OpenCompass/opencompass/summarizers/subjective/compass_arena_bradley_terry.py

1020 lines
36 KiB
Python
Raw Normal View History

# flake8: noqa
import functools
import getpass
import json
import math
import multiprocessing as mp
import os
import os.path as osp
from datetime import datetime
from functools import partial
from typing import Any, Dict, List, Optional, Tuple
import mmengine
import numpy as np
import pandas as pd
import tabulate
from mmengine import ConfigDict
from scipy.optimize import minimize
from scipy.special import expit
from tqdm import tqdm
from opencompass.summarizers import DefaultSubjectiveSummarizer
from opencompass.summarizers.default_subjective import \
model_abbr_from_cfg_used_in_summarizer
from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
get_infer_output_path, get_logger,
model_abbr_from_cfg)
from opencompass.utils.prompt import get_prompt_hash
STYLE_CONTROL_VARIABLES_V1 = [
'sum_assistant_tokens',
'header_count',
'list_count',
'bold_count',
]
EXTRA_CONTROL_VARIABLES = []
def get_matchups_models(df):
n_rows = len(df)
model_indices, models = pd.factorize(
pd.concat([df['model_a'], df['model_b']]))
matchups = np.column_stack(
[model_indices[:n_rows], model_indices[n_rows:]])
return matchups, models.to_list()
def preprocess_for_elo(df):
"""
in Elo we want numpy arrays for matchups and outcomes
matchups: int32 (N,2) contains model ids for the competitors in a match
outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
"""
matchups, models = get_matchups_models(df)
outcomes = np.full(len(df), 0.5)
outcomes[df['winner'] == 'model_a'] = 1.0
outcomes[df['winner'] == 'model_b'] = 0.0
return matchups, outcomes, models
def preprocess_for_bt(df):
"""in BT we only need the unique (matchup,outcome) sets along with the
weights of how often they occur."""
n_rows = len(df)
# the 3 columns of schedule represent: model_a id, model_b id, outcome_id
schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
# set the two model cols by mapping the model names to their int ids
schedule[:, [0, 1]], models = get_matchups_models(df)
# map outcomes to integers (must be same dtype as model ids so it can be in the same array)
# model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
schedule[df['winner'] == 'model_a', 2] = 2
schedule[df['winner'] == 'model_b', 2] = 0
# count the number of occurrences of each observed result
matchups_outcomes, weights = np.unique(schedule,
return_counts=True,
axis=0)
matchups = matchups_outcomes[:, [0, 1]]
# map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
weights = weights.astype(np.float64)
# each possible result is weighted according to number of times it occurred in the dataset
return matchups, outcomes, models, weights
def preprocess_for_style(
df,
apply_ratio: List[int] = None,
style_variables: List[str] = STYLE_CONTROL_VARIABLES_V1,
control_variables: List[str] = EXTRA_CONTROL_VARIABLES,
style_var_suffixes: List[str] = None,
add_one: bool = True,
normalize_style_features: bool = True,
):
matchups, outcomes, models = preprocess_for_elo(
df) # this can use the same preprocessing as Elo
n = matchups.shape[0]
style_k = int(len(style_variables))
if control_variables is not None:
control_k = int(len(control_variables))
else:
control_k = 0
if apply_ratio == None:
apply_ratio = np.repeat(1, style_k)
def extract_feature(x, feature):
val = x[feature]
if isinstance(val, int):
return val
else:
return sum(val.values())
## Style variables
if style_var_suffixes is None:
style_var_suffixes = ['_a', '_b']
style_vector = np.zeros(shape=(2 * style_k, n), dtype=np.int32)
for idx1, model_suffix in enumerate(style_var_suffixes):
for idx, element in enumerate(style_variables):
style_vector[idx + (idx1 * style_k), :] = df.conv_metadata.map(
partial(extract_feature,
feature=f'{element}{model_suffix}')).values
style_vector = np.ascontiguousarray(style_vector)
style_diff = (style_vector[:style_k] -
style_vector[style_k:]).astype(float)
style_sum = (style_vector[:style_k] + style_vector[style_k:]).astype(float)
# Add one to prevent division by zero
if add_one:
style_sum = style_sum + np.ones(style_diff.shape)
apply_ratio = np.flatnonzero(apply_ratio)
# Apply ratio where necessary (length, etc)
style_diff[apply_ratio] /= style_sum[apply_ratio]
style_mean = np.mean(style_diff, axis=1)
if normalize_style_features:
style_std = np.std(style_diff, axis=1)
# # features = normalize(style_diff)
style_features = ((style_diff - style_mean[:, np.newaxis]) /
style_std[:, np.newaxis]).T
else:
style_features = style_diff.T
## Other control variables
if control_k > 0:
control_vector = np.zeros(shape=(control_k, n), dtype=np.int32)
for idx, element in enumerate(control_variables):
control_vector[idx, :] = df[element]
control_vector = np.ascontiguousarray(control_vector).astype(float)
control_features = control_vector.T
# combine style and other control features
features = np.hstack([style_features, control_features])
else:
features = style_features
return matchups, features, outcomes, models
def fit_vectorized_elo(
matchups,
outcomes,
sample_indices,
num_models: int,
k: float = 4.0,
base: float = 10.0,
init_rating: float = 1000.0,
scale: float = 400.0,
):
"""fit multiple sets of Elo ratings on different samples of the data at the
same time."""
alpha = math.log(base) / scale
num_samples = sample_indices.shape[1]
ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
# iterate over the rows of sample_indices, each column is an index into a match in the input arrays
sample_range = np.arange(num_samples)
for matchup_indices in sample_indices:
model_a_indices = matchups[matchup_indices, 0]
model_b_indices = matchups[matchup_indices, 1]
model_a_ratings = ratings[sample_range, model_a_indices]
model_b_ratings = ratings[sample_range, model_b_indices]
sample_outcomes = outcomes[matchup_indices]
probs = expit(alpha * (model_a_ratings - model_b_ratings))
updates = k * (sample_outcomes - probs)
ratings[sample_range, model_a_indices] += updates
ratings[sample_range, model_b_indices] -= updates
return ratings + init_rating
def compute_elo(
df,
k: float = 4.0,
base: float = 10.0,
init_rating: float = 1000.0,
scale: float = 400.0,
):
matchups, outcomes, models = preprocess_for_elo(df)
alpha = math.log(base) / scale
ratings = np.full(shape=(len(models), ), fill_value=init_rating)
for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes):
prob = 1.0 / (1.0 +
math.exp(alpha *
(ratings[model_b_idx] - ratings[model_a_idx])))
update = k * (outcome - prob)
ratings[model_a_idx] += update
ratings[model_b_idx] -= update
return {model: ratings[idx] for idx, model in enumerate(models)}
def compute_bootstrap_elo(
df,
num_round: int = 100,
k: float = 4.0,
base: float = 10.0,
init_rating: float = 1000.0,
scale: float = 400.0,
):
matchups, outcomes, models = preprocess_for_elo(df)
sample_indices = np.random.randint(low=0,
high=len(df),
size=(len(df), num_round))
ratings = fit_vectorized_elo(matchups, outcomes, sample_indices,
len(models), k, base, init_rating, scale)
df = pd.DataFrame(data=ratings, columns=models)
return df[df.median().sort_values(ascending=False).index]
def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
matchup_ratings = ratings[matchups]
logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
probs = expit(logits)
# this form naturally counts a draw as half a win and half a loss
loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) *
(1.0 - outcomes)) * weights).sum()
matchups_grads = -alpha * (outcomes - probs) * weights
model_grad = np.zeros_like(ratings)
# aggregate gradients at the model level using the indices in matchups
np.add.at(
model_grad,
matchups[:, [0, 1]],
matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
)
return loss, model_grad
def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
initial_ratings = np.zeros(n_models, dtype=np.float64)
result = minimize(
fun=bt_loss_and_grad,
x0=initial_ratings,
args=(matchups, outcomes, weights, alpha),
jac=True,
method='L-BFGS-B',
options={
'disp': False,
'maxiter': 100,
'gtol': tol
},
)
return result['x']
def scale_and_offset(
ratings,
models,
scale: float = 400.0,
init_rating: float = 1000.0,
baseline_model: str = None,
baseline_rating: float = 1000.0,
):
"""convert ratings from the natural scale to the Elo rating scale with an
anchored baseline."""
scaled_ratings = (ratings * scale) + init_rating
if baseline_model is not None:
if baseline_model in models:
baseline_idx = models.index(baseline_model)
scaled_ratings += baseline_rating - scaled_ratings[...,
[baseline_idx]]
return scaled_ratings
def compute_bt(
df,
base: float = 10.0,
scale: float = 400.0,
init_rating: float = 1000.0,
baseline_model: str = None,
baseline_rating: float = 1000.0,
tol: float = 1e-6,
):
matchups, outcomes, models, weights = preprocess_for_bt(df)
ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base),
tol)
scaled_ratings = scale_and_offset(
ratings=ratings,
models=models,
scale=scale,
init_rating=init_rating,
baseline_model=baseline_model,
baseline_rating=baseline_rating,
)
return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)
def compute_bootstrap_bt(
battles,
num_round: int,
base: float = 10.0,
scale: float = 400.0,
init_rating: float = 1000.0,
baseline_model: str = None,
baseline_rating: float = 1000.0,
tol: float = 1e-6,
num_cpu: int = None,
):
matchups, outcomes, models, weights = preprocess_for_bt(battles)
# bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
rng = np.random.default_rng(seed=0)
idxs = rng.multinomial(n=len(battles),
pvals=weights / weights.sum(),
size=(num_round))
# only the distribution over their occurrence counts changes between samples (and it can be 0)
boot_weights = idxs.astype(np.float64) / len(battles)
# the only thing different across samples is the distribution of weights
bt_fn = partial(fit_bt,
matchups,
outcomes,
n_models=len(models),
alpha=np.log(base),
tol=tol)
with mp.Pool(num_cpu if num_cpu else os.cpu_count() - 1) as pool:
results = list(
tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round))
ratings = np.array(results)
scaled_ratings = scale_and_offset(
ratings=ratings,
models=models,
scale=scale,
init_rating=init_rating,
baseline_model=baseline_model,
baseline_rating=baseline_rating,
)
df = pd.DataFrame(scaled_ratings, columns=models)
return df[df.median().sort_values(ascending=False).index]
DIFF_MASK = np.array(
[1.0, -1.0], dtype=np.float64
) # create globally to not incur the instantiation cost in each call
def contextual_bt_loss_and_grad(
params,
n_competitors,
matchups,
features,
outcomes,
alpha=1.0,
reg=1.0,
half_reg=0.5,
):
reg_loss = half_reg * np.inner(params, params)
# Split params into ratings and feature parameters
ratings = params[:n_competitors]
feature_params = params[n_competitors:]
matchup_ratings = ratings[matchups]
bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
context_logits = np.dot(features, feature_params)
probs = expit(bt_logits + context_logits)
loss = (-((np.log(probs) * outcomes + np.log(1.0 - probs) *
(1.0 - outcomes))).sum() + reg_loss)
error = outcomes - probs
grad = reg * params # initialize the grad as the regularization grad
matchups_grads = -alpha * error
np.add.at(grad[:n_competitors], matchups[:, [0, 1]],
matchups_grads[:, None] * DIFF_MASK)
grad[n_competitors:] -= np.dot(features.T, error)
return loss, grad
# note on regularization:
# default reg is to 0.5 since the LogisticRegression default is 1.0
# in the original implementation, matchups were duplicated
# that made the ratio of log loss to reg loss "twice as high"
# in this non-duplicated version for parity we also reduce the reg by one half to match
def fit_contextual_bt(
matchups,
features,
outcomes,
models,
idxs=None,
alpha=math.log(10.0),
reg=0.5,
tol=1e-6,
):
n_features = features.shape[1]
n_models = len(models)
initial_params = np.zeros(n_models + n_features, dtype=np.float64)
half_reg = reg / 2.0
# sample idxs optionally allow for fitting on a bootstrap sample of the dataset
if idxs is not None:
matchups, features, outcomes = matchups[idxs], features[
idxs], outcomes[idxs]
result = minimize(
fun=contextual_bt_loss_and_grad,
x0=initial_params,
args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
jac=True,
method='L-BFGS-B',
options={
'disp': False,
'maxiter': 100,
'gtol': tol
},
)
return result['x']
def compute_style_control(
df: pd.DataFrame,
alpha: float = math.log(10.0),
reg: float = 0.5,
scale: float = 400.0,
init_rating: float = 1000.0,
baseline_model: str = None,
baseline_rating: float = 1000.0,
normalize_style_features: bool = True,
control_variables: List[str] = None,
odds_ratio: bool = True,
tol: float = 1e-6,
):
if control_variables is not None:
_df = pd.get_dummies(
data=df,
columns=control_variables,
drop_first=
False, # Since the model is fitted without an intercept, we keep all levels of each categorical
)
# One-hot encode categorical control variables
one_hot_ctrls = []
for col in _df.columns:
for ctrl_var in control_variables:
if col.startswith(ctrl_var):
one_hot_ctrls.append(col)
break
matchups, features, outcomes, models = preprocess_for_style(
_df,
normalize_style_features=normalize_style_features,
style_variables=STYLE_CONTROL_VARIABLES_V1,
control_variables=one_hot_ctrls,
)
ratings_params = fit_contextual_bt(
matchups,
features,
outcomes,
models=models,
alpha=alpha,
reg=reg,
tol=tol,
)
ratings = ratings_params[:len(models)]
if odds_ratio:
params = np.exp(ratings_params[len(models):])
else:
params = ratings_params[len(models):]
scaled_ratings = scale_and_offset(
ratings=ratings,
models=models,
scale=scale,
init_rating=init_rating,
baseline_model=baseline_model,
baseline_rating=baseline_rating,
)
scaled_ratings = pd.Series(scaled_ratings,
index=models).sort_values(ascending=False)
control_coefficients = {
k: v
for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
}
return scaled_ratings, control_coefficients
def compute_bootstrap_style_control(
df,
num_round: int,
alpha: float = math.log(10.0),
reg: float = 0.5,
scale: float = 400.0,
init_rating: float = 1000.0,
baseline_model: str = None,
baseline_rating: float = 1000.0,
normalize_style_features: bool = True,
control_variables: List[str] = None,
odds_ratio: bool = True,
tol: float = 1e-6,
num_cpu: int = None,
):
if control_variables is not None:
_df = pd.get_dummies(
data=df,
columns=control_variables,
drop_first=
False, # Since the model is fitted without an intercept, we keep all levels of each categorical
)
# One-hot encode categorical control variables
one_hot_ctrls = []
for col in _df.columns:
for ctrl_var in control_variables:
if col.startswith(ctrl_var):
one_hot_ctrls.append(col)
break
matchups, features, outcomes, models = preprocess_for_style(
_df,
normalize_style_features=normalize_style_features,
style_variables=STYLE_CONTROL_VARIABLES_V1,
control_variables=one_hot_ctrls,
)
contextual_bt_fn = partial(
fit_contextual_bt,
matchups,
features,
outcomes,
models,
alpha=alpha,
reg=reg,
tol=tol,
)
boot_idxs = np.random.randint(low=0,
high=matchups.shape[0],
size=(num_round, matchups.shape[0]))
with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
results = list(
tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs),
total=num_round))
ratings_params = np.array(results)
ratings = ratings_params[:, :len(models)]
if odds_ratio:
params = np.exp(ratings_params[:, len(models):].mean(axis=0))
else:
params = ratings_params[:, len(models):].mean(axis=0)
scaled_ratings = scale_and_offset(
ratings=ratings,
models=models,
scale=scale,
init_rating=init_rating,
baseline_model=baseline_model,
baseline_rating=baseline_rating,
)
df = pd.DataFrame(scaled_ratings, columns=models)
control_coefficients = {
k: v
for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
}
return df[df.median().sort_values(
ascending=False).index], control_coefficients
class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
"""Summarizer for fitting and Bradley-Terry model to pairwise matchups
according to https://github.com/lm-sys/FastChat/tree/main.
Args:
config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
dataset_abbrs (Optional[List[str]], optional): Dataset abbreviations to be listed in the summary. Defaults to None.
summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
normalize_style_features (bool, optional): Whether to normalize style features BEFORE fitting the BT model (implementation by FastChat). Turn this off for easier interpretation of odds ratios (when odds_ratio==True). Defaults to True.
odds_ratio (bool, optional): Whether to report odds ratios (np.exp(beta_k)) instead of the original coefficients. Defaults to True.
groups (List[str], optional): Group variables to include while fitting the BT model. These must be available in the input dataset for each observation. Defaults to None.
"""
def __init__(
self,
config: ConfigDict,
dataset_abbrs: Optional[List[str]] = None,
summary_groups: List = None,
prompt_db=None,
rating_system: str = 'bradleyterry',
num_bootstrap: int = 300,
num_cpu: int = None,
with_control_vars: bool = True,
normalize_style_features: bool = True,
odds_ratio: bool = True,
groups: List[str] = None,
) -> None:
summary_groups = [] if summary_groups is None else summary_groups
super().__init__(config, dataset_abbrs, summary_groups, prompt_db)
self.summarizer_cfg = self.cfg['summarizer']
self.rating_system = 'bradleyterry' # Only bradleyterry supported
self.num_bootstrap = num_bootstrap
self.num_cpu = num_cpu
self.with_control_vars = with_control_vars
self.normalize_style_features = normalize_style_features
self.odds_ratio = odds_ratio
self.groups = [] if groups is None else groups
def _pick_up_results(self, judge_abbr):
"""The function reads the numerical results of evaluations from the
output folder based on the configuration file, and ultimately returns
four dictionaries, each containing processed information in different
formats. The contents of the four dictionaries are as follows:
- raw_results: contains the raw results of each model on each dataset (excluding details).
- parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
- dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
with metrics appearing earlier considered more important.
- dataset_eval_mode: contains the evaluation mode for each dataset.
"""
# raw_results: {model_abbr: {dataset_abbr: result}}
raw_results: Dict[str, Dict[str, Any]] = {}
# # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
# parsed_results: Dict[str, Dict[str, Dict[str, float]]] = {}
# # dataset_metrics: {dataset_abbr: [metric]}
# dataset_metrics: Dict[str, List[str]] = {}
for model in self.model_cfgs:
model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
# parsed_results.setdefault(model_abbr, {})
# raw_results.setdefault(model_abbr, {})
for dataset in self.dataset_cfgs:
base_models = dataset.get('base_models', None)
if base_models is None:
raise ValueError(
'CompassArenaBradleyTerrySummarizer requires at least one `base_model` in specified in the dataset config.'
)
base_models_list = [item['abbr'] for item in base_models]
dataset_abbr = dataset_abbr_from_cfg(dataset)
raw_results.setdefault(dataset_abbr, {})
for base_model_abbr in base_models_list:
raw_results[dataset_abbr].setdefault(base_model_abbr, [])
origin_path = get_infer_output_path(
model, dataset, osp.join(self.work_dir, 'results'))
if base_model_abbr != '':
temp_path, dataset_json_name = (
origin_path.rsplit('/', 1)[0],
origin_path.rsplit('/', 1)[1],
)
filepath = osp.join(
temp_path.rsplit('/', 1)[0],
base_model_abbr + '_' +
temp_path.rsplit('/', 1)[1] + '_judged-by--' +
judge_abbr,
dataset_json_name,
)
else:
filepath = osp.join(
origin_path.rsplit('/', 1)[0] + '_judged-by--' +
judge_abbr,
origin_path.rsplit('/', 1)[1],
)
if not osp.exists(filepath):
continue
result = mmengine.load(filepath)
result.pop('details', None)
# raw_results[dataset_abbr] = result
raw_results[dataset_abbr][base_model_abbr].extend(
result['matches'])
if 'error' in result:
self.logger.debug(
f'error in {model_abbr} {dataset_abbr} {result["error"]}'
)
continue
# dataset_eval_mode: {dataset_abbr: eval_mode}
dataset_eval_mode: Dict[str, str] = {}
for dataset in self.dataset_cfgs:
inferencer = (dataset.get('infer_cfg', {}).get('inferencer',
{}).get('type', ''))
inferencer = (inferencer if isinstance(inferencer, str) else
inferencer.__name__)
dataset_abbr = dataset_abbr_from_cfg(dataset)
if 'GenInferencer' in inferencer:
dataset_eval_mode[dataset_abbr] = 'gen'
elif 'PPLInferencer' in inferencer:
dataset_eval_mode[dataset_abbr] = 'ppl'
elif 'LLInferencer' in inferencer:
dataset_eval_mode[dataset_abbr] = 'll'
else:
dataset_eval_mode[dataset_abbr] = 'unknown'
self.logger.warning(
f'unknown inferencer: {inferencer} - {dataset_abbr}')
# return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
return raw_results, dataset_eval_mode
def _calculate_ratings(
self,
matches: Dict,
base_model: str = None,
groups: List[str] = None,
) -> Tuple[pd.DataFrame, Dict]:
rating_system = self.rating_system
num_bootstrap = self.num_bootstrap
num_cpu = self.num_cpu
with_control_vars = self.with_control_vars
matches_df = pd.DataFrame(matches)
num_battles = (matches_df['model_a'].value_counts().add(
matches_df['model_b'].value_counts(), fill_value=0))
# if rating_system == "bradleyterry":
if with_control_vars:
bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
df=matches_df,
num_round=num_bootstrap,
baseline_model=base_model,
normalize_style_features=self.normalize_style_features,
control_variables=groups,
odds_ratio=self.odds_ratio,
)
elo_rating_final, coef_final = compute_style_control(
df=matches_df,
baseline_model=base_model,
normalize_style_features=self.normalize_style_features,
control_variables=groups,
odds_ratio=self.odds_ratio,
)
else:
bootstrap_df = compute_bootstrap_bt(
battles=matches_df,
num_round=num_bootstrap,
baseline_model=base_model,
num_cpu=num_cpu,
)
elo_rating_final = compute_bt(
df=matches_df,
baseline_model=base_model,
)
# print(elo_rating_final)
# elif rating_system == "elo":
# bootstrap_df = compute_bootstrap_elo(
# df=matches_df,
# num_round=num_bootstrap,
# num_cpu=num_cpu,
# )
# elo_rating_final = compute_elo(matches_df)
model_rating_q025 = bootstrap_df.quantile(0.025)
model_rating_q975 = bootstrap_df.quantile(0.975)
# compute ranking based on CI
model_order = list(elo_rating_final.index)
ranking = {}
for i, model_a in enumerate(model_order):
ranking[model_a] = 1
for j, model_b in enumerate(model_order):
if i == j:
continue
if model_rating_q025[model_b] > model_rating_q975[model_a]:
ranking[model_a] += 1
leaderboard_table_df = pd.DataFrame(
{
'rating': elo_rating_final,
'ranking_ub': pd.Series(ranking),
'std_dev': bootstrap_df.std(),
'rating_q975': model_rating_q975,
'rating_q025': model_rating_q025,
'num_battles': num_battles,
}, )
leaderboard_table_df['model_name'] = leaderboard_table_df.index
leaderboard_table_df.sort_values(
by=['rating'],
ascending=False,
inplace=True,
)
leaderboard_table_df['ranking'] = np.arange(
1,
len(leaderboard_table_df) + 1)
if rating_system == 'bradleyterry' and with_control_vars:
control_coefficients = {
'bootstrap': bootstrap_coef,
'final': coef_final,
}
else:
control_coefficients = {'final': []}
return leaderboard_table_df, control_coefficients['final']
def _output_to_file(
self,
output_path,
time_str: str,
tables: Dict,
metadata: Dict,
judge_abbr: str,
dataset_eval_mode: str,
):
# Output to file
if output_path is None:
output_path = osp.join(self.work_dir, 'summary',
f'summary_{time_str}.json')
output_csv_path = osp.join(self.work_dir, 'summary',
f'summary_{time_str}.csv')
else:
output_csv_path = output_path.replace('.json', '.csv')
output_path = output_path.split(
'.json')[0] + '_by_' + judge_abbr + '.json'
output_dir = osp.split(output_path)[0]
mmengine.mkdir_or_exist(output_dir)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=4)
self.logger.info(f'write summary to {osp.abspath(output_path)}')
prompt_version = {
dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
for d in self.dataset_cfgs
}
full_results = []
for base_model_abbr, datasets in tables.items():
base_model_results = []
for dataset_abbr, table_df in datasets.items():
table_df['dataset'] = dataset_abbr
table_df['version'] = prompt_version.get(dataset_abbr, '-')
table_df['metric'] = 'bt_rating'
table_df['mode'] = dataset_eval_mode[dataset_abbr]
table_df['base_model'] = base_model_abbr
base_model_results.append(table_df)
cur_base_model_result_df = pd.concat(base_model_results)
full_results.append(cur_base_model_result_df)
full_results_df = pd.concat(full_results)
full_results_df = full_results_df[[
'dataset',
'version',
'base_model',
'metric',
'mode',
'ranking',
'ranking_ub',
'model_name',
'rating',
'rating_q975',
'rating_q025',
'std_dev',
'num_battles',
]]
output_csv_path = (output_csv_path.split('.csv')[0] + '_by_' +
judge_abbr + '.csv')
with pd.option_context(
'display.max_rows',
20,
'display.max_columns',
20,
'display.expand_frame_repr',
False,
):
print(full_results_df.reset_index(drop=True).round(2))
full_results_df.to_csv(
output_csv_path,
index=False,
)
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
def flip_dict_levels(self, original_dict: Dict):
"""Flips the two levels of a nested dictionary so that dict[lvl1][lvl2]
becomes dict[lvl2][lvl1].
Args:
original_dict (dict): The original nested dictionary.
Returns:
dict: The flipped dictionary.
"""
flipped_dict = {}
for lvl1, lvl2_dict in original_dict.items():
for lvl2, value in lvl2_dict.items():
if lvl2 not in flipped_dict:
flipped_dict[lvl2] = {}
flipped_dict[lvl2][lvl1] = value
return flipped_dict
def summarize(
self,
output_path: str = None,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
):
"""Summarize evaluation results and format output table.
Args:
output_path (str, optional): Output path. Defaults to None.
time_str (str, optional): Timestamp for file suffix. Defaults to
datetime.now().strftime('%Y%m%d_%H%M%S').
"""
all_scores = {}
for judge_model in self.judge_models:
control_coefficients = {}
leaderboard_tables = {}
judge_abbr = model_abbr_from_cfg(judge_model)
# pick up results
raw_results, dataset_eval_mode = self._pick_up_results(judge_abbr)
all_matches = []
for dataset_abbr, base_models in raw_results.items():
control_coefficients[dataset_abbr] = {}
leaderboard_tables[dataset_abbr] = {}
dataset_matches = base_models[list(base_models)[0]]
all_matches.extend(dataset_matches)
for base_model_abbr, matches in base_models.items():
cur_table_df, cur_ctrl_coefs = self._calculate_ratings(
matches=matches,
base_model=base_model_abbr,
groups=self.groups,
)
control_coefficients[dataset_abbr][
base_model_abbr] = cur_ctrl_coefs
leaderboard_tables[dataset_abbr][
base_model_abbr] = cur_table_df
print('-' * 10 +
f"{dataset_abbr + ':' + base_model_abbr}\n" +
'-' * 10)
# print(cur_table_df)
print(cur_ctrl_coefs)
leaderboard_tables = self.flip_dict_levels(leaderboard_tables)
# Output to .json / .csv files
self._output_to_file(
output_path=output_path,
time_str=time_str,
tables=leaderboard_tables,
metadata=control_coefficients,
judge_abbr=judge_abbr,
dataset_eval_mode=dataset_eval_mode,
)
# Fit another BT model with the first base_model and combining matches from all datasets
all_scores_df, all_scores_ctrl_coefs = self._calculate_ratings(
matches=all_matches,
base_model=list(base_models)[0],
groups=self.groups,
)
all_scores[judge_abbr] = pd.Series(
all_scores_df['rating'],
index=all_scores_df['model_name'],
).to_dict()
print(f'{all_scores=}')
print(f'{all_scores_ctrl_coefs=}')
return {'CompassArenaSubjBenchBradleyTerry': all_scores}