mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
1122 lines
40 KiB
Python
1122 lines
40 KiB
Python
# flake8: noqa
|
|
import functools
|
|
import getpass
|
|
import json
|
|
import math
|
|
import multiprocessing as mp
|
|
import os
|
|
import os.path as osp
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from functools import partial
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import mmengine
|
|
import numpy as np
|
|
import pandas as pd
|
|
import tabulate
|
|
from mmengine import ConfigDict
|
|
from scipy.optimize import minimize
|
|
from scipy.special import expit
|
|
from tqdm import tqdm
|
|
|
|
from opencompass.summarizers import DefaultSubjectiveSummarizer
|
|
from opencompass.summarizers.default_subjective import \
|
|
model_abbr_from_cfg_used_in_summarizer
|
|
from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg,
|
|
get_infer_output_path, get_logger,
|
|
model_abbr_from_cfg)
|
|
from opencompass.utils.prompt import get_prompt_hash
|
|
|
|
STYLE_CONTROL_VARIABLES_V1 = [
|
|
'sum_assistant_tokens',
|
|
'header_count',
|
|
'list_count',
|
|
'bold_count',
|
|
]
|
|
|
|
EXTRA_CONTROL_VARIABLES = []
|
|
|
|
|
|
def get_matchups_models(df):
|
|
n_rows = len(df)
|
|
model_indices, models = pd.factorize(
|
|
pd.concat([df['model_a'], df['model_b']]))
|
|
matchups = np.column_stack(
|
|
[model_indices[:n_rows], model_indices[n_rows:]])
|
|
return matchups, models.to_list()
|
|
|
|
|
|
def preprocess_for_elo(df):
|
|
"""
|
|
in Elo we want numpy arrays for matchups and outcomes
|
|
matchups: int32 (N,2) contains model ids for the competitors in a match
|
|
outcomes: float64 (N,) contains 1.0, 0.5, or 0.0 representing win, tie, or loss for model_a
|
|
"""
|
|
matchups, models = get_matchups_models(df)
|
|
outcomes = np.full(len(df), 0.5)
|
|
outcomes[df['winner'] == 'model_a'] = 1.0
|
|
outcomes[df['winner'] == 'model_b'] = 0.0
|
|
return matchups, outcomes, models
|
|
|
|
|
|
def preprocess_for_bt(df):
|
|
"""in BT we only need the unique (matchup,outcome) sets along with the
|
|
weights of how often they occur."""
|
|
n_rows = len(df)
|
|
# the 3 columns of schedule represent: model_a id, model_b id, outcome_id
|
|
schedule = np.full((n_rows, 3), fill_value=1, dtype=np.int32)
|
|
# set the two model cols by mapping the model names to their int ids
|
|
schedule[:, [0, 1]], models = get_matchups_models(df)
|
|
# map outcomes to integers (must be same dtype as model ids so it can be in the same array)
|
|
# model_a win -> 2, tie -> 1 (prefilled by default), model_b win -> 0
|
|
schedule[df['winner'] == 'model_a', 2] = 2
|
|
schedule[df['winner'] == 'model_b', 2] = 0
|
|
# count the number of occurrences of each observed result
|
|
matchups_outcomes, weights = np.unique(schedule,
|
|
return_counts=True,
|
|
axis=0)
|
|
matchups = matchups_outcomes[:, [0, 1]]
|
|
# map 2 -> 1.0, 1 -> 0.5, 0 -> 0.0 which will be used as labels during optimization
|
|
outcomes = matchups_outcomes[:, 2].astype(np.float64) / 2.0
|
|
weights = weights.astype(np.float64)
|
|
# each possible result is weighted according to number of times it occurred in the dataset
|
|
return matchups, outcomes, models, weights
|
|
|
|
|
|
def preprocess_for_style(
|
|
df,
|
|
apply_ratio: List[int] = None,
|
|
style_variables: List[str] = STYLE_CONTROL_VARIABLES_V1,
|
|
control_variables: List[str] = EXTRA_CONTROL_VARIABLES,
|
|
style_var_suffixes: List[str] = None,
|
|
add_one: bool = True,
|
|
normalize_style_features: bool = True,
|
|
):
|
|
matchups, outcomes, models = preprocess_for_elo(
|
|
df) # this can use the same preprocessing as Elo
|
|
|
|
n = matchups.shape[0]
|
|
style_k = int(len(style_variables))
|
|
|
|
if control_variables is not None:
|
|
control_k = int(len(control_variables))
|
|
else:
|
|
control_k = 0
|
|
|
|
if apply_ratio == None:
|
|
apply_ratio = np.repeat(1, style_k)
|
|
|
|
def extract_feature(x, feature):
|
|
val = x[feature]
|
|
if isinstance(val, int):
|
|
return val
|
|
else:
|
|
return sum(val.values())
|
|
|
|
## Style variables
|
|
if style_var_suffixes is None:
|
|
style_var_suffixes = ['_a', '_b']
|
|
|
|
style_vector = np.zeros(shape=(2 * style_k, n), dtype=np.int32)
|
|
for idx1, model_suffix in enumerate(style_var_suffixes):
|
|
for idx, element in enumerate(style_variables):
|
|
style_vector[idx + (idx1 * style_k), :] = df.conv_metadata.map(
|
|
partial(extract_feature,
|
|
feature=f'{element}{model_suffix}')).values
|
|
|
|
style_vector = np.ascontiguousarray(style_vector)
|
|
|
|
style_diff = (style_vector[:style_k] -
|
|
style_vector[style_k:]).astype(float)
|
|
style_sum = (style_vector[:style_k] + style_vector[style_k:]).astype(float)
|
|
|
|
# Add one to prevent division by zero
|
|
if add_one:
|
|
style_sum = style_sum + np.ones(style_diff.shape)
|
|
|
|
apply_ratio = np.flatnonzero(apply_ratio)
|
|
|
|
# Apply ratio where necessary (length, etc)
|
|
style_diff[apply_ratio] /= style_sum[apply_ratio]
|
|
|
|
style_mean = np.mean(style_diff, axis=1)
|
|
|
|
if normalize_style_features:
|
|
style_std = np.std(style_diff, axis=1)
|
|
|
|
# # features = normalize(style_diff)
|
|
style_features = ((style_diff - style_mean[:, np.newaxis]) /
|
|
style_std[:, np.newaxis]).T
|
|
else:
|
|
style_features = style_diff.T
|
|
|
|
## Other control variables
|
|
if control_k > 0:
|
|
control_vector = np.zeros(shape=(control_k, n), dtype=np.int32)
|
|
for idx, element in enumerate(control_variables):
|
|
control_vector[idx, :] = df[element]
|
|
|
|
control_vector = np.ascontiguousarray(control_vector).astype(float)
|
|
|
|
control_features = control_vector.T
|
|
|
|
# combine style and other control features
|
|
features = np.hstack([style_features, control_features])
|
|
else:
|
|
features = style_features
|
|
|
|
return matchups, features, outcomes, models
|
|
|
|
|
|
def fit_vectorized_elo(
|
|
matchups,
|
|
outcomes,
|
|
sample_indices,
|
|
num_models: int,
|
|
k: float = 4.0,
|
|
base: float = 10.0,
|
|
init_rating: float = 1000.0,
|
|
scale: float = 400.0,
|
|
):
|
|
"""fit multiple sets of Elo ratings on different samples of the data at the
|
|
same time."""
|
|
alpha = math.log(base) / scale
|
|
num_samples = sample_indices.shape[1]
|
|
ratings = np.zeros(shape=(num_samples, num_models), dtype=np.float64)
|
|
# iterate over the rows of sample_indices, each column is an index into a match in the input arrays
|
|
sample_range = np.arange(num_samples)
|
|
for matchup_indices in sample_indices:
|
|
model_a_indices = matchups[matchup_indices, 0]
|
|
model_b_indices = matchups[matchup_indices, 1]
|
|
model_a_ratings = ratings[sample_range, model_a_indices]
|
|
model_b_ratings = ratings[sample_range, model_b_indices]
|
|
sample_outcomes = outcomes[matchup_indices]
|
|
probs = expit(alpha * (model_a_ratings - model_b_ratings))
|
|
updates = k * (sample_outcomes - probs)
|
|
ratings[sample_range, model_a_indices] += updates
|
|
ratings[sample_range, model_b_indices] -= updates
|
|
return ratings + init_rating
|
|
|
|
|
|
def compute_elo(
|
|
df,
|
|
k: float = 4.0,
|
|
base: float = 10.0,
|
|
init_rating: float = 1000.0,
|
|
scale: float = 400.0,
|
|
):
|
|
matchups, outcomes, models = preprocess_for_elo(df)
|
|
alpha = math.log(base) / scale
|
|
ratings = np.full(shape=(len(models), ), fill_value=init_rating)
|
|
|
|
for (model_a_idx, model_b_idx), outcome in zip(matchups, outcomes):
|
|
prob = 1.0 / (1.0 +
|
|
math.exp(alpha *
|
|
(ratings[model_b_idx] - ratings[model_a_idx])))
|
|
update = k * (outcome - prob)
|
|
ratings[model_a_idx] += update
|
|
ratings[model_b_idx] -= update
|
|
|
|
return {model: ratings[idx] for idx, model in enumerate(models)}
|
|
|
|
|
|
def compute_bootstrap_elo(
|
|
df,
|
|
num_round: int = 100,
|
|
k: float = 4.0,
|
|
base: float = 10.0,
|
|
init_rating: float = 1000.0,
|
|
scale: float = 400.0,
|
|
):
|
|
matchups, outcomes, models = preprocess_for_elo(df)
|
|
sample_indices = np.random.randint(low=0,
|
|
high=len(df),
|
|
size=(len(df), num_round))
|
|
ratings = fit_vectorized_elo(matchups, outcomes, sample_indices,
|
|
len(models), k, base, init_rating, scale)
|
|
df = pd.DataFrame(data=ratings, columns=models)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
def bt_loss_and_grad(ratings, matchups, outcomes, weights, alpha=1.0):
|
|
matchup_ratings = ratings[matchups]
|
|
logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
|
|
probs = expit(logits)
|
|
# this form naturally counts a draw as half a win and half a loss
|
|
loss = -((np.log(probs) * outcomes + np.log(1.0 - probs) *
|
|
(1.0 - outcomes)) * weights).sum()
|
|
matchups_grads = -alpha * (outcomes - probs) * weights
|
|
model_grad = np.zeros_like(ratings)
|
|
# aggregate gradients at the model level using the indices in matchups
|
|
np.add.at(
|
|
model_grad,
|
|
matchups[:, [0, 1]],
|
|
matchups_grads[:, None] * np.array([1.0, -1.0], dtype=np.float64),
|
|
)
|
|
return loss, model_grad
|
|
|
|
|
|
def fit_bt(matchups, outcomes, weights, n_models, alpha, tol=1e-6):
|
|
initial_ratings = np.zeros(n_models, dtype=np.float64)
|
|
result = minimize(
|
|
fun=bt_loss_and_grad,
|
|
x0=initial_ratings,
|
|
args=(matchups, outcomes, weights, alpha),
|
|
jac=True,
|
|
method='L-BFGS-B',
|
|
options={
|
|
'disp': False,
|
|
'maxiter': 100,
|
|
'gtol': tol
|
|
},
|
|
)
|
|
return result['x']
|
|
|
|
|
|
def scale_and_offset(
|
|
ratings,
|
|
models,
|
|
scale: float = 400.0,
|
|
init_rating: float = 1000.0,
|
|
baseline_model: str = None,
|
|
baseline_rating: float = 1000.0,
|
|
):
|
|
"""convert ratings from the natural scale to the Elo rating scale with an
|
|
anchored baseline."""
|
|
scaled_ratings = (ratings * scale) + init_rating
|
|
|
|
if baseline_model is not None:
|
|
if baseline_model in models:
|
|
baseline_idx = models.index(baseline_model)
|
|
scaled_ratings += baseline_rating - scaled_ratings[...,
|
|
[baseline_idx]]
|
|
|
|
return scaled_ratings
|
|
|
|
|
|
def compute_bt(
|
|
df,
|
|
base: float = 10.0,
|
|
scale: float = 400.0,
|
|
init_rating: float = 1000.0,
|
|
baseline_model: str = None,
|
|
baseline_rating: float = 1000.0,
|
|
tol: float = 1e-6,
|
|
):
|
|
matchups, outcomes, models, weights = preprocess_for_bt(df)
|
|
ratings = fit_bt(matchups, outcomes, weights, len(models), math.log(base),
|
|
tol)
|
|
|
|
scaled_ratings = scale_and_offset(
|
|
ratings=ratings,
|
|
models=models,
|
|
scale=scale,
|
|
init_rating=init_rating,
|
|
baseline_model=baseline_model,
|
|
baseline_rating=baseline_rating,
|
|
)
|
|
|
|
return pd.Series(scaled_ratings, index=models).sort_values(ascending=False)
|
|
|
|
|
|
def compute_bootstrap_bt(
|
|
battles,
|
|
num_round: int,
|
|
base: float = 10.0,
|
|
scale: float = 400.0,
|
|
init_rating: float = 1000.0,
|
|
baseline_model: str = None,
|
|
baseline_rating: float = 1000.0,
|
|
tol: float = 1e-6,
|
|
num_cpu: int = None,
|
|
):
|
|
matchups, outcomes, models, weights = preprocess_for_bt(battles)
|
|
# bootstrap sample the unique outcomes and their counts directly using the multinomial distribution
|
|
rng = np.random.default_rng(seed=0)
|
|
idxs = rng.multinomial(n=len(battles),
|
|
pvals=weights / weights.sum(),
|
|
size=(num_round))
|
|
# only the distribution over their occurrence counts changes between samples (and it can be 0)
|
|
boot_weights = idxs.astype(np.float64) / len(battles)
|
|
|
|
# the only thing different across samples is the distribution of weights
|
|
bt_fn = partial(fit_bt,
|
|
matchups,
|
|
outcomes,
|
|
n_models=len(models),
|
|
alpha=np.log(base),
|
|
tol=tol)
|
|
with mp.Pool(num_cpu if num_cpu else os.cpu_count() - 1) as pool:
|
|
results = list(
|
|
tqdm(pool.imap_unordered(bt_fn, boot_weights), total=num_round))
|
|
|
|
ratings = np.array(results)
|
|
|
|
scaled_ratings = scale_and_offset(
|
|
ratings=ratings,
|
|
models=models,
|
|
scale=scale,
|
|
init_rating=init_rating,
|
|
baseline_model=baseline_model,
|
|
baseline_rating=baseline_rating,
|
|
)
|
|
|
|
df = pd.DataFrame(scaled_ratings, columns=models)
|
|
return df[df.median().sort_values(ascending=False).index]
|
|
|
|
|
|
DIFF_MASK = np.array(
|
|
[1.0, -1.0], dtype=np.float64
|
|
) # create globally to not incur the instantiation cost in each call
|
|
|
|
|
|
def contextual_bt_loss_and_grad(
|
|
params,
|
|
n_competitors,
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
alpha=1.0,
|
|
reg=1.0,
|
|
half_reg=0.5,
|
|
):
|
|
reg_loss = half_reg * np.inner(params, params)
|
|
|
|
# Split params into ratings and feature parameters
|
|
ratings = params[:n_competitors]
|
|
feature_params = params[n_competitors:]
|
|
|
|
matchup_ratings = ratings[matchups]
|
|
bt_logits = alpha * (matchup_ratings[:, 0] - matchup_ratings[:, 1])
|
|
context_logits = np.dot(features, feature_params)
|
|
probs = expit(bt_logits + context_logits)
|
|
loss = (-((np.log(probs) * outcomes + np.log(1.0 - probs) *
|
|
(1.0 - outcomes))).sum() + reg_loss)
|
|
|
|
error = outcomes - probs
|
|
grad = reg * params # initialize the grad as the regularization grad
|
|
matchups_grads = -alpha * error
|
|
np.add.at(grad[:n_competitors], matchups[:, [0, 1]],
|
|
matchups_grads[:, None] * DIFF_MASK)
|
|
grad[n_competitors:] -= np.dot(features.T, error)
|
|
return loss, grad
|
|
|
|
|
|
# note on regularization:
|
|
# default reg is to 0.5 since the LogisticRegression default is 1.0
|
|
# in the original implementation, matchups were duplicated
|
|
# that made the ratio of log loss to reg loss "twice as high"
|
|
# in this non-duplicated version for parity we also reduce the reg by one half to match
|
|
def fit_contextual_bt(
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models,
|
|
idxs=None,
|
|
alpha=math.log(10.0),
|
|
reg=0.5,
|
|
tol=1e-6,
|
|
):
|
|
n_features = features.shape[1]
|
|
n_models = len(models)
|
|
initial_params = np.zeros(n_models + n_features, dtype=np.float64)
|
|
half_reg = reg / 2.0
|
|
|
|
# sample idxs optionally allow for fitting on a bootstrap sample of the dataset
|
|
if idxs is not None:
|
|
matchups, features, outcomes = matchups[idxs], features[
|
|
idxs], outcomes[idxs]
|
|
|
|
result = minimize(
|
|
fun=contextual_bt_loss_and_grad,
|
|
x0=initial_params,
|
|
args=(n_models, matchups, features, outcomes, alpha, reg, half_reg),
|
|
jac=True,
|
|
method='L-BFGS-B',
|
|
options={
|
|
'disp': False,
|
|
'maxiter': 100,
|
|
'gtol': tol
|
|
},
|
|
)
|
|
return result['x']
|
|
|
|
|
|
def compute_style_control(
|
|
df: pd.DataFrame,
|
|
alpha: float = math.log(10.0),
|
|
reg: float = 0.5,
|
|
scale: float = 400.0,
|
|
init_rating: float = 1000.0,
|
|
baseline_model: str = None,
|
|
baseline_rating: float = 1000.0,
|
|
normalize_style_features: bool = True,
|
|
control_variables: List[str] = None,
|
|
odds_ratio: bool = True,
|
|
tol: float = 1e-6,
|
|
):
|
|
if control_variables is not None:
|
|
_df = pd.get_dummies(
|
|
data=df,
|
|
columns=control_variables,
|
|
drop_first=
|
|
False, # Since the model is fitted without an intercept, we keep all levels of each categorical
|
|
)
|
|
|
|
# One-hot encode categorical control variables
|
|
one_hot_ctrls = []
|
|
for col in _df.columns:
|
|
for ctrl_var in control_variables:
|
|
if col.startswith(ctrl_var):
|
|
one_hot_ctrls.append(col)
|
|
break
|
|
|
|
matchups, features, outcomes, models = preprocess_for_style(
|
|
_df,
|
|
normalize_style_features=normalize_style_features,
|
|
style_variables=STYLE_CONTROL_VARIABLES_V1,
|
|
control_variables=one_hot_ctrls,
|
|
)
|
|
ratings_params = fit_contextual_bt(
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models=models,
|
|
alpha=alpha,
|
|
reg=reg,
|
|
tol=tol,
|
|
)
|
|
ratings = ratings_params[:len(models)]
|
|
|
|
if odds_ratio:
|
|
params = np.exp(ratings_params[len(models):])
|
|
else:
|
|
params = ratings_params[len(models):]
|
|
|
|
scaled_ratings = scale_and_offset(
|
|
ratings=ratings,
|
|
models=models,
|
|
scale=scale,
|
|
init_rating=init_rating,
|
|
baseline_model=baseline_model,
|
|
baseline_rating=baseline_rating,
|
|
)
|
|
scaled_ratings = pd.Series(scaled_ratings,
|
|
index=models).sort_values(ascending=False)
|
|
|
|
control_coefficients = {
|
|
k: v
|
|
for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
|
|
}
|
|
|
|
return scaled_ratings, control_coefficients
|
|
|
|
|
|
def compute_bootstrap_style_control(
|
|
df,
|
|
num_round: int,
|
|
alpha: float = math.log(10.0),
|
|
reg: float = 0.5,
|
|
scale: float = 400.0,
|
|
init_rating: float = 1000.0,
|
|
baseline_model: str = None,
|
|
baseline_rating: float = 1000.0,
|
|
normalize_style_features: bool = True,
|
|
control_variables: List[str] = None,
|
|
odds_ratio: bool = True,
|
|
tol: float = 1e-6,
|
|
num_cpu: int = None,
|
|
):
|
|
if control_variables is not None:
|
|
_df = pd.get_dummies(
|
|
data=df,
|
|
columns=control_variables,
|
|
drop_first=
|
|
False, # Since the model is fitted without an intercept, we keep all levels of each categorical
|
|
)
|
|
|
|
# One-hot encode categorical control variables
|
|
one_hot_ctrls = []
|
|
for col in _df.columns:
|
|
for ctrl_var in control_variables:
|
|
if col.startswith(ctrl_var):
|
|
one_hot_ctrls.append(col)
|
|
break
|
|
|
|
matchups, features, outcomes, models = preprocess_for_style(
|
|
_df,
|
|
normalize_style_features=normalize_style_features,
|
|
style_variables=STYLE_CONTROL_VARIABLES_V1,
|
|
control_variables=one_hot_ctrls,
|
|
)
|
|
|
|
contextual_bt_fn = partial(
|
|
fit_contextual_bt,
|
|
matchups,
|
|
features,
|
|
outcomes,
|
|
models,
|
|
alpha=alpha,
|
|
reg=reg,
|
|
tol=tol,
|
|
)
|
|
|
|
boot_idxs = np.random.randint(low=0,
|
|
high=matchups.shape[0],
|
|
size=(num_round, matchups.shape[0]))
|
|
|
|
with mp.Pool(num_cpu if num_cpu else os.cpu_count()) as pool:
|
|
results = list(
|
|
tqdm(pool.imap_unordered(contextual_bt_fn, boot_idxs),
|
|
total=num_round))
|
|
|
|
ratings_params = np.array(results)
|
|
ratings = ratings_params[:, :len(models)]
|
|
|
|
if odds_ratio:
|
|
params = np.exp(ratings_params[:, len(models):].mean(axis=0))
|
|
else:
|
|
params = ratings_params[:, len(models):].mean(axis=0)
|
|
|
|
scaled_ratings = scale_and_offset(
|
|
ratings=ratings,
|
|
models=models,
|
|
scale=scale,
|
|
init_rating=init_rating,
|
|
baseline_model=baseline_model,
|
|
baseline_rating=baseline_rating,
|
|
)
|
|
df = pd.DataFrame(scaled_ratings, columns=models)
|
|
|
|
control_coefficients = {
|
|
k: v
|
|
for k, v in zip(STYLE_CONTROL_VARIABLES_V1 + one_hot_ctrls, params)
|
|
}
|
|
|
|
return df[df.median().sort_values(
|
|
ascending=False).index], control_coefficients
|
|
|
|
|
|
class CompassArenaBradleyTerrySummarizer(DefaultSubjectiveSummarizer):
|
|
"""Summarizer for fitting and Bradley-Terry model to pairwise matchups
|
|
according to https://github.com/lm-sys/FastChat/tree/main.
|
|
|
|
Args:
|
|
config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime.
|
|
dataset_abbrs (Optional[List[str]], optional): Dataset abbreviations to be listed in the summary. Defaults to None.
|
|
summary_groups (List, optional): Passed to DefaultSubjectiveSummarizer. Not used for this class. Defaults to None.
|
|
prompt_db (_type_, optional): Legacy parameter kept for backward compatibility. Defaults to None.
|
|
rating_system (str, optional): Rating system used. Currently only supports "bradleyterry". Defaults to "bradleyterry".
|
|
report_pred_win_rates (bool, optional): Whether to report the predicted win rates (against the baseline model) instead of the arena ratings. Defaults to True.
|
|
num_bootstrap (int, optional): The number of bootstraps for estimating the confidence intervals. Defaults to 300.
|
|
num_cpu (int, optional): The number of CPUs to use for the BT bootstrapping process. Defaults to None.
|
|
with_control_vars (bool, optional): Whether to include additional covariates (including style features and group variables) when fitting the BT model. Defaults to True.
|
|
normalize_style_features (bool, optional): Whether to normalize style features BEFORE fitting the BT model (implementation by FastChat). Turn this off for easier interpretation of odds ratios (when odds_ratio==True). Defaults to True.
|
|
odds_ratio (bool, optional): Whether to report odds ratios (np.exp(beta_k)) instead of the original coefficients. Defaults to True.
|
|
groups (List[str], optional): Group variables to include while fitting the BT model. These must be available in the input dataset for each observation. Defaults to None.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
config: ConfigDict,
|
|
dataset_abbrs: Optional[List[str]] = None,
|
|
summary_groups: List = None,
|
|
prompt_db=None,
|
|
rating_system: str = 'bradleyterry',
|
|
report_pred_win_rates: bool = True,
|
|
num_bootstrap: int = 300,
|
|
num_cpu: int = None,
|
|
with_control_vars: bool = True,
|
|
normalize_style_features: bool = True,
|
|
odds_ratio: bool = True,
|
|
groups: List[str] = None,
|
|
) -> None:
|
|
summary_groups = [] if summary_groups is None else summary_groups
|
|
super().__init__(config, dataset_abbrs, summary_groups, prompt_db)
|
|
|
|
self.summarizer_cfg = self.cfg['summarizer']
|
|
self.rating_system = 'bradleyterry' # Only bradleyterry supported
|
|
self.report_pred_win_rates = report_pred_win_rates
|
|
self.num_bootstrap = num_bootstrap
|
|
self.num_cpu = num_cpu
|
|
self.with_control_vars = with_control_vars
|
|
self.normalize_style_features = normalize_style_features
|
|
self.odds_ratio = odds_ratio
|
|
self.groups = [] if groups is None else groups
|
|
|
|
def _pick_up_results(self, judge_abbr):
|
|
"""The function reads the numerical results of evaluations from the
|
|
output folder based on the configuration file, and ultimately returns
|
|
four dictionaries, each containing processed information in different
|
|
formats. The contents of the four dictionaries are as follows:
|
|
|
|
- raw_results: contains the raw results of each model on each dataset (excluding details).
|
|
- parsed_results: contains the results of each model on each dataset for each metric, with metrics in METRIC_BLACKLIST being ignored.
|
|
- dataset_metrics: contains the list of metrics for each dataset, consistent with the metrics in parsed_results. The list is ordered according to the METRIC_WHITELIST,
|
|
with metrics appearing earlier considered more important.
|
|
- dataset_eval_mode: contains the evaluation mode for each dataset.
|
|
"""
|
|
# raw_results: {model_abbr: {dataset_abbr: result}}
|
|
raw_results: Dict[str, Dict[str, Any]] = {}
|
|
# # parsed_results: {model_abbr: {dataset_abbr: {metric: score}}}
|
|
# parsed_results: Dict[str, Dict[str, Dict[str, float]]] = {}
|
|
# # dataset_metrics: {dataset_abbr: [metric]}
|
|
# dataset_metrics: Dict[str, List[str]] = {}
|
|
|
|
for model in self.model_cfgs:
|
|
model_abbr = model_abbr_from_cfg_used_in_summarizer(model)
|
|
# parsed_results.setdefault(model_abbr, {})
|
|
# raw_results.setdefault(model_abbr, {})
|
|
|
|
for dataset in self.dataset_cfgs:
|
|
base_models = dataset.get('base_models', None)
|
|
if base_models is None:
|
|
raise ValueError(
|
|
'CompassArenaBradleyTerrySummarizer requires at least one `base_model` in specified in the dataset config.'
|
|
)
|
|
|
|
base_models_list = [item['abbr'] for item in base_models]
|
|
|
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
|
raw_results.setdefault(dataset_abbr, {})
|
|
|
|
for base_model_abbr in base_models_list:
|
|
raw_results[dataset_abbr].setdefault(base_model_abbr, [])
|
|
|
|
origin_path = get_infer_output_path(
|
|
model, dataset, osp.join(self.work_dir, 'results'))
|
|
if base_model_abbr != '':
|
|
temp_path, dataset_json_name = (
|
|
origin_path.rsplit('/', 1)[0],
|
|
origin_path.rsplit('/', 1)[1],
|
|
)
|
|
filepath = osp.join(
|
|
temp_path.rsplit('/', 1)[0],
|
|
base_model_abbr + '_' +
|
|
temp_path.rsplit('/', 1)[1] + '_judged-by--' +
|
|
judge_abbr,
|
|
dataset_json_name,
|
|
)
|
|
else:
|
|
filepath = osp.join(
|
|
origin_path.rsplit('/', 1)[0] + '_judged-by--' +
|
|
judge_abbr,
|
|
origin_path.rsplit('/', 1)[1],
|
|
)
|
|
if not osp.exists(filepath):
|
|
continue
|
|
|
|
result = mmengine.load(filepath)
|
|
result.pop('details', None)
|
|
|
|
# raw_results[dataset_abbr] = result
|
|
raw_results[dataset_abbr][base_model_abbr].extend(
|
|
result['matches'])
|
|
|
|
if 'error' in result:
|
|
self.logger.debug(
|
|
f'error in {model_abbr} {dataset_abbr} {result["error"]}'
|
|
)
|
|
continue
|
|
|
|
# dataset_eval_mode: {dataset_abbr: eval_mode}
|
|
dataset_eval_mode: Dict[str, str] = {}
|
|
for dataset in self.dataset_cfgs:
|
|
inferencer = (dataset.get('infer_cfg', {}).get('inferencer',
|
|
{}).get('type', ''))
|
|
inferencer = (inferencer if isinstance(inferencer, str) else
|
|
inferencer.__name__)
|
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
|
if 'GenInferencer' in inferencer:
|
|
dataset_eval_mode[dataset_abbr] = 'gen'
|
|
elif 'PPLInferencer' in inferencer:
|
|
dataset_eval_mode[dataset_abbr] = 'ppl'
|
|
elif 'LLInferencer' in inferencer:
|
|
dataset_eval_mode[dataset_abbr] = 'll'
|
|
else:
|
|
dataset_eval_mode[dataset_abbr] = 'unknown'
|
|
self.logger.warning(
|
|
f'unknown inferencer: {inferencer} - {dataset_abbr}')
|
|
|
|
# return raw_results, parsed_results, dataset_metrics, dataset_eval_mode
|
|
return raw_results, dataset_eval_mode
|
|
|
|
def _calculate_ratings(
|
|
self,
|
|
matches: Dict,
|
|
base_model: str = None,
|
|
groups: List[str] = None,
|
|
) -> Tuple[pd.DataFrame, Dict]:
|
|
|
|
rating_system = self.rating_system
|
|
num_bootstrap = self.num_bootstrap
|
|
num_cpu = self.num_cpu
|
|
with_control_vars = self.with_control_vars
|
|
|
|
matches_df = pd.DataFrame(matches)
|
|
|
|
num_battles = (matches_df['model_a'].value_counts().add(
|
|
matches_df['model_b'].value_counts(), fill_value=0))
|
|
|
|
# if rating_system == "bradleyterry":
|
|
if with_control_vars:
|
|
elo_rating_final, coef_final = compute_style_control(
|
|
df=matches_df,
|
|
baseline_model=base_model,
|
|
normalize_style_features=self.normalize_style_features,
|
|
control_variables=groups,
|
|
odds_ratio=self.odds_ratio,
|
|
)
|
|
|
|
bootstrap_df, bootstrap_coef = compute_bootstrap_style_control(
|
|
df=matches_df,
|
|
num_round=num_bootstrap,
|
|
baseline_model=base_model,
|
|
normalize_style_features=self.normalize_style_features,
|
|
control_variables=groups,
|
|
odds_ratio=self.odds_ratio,
|
|
)
|
|
else:
|
|
bootstrap_df = compute_bootstrap_bt(
|
|
battles=matches_df,
|
|
num_round=num_bootstrap,
|
|
baseline_model=base_model,
|
|
num_cpu=num_cpu,
|
|
)
|
|
elo_rating_final = compute_bt(
|
|
df=matches_df,
|
|
baseline_model=base_model,
|
|
)
|
|
|
|
# print(elo_rating_final)
|
|
|
|
# elif rating_system == "elo":
|
|
# bootstrap_df = compute_bootstrap_elo(
|
|
# df=matches_df,
|
|
# num_round=num_bootstrap,
|
|
# num_cpu=num_cpu,
|
|
# )
|
|
# elo_rating_final = compute_elo(matches_df)
|
|
|
|
model_rating_q025 = bootstrap_df.quantile(0.025)
|
|
model_rating_q975 = bootstrap_df.quantile(0.975)
|
|
|
|
# compute ranking based on CI
|
|
model_order = list(elo_rating_final.index)
|
|
|
|
ranking = {}
|
|
for i, model_a in enumerate(model_order):
|
|
ranking[model_a] = 1
|
|
for j, model_b in enumerate(model_order):
|
|
if i == j:
|
|
continue
|
|
if model_rating_q025[model_b] > model_rating_q975[model_a]:
|
|
ranking[model_a] += 1
|
|
|
|
leaderboard_table_df = pd.DataFrame(
|
|
{
|
|
'rating': elo_rating_final,
|
|
'ranking_ub': pd.Series(ranking),
|
|
'std_dev': bootstrap_df.std(),
|
|
'rating_q975': model_rating_q975,
|
|
'rating_q025': model_rating_q025,
|
|
'num_battles': num_battles,
|
|
}, )
|
|
leaderboard_table_df['model_name'] = leaderboard_table_df.index
|
|
|
|
leaderboard_table_df.sort_values(
|
|
by=['rating'],
|
|
ascending=False,
|
|
inplace=True,
|
|
)
|
|
leaderboard_table_df['ranking'] = np.arange(
|
|
1,
|
|
len(leaderboard_table_df) + 1)
|
|
|
|
if rating_system == 'bradleyterry' and with_control_vars:
|
|
control_coefficients = {
|
|
'bootstrap': bootstrap_coef,
|
|
'final': coef_final,
|
|
}
|
|
else:
|
|
control_coefficients = {'final': []}
|
|
|
|
return leaderboard_table_df, control_coefficients['final']
|
|
|
|
def _output_to_file(
|
|
self,
|
|
output_path,
|
|
time_str: str,
|
|
tables: Dict,
|
|
metadata: Dict,
|
|
judge_abbr: str,
|
|
dataset_eval_mode: str,
|
|
):
|
|
# Output to file
|
|
if output_path is None:
|
|
output_path = osp.join(self.work_dir, 'summary',
|
|
f'summary_{time_str}.json')
|
|
output_csv_path = osp.join(self.work_dir, 'summary',
|
|
f'summary_{time_str}.csv')
|
|
else:
|
|
output_csv_path = output_path.replace('.json', '.csv')
|
|
output_path = output_path.split(
|
|
'.json')[0] + '_by_' + judge_abbr + '.json'
|
|
|
|
output_dir = osp.split(output_path)[0]
|
|
mmengine.mkdir_or_exist(output_dir)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(metadata, f, ensure_ascii=False, indent=4)
|
|
self.logger.info(f'write summary to {osp.abspath(output_path)}')
|
|
|
|
prompt_version = {
|
|
dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6]
|
|
for d in self.dataset_cfgs
|
|
}
|
|
|
|
full_results = []
|
|
for base_model_abbr, datasets in tables.items():
|
|
base_model_results = []
|
|
for dataset_abbr, table_df in datasets.items():
|
|
table_df['dataset'] = dataset_abbr
|
|
table_df['version'] = prompt_version.get(dataset_abbr, '-')
|
|
table_df['metric'] = 'bt_rating'
|
|
table_df['mode'] = dataset_eval_mode[dataset_abbr]
|
|
table_df['base_model'] = base_model_abbr
|
|
|
|
base_model_results.append(table_df)
|
|
|
|
cur_base_model_result_df = pd.concat(base_model_results)
|
|
full_results.append(cur_base_model_result_df)
|
|
|
|
full_results_df = pd.concat(full_results)
|
|
full_results_df = full_results_df[[
|
|
'dataset',
|
|
'version',
|
|
'base_model',
|
|
'metric',
|
|
'mode',
|
|
'ranking',
|
|
'ranking_ub',
|
|
'model_name',
|
|
'predicted_win_rate',
|
|
'rating',
|
|
'rating_q975',
|
|
'rating_q025',
|
|
'std_dev',
|
|
'num_battles',
|
|
]]
|
|
|
|
output_csv_path = (output_csv_path.split('.csv')[0] + '_by_' +
|
|
judge_abbr + '.csv')
|
|
|
|
with pd.option_context(
|
|
'display.max_rows',
|
|
20,
|
|
'display.max_columns',
|
|
20,
|
|
'display.expand_frame_repr',
|
|
False,
|
|
):
|
|
print(full_results_df.reset_index(drop=True).round(2))
|
|
|
|
full_results_df.to_csv(
|
|
output_csv_path,
|
|
index=False,
|
|
)
|
|
self.logger.info(f'write csv to {osp.abspath(output_csv_path)}')
|
|
|
|
def flip_dict_levels(self, original_dict: Dict):
|
|
"""Flips the two levels of a nested dictionary so that dict[lvl1][lvl2]
|
|
becomes dict[lvl2][lvl1].
|
|
|
|
Args:
|
|
original_dict (dict): The original nested dictionary.
|
|
|
|
Returns:
|
|
dict: The flipped dictionary.
|
|
"""
|
|
flipped_dict = {}
|
|
for lvl1, lvl2_dict in original_dict.items():
|
|
for lvl2, value in lvl2_dict.items():
|
|
if lvl2 not in flipped_dict:
|
|
flipped_dict[lvl2] = {}
|
|
flipped_dict[lvl2][lvl1] = value
|
|
|
|
return flipped_dict
|
|
|
|
def predict_win_rate(
|
|
self,
|
|
ratings_df: pd.DataFrame,
|
|
baseline_model: str,
|
|
base: float = 10.0,
|
|
scaling_factor: float = 400.0,
|
|
round_win_rate: int = None,
|
|
) -> pd.DataFrame:
|
|
"""Predict win rates between all models using their ELO ratings.
|
|
|
|
Args:
|
|
ratings_df (pd.DataFrame): DataFrame containing model ratings with model names as index
|
|
baseline_model (str): Name of baseline model to use as reference
|
|
base (float): Base for the ELO formula (default 10.0)
|
|
scaling_factor (float): Scaling factor for rating differences (default 400.0)
|
|
|
|
Returns:
|
|
pd.DataFrame: DataFrame with an additional column 'predicted_win_rate' containing
|
|
the predicted win rate against the baseline model
|
|
"""
|
|
if baseline_model not in ratings_df.index:
|
|
raise ValueError(
|
|
f'Baseline model {baseline_model} not found in ratings')
|
|
|
|
# Create a copy of the ratings dataframe to avoid modifying the original
|
|
result_df = ratings_df.copy()
|
|
|
|
# Initialize the predicted_win_rate column with 0.5 for the baseline model
|
|
|
|
result_df['predicted_win_rate'] = 0.5
|
|
|
|
# Get the baseline model's rating
|
|
baseline_rating = ratings_df.loc[baseline_model, 'rating']
|
|
|
|
# Calculate win probabilities for all models against the baseline
|
|
for model, row in ratings_df.iterrows():
|
|
if model != baseline_model:
|
|
model_rating = row['rating']
|
|
# ELO win probability formula
|
|
win_rate = 1 / (1 + base**(
|
|
(baseline_rating - model_rating) / scaling_factor))
|
|
result_df.loc[model, 'predicted_win_rate'] = win_rate
|
|
|
|
if round_win_rate is not None:
|
|
result_df['predicted_win_rate'] = result_df[
|
|
'predicted_win_rate'].round(round_win_rate)
|
|
|
|
return result_df
|
|
|
|
def summarize(
|
|
self,
|
|
output_path: str = None,
|
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
|
):
|
|
"""Summarize evaluation results and format output table.
|
|
|
|
Args:
|
|
output_path (str, optional): Output path. Defaults to None.
|
|
time_str (str, optional): Timestamp for file suffix. Defaults to
|
|
datetime.now().strftime('%Y%m%d_%H%M%S').
|
|
"""
|
|
all_scores_df_list = []
|
|
all_scores = {}
|
|
all_scores_ctrl_coefs = {}
|
|
for judge_model in self.judge_models:
|
|
control_coefficients = {}
|
|
leaderboard_tables = {}
|
|
|
|
judge_abbr = model_abbr_from_cfg(judge_model)
|
|
|
|
# pick up results
|
|
raw_results, dataset_eval_mode = self._pick_up_results(judge_abbr)
|
|
|
|
all_matches = []
|
|
for dataset_abbr, base_models in raw_results.items():
|
|
control_coefficients[dataset_abbr] = {}
|
|
leaderboard_tables[dataset_abbr] = {}
|
|
|
|
dataset_matches = base_models[list(base_models)[0]]
|
|
all_matches.extend(dataset_matches)
|
|
|
|
for base_model_abbr, matches in base_models.items():
|
|
cur_table_df, cur_ctrl_coefs = self._calculate_ratings(
|
|
matches=matches,
|
|
base_model=base_model_abbr,
|
|
groups=self.groups,
|
|
)
|
|
|
|
# Calculate predicted win_rate
|
|
cur_table_df = self.predict_win_rate(
|
|
ratings_df=cur_table_df,
|
|
baseline_model=base_model_abbr,
|
|
round_win_rate=4,
|
|
)
|
|
|
|
control_coefficients[dataset_abbr][
|
|
base_model_abbr] = cur_ctrl_coefs
|
|
leaderboard_tables[dataset_abbr][
|
|
base_model_abbr] = cur_table_df
|
|
|
|
print('-' * 10 +
|
|
f"{dataset_abbr + ':' + base_model_abbr}\n" +
|
|
'-' * 10)
|
|
print(cur_table_df)
|
|
print(cur_ctrl_coefs)
|
|
|
|
leaderboard_tables = self.flip_dict_levels(leaderboard_tables)
|
|
|
|
# Output to .json / .csv files
|
|
self._output_to_file(
|
|
output_path=output_path,
|
|
time_str=time_str,
|
|
tables=leaderboard_tables,
|
|
metadata=control_coefficients,
|
|
judge_abbr=judge_abbr,
|
|
dataset_eval_mode=dataset_eval_mode,
|
|
)
|
|
|
|
# Fit another BT model with the first base_model and combining matches from all datasets
|
|
cur_judge_all_scores_df, cur_judge_all_scores_ctrl_coefs = (
|
|
self._calculate_ratings(
|
|
matches=all_matches,
|
|
base_model=list(base_models)[0],
|
|
groups=self.groups,
|
|
))
|
|
# Calculate predicted win_rate
|
|
cur_judge_all_scores_df = self.predict_win_rate(
|
|
ratings_df=cur_judge_all_scores_df,
|
|
baseline_model=list(base_models)[0],
|
|
round_win_rate=4,
|
|
)
|
|
cur_judge_all_scores_df['judge'] = judge_abbr
|
|
|
|
all_scores_df_list.append(cur_judge_all_scores_df)
|
|
|
|
# Report predicted win rate or ratings
|
|
if self.report_pred_win_rates:
|
|
_scores = cur_judge_all_scores_df['predicted_win_rate']
|
|
else:
|
|
_scores = cur_judge_all_scores_df['rating']
|
|
|
|
all_scores[judge_abbr] = pd.Series(
|
|
_scores,
|
|
index=cur_judge_all_scores_df['model_name'],
|
|
).to_dict()
|
|
|
|
all_scores_ctrl_coefs[judge_abbr] = cur_judge_all_scores_ctrl_coefs
|
|
|
|
all_scores_df = pd.concat(all_scores_df_list)
|
|
|
|
output_path_all_scores_df = osp.join(
|
|
self.work_dir, 'summary', f'summary_{time_str}_all_scores_df.csv')
|
|
output_path_all_scores = osp.join(
|
|
self.work_dir, 'summary', f'summary_{time_str}_all_scores.json')
|
|
output_path_all_scores_ctrl_coefs = osp.join(
|
|
self.work_dir, 'summary',
|
|
f'summary_{time_str}_all_scores_ctrl_coefs.json')
|
|
|
|
all_scores_df.to_csv(output_path_all_scores_df)
|
|
|
|
with open(output_path_all_scores, 'w', encoding='utf-8') as f:
|
|
json.dump(all_scores, f, ensure_ascii=False, indent=4)
|
|
|
|
with open(output_path_all_scores_ctrl_coefs, 'w',
|
|
encoding='utf-8') as f:
|
|
json.dump(all_scores_ctrl_coefs, f, ensure_ascii=False, indent=4)
|
|
|
|
print(f'{all_scores_df=}')
|
|
print(f'{all_scores=}')
|
|
print(f'{all_scores_ctrl_coefs=}')
|
|
|
|
return {'CompassArenaSubjBenchBradleyTerry': all_scores}
|