# flake8: noqa # yapf: disable import argparse import datetime import json import math import os import os.path as osp import re from collections import defaultdict from datetime import datetime from glob import glob from itertools import product import mmengine import numpy as np #import plotly.express as px import pandas as pd import tiktoken from mmengine import ConfigDict from sklearn.linear_model import LogisticRegression from tabulate import tabulate from tqdm import tqdm from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg from .utils import get_outdir def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): models = pd.concat([df['model_a'], df['model_b']]).unique() models = pd.Series(np.arange(len(models)), index=models) # duplicate battles df = pd.concat([df, df], ignore_index=True) p = len(models.index) n = df.shape[0] X = np.zeros([n, p]) X[np.arange(n), models[df['model_a']]] = +math.log(BASE) X[np.arange(n), models[df['model_b']]] = -math.log(BASE) # one A win => two A win Y = np.zeros(n) Y[df['winner'] == 'model_a'] = 1.0 # one tie => one A win + one B win # find tie + tie (both bad) index tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)') tie_idx[len(tie_idx)//2:] = False Y[tie_idx] = 1.0 lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model lr.fit(X,Y) elo_scores = SCALE * lr.coef_[0] + INIT_RATING # set anchor as gpt4-0314 = 1000 if 'gpt4-0314' in models.index: elo_scores += 1000 - elo_scores[models['gpt4-0314']] return pd.Series(elo_scores, index = models.index).sort_values(ascending=False) def get_bootstrap_result(battles, func_compute_elo, num_round): rows = [] for i in tqdm(range(num_round), desc='bootstrap'): rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True))) df = pd.DataFrame(rows) return df[df.median().sort_values(ascending=False).index] def preety_print_two_ratings(ratings_1, ratings_2, column_names): df = pd.DataFrame([ [n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys() ], columns=['Model', column_names[0], column_names[1]]).sort_values(column_names[0], ascending=False).reset_index(drop=True) df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int) df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int) df.index = df.index + 1 return df def visualize_bootstrap_scores(df, title): bars = pd.DataFrame(dict( lower = df.quantile(.025), rating = df.quantile(.5), upper = df.quantile(.975))).reset_index(names='model').sort_values('rating', ascending=False) bars['error_y'] = bars['upper'] - bars['rating'] bars['error_y_minus'] = bars['rating'] - bars['lower'] bars['rating_rounded'] = np.round(bars['rating'], 2) fig = px.scatter(bars, x='model', y='rating', error_y='error_y', error_y_minus='error_y_minus', text='rating_rounded', title=title) fig.update_layout(xaxis_title='Model', yaxis_title='Rating', height=600) return fig def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000): names = sorted(list(elo_ratings.keys())) wins = defaultdict(lambda: defaultdict(lambda: 0)) for a in names: for b in names: ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE)) wins[a][b] = ea wins[b][a] = 1 - ea data = { a: [wins[a][b] if a != b else np.NAN for b in names] for a in names } df = pd.DataFrame(data, index=names) df.index.name = 'model_a' df.columns.name = 'model_b' return df.T def model_abbr_from_cfg_used_in_summarizer(model): if model.get('summarizer_abbr', None): return model['summarizer_abbr'] else: return model_abbr_from_cfg(model) def post_process_compass_arena(s): if result := re.findall('\[\[([AB<>=]+)\]\]', s): return result[0] else: return None def get_win_rate_column(df, column, baseline='gpt4-0314'): to_dict = df[['model', column]].set_index('model').to_dict()[column] win_rate_table = predict_win_rate(to_dict) return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2)) def get_battles_from_judgment(dataset, subdir_path, post_process, WEIGHT=3): arena_hard_battles = pd.DataFrame() dataset_abbr = dataset_abbr_from_cfg(dataset) filename = osp.join(subdir_path, dataset_abbr + '.json') partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json') if osp.exists(osp.realpath(filename)): result = mmengine.load(filename) elif osp.exists(osp.realpath(partial_filename)): filename = partial_filename result = {} i = 1 partial_dict_flag = 0 while osp.exists(osp.realpath(filename)): res = mmengine.load(filename) for k, v in res.items(): result[partial_dict_flag] = v partial_dict_flag += 1 filename = osp.join(subdir_path, dataset_abbr + '_' + str(i) + '.json') i += 1 else: result = {} if len(result) == 0: print('*' * 100) print('There are no results for ' + filename + ' or ' + partial_filename) print('*' * 100) assert len(result) > 0 judged_answers = [] references = [] for k, v in result.items(): output = { 'model_a': v['gold']['answer1'], 'model_b': v['gold']['answer2']} processed_judge = post_process(v['prediction']) if processed_judge is not None: weight = 1 if processed_judge == 'A=B': output['winner'] = 'tie' elif processed_judge == 'A>B': output['winner'] = 'model_a' elif processed_judge == 'A>>B': output['winner'] = 'model_a' weight = WEIGHT elif processed_judge == 'B>A': output['winner'] = 'model_b' elif processed_judge == 'B>>A': output['winner'] = 'model_b' weight = WEIGHT else: weight = 0 else: weight = 0 if weight: arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)]) return arena_hard_battles class ArenaHardSummarizer: """Do the subjectivity analyze based on evaluation results. Args: config (ConfigDict): The configuration object of the evaluation task. It's expected to be filled out at runtime. """ def __init__(self, config: ConfigDict, judge_type='general', check_pos_bias=True, summary_type='single') -> None: self.tasks = [] self.cfg = config self.base_models = self.cfg['eval']['partitioner']['base_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.judge_models = self.cfg.get('judge_models', None) self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) self.judge_type = judge_type assert self.judge_type in ['general'] self.judge_map = {'general': post_process_compass_arena} self.judge_function = self.judge_map[self.judge_type] self.check_pos_bias = check_pos_bias self.summary_type = summary_type def get_score(self, time_str): output_dir, results_folder = get_outdir(self.cfg, time_str) model_combinations = list(product(self.base_models, self.compare_models)) unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]]) if self.meta_judge_model is not None: self.judge_models.append(self.meta_judge_model) scores = {} for idx, judge_model_cfg in enumerate(self.judge_models): judge_model = model_abbr_from_cfg(judge_model_cfg) for dataset in self.cfg['datasets']: dataset_abbr = dataset_abbr_from_cfg(dataset) battles = pd.DataFrame() print('Turning judgment results into battles...') for model_pair in unique_combinations: model1 = model_pair[0]['abbr'] model2 = model_pair[1]['abbr'] if idx == len(self.judge_models): subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model else: subdir = model1 + '_' + model2 + '_judged-by--' + judge_model subdir_path = os.path.join(results_folder, subdir) if not os.path.isdir(subdir_path): print(subdir_path + ' is not exist! please check!') continue new_battle = get_battles_from_judgment(dataset, subdir_path, self.judge_function) battles = pd.concat([battles, new_battle], ignore_index=True) battles.to_json(os.path.join(output_dir,'arena_hard_battles_judged-by--'+ judge_model+'.jsonl'), lines=True, orient='records') bootstrap_online_elo = compute_mle_elo(battles) np.random.seed(42) bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100) bootstrap_elo_lu.to_json(os.path.join(output_dir,'bootstrapping_results'+ judge_model+'.jsonl'), lines=True, orient='records') stats = pd.DataFrame() stats['results'] = None stats['results'] = stats['results'].astype('object') for i, model in enumerate(bootstrap_online_elo.index): assert model in bootstrap_elo_lu.columns stats.at[i, 'model'] = model stats.at[i, 'score'] = bootstrap_online_elo[model] stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5) stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5) if model == 'gpt4-0314': stats.at[i, 'avg_tokens'] = 423 else: with open(os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json'), 'r') as f: model_preds = json.load(f) pred_length = 0 for k, v in model_preds.items(): pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(v['prediction'])) pred_length /= len(model_preds) stats.at[i, 'avg_tokens'] = pred_length stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist() stats.sort_values(by='model', inplace=True) stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist() stats['lower'] = get_win_rate_column(stats, 'lower', 'gpt4-0314').tolist() stats['upper'] = get_win_rate_column(stats, 'upper', 'gpt4-0314').tolist() decimal = 1 stats.sort_values(by='score', ascending=False, inplace=True) for _, row in stats.iterrows(): interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal))) print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}") stats.to_json(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.json'), orient='records', indent=4) stats.to_csv(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.csv')) def summarize( self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), ): """Summarize the subjectivity analysis based on evaluation results. Args: time_str (str): Timestamp for file naming. Returns: pd.DataFrame: The summary results. """ self.get_score(time_str)