# flake8: noqa: W605 import json import math import os.path as osp import re from collections import defaultdict import numpy as np import pandas as pd from datasets import Dataset, DatasetDict from sklearn.linear_model import LogisticRegression from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET from opencompass.utils import get_data_path from ..base import BaseDataset from .utils import get_judgeanswer_and_reference @LOAD_DATASET.register_module() class ArenaHardDataset(BaseDataset): def load(self, path: str, name: str, *args, **kwargs): path = get_data_path(path, local_mode=True) filename = osp.join(path, f'{name}.jsonl') dataset = DatasetDict() raw_data = [] with open(filename, 'r', encoding='utf-8') as file: for line in file: problem = json.loads(line) question_id = problem['question_id'] cluster = problem['cluster'] question = problem['turns'][0][ 'content'] # only one turn in arena_hard raw_data.append({ 'question': question, 'capability': cluster, 'judge': { 'capability': cluster, 'question': question, 'question_id': question_id } }) dataset = Dataset.from_list(raw_data) return dataset def post_process_arenahard(completion): s = completion['prediction'] if result := re.findall('\[\[([AB<>=]+)\]\]', s): return result[0] else: return None def get_battles_from_judgment(judged_answers, references, WEIGHT=3): arena_hard_battles = pd.DataFrame() for judged_answer, reference in zip(judged_answers, references): output = { 'model_a': reference['answer1'], 'model_b': reference['answer2'] } if judged_answer is not None: weight = 1 if judged_answer == 'A=B': output['winner'] = 'tie' elif judged_answer == 'A>B': output['winner'] = 'model_a' elif judged_answer == 'A>>B': output['winner'] = 'model_a' weight = WEIGHT elif judged_answer == 'B>A': output['winner'] = 'model_b' elif judged_answer == 'B>>A': output['winner'] = 'model_b' weight = WEIGHT else: weight = 0 else: weight = 0 if weight: arena_hard_battles = pd.concat( [arena_hard_battles, pd.DataFrame([output] * weight)]) return arena_hard_battles def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000): models = pd.concat([df['model_a'], df['model_b']]).unique() models = pd.Series(np.arange(len(models)), index=models) # duplicate battles df = pd.concat([df, df], ignore_index=True) p = len(models.index) n = df.shape[0] X = np.zeros([n, p]) X[np.arange(n), models[df['model_a']]] = +math.log(BASE) X[np.arange(n), models[df['model_b']]] = -math.log(BASE) # one A win => two A win Y = np.zeros(n) Y[df['winner'] == 'model_a'] = 1.0 # one tie => one A win + one B win # find tie + tie (both bad) index tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)') tie_idx[len(tie_idx) // 2:] = False Y[tie_idx] = 1.0 lr = LogisticRegression( fit_intercept=False, penalty=None, tol=1e-8 ) # May need to set a small value when not use GPT4 as judge model lr.fit(X, Y) elo_scores = SCALE * lr.coef_[0] + INIT_RATING # set anchor as gpt4-0314 = 1000 if 'gpt4-0314' in models.index: elo_scores += 1000 - elo_scores[models['gpt4-0314']] return pd.Series(elo_scores, index=models.index).sort_values(ascending=False) def get_bootstrap_result(battles, func_compute_elo, num_round): rows = [] for i in range(num_round): rows.append(func_compute_elo(battles.sample(frac=1.0, replace=True))) df = pd.DataFrame(rows) return df[df.median().sort_values(ascending=False).index] def preety_print_two_ratings(ratings_1, ratings_2, column_names): df = pd.DataFrame( [[n, ratings_1[n], ratings_2[n]] for n in ratings_1.keys()], columns=['Model', column_names[0], column_names[1] ]).sort_values(column_names[0], ascending=False).reset_index(drop=True) df[column_names[0]] = (df[column_names[0]] + 0.5).astype(int) df[column_names[1]] = (df[column_names[1]] + 0.5).astype(int) df.index = df.index + 1 return df def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000): names = sorted(list(elo_ratings.keys())) wins = defaultdict(lambda: defaultdict(lambda: 0)) for a in names: for b in names: ea = 1 / (1 + BASE**((elo_ratings[b] - elo_ratings[a]) / SCALE)) wins[a][b] = ea wins[b][a] = 1 - ea data = { a: [wins[a][b] if a != b else np.NAN for b in names] for a in names } df = pd.DataFrame(data, index=names) df.index.name = 'model_a' df.columns.name = 'model_b' return df.T def get_win_rate_column(df, column, baseline='gpt4-0314'): to_dict = df[['model', column]].set_index('model').to_dict()[column] win_rate_table = predict_win_rate(to_dict) return win_rate_table[baseline].fillna(0.5).apply( lambda x: round(x * 100, 2)) @DICT_POSTPROCESSORS.register_module('arenahard') def arenahard_postprocess(output: dict, output_path: str) -> dict: judged_answers, references = get_judgeanswer_and_reference( output, output_path, post_process_arenahard) if len(judged_answers) == 0: scores = None battles = get_battles_from_judgment( judged_answers, references, ) bootstrap_online_elo = compute_mle_elo(battles) np.random.seed(42) bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100) stats = pd.DataFrame() stats['results'] = None stats['results'] = stats['results'].astype('object') for i, model in enumerate(bootstrap_online_elo.index): assert model in bootstrap_elo_lu.columns stats.at[i, 'model'] = model stats.at[i, 'score'] = bootstrap_online_elo[model] # stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5) # stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5) # stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist() stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist() models = stats['model'] scores = stats['score'] if models[0] == 'gpt4-0314': score = scores[1] else: score = scores[0] results = {'score': score} results['details'] = output return results