OpenCompass/opencompass/summarizers/subjective.py

import copy as cp
import io
import json
import math
import multiprocessing as mp
import os
import os.path as osp
import pickle
import random as rd
from collections import defaultdict
from datetime import datetime
from typing import List, Optional

try:
    import cv2
except ImportError:
    import traceback

    traceback.print_exc()
    raise ImportError(
        'Import cv2 failed. Please install it with '
        '"pip install opencv-python-headless" and try again.\n\n'
        'If the prompt `ImportError: libGL.so.1` appears,'
        ' you may consider one of the following two methods:\n'
        'Method 1 - Uninstall opencv and then install opencv-headless\n'
        'pip uninstall opencv-python; pip install opencv-python-headless\n\n'
        'Method 2: Install the missing dynamic link libraries\n'
        'sudo apt-get update; sudo apt-get install -y libgl1 libglib2.0-0')
import mmengine
import numpy as np
import pandas as pd
from mmengine import ConfigDict
from tabulate import tabulate
from tqdm import tqdm

from opencompass.utils import build_dataset_from_cfg, dataset_abbr_from_cfg


def dump(data, f):
    """Dump data to file."""

    def dump_pkl(data, pth):
        pickle.dump(data, open(pth, 'wb'))

    def dump_json(data, pth):
        json.dump(data, open(pth, 'w'), indent=4)

    def dump_jsonl(data, f):
        lines = [json.dumps(x, ensure_ascii=False) for x in data]
        with open(f, 'w', encoding='utf8') as fout:
            fout.write('\n'.join(lines))

    def dump_xlsx(data, f):
        data.to_excel(f, index=False)

    def dump_csv(data, f):
        data.to_csv(f, index=False)

    def dump_tsv(data, f):
        data.to_csv(f, sep='\t', index=False)

    handlers = dict(pkl=dump_pkl,
                    json=dump_json,
                    jsonl=dump_jsonl,
                    xlsx=dump_xlsx,
                    csv=dump_csv,
                    tsv=dump_tsv)
    suffix = f.split('.')[-1]
    return handlers[suffix](data, f)


def load(f):
    """Load data from file."""

    def load_pkl(pth):
        return pickle.load(open(pth, 'rb'))

    def load_json(pth):
        return json.load(open(pth, 'r', encoding='utf-8'))

    def load_jsonl(f):
        lines = open(f, encoding='utf-8').readlines()
        lines = [x.strip() for x in lines]
        if lines[-1] == '':
            lines = lines[:-1]
        data = [json.loads(x) for x in lines]
        return data

    def load_xlsx(f):
        return pd.read_excel(f)

    def load_csv(f):
        return pd.read_csv(f)

    def load_tsv(f):
        return pd.read_csv(f, sep='\t')

    handlers = dict(pkl=load_pkl,
                    json=load_json,
                    jsonl=load_jsonl,
                    xlsx=load_xlsx,
                    csv=load_csv,
                    tsv=load_tsv)
    suffix = f.split('.')[-1]
    return handlers[suffix](f)


def double_log(msg, fout=None):
    """Prints a message and optionally writes it to a file.

    Args:
        msg (str): The message to be printed and, if fout is provided,
            written to the file.
        fout (file, optional): A file object to write the message
            to (default is None).

    Returns:
        None
    """
    print(msg)
    if fout is not None:
        fout.write(str(msg) + '\n')
        fout.flush()


def stack_image(imgs, shape=(1, 3)):
    """Stacks a list of images into a grid.

    Args:
        imgs (list): A list of image arrays to be stacked.
        shape (tuple): A tuple specifying the grid shape
            (rows, columns) for the stacked images (default is (1, 3)).

    Returns:
        numpy.ndarray: The stacked image grid.
    """
    total_imgs = shape[0] * shape[1]
    assert len(imgs) <= total_imgs
    h, w, _ = imgs[0].shape
    imgs = [cv2.resize(im, dsize=(w, h)) for im in imgs]
    for i in range(total_imgs - len(imgs)):
        imgs.append(np.ones((h, w, 3)).astype(np.uint8) * 127)
    rows = []
    for i in range(shape[0]):
        if shape[1] == 1:
            rows.append(imgs[i])
        else:
            rows.append(np.hstack(imgs[i * shape[1]:(i + 1) * shape[1]]))
    if shape[0] == 1:
        return rows[0]
    else:
        return np.vstack(rows)


def simple_count(data_in, lang=None, capa=None):
    """Counts occurrences of outcomes (win, lose, both, neither) in a dataset.

    Args:
        data_in (dict): The input data containing 'A', 'B', 'extracted' fields.
        lang (str, optional): Filter by language (default is None).
        capa (str, optional): Filter by capability (default is None).

    Returns:
        dict: A dictionary containing outcome counts for each
            entry in 'A' and 'B'.
    """
    data = cp.deepcopy(data_in)
    if lang is not None and 'lang' in data:
        data = data[data['lang'] == lang]
    if capa is not None and 'capability' in data:
        flag = [(capa in x) for x in data['capability']]
        data = data[flag]

    A, B, ext = data['A'], data['B'], data['extracted']
    res = {}
    for a, b, choice in zip(A, B, ext):
        if a not in res:
            res[a] = defaultdict(lambda: 0)
        if b not in res:
            res[b] = defaultdict(lambda: 0)
        ans_map = dict(A=['win', 'lose'],
                       B=['lose', 'win'],
                       C=['both', 'both'],
                       D=['neither', 'neither'])
        ak, bk = ans_map[choice]
        res[a][ak] += 1
        res[b][bk] += 1
    return res


def calc_win_rate(data_copy, models, lang=None, capa=None):
    """Calculates win rates, tie rates, and loss rates between models based on
    given data.

    Args:
        data_copy (pd.DataFrame): The input data containing
            'A', 'B', 'extracted', 'lang', and 'capability' columns.
        models (list): List of model names to calculate rates for.
        lang (str, optional): Filter data by language (default is None).
        capa (str, optional): Filter data by capability (default is None).

    Returns:
        pd.DataFrame, pd.DataFrame: DataFrames containing win rates
            (cnt) and tie rates (ff) between models.
    """
    data = cp.deepcopy(data_copy)
    if lang is not None and 'lang' in data:
        data = data[data['lang'] == lang]
    if capa is not None and 'capability' in data:
        flag = [(capa in x) for x in data['capability']]
        data = data[flag]

    win = defaultdict(lambda: 0)
    tie = defaultdict(lambda: 0)
    lose = defaultdict(lambda: 0)

    for i in range(len(data)):
        v = data.iloc[i]
        o = v['extracted']
        key = v['A'] + ';' + v['B']

        if o == 'A':
            win[key] += 1
        if o == 'B':
            lose[key] += 1
        if o in ['C', 'D']:
            tie[key] += 1

    nmodel = len(models)
    cnt = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
    ff = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
    tot = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
    for i, k in enumerate(win):
        m1, m2 = k.split(';')
        cnt.at[m1, m2] += win[k]
        cnt.at[m2, m1] += lose[k]
        ff.at[m1, m2] += tie[k]
        ff.at[m2, m1] += tie[k]
        tot.at[m1, m2] += tie[k] + win[k] + lose[k]
        tot.at[m2, m1] += tie[k] + win[k] + lose[k]

    for m1 in models:
        for m2 in models:
            if tot.at[m1, m2]:
                cnt.at[m1, m2] /= tot.at[m1, m2]
                ff.at[m1, m2] /= tot.at[m1, m2]
    return cnt, ff


def find_inconsistent(data, vals=['A', 'B', 'C', 'D']):
    """Finds inconsistent data entries based on specified values.

    Args:
        data (pd.DataFrame): The input data containing
            'cmp_index' and 'extracted' columns.
        vals (list, optional): List of possible values
            (default is ['A', 'B', 'C', 'D']).

    Returns:
        pd.DataFrame, pd.DataFrame: DataFrames containing
            consistent (cons) and inconsistent (incons) data entries.
    """
    assert 'extracted' in data
    cons, incons = [], []
    pred_map = {x: y for x, y in zip(data['cmp_index'], data['extracted'])}
    for k in data['cmp_index']:
        parts = k.split(';')
        kct = ';'.join([parts[0], parts[2], parts[1]])
        if kct not in pred_map:
            cons.append(k)
            continue
        cons_tups = [(vals[0], vals[1]), (vals[1], vals[0]),
                     (vals[2], vals[2]), (vals[3], vals[3])]
        flag = True
        for tup in cons_tups:
            if pred_map[k] == tup[0] and pred_map[kct] == tup[1]:
                flag = False
                cons.append(k)
                break
        if flag:
            incons.append(k)
    cons, incons = data[data['cmp_index'].isin(cons)], data[
        data['cmp_index'].isin(incons)]
    return cons, incons


def extract_vispair(data, vals='ABCD', vispair=None):
    """Extracts specific data pairs and writes them to Excel files.

    Args:
        data (pd.DataFrame): The input data containing
            'A', 'B', and 'extracted' columns.
        vals (str, optional): A string of possible
            values (default is 'ABCD').
        vispair (tuple, optional): A tuple specifying the pair
            of values to extract (e.g., ('A', 'B')).

    Returns:
        None
    """
    assert vispair is not None
    ma, mb = vispair
    indices_map = defaultdict(list)
    lt = len(data)
    for i in range(lt):
        item = data.iloc[i]
        if (item['A'] == ma and item['B'] == mb
                and item['extracted'] == vals[0]):
            indices_map[f'{ma}_win_{mb}'].append(i)

        if (item['A'] == mb and item['B'] == ma
                and item['extracted'] == vals[1]):
            indices_map[f'{ma}_win_{mb}'].append(i)

        if (item['A'] == ma and item['B'] == mb
                and item['extracted'] == vals[1]):
            indices_map[f'{ma}_lose_{mb}'].append(i)

        if (item['A'] == mb and item['B'] == ma
                and item['extracted'] == vals[0]):
            indices_map[f'{ma}_lose_{mb}'].append(i)

        if (set([item['A'], item['B']]) == set([ma, mb])
                and item['extracted'] == vals[2]):
            indices_map[f'{ma}_both_{mb}'].append(i)

        if (set([item['A'], item['B']]) == set([ma, mb])
                and item['extracted'] == vals[3]):
            indices_map[f'{ma}_neither_{mb}'].append(i)

    for k in indices_map:
        data_sub = data.iloc[indices_map[k]]
        dump(data_sub, f'{k}.xlsx')


def get_shape(lt):
    """Calculates the shape (rows, columns) for a grid based on the number of
    elements.

    Args:
        lt (int): The total number of elements in the grid.

    Returns:
        tuple: A tuple containing the calculated number
            of rows and columns.
    """
    h = int(math.sqrt(lt))
    w = lt // h
    if h * w < lt:
        w += 1
    return h, w


def compute_elo_score(data,
                      K=32,
                      SCALE=400,
                      BASE=10,
                      INIT_RATING=1000,
                      seed=2680,
                      vals='ABCD'):
    """Computes Elo ratings for models based on provided data.

    Args:
        data (pd.DataFrame): The input data containing
            'A', 'B', and 'extracted' columns.
        K (float, optional): The K factor for Elo
            calculation (default is 32).
        SCALE (float, optional): The Elo scale factor (default is 400).
        BASE (float, optional): The Elo base factor (default is 10).
        INIT_RATING (float, optional): The initial rating
            for models (default is 1000).
        seed (int, optional): Random seed for shuffling
            battles (default is 2680).
        vals (str, optional): A string of possible values
            (default is 'ABCD').

    Returns:
        dict: A dictionary containing model ratings.
    """
    rating = defaultdict(lambda: INIT_RATING)
    battles = []
    lt = len(data)
    for i in range(lt):
        item = data.iloc[i]
        score_map = {vals[0]: 1, vals[1]: 0, vals[2]: 0.5, vals[3]: 0.5}
        score = score_map[
            item['extracted']] if item['extracted'] in score_map else 0.5
        battles.append((item['A'], item['B'], score))

    rd.seed(seed)
    rd.shuffle(battles)

    for m0, m1, v in battles:
        ra = rating[m0]
        rb = rating[m1]
        ea = 1 / (1 + BASE**((rb - ra) / SCALE))
        eb = 1 / (1 + BASE**((ra - rb) / SCALE))
        sa = v
        rating[m0] += K * (sa - ea)
        rating[m1] += K * (1 - sa - eb)
    return {k: v for k, v in rating.items()}


def compute_elo_score_pack(tup):
    return compute_elo_score(tup[0], seed=tup[1], vals=tup[2])


def mrlines(fname, sp='\n'):
    f = open(fname).read().split(sp)
    while f != [] and f[-1] == '':
        f = f[:-1]
    return f


def get_bootstrap_result(data,
                         num_round,
                         base_seed=1000,
                         num_thread=20,
                         vals='ABCD'):
    """Computes Elo scores with bootstrapping and returns the results as a
    DataFrame.

    Args:
        data (pd.DataFrame): The input data containing 'A', 'B',
            and 'extracted' columns.
        num_round (int): The number of bootstrap rounds to perform.
        base_seed (int, optional): The base seed for randomization
            (default is 1000).
        num_thread (int, optional): The number of threads to use
            for parallel processing (default is 20).
        vals (str, optional): A string of possible values
            (default is 'ABCD').

    Returns:
        pd.DataFrame: A DataFrame containing Elo scores for
            models based on bootstrapping.
    """
    rows = []
    tups = [(data, base_seed + i, vals) for i in range(num_round)]
    pool = mp.Pool(num_thread)
    rets = pool.map(compute_elo_score_pack, tups)
    for ret in rets:
        rows.append(ret)
    df = pd.DataFrame(rows)
    return df[df.median().sort_values(ascending=False).index]


def bootstrap_elo(data, num_round=1000, times=10, vals='ABCD'):
    """Computes Elo scores with bootstrapping over multiple runs and returns
    aggregated results.

    Args:
        data (pd.DataFrame): The input data containing 'A', 'B',
            and 'extracted' columns.
        num_round (int, optional): The number of bootstrap rounds
            to perform in each run (default is 1000).
        times (int, optional): The number of runs to perform
            (default is 10).
        vals (str, optional): A string of possible values
            (default is 'ABCD').

    Returns:
        pd.DataFrame: A DataFrame containing aggregated Elo
            scores with mean and standard deviation.
    """
    results = defaultdict(list)
    for i in tqdm(range(times)):
        bootstrap_elo_lu = get_bootstrap_result(data,
                                                num_round,
                                                base_seed=num_round * i,
                                                num_thread=20,
                                                vals=vals)
        bootstrap_lu_median = bootstrap_elo_lu.median().reset_index().set_axis(
            ['model', 'rating'], axis=1)
        for m, r in zip(bootstrap_lu_median['model'],
                        bootstrap_lu_median['rating']):
            results[m].append(r)
    res_dict = {}
    keys = list(results.keys())
    keys.sort()
    for k in keys:
        res_dict[k] = [np.mean(results[k]), np.std(results[k])]
    df = pd.DataFrame(res_dict, index=['elo_score [Mean]', 'elo_score [Std]'])
    return df


FONT_FILE = os.environ.get('FONT_FILE', None)


def match_answer(s):
    """Match the selected answer (A, B, C, or D) in a given string.

    Args:
        s (str): The input string to search for the selected answer.

    Returns:
        str or None: The matched answer ('A', 'B', 'C', or 'D')
            or None if not found.
    """

    def match_char(s, chars):
        cin = [c in s for c in chars]
        if sum(cin) == 1:
            return chars[cin.index(True)]
        else:
            return None

    lines = s.split('\n')
    for _, line in enumerate(lines):
        if line.startswith('选择：'):
            return match_char(line, 'ABCD')
    return None


def draw_heatmap(hmap, title):
    """Draw a heatmap using the given data.

    Args:
        hmap (pd.DataFrame): The data for the heatmap.
        title (str): The title for the heatmap.

    Returns:
        np.ndarray: An image of the heatmap.
    """
    from matplotlib import font_manager
    if FONT_FILE is None:
        fontP = font_manager.FontProperties()
    else:
        fontP = font_manager.FontProperties(fname=FONT_FILE)
    fontP.set_size(18)
    import matplotlib.pyplot as plt
    import seaborn as sns
    ax = sns.heatmap(hmap,
                     annot=True,
                     cmap='Blues',
                     annot_kws={'size': 35 / np.sqrt(len(hmap))})
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=12)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
    plt.yticks(rotation=0)
    ax.xaxis.tick_top()  # x axis on top
    ax.xaxis.set_label_position('top')
    plt.title(title, color='Blue', fontproperties=fontP)
    plt.tight_layout()
    buffer = io.BytesIO()
    plt.savefig(buffer, format='png', dpi=100)
    plt.close()
    buffer.seek(0)
    image_data = buffer.getvalue()
    image = cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
    return image


def proc_capa(capas):
    capa_lists = [capa_str for capa_str in capas]
    capa_set = set(capa_lists)
    capa_set = list(capa_set)
    return capa_set


class SubjectiveSummarizer:
    """Do the subjectivity analyze based on evaluation results.

    Args:
        config (ConfigDict): The configuration object of the evaluation task.
            It's expected to be filled out at runtime.
        vispair (List[str], optional): List of
            two models to visualize.
        refm (str, optional): Reference model
            for win rate comparison.
        col_name (str): Name of the column
            containing evaluation results.
        fout (str): Output file name.
        ignore (str, optional): Ignore certain
            comparisons based on a file.
    """

    def __init__(
        self,
        config: ConfigDict,
        vispair: Optional[List[str]] = None,
        refm: Optional[str] = None,
        col_name: str = 'gpt4',
        fout: str = 'report.md',
        ignore: Optional[str] = None,
    ) -> None:
        self.tasks = []
        self.cfg = config
        self.vispair = vispair
        self.refm = refm
        self.col_name = col_name
        self.fout = fout
        self.ignore = ignore

    def summarize(self,
                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.

        Args:
            time_str (str): Timestamp for file naming.

        Returns:
            pd.DataFrame: The summary results.
        """

        dataset_cfgs = self.cfg['datasets']
        eval_cfg = self.cfg['eval']
        work_dir = self.cfg['work_dir']
        self.work_dir = work_dir

        self.time_str = time_str
        output_path = osp.join(self.work_dir, 'summary',
                               f'summary_{self.time_str}.txt')
        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
        mmengine.mkdir_or_exist(output_dir)
        fout = open(osp.join(output_dir, self.fout), 'w')
        results_folder = osp.join(work_dir, 'results')
        data_list = []
        for subdir in os.listdir(results_folder):
            subdir_path = os.path.join(results_folder, subdir)
            if os.path.isdir(subdir_path):
                model1, model2 = subdir.split('_')
                for dataset in dataset_cfgs:
                    origin_dataset = build_dataset_from_cfg(dataset)
                    dataset_abbr = dataset_abbr_from_cfg(dataset)
                    filepath = os.path.join(subdir_path,
                                            dataset_abbr + '.json')
                    result = mmengine.load(filepath)
                    if eval_cfg['partitioner']['mode'] == 'all':
                        for key, value in result.items():
                            prediction = value.get('prediction', None)
                            q_index = origin_dataset.test[int(key) % len(
                                origin_dataset.test)]['index']
                            cmp_index = f'{q_index};{model1};{model2}'
                            data_list.append(
                                [cmp_index, model1, model2, prediction])

        data = pd.DataFrame(data_list, columns=['cmp_index', 'A', 'B', 'gpt4'])
        meta = pd.read_excel(
            osp.join(dataset_cfgs[0]['path'],
                     dataset_cfgs[0]['name'] + '.xlsx'))

        if self.ignore is not None:
            q_index = [x.split(';')[0] for x in data['cmp_index']]
            to_ignore = set(mrlines(self.ignore))
            flag = [x not in to_ignore for x in q_index]
            data = data[flag]

        double_log('# Subjective Analysis', fout)
        capas = proc_capa(meta['capability'])
        capa_map = {i: c for i, c in zip(meta['index'], meta['capability'])}

        nonem = [x != 'EM' for x in data[self.col_name]]
        double_log(
            f'A total of {len(data)} comparisons, of which {sum(nonem)} '
            f'comparisons are meaningful (A / B answers inconsistent)', fout)
        data = data[nonem]

        data['capability'] = [
            capa_map[str(i).split(';')[0]] for i in data['cmp_index']
        ]
        data['extracted'] = [match_answer(ans) for ans in data[self.col_name]]

        succeed = [not pd.isna(x) for x in data['extracted']]
        succeed_rate = np.mean(succeed)
        double_log(
            f'A total of {len(succeed)} answer comparisons, successfully '
            f'extracted {sum(succeed)} answers from GPT-4 replies, with '
            f'an extraction success rate of {succeed_rate * 100:.2f}%', fout)
        data = data[succeed]

        cons, incons = find_inconsistent(data, 'ABCD')
        if len(cons) != len(data):
            double_log(
                f'A total of {len(data)} answer comparisons, {len(cons)} '
                f'pairs (A vs. B <-> B vs. A) are consistent，consistent '
                f'rate is {len(cons) / len(data) * 100:.2f}%', fout)

        dump(cons, osp.join(output_dir, 'consistent_cmp.xlsx'))
        dump(incons, osp.join(output_dir, 'inconsistent_cmp.xlsx'))

        data = cons
        if self.vispair is not None and len(self.vispair) == 2:
            extract_vispair(data, vispair=self.vispair)

        data['lang'] = [x.split('-')[0] for x in data['cmp_index']]
        langs = [None, 'cn', 'en']
        return self.analyze(data, self.refm, langs, capas, fout)

    def analyze(self, data, refm, langs, capas, fout):
        """Do the subjectivity analysis based on evaluation results.

        Args:
            data (pd.DataFrame): The evaluation data.
            refm (str): Reference model for win rate comparison.
            langs (List[str]): List of languages to analyze.
            capas (List[str]): List of capabilities to analyze.
            fout (str): Output file name.

        Returns:
            None
        """
        output_path = osp.join(self.work_dir, 'summary',
                               f'summary_{self.time_str}.txt')
        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
        mmengine.mkdir_or_exist(output_dir)

        stats = defaultdict(list)
        scores = defaultdict(list)

        dim_key = 'Dimension \\ Stat [W / T / L / NB]'
        scores_dim_key = 'Dimension \\ Score'

        for lang in langs:
            name = (lang.upper() if lang is not None else 'Overall')
            stats[dim_key].append(f'LANG: {name}')
            scores[scores_dim_key].append(f'LANG: {name}')

            count_stat = simple_count(data, lang=lang)
            if count_stat == {}:
                for k, v in stats.items():
                    if k != dim_key:
                        v.append('N/A')
                for k, v in scores.items():
                    if k != scores_dim_key:
                        v.append('N/A')

            for k in count_stat:
                stat = count_stat[k]
                winr = stat['win'] / sum(stat.values())
                tier = (stat['both'] + stat['neither']) / sum(stat.values())
                loser = stat['lose'] / sum(stat.values())
                not_bad = (stat['win'] + stat['both']) / sum(stat.values())
                msg = f'{winr * 100:.1f}% / {tier * 100:.1f}% / {loser * 100:.1f}% / {not_bad * 100:.1f}%'  # noqa
                stats[k].append(msg)
                score = 3 * stat['win'] + stat['both'] - stat[
                    'neither'] - 3 * stat['lose']
                scores[k].append(score)
        for capa in capas:
            stats[dim_key].append(f'CAPA: {capa}')
            scores[scores_dim_key].append(f'CAPA: {capa}')
            count_stat = simple_count(data, capa=capa)
            if count_stat == {}:
                for k, v in stats.items():
                    if k != dim_key:
                        v.append('N/A')
                for k, v in scores.items():
                    if k != scores_dim_key:
                        v.append('N/A')

            for k in count_stat:
                stat = count_stat[k]
                winr = stat['win'] / sum(stat.values())
                tier = (stat['both'] + stat['neither']) / sum(stat.values())
                loser = stat['lose'] / sum(stat.values())
                not_bad = (stat['win'] + stat['both']) / sum(stat.values())
                msg = f'{winr * 100:.1f}% / {tier * 100:.1f}% / {loser * 100:.1f}% / {not_bad * 100:.1f}%'  # noqa
                stats[k].append(msg)
                score = 3 * stat['win'] + stat['both'] - stat[
                    'neither'] - 3 * stat['lose']
                scores[k].append(score)
        double_log(
            '### Basic statistics (4 stats: win / tie / lose / not bad)', fout)
        all_models = list(stats.keys())
        all_models.remove(dim_key)

        table_width = 3
        num_tables = len(all_models) // table_width + (
            len(all_models) % table_width != 0)
        for i in range(num_tables):
            cur_keys = [dim_key
                        ] + all_models[i * table_width:(i + 1) * table_width]
            sub_stats = {k: stats[k] for k in cur_keys}
            double_log(tabulate(sub_stats, headers='keys', tablefmt='github'),
                       fout)

        image_url1 = 'by_capa.png'
        image_url2 = 'by_lang.png'
        double_log(
            f'\n\n![Capabilities Dimension '
            f'Classification Result]({image_url1})'
            f'\n\n![Language Classification  Result]({image_url2})', fout)

        double_log(
            '\n\n### Model scores (base score is 0, win +3,'
            ' both +1, neither -1, lose -3)', fout)
        double_log(tabulate(scores, headers='keys', tablefmt='github'), fout)

        double_log('### Bootstrap ELO, Median of n=1000 times ', fout)
        elo_table = bootstrap_elo(data)
        double_log(tabulate(elo_table, headers='keys', tablefmt='github'),
                   fout)

        models = list(count_stat.keys())
        models.sort()

        images = []
        for lang in langs:
            wr, dr = calc_win_rate(data, models, lang=lang)
            lang_name = lang.upper() if lang is not None else 'Overall'

            wr_table = defaultdict(list)
            if refm is not None:
                for m in models:
                    if m == refm:
                        continue
                    wr_table['model'].append(m)
                    wr_table['win_rate'].append(wr.at[m, refm])
                    wr_table['draw_rate'].append(dr.at[m, refm])
                    wr_table['win + draw'].append(dr.at[m, refm] +
                                                  wr.at[m, refm])
                double_log(
                    f'By language {lang_name}, calculate '
                    f'the win rate against {refm}:', fout)
                double_log(
                    tabulate(wr_table, headers='keys', tablefmt='github'),
                    fout)

            im = draw_heatmap(
                wr, f'Language: {lang if lang is not None else "All"}')
            images.append(im)
        image = stack_image(images, shape=(1, 3))
        cv2.imwrite(osp.join(output_dir, 'by_lang.png'), image)

        images = []
        for capa in capas:
            wr, dr = calc_win_rate(data, models, capa=capa)

            wr_table = defaultdict(list)
            if refm is not None:
                for m in models:
                    if m == refm:
                        continue
                    wr_table['model'].append(m)
                    wr_table['win_rate'].append(wr.at[m, refm])
                    wr_table['draw_rate'].append(dr.at[m, refm])
                    wr_table['win + draw'].append(dr.at[m, refm] +
                                                  wr.at[m, refm])
                double_log(
                    f'By capability {capa}, calculate the '
                    f'win rate against {refm}:', fout)
                double_log(
                    tabulate(wr_table, headers='keys', tablefmt='github'),
                    fout)

            im = draw_heatmap(wr, f'Capability: {capa}')
            images.append(im)

        lt = len(capas)
        h, w = get_shape(lt)
        image = stack_image(images, shape=(h, w))
        cv2.imwrite(osp.join(output_dir, 'by_capa.png'), image)
        dump(data, osp.join(output_dir, 'tmp.xlsx'))
        fout.close()