OpenCompass/opencompass/datasets/evalplus/perf/select_pe_tasks.py

"""Analyze the variance of PE and their time cost.
Filter those with high variance and low time cost.
"""

import json
import math
from datetime import datetime
from typing import List

import numpy as np
from rich.console import Console
from rich.syntax import Syntax
from termcolor import colored

from evalplus.config import PREF_CURATE_MIN_INSTRUCTION


def cv(time_costs: List[float]) -> float:
    """
    We use Coefficient of Variation (CV) to as the variance of PE.
    CV = 100 * standard deviation / mean
    """
    if len(time_costs) == 0:
        raise ValueError("time_costs is empty.")
    return 100 * np.std(time_costs) / np.mean(time_costs)


def filter_by_profile_size(task2profile: dict, threshold: int = 10):
    to_remove = []
    for task_id, profile in task2profile.items():
        if (
            profile is None
            or len(profile) < threshold
            or any(None in p for p in profile)
        ):
            print(colored(f"⚠️ {task_id} skipped: #profile < {threshold}", "red"))
            to_remove.append(task_id)
    for task_id in to_remove:
        del task2profile[task_id]
    return task2profile


def filter_by_compute_cost(
    task2profile: dict, thresh: float = PREF_CURATE_MIN_INSTRUCTION
):
    """Filter out tasks that can be solved using less than threshold #instruction."""
    to_remove = []
    for task_id, profile in task2profile.items():
        if (
            min(np.mean(p) for p in profile) < thresh
        ):  # filter if some solution is too fast
            print(
                colored(
                    f"⚠️ {task_id} skipped: some solution is faster than {thresh} #instruction",
                    "red",
                )
            )
            to_remove.append(task_id)
    for task_id in to_remove:
        del task2profile[task_id]
    return task2profile


def filter_by_cv(task2profile: dict, thresh: float, percentile: int = 95):
    to_remove = []
    for task_id, profile in task2profile.items():
        mean_var = np.percentile([cv(p) for p in profile], percentile)
        if mean_var > thresh:
            print(
                colored(
                    f"⚠️ {task_id} skipped: P{percentile} CV = {mean_var:.1f}% > {thresh}%",
                    "red",
                )
            )
            to_remove.append(task_id)
    for task_id in to_remove:
        del task2profile[task_id]
    return task2profile


# smaller time, larger threshold
def thresh_fn(base_thresh, x, weight=0.002):
    return base_thresh + math.sqrt(weight / x)


def adaptive_seg1d(arr1d, base_thresh=0.10):
    # sort from large to small
    arr1d = np.sort(arr1d)[::-1]
    # relative distance
    relative_distance = -np.diff(arr1d) / arr1d[:-1]

    splitter_idx = []
    for i, rel in enumerate(relative_distance):
        if rel > thresh_fn(base_thresh, arr1d[i], weight=PREF_CURATE_MIN_INSTRUCTION):
            splitter_idx.append(i + 1)

    # [9, 8, 7, |-> 3, 2 1]
    # splitter_idx points to the slowest in each cluster
    return np.split(arr1d, splitter_idx)


def filter_by_clustering(task2profile: dict, base_threshold=0.2, min_clusters=3):
    to_remove = []
    for task_id, profile in task2profile.items():
        if len(adaptive_seg1d(np.mean(profile, axis=1), base_threshold)) < min_clusters:
            print(
                colored(
                    f"⚠️ {task_id} skipped: #Cluster = 0 with {base_threshold=}%",
                    "red",
                )
            )
            to_remove.append(task_id)
    for task_id in to_remove:
        del task2profile[task_id]
    return task2profile


def brief_list_repr(lst, head_count=4, tail_count=4):
    if len(lst) <= head_count + tail_count:
        return f"{lst}"
    else:
        head = ", ".join(str(x) for x in lst[:head_count])
        tail = ", ".join(str(x) for x in lst[-tail_count:])
        return f"[{head}, ..., {tail}]"


def script(
    profiled_solutions: str,
    output_dataset: str = f"evalperf-{datetime.now():%Y%m%d}.jsonl",
    debug_tasks: List[str] = [],
    min_clusters=4,
):
    assert profiled_solutions.endswith(".jsonl")
    assert output_dataset.endswith(".jsonl")

    # read jsonl
    with open(profiled_solutions, "r") as f:
        profiled_solutions = [json.loads(l) for l in f if l.strip()]

    console = Console()

    task2profile = {d["task_id"]: d["counter_profile"] for d in profiled_solutions}
    print(f"Loaded {len(task2profile)} tasks.")

    # * Criteria 1: Profile cannot be empty
    task2profile = filter_by_profile_size(task2profile)
    print(f"{len(task2profile)} tasks with profile.")

    # * Criteria 2: Solutions should run more than MIN_SLOWEST_INSTRUCTION_COUNT
    task2profile = filter_by_compute_cost(task2profile)
    print(
        f"{len(task2profile)} tasks with slowest mean time > {PREF_CURATE_MIN_INSTRUCTION}s."
    )

    # * Criteria 3: P99-CV should be less than 5%
    final_thresh = 5
    percentile = 99
    task2profile = filter_by_cv(
        task2profile, thresh=final_thresh, percentile=percentile
    )
    print(f"{len(task2profile)} tasks with CV <= {final_thresh}%.")

    # * Criteria 4: Cluster should be more than 1
    task2profile = filter_by_clustering(
        task2profile, base_threshold=0.2, min_clusters=min_clusters
    )
    print(f"{len(task2profile)} tasks with #Cluster >= {min_clusters}.")

    # export dataset
    task2solution = {d["task_id"]: d for d in profiled_solutions}
    # each item is {"task_id": "xxx", "solutions": [...], "percentile": [...]}
    export_dataset = []
    total_clusters = 0
    for task_id, profile in task2profile.items():
        print(colored(f"-========== {task_id} ==========-", "green"))
        if task_id in debug_tasks:
            print(colored(f"Debugging {task_id}", "red"))
        mean_runtime = [np.mean(p) for p in profile]
        clusters = adaptive_seg1d(mean_runtime)  # descend
        print(colored(f"#seg = {len(clusters)}", "green"))

        accumulative_ratio = []
        ref_idx = []
        for i, cluster in enumerate(clusters):
            prior_ar = 0 if i == 0 else accumulative_ratio[-1]
            ratio = 100 * len(cluster) / len(mean_runtime)
            acc_ratio = prior_ar + ratio
            brief_list_str = brief_list_repr([round(1000 * v) for v in cluster])
            print(
                f"#{i} |{len(cluster):<3}| ({acc_ratio:<4.1f}) @cv {cv(cluster):.1f}: {brief_list_str}"
            )
            accumulative_ratio.append(acc_ratio)
            ref_idx.append(np.where(mean_runtime == cluster[0])[0][0])

            if task_id in debug_tasks:
                # print solutions
                solution_text = task2solution[task_id]["solutions"][ref_idx[-1]]
                # remove empty lines
                solution_text = "\n".join(
                    line for line in solution_text.split("\n") if line.strip()
                )
                console.print(Syntax(solution_text, "python"))
                print(colored("-" * 32, "green"))

        total_clusters += len(clusters)

        # add reference solution and check consistency
        for i in range(len(ref_idx)):
            if i == 0:
                continue
            # prior runtime must be larger than current
            assert mean_runtime[ref_idx[i - 1]] > mean_runtime[ref_idx[i]]

        reference = [task2solution[task_id]["solutions"][idx] for idx in ref_idx]

        assert len(reference) == len(clusters)
        assert len(accumulative_ratio) == len(reference)
        item = {
            "task_id": task_id,
            "reference": reference,
            "pe_input": task2solution[task_id]["pe_input"],
            "scores": accumulative_ratio,
        }
        export_dataset.append(item)

    print(f"Total clusters: {total_clusters}")

    with open(output_dataset, "w") as f:
        for item in export_dataset:
            f.write(json.dumps(item) + "\n")


def main():
    from fire import Fire

    Fire(script)


if __name__ == "__main__":
    main()