OpenCompass/opencompass/datasets/evalplus/evalperf.py

"""Compute the Differential Performance Scores (DPS) and DPS_{norm} of given samples from a model.

Check our COLM paper for more details: https://www.arxiv.org/abs/2408.06450

^Updates from the COLM paper:
* Condition to activate efficiency evaluation for a task:
  * Paper: as long as you have at least one correct solution, and we select up to 10 correct solutions for efficiency sampling
  * Here: you need to have at least `min_correct` correct solutions, and we evaluate the efficiency of all correct solutions
  * Updating rationale: to make the evaluation more statistically robust

@inproceedings{liu2024evaluating,
  title = {Evaluating Language Models for Efficient Code Generation},
  author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
  booktitle = {First Conference on Language Modeling},
  year = {2024},
  url = {https://openreview.net/forum?id=IBCBMeAhmC},
}
"""

import json
import multiprocessing
import os
import socket
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import closing
from datetime import datetime
from statistics import mean
from typing import Dict, List, Optional, Tuple

import rich
from rich.rule import Rule
from rich.syntax import Syntax
from rich.table import Table

from opencompass.datasets.evalplus.codegen import run_codegen
from opencompass.datasets.evalplus.config import *
from opencompass.datasets.evalplus.config import PERF_EVAL_TIMEOUT_SECOND
from opencompass.datasets.evalplus.data import (
    get_evalperf_data,
    get_human_eval_plus,
    get_human_eval_plus_hash,
    get_mbpp_plus,
    get_mbpp_plus_hash,
)
from opencompass.datasets.evalplus.data.mbpp import mbpp_deserialize_inputs
from opencompass.datasets.evalplus.data.utils import stream_jsonl
from opencompass.datasets.evalplus.eval import PASS, untrusted_check
from opencompass.datasets.evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
from opencompass.datasets.evalplus.evaluate import get_groundtruth
from opencompass.datasets.evalplus.perf.profile import (
    are_profiles_broken,
    default_parallelism,
    profile,
    simple_test_profiler,
)
from opencompass.datasets.evalplus.utils import progress

def rule(msg: str):
    rich.print(Rule(msg))


def not_none(l: list) -> list:
    return [x for x in l if x is not None]


def get_free_port():
    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
        s.bind(("", 0))
        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        return s.getsockname()[1]


def correctness_check(
    solution: str, dataset: str, task: Dict, expected_output: List
) -> Tuple:
    assert isinstance(solution, str)
    result = untrusted_check(
        dataset,
        solution,
        task["base_input"] + list(task["plus_input"]),
        task["entry_point"],
        expected_output["base"] + expected_output["plus"],
        task["atol"],
        expected_output["base_time"] + expected_output["plus_time"],
        fast_check=True,
        min_time_limit=DEFAULT_MIN_TIME_LIMIT,
        gt_time_limit_factor=DEFAULT_GT_TIME_LIMIT_FACTOR,
    )
    return result, solution


def get_evalplus_data():
    problems_he = get_human_eval_plus(noextreme=True)
    dataset_hash = get_human_eval_plus_hash(noextreme=True)
    expected_output_human = get_groundtruth(problems_he, dataset_hash, [])
    problems_mbpp = get_mbpp_plus(noextreme=True)
    dataset_hash = get_mbpp_plus_hash(noextreme=True)
    expected_output_mbpp = get_groundtruth(
        problems_mbpp,
        dataset_hash,
        MBPP_OUTPUT_NOT_NONE_TASKS,
    )
    problems = {**problems_he, **problems_mbpp}
    expected_output = {**expected_output_human, **expected_output_mbpp}
    return problems, expected_output


def table_print(table_name: str, kv: Dict):
    table = Table(
        title=table_name,
        show_header=True,
        header_style="bold",
    )
    for col_name in kv:
        table.add_column(col_name)

    table.add_row(*[str(v) for v in kv.values()])
    rich.print(table)


def correctness_worker(task_id: str, samples: list, ctask: Dict, expected_output: Dict):
    assert isinstance(
        samples, list
    ), f"{task_id}: samples is not a list but {type(samples)}"

    results = []

    for solution in samples:
        result, solution = correctness_check(
            solution, task_id.split("/")[0].lower(), ctask, expected_output
        )
        results.append(
            {
                "solution": solution,
                "pass": result[0] == PASS,
                "profiled": False,
                "matching_cluster_idx": None,
                "dps": None,
                "dps_norm": None,
            }
        )

    return task_id, results


def perf_worker(
    task_id: str,
    ptask: Dict,  # EvalPerf data
    ret_dict: Dict,
    lazy_evaluation: bool,
    max_profile: int,
):
    rich.print(f"{task_id}: Started")
    start_time = time.time()

    ######################### Profiling Setup #########################
    n_reference = len(ptask["reference"])
    entry_point = ptask["entry_point"]
    pe_input = (
        mbpp_deserialize_inputs(task_id, ptask["pe_input"])[0]
        if task_id.startswith("Mbpp/")
        else ptask["pe_input"][0]
    )
    ####################################################################

    ####################################################################
    ############### Lazily profile reference solutions #################
    ####################################################################
    cache_ref_num_inst = [None] * n_reference

    def get_avg_ref_profile(idx, check_order=True) -> Optional[Tuple]:
        nonlocal cache_ref_num_inst

        assert (
            idx < n_reference - 1
            and cache_ref_num_inst[idx + 1] is not None
            or idx == n_reference - 1
        ), f"Calling get_avg_ref_profile({idx}) before get_avg_ref_profile({idx+1}) is called, is not allowed! {n_reference = }"

        if cache_ref_num_inst[idx] is not None:
            return cache_ref_num_inst[idx], ptask["scores"][idx]

        evaluation_time = PERF_EVAL_TIMEOUT_SECOND
        ref_solution = ptask["reference"][idx]
        for _ in range(2):  # at most retry twice
            profiles = profile(
                ref_solution,
                entry_point,
                [pe_input],
                timeout_second_per_test=evaluation_time,
            )

            # Bad thing#1: timeout / failure happens
            if are_profiles_broken(profiles):
                print(f"{task_id}: [WARNING] Error in ref: {profiles}")
                rich.print(Syntax(ref_solution, "python"))
                print(f"{task_id}: Retrying w/ +10s timeout...")
                evaluation_time += 10
            else:
                break

        avg_profile = mean(profiles)
        # Bad thing#2: if the current #instruction is faster than that of i+1
        if idx < n_reference - 1 and avg_profile < cache_ref_num_inst[idx + 1]:
            print(f"{task_id}: [WARNING] #{idx} ref faster than #{idx + 1}")
            print(f"ref {idx}: #inst {avg_profile}\tscore {ptask['scores'][idx]:.1f}")
            print(
                f"ref {idx+1}: #inst {cache_ref_num_inst[idx+1]}\tscore {ptask['scores'][idx+1]:.1f}"
            )
            rich.print(Syntax(ref_solution, "python"))
            if check_order:
                return None

        cache_ref_num_inst[idx] = avg_profile
        ret_dict["ref"][idx]["_num_cpu_instructions"] = avg_profile
        return cache_ref_num_inst[idx], ptask["scores"][idx]

    ####################################################################
    ############################## END #################################
    ####################################################################

    if not lazy_evaluation:  # compute everything ahead of time
        for i in range(n_reference - 1, -1, -1):
            if get_avg_ref_profile(i) is None:
                break

        assert (
            None not in cache_ref_num_inst
        ), f"{task_id}: Failed to profile certain reference: {cache_ref_num_inst = }"

    profile_cache = {}

    cur_profiled = 0
    for result in ret_dict["results"]:
        if cur_profiled >= max_profile:
            rich.print(f"{task_id}: Reached max_profile limit {max_profile}, stopped")
            break
        if not result["pass"]:
            continue

        solution = result["solution"]

        if solution in profile_cache:  # reuse cache
            sample_profiles = profile_cache[solution]
        else:
            sample_profiles = profile(
                solution,
                entry_point,
                [pe_input],
                timeout_second_per_test=PERF_EVAL_TIMEOUT_SECOND,
            )
            profile_cache[solution] = sample_profiles  # store cache

        score = 0
        norm_score = 0
        result["matching_cluster_idx"] = -1  # -1 means even slower than the slowest ref
        # if the solution results in a timeout, score is 0
        if are_profiles_broken(sample_profiles):
            print(
                f"{task_id}: Tested solution error'ed out: {sample_profiles} ... regarded as 0 score"
            )
            rich.print(Syntax(solution, "python"))
        else:
            avg_sample_profile = result["_num_cpu_instructions"] = mean(sample_profiles)
            # Get profiles from fast to slow (back to front):
            for j in range(n_reference - 1, -1, -1):
                avg_ref_profile, ref_score = get_avg_ref_profile(j, check_order=False)
                if avg_sample_profile <= avg_ref_profile:
                    result["matching_cluster_idx"] = j
                    score = ref_score
                    norm_score = 100 * (j + 1) / n_reference
                    break

        result["dps"] = score
        result["dps_norm"] = norm_score
        result["profiled"] = True
        cur_profiled += 1

    ret_dict["dps"] = mean(not_none([r["dps"] for r in ret_dict["results"]]))
    ret_dict["dps_norm"] = mean(not_none([r["dps_norm"] for r in ret_dict["results"]]))
    ret_dict["n_profiled"] = cur_profiled

    table_print(
        f"[bold green]{task_id} Completed[/]",
        {
            "Duration": f"{time.time() - start_time:.1f}s",
            "DPS": f"[green]{ret_dict['dps']:.1f}[/]",
            "DPS_norm": f"[green]{ret_dict['dps_norm']:.1f}[/]",
            "# Profiled": f"{cur_profiled} / {len(ret_dict['results'])}",
            "Pass@1": f"{ret_dict['pass@1']:.1f}%",
        },
    )

    return ret_dict


# TODO(@ganler): OPTIMIZATION: reuse the samples from the generations of other datasets
def script(
    samples: Optional[str] = None,
    min_correct: int = 10,
    max_profile: Optional[int] = None,
    n_samples: int = 100,
    temperature: float = 1.0,
    parallel: Optional[int] = None,
    lazy_evaluation: bool = True,
    i_just_wanna_run: bool = False,
    **model_kwargs,
):
    max_profile = max_profile or min(min_correct * 2, n_samples)
    assert min_correct <= max_profile <= n_samples
    simple_test_profiler()  # test linux perf setup

    if model_kwargs:
        # To suppress the warning of tokenizers
        os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
            "TOKENIZERS_PARALLELISM", "false"
        )
        # overwrite parameters
        samples = run_codegen(
            dataset="evalperf",
            n_samples=n_samples,
            temperature=temperature,
            **model_kwargs,
        )

    assert samples is not None, "Please provide the path to the samples"

    # Data loading
    problems, expected_output = get_evalplus_data()
    ptasks = get_evalperf_data()

    # Parallelism
    max_workers = parallel or max(1, default_parallelism(divisor=4))
    assert 0 < max_workers < multiprocessing.cpu_count(), "Invalid max CPU workers"

    if os.path.isdir(samples):
        result_path = os.path.join(samples, "evalperf_results.json")
    else:
        assert samples.endswith(".jsonl")
        result_path = samples.replace(".jsonl", "_evalperf_results.json")
    brief_result_path = result_path.replace(
        "evalperf_results.json", "evalperf_results.brief.json"
    )

    # resume results
    eval_results = {}
    if not i_just_wanna_run and os.path.exists(result_path):
        resumed_result = json.load(open(result_path, "r"))
        if (
            resumed_result["n_samples"] == n_samples
            and resumed_result["temperature"] == temperature
            and resumed_result["min_correct"] == min_correct
            and resumed_result["max_profile"] == max_profile
        ):
            eval_results = resumed_result["eval"]
            for etask in eval_results:
                ptasks.pop(etask, None)

            rich.print(f"Resumed {len(eval_results)} results from {result_path}")

    # Load model's samples: task_id -> a list of samples
    sample_iter = stream_jsonl(samples)
    samples = defaultdict(list)
    for task in sample_iter:
        samples[task["task_id"].replace("_", "/")].append(task["solution"])
    samples = {k: v[:n_samples] for k, v in samples.items()}

    # assert each task has n_samples
    for task_id, s in samples.items():
        assert len(s) == n_samples, f"{task_id} has {len(s)} samples != {n_samples}"

    # Initialize eval_results
    for task_id, ptask in ptasks.items():
        eval_results[task_id] = {
            "task_id": task_id,
            "results": [],
            "ref": [
                {"solution": s, "score": r, "_num_cpu_instructions": None}
                for s, r in zip(ptask["reference"], ptask["scores"])
            ],
            "dps": None,
            "dps_norm": None,
            "pass@1": None,
            "n_profiled": None,
        }

    rule("Correctness Checking...")
    with progress("Correctness") as p:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(
                    correctness_worker,
                    task_id,
                    samples[task_id],
                    problems[task_id],
                    expected_output[task_id],
                )
                for task_id in ptasks
            ]

            for future in p.track(as_completed(futures), total=len(futures)):
                task_id, results = future.result()
                eval_results[task_id]["results"] = results
                eval_results[task_id]["pass@1"] = (
                    100 * len([r for r in results if r["pass"]]) / n_samples
                )

    rule("EvalPerf Configurations")
    if lazy_evaluation:
        rich.print(
            "[bold yellow]Lazy evaluation is enabled[/]: "
            "Fast evaluation without enumeratively checking reference order consistency."
        )

    table_print(
        "Configurations",
        {
            "Max CPU": max_workers,
            "#Tasks": len(ptasks),
            "#Samples per task": n_samples,
            "Min correct": min_correct,
            "Max profile": max_profile,
            "Result path": result_path,
        },
    )

    rich.print(f"IDs of tasks to evaluate: {list(ptasks.keys())}")
    rule("Evaluation Start")
    undone = []
    with progress("Profiling") as p:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []
            for task_id, ptask in ptasks.items():
                n_pass = len([r for r in eval_results[task_id]["results"] if r["pass"]])
                if n_pass < min_correct:
                    rich.print(
                        f"{task_id}: [bold yellow]{n_pass} < {min_correct} correct solutions, skipped[/]"
                    )
                    continue
                futures.append(
                    executor.submit(
                        perf_worker,
                        task_id,
                        ptask,
                        eval_results[task_id],
                        lazy_evaluation,
                        max_profile,
                    )
                )
                undone.append(task_id)
                rich.print(f"{task_id}: Queued")

            for future in p.track(as_completed(futures), total=len(futures)):
                result = future.result()
                eval_results[result["task_id"]] = result
                undone.remove(result["task_id"])
                if undone and len(undone) < max_workers:
                    print(f"Still running: {undone}")

    rule("Evaluation Summary")
    dps = mean(not_none([res["dps"] for res in eval_results.values()]))
    dps_norm = mean(not_none([res["dps_norm"] for res in eval_results.values()]))
    pass_1 = mean(not_none([res["pass@1"] for res in eval_results.values()]))
    n_evalperfed = len(not_none([res["dps"] for res in eval_results.values()]))

    table_print(
        "EvalPerf Summary",
        {
            "DPS": f"{dps:.1f}",
            "DPS_norm": f"{dps_norm:.1f}",
            "Pass@1": f"{pass_1:.1f}%",
            "#EvalPerf-ed tasks": f"{n_evalperfed} / {len(eval_results)}",
            "min_correct": min_correct,
            "n_samples": n_samples,
            "temperature": temperature,
        },
    )

    # Save full results
    with open(result_path, "w") as f:
        f.write(
            json.dumps(
                {
                    "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                    "n_samples": n_samples,
                    "temperature": temperature,
                    "min_correct": min_correct,
                    "max_profile": max_profile,
                    "eval": eval_results,
                }
            )
        )
    rich.print(f"Full results have been saved to {result_path}")

    # Save brief results
    with open(brief_result_path, "w") as f:
        f.write(
            json.dumps(
                {
                    "date": datetime.now().strftime("%Y-%m-%d %H:%M"),
                    "config": {
                        "n_samples": n_samples,
                        "temperature": temperature,
                        "min_correct": min_correct,
                        "max_profile": max_profile,
                    },
                    "summary": {
                        "dps": dps,
                        "dps_norm": dps_norm,
                        "pass@1": pass_1,
                    },
                    "eval": {
                        task_id: {
                            "dps": res["dps"],
                            "dps_norm": res["dps_norm"],
                            "pass@1": res["pass@1"],
                            "profiled": [
                                {
                                    "solution": r["solution"],
                                    "matching_cluster_idx": r["matching_cluster_idx"],
                                }
                                for r in res["results"]
                                if r["profiled"]
                            ],
                        }
                        for task_id, res in eval_results.items()
                    },
                }
            )
        )

    rich.print(f"Brief results have been saved to {brief_result_path}")

    rule("To visualize win-rates and pair-wise DPS, run:")
    rich.print(
        Syntax(
            f"""\
git clone git@github.com:evalplus/evalplus.github.io.git
git --git-dir=evalplus.github.io/.git pull
cp {brief_result_path} evalplus.github.io/results/evalperf
python evalplus.github.io/results/evalperf/stats.py
python -m http.server -d evalplus.github.io {get_free_port()}""",
            "bash",
        )
    )


def main():
    from fire import Fire

    Fire(script)


if __name__ == "__main__":
    main()