mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
558 lines
20 KiB
Python
558 lines
20 KiB
Python
"""Compute the Differential Performance Scores (DPS) and DPS_{norm} of given samples from a model.
|
|
|
|
Check our COLM paper for more details: https://www.arxiv.org/abs/2408.06450
|
|
|
|
^Updates from the COLM paper:
|
|
* Condition to activate efficiency evaluation for a task:
|
|
* Paper: as long as you have at least one correct solution, and we select up to 10 correct solutions for efficiency sampling
|
|
* Here: you need to have at least `min_correct` correct solutions, and we evaluate the efficiency of all correct solutions
|
|
* Updating rationale: to make the evaluation more statistically robust
|
|
|
|
@inproceedings{liu2024evaluating,
|
|
title = {Evaluating Language Models for Efficient Code Generation},
|
|
author = {Liu, Jiawei and Xie, Songrun and Wang, Junhao and Wei, Yuxiang and Ding, Yifeng and Zhang, Lingming},
|
|
booktitle = {First Conference on Language Modeling},
|
|
year = {2024},
|
|
url = {https://openreview.net/forum?id=IBCBMeAhmC},
|
|
}
|
|
"""
|
|
|
|
import json
|
|
import multiprocessing
|
|
import os
|
|
import socket
|
|
import time
|
|
from collections import defaultdict
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from contextlib import closing
|
|
from datetime import datetime
|
|
from statistics import mean
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
import rich
|
|
from rich.rule import Rule
|
|
from rich.syntax import Syntax
|
|
from rich.table import Table
|
|
|
|
from opencompass.datasets.evalplus.codegen import run_codegen
|
|
from opencompass.datasets.evalplus.config import *
|
|
from opencompass.datasets.evalplus.config import PERF_EVAL_TIMEOUT_SECOND
|
|
from opencompass.datasets.evalplus.data import (
|
|
get_evalperf_data,
|
|
get_human_eval_plus,
|
|
get_human_eval_plus_hash,
|
|
get_mbpp_plus,
|
|
get_mbpp_plus_hash,
|
|
)
|
|
from opencompass.datasets.evalplus.data.mbpp import mbpp_deserialize_inputs
|
|
from opencompass.datasets.evalplus.data.utils import stream_jsonl
|
|
from opencompass.datasets.evalplus.eval import PASS, untrusted_check
|
|
from opencompass.datasets.evalplus.eval._special_oracle import MBPP_OUTPUT_NOT_NONE_TASKS
|
|
from opencompass.datasets.evalplus.evaluate import get_groundtruth
|
|
from opencompass.datasets.evalplus.perf.profile import (
|
|
are_profiles_broken,
|
|
default_parallelism,
|
|
profile,
|
|
simple_test_profiler,
|
|
)
|
|
from opencompass.datasets.evalplus.utils import progress
|
|
|
|
def rule(msg: str):
|
|
rich.print(Rule(msg))
|
|
|
|
|
|
def not_none(l: list) -> list:
|
|
return [x for x in l if x is not None]
|
|
|
|
|
|
def get_free_port():
|
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
|
s.bind(("", 0))
|
|
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
|
return s.getsockname()[1]
|
|
|
|
|
|
def correctness_check(
|
|
solution: str, dataset: str, task: Dict, expected_output: List
|
|
) -> Tuple:
|
|
assert isinstance(solution, str)
|
|
result = untrusted_check(
|
|
dataset,
|
|
solution,
|
|
task["base_input"] + list(task["plus_input"]),
|
|
task["entry_point"],
|
|
expected_output["base"] + expected_output["plus"],
|
|
task["atol"],
|
|
expected_output["base_time"] + expected_output["plus_time"],
|
|
fast_check=True,
|
|
min_time_limit=DEFAULT_MIN_TIME_LIMIT,
|
|
gt_time_limit_factor=DEFAULT_GT_TIME_LIMIT_FACTOR,
|
|
)
|
|
return result, solution
|
|
|
|
|
|
def get_evalplus_data():
|
|
problems_he = get_human_eval_plus(noextreme=True)
|
|
dataset_hash = get_human_eval_plus_hash(noextreme=True)
|
|
expected_output_human = get_groundtruth(problems_he, dataset_hash, [])
|
|
problems_mbpp = get_mbpp_plus(noextreme=True)
|
|
dataset_hash = get_mbpp_plus_hash(noextreme=True)
|
|
expected_output_mbpp = get_groundtruth(
|
|
problems_mbpp,
|
|
dataset_hash,
|
|
MBPP_OUTPUT_NOT_NONE_TASKS,
|
|
)
|
|
problems = {**problems_he, **problems_mbpp}
|
|
expected_output = {**expected_output_human, **expected_output_mbpp}
|
|
return problems, expected_output
|
|
|
|
|
|
def table_print(table_name: str, kv: Dict):
|
|
table = Table(
|
|
title=table_name,
|
|
show_header=True,
|
|
header_style="bold",
|
|
)
|
|
for col_name in kv:
|
|
table.add_column(col_name)
|
|
|
|
table.add_row(*[str(v) for v in kv.values()])
|
|
rich.print(table)
|
|
|
|
|
|
def correctness_worker(task_id: str, samples: list, ctask: Dict, expected_output: Dict):
|
|
assert isinstance(
|
|
samples, list
|
|
), f"{task_id}: samples is not a list but {type(samples)}"
|
|
|
|
results = []
|
|
|
|
for solution in samples:
|
|
result, solution = correctness_check(
|
|
solution, task_id.split("/")[0].lower(), ctask, expected_output
|
|
)
|
|
results.append(
|
|
{
|
|
"solution": solution,
|
|
"pass": result[0] == PASS,
|
|
"profiled": False,
|
|
"matching_cluster_idx": None,
|
|
"dps": None,
|
|
"dps_norm": None,
|
|
}
|
|
)
|
|
|
|
return task_id, results
|
|
|
|
|
|
def perf_worker(
|
|
task_id: str,
|
|
ptask: Dict, # EvalPerf data
|
|
ret_dict: Dict,
|
|
lazy_evaluation: bool,
|
|
max_profile: int,
|
|
):
|
|
rich.print(f"{task_id}: Started")
|
|
start_time = time.time()
|
|
|
|
######################### Profiling Setup #########################
|
|
n_reference = len(ptask["reference"])
|
|
entry_point = ptask["entry_point"]
|
|
pe_input = (
|
|
mbpp_deserialize_inputs(task_id, ptask["pe_input"])[0]
|
|
if task_id.startswith("Mbpp/")
|
|
else ptask["pe_input"][0]
|
|
)
|
|
####################################################################
|
|
|
|
####################################################################
|
|
############### Lazily profile reference solutions #################
|
|
####################################################################
|
|
cache_ref_num_inst = [None] * n_reference
|
|
|
|
def get_avg_ref_profile(idx, check_order=True) -> Optional[Tuple]:
|
|
nonlocal cache_ref_num_inst
|
|
|
|
assert (
|
|
idx < n_reference - 1
|
|
and cache_ref_num_inst[idx + 1] is not None
|
|
or idx == n_reference - 1
|
|
), f"Calling get_avg_ref_profile({idx}) before get_avg_ref_profile({idx+1}) is called, is not allowed! {n_reference = }"
|
|
|
|
if cache_ref_num_inst[idx] is not None:
|
|
return cache_ref_num_inst[idx], ptask["scores"][idx]
|
|
|
|
evaluation_time = PERF_EVAL_TIMEOUT_SECOND
|
|
ref_solution = ptask["reference"][idx]
|
|
for _ in range(2): # at most retry twice
|
|
profiles = profile(
|
|
ref_solution,
|
|
entry_point,
|
|
[pe_input],
|
|
timeout_second_per_test=evaluation_time,
|
|
)
|
|
|
|
# Bad thing#1: timeout / failure happens
|
|
if are_profiles_broken(profiles):
|
|
print(f"{task_id}: [WARNING] Error in ref: {profiles}")
|
|
rich.print(Syntax(ref_solution, "python"))
|
|
print(f"{task_id}: Retrying w/ +10s timeout...")
|
|
evaluation_time += 10
|
|
else:
|
|
break
|
|
|
|
avg_profile = mean(profiles)
|
|
# Bad thing#2: if the current #instruction is faster than that of i+1
|
|
if idx < n_reference - 1 and avg_profile < cache_ref_num_inst[idx + 1]:
|
|
print(f"{task_id}: [WARNING] #{idx} ref faster than #{idx + 1}")
|
|
print(f"ref {idx}: #inst {avg_profile}\tscore {ptask['scores'][idx]:.1f}")
|
|
print(
|
|
f"ref {idx+1}: #inst {cache_ref_num_inst[idx+1]}\tscore {ptask['scores'][idx+1]:.1f}"
|
|
)
|
|
rich.print(Syntax(ref_solution, "python"))
|
|
if check_order:
|
|
return None
|
|
|
|
cache_ref_num_inst[idx] = avg_profile
|
|
ret_dict["ref"][idx]["_num_cpu_instructions"] = avg_profile
|
|
return cache_ref_num_inst[idx], ptask["scores"][idx]
|
|
|
|
####################################################################
|
|
############################## END #################################
|
|
####################################################################
|
|
|
|
if not lazy_evaluation: # compute everything ahead of time
|
|
for i in range(n_reference - 1, -1, -1):
|
|
if get_avg_ref_profile(i) is None:
|
|
break
|
|
|
|
assert (
|
|
None not in cache_ref_num_inst
|
|
), f"{task_id}: Failed to profile certain reference: {cache_ref_num_inst = }"
|
|
|
|
profile_cache = {}
|
|
|
|
cur_profiled = 0
|
|
for result in ret_dict["results"]:
|
|
if cur_profiled >= max_profile:
|
|
rich.print(f"{task_id}: Reached max_profile limit {max_profile}, stopped")
|
|
break
|
|
if not result["pass"]:
|
|
continue
|
|
|
|
solution = result["solution"]
|
|
|
|
if solution in profile_cache: # reuse cache
|
|
sample_profiles = profile_cache[solution]
|
|
else:
|
|
sample_profiles = profile(
|
|
solution,
|
|
entry_point,
|
|
[pe_input],
|
|
timeout_second_per_test=PERF_EVAL_TIMEOUT_SECOND,
|
|
)
|
|
profile_cache[solution] = sample_profiles # store cache
|
|
|
|
score = 0
|
|
norm_score = 0
|
|
result["matching_cluster_idx"] = -1 # -1 means even slower than the slowest ref
|
|
# if the solution results in a timeout, score is 0
|
|
if are_profiles_broken(sample_profiles):
|
|
print(
|
|
f"{task_id}: Tested solution error'ed out: {sample_profiles} ... regarded as 0 score"
|
|
)
|
|
rich.print(Syntax(solution, "python"))
|
|
else:
|
|
avg_sample_profile = result["_num_cpu_instructions"] = mean(sample_profiles)
|
|
# Get profiles from fast to slow (back to front):
|
|
for j in range(n_reference - 1, -1, -1):
|
|
avg_ref_profile, ref_score = get_avg_ref_profile(j, check_order=False)
|
|
if avg_sample_profile <= avg_ref_profile:
|
|
result["matching_cluster_idx"] = j
|
|
score = ref_score
|
|
norm_score = 100 * (j + 1) / n_reference
|
|
break
|
|
|
|
result["dps"] = score
|
|
result["dps_norm"] = norm_score
|
|
result["profiled"] = True
|
|
cur_profiled += 1
|
|
|
|
ret_dict["dps"] = mean(not_none([r["dps"] for r in ret_dict["results"]]))
|
|
ret_dict["dps_norm"] = mean(not_none([r["dps_norm"] for r in ret_dict["results"]]))
|
|
ret_dict["n_profiled"] = cur_profiled
|
|
|
|
table_print(
|
|
f"[bold green]{task_id} Completed[/]",
|
|
{
|
|
"Duration": f"{time.time() - start_time:.1f}s",
|
|
"DPS": f"[green]{ret_dict['dps']:.1f}[/]",
|
|
"DPS_norm": f"[green]{ret_dict['dps_norm']:.1f}[/]",
|
|
"# Profiled": f"{cur_profiled} / {len(ret_dict['results'])}",
|
|
"Pass@1": f"{ret_dict['pass@1']:.1f}%",
|
|
},
|
|
)
|
|
|
|
return ret_dict
|
|
|
|
|
|
# TODO(@ganler): OPTIMIZATION: reuse the samples from the generations of other datasets
|
|
def script(
|
|
samples: Optional[str] = None,
|
|
min_correct: int = 10,
|
|
max_profile: Optional[int] = None,
|
|
n_samples: int = 100,
|
|
temperature: float = 1.0,
|
|
parallel: Optional[int] = None,
|
|
lazy_evaluation: bool = True,
|
|
i_just_wanna_run: bool = False,
|
|
**model_kwargs,
|
|
):
|
|
max_profile = max_profile or min(min_correct * 2, n_samples)
|
|
assert min_correct <= max_profile <= n_samples
|
|
simple_test_profiler() # test linux perf setup
|
|
|
|
if model_kwargs:
|
|
# To suppress the warning of tokenizers
|
|
os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
|
|
"TOKENIZERS_PARALLELISM", "false"
|
|
)
|
|
# overwrite parameters
|
|
samples = run_codegen(
|
|
dataset="evalperf",
|
|
n_samples=n_samples,
|
|
temperature=temperature,
|
|
**model_kwargs,
|
|
)
|
|
|
|
assert samples is not None, "Please provide the path to the samples"
|
|
|
|
# Data loading
|
|
problems, expected_output = get_evalplus_data()
|
|
ptasks = get_evalperf_data()
|
|
|
|
# Parallelism
|
|
max_workers = parallel or max(1, default_parallelism(divisor=4))
|
|
assert 0 < max_workers < multiprocessing.cpu_count(), "Invalid max CPU workers"
|
|
|
|
if os.path.isdir(samples):
|
|
result_path = os.path.join(samples, "evalperf_results.json")
|
|
else:
|
|
assert samples.endswith(".jsonl")
|
|
result_path = samples.replace(".jsonl", "_evalperf_results.json")
|
|
brief_result_path = result_path.replace(
|
|
"evalperf_results.json", "evalperf_results.brief.json"
|
|
)
|
|
|
|
# resume results
|
|
eval_results = {}
|
|
if not i_just_wanna_run and os.path.exists(result_path):
|
|
resumed_result = json.load(open(result_path, "r"))
|
|
if (
|
|
resumed_result["n_samples"] == n_samples
|
|
and resumed_result["temperature"] == temperature
|
|
and resumed_result["min_correct"] == min_correct
|
|
and resumed_result["max_profile"] == max_profile
|
|
):
|
|
eval_results = resumed_result["eval"]
|
|
for etask in eval_results:
|
|
ptasks.pop(etask, None)
|
|
|
|
rich.print(f"Resumed {len(eval_results)} results from {result_path}")
|
|
|
|
# Load model's samples: task_id -> a list of samples
|
|
sample_iter = stream_jsonl(samples)
|
|
samples = defaultdict(list)
|
|
for task in sample_iter:
|
|
samples[task["task_id"].replace("_", "/")].append(task["solution"])
|
|
samples = {k: v[:n_samples] for k, v in samples.items()}
|
|
|
|
# assert each task has n_samples
|
|
for task_id, s in samples.items():
|
|
assert len(s) == n_samples, f"{task_id} has {len(s)} samples != {n_samples}"
|
|
|
|
# Initialize eval_results
|
|
for task_id, ptask in ptasks.items():
|
|
eval_results[task_id] = {
|
|
"task_id": task_id,
|
|
"results": [],
|
|
"ref": [
|
|
{"solution": s, "score": r, "_num_cpu_instructions": None}
|
|
for s, r in zip(ptask["reference"], ptask["scores"])
|
|
],
|
|
"dps": None,
|
|
"dps_norm": None,
|
|
"pass@1": None,
|
|
"n_profiled": None,
|
|
}
|
|
|
|
rule("Correctness Checking...")
|
|
with progress("Correctness") as p:
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = [
|
|
executor.submit(
|
|
correctness_worker,
|
|
task_id,
|
|
samples[task_id],
|
|
problems[task_id],
|
|
expected_output[task_id],
|
|
)
|
|
for task_id in ptasks
|
|
]
|
|
|
|
for future in p.track(as_completed(futures), total=len(futures)):
|
|
task_id, results = future.result()
|
|
eval_results[task_id]["results"] = results
|
|
eval_results[task_id]["pass@1"] = (
|
|
100 * len([r for r in results if r["pass"]]) / n_samples
|
|
)
|
|
|
|
rule("EvalPerf Configurations")
|
|
if lazy_evaluation:
|
|
rich.print(
|
|
"[bold yellow]Lazy evaluation is enabled[/]: "
|
|
"Fast evaluation without enumeratively checking reference order consistency."
|
|
)
|
|
|
|
table_print(
|
|
"Configurations",
|
|
{
|
|
"Max CPU": max_workers,
|
|
"#Tasks": len(ptasks),
|
|
"#Samples per task": n_samples,
|
|
"Min correct": min_correct,
|
|
"Max profile": max_profile,
|
|
"Result path": result_path,
|
|
},
|
|
)
|
|
|
|
rich.print(f"IDs of tasks to evaluate: {list(ptasks.keys())}")
|
|
rule("Evaluation Start")
|
|
undone = []
|
|
with progress("Profiling") as p:
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = []
|
|
for task_id, ptask in ptasks.items():
|
|
n_pass = len([r for r in eval_results[task_id]["results"] if r["pass"]])
|
|
if n_pass < min_correct:
|
|
rich.print(
|
|
f"{task_id}: [bold yellow]{n_pass} < {min_correct} correct solutions, skipped[/]"
|
|
)
|
|
continue
|
|
futures.append(
|
|
executor.submit(
|
|
perf_worker,
|
|
task_id,
|
|
ptask,
|
|
eval_results[task_id],
|
|
lazy_evaluation,
|
|
max_profile,
|
|
)
|
|
)
|
|
undone.append(task_id)
|
|
rich.print(f"{task_id}: Queued")
|
|
|
|
for future in p.track(as_completed(futures), total=len(futures)):
|
|
result = future.result()
|
|
eval_results[result["task_id"]] = result
|
|
undone.remove(result["task_id"])
|
|
if undone and len(undone) < max_workers:
|
|
print(f"Still running: {undone}")
|
|
|
|
rule("Evaluation Summary")
|
|
dps = mean(not_none([res["dps"] for res in eval_results.values()]))
|
|
dps_norm = mean(not_none([res["dps_norm"] for res in eval_results.values()]))
|
|
pass_1 = mean(not_none([res["pass@1"] for res in eval_results.values()]))
|
|
n_evalperfed = len(not_none([res["dps"] for res in eval_results.values()]))
|
|
|
|
table_print(
|
|
"EvalPerf Summary",
|
|
{
|
|
"DPS": f"{dps:.1f}",
|
|
"DPS_norm": f"{dps_norm:.1f}",
|
|
"Pass@1": f"{pass_1:.1f}%",
|
|
"#EvalPerf-ed tasks": f"{n_evalperfed} / {len(eval_results)}",
|
|
"min_correct": min_correct,
|
|
"n_samples": n_samples,
|
|
"temperature": temperature,
|
|
},
|
|
)
|
|
|
|
# Save full results
|
|
with open(result_path, "w") as f:
|
|
f.write(
|
|
json.dumps(
|
|
{
|
|
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
"n_samples": n_samples,
|
|
"temperature": temperature,
|
|
"min_correct": min_correct,
|
|
"max_profile": max_profile,
|
|
"eval": eval_results,
|
|
}
|
|
)
|
|
)
|
|
rich.print(f"Full results have been saved to {result_path}")
|
|
|
|
# Save brief results
|
|
with open(brief_result_path, "w") as f:
|
|
f.write(
|
|
json.dumps(
|
|
{
|
|
"date": datetime.now().strftime("%Y-%m-%d %H:%M"),
|
|
"config": {
|
|
"n_samples": n_samples,
|
|
"temperature": temperature,
|
|
"min_correct": min_correct,
|
|
"max_profile": max_profile,
|
|
},
|
|
"summary": {
|
|
"dps": dps,
|
|
"dps_norm": dps_norm,
|
|
"pass@1": pass_1,
|
|
},
|
|
"eval": {
|
|
task_id: {
|
|
"dps": res["dps"],
|
|
"dps_norm": res["dps_norm"],
|
|
"pass@1": res["pass@1"],
|
|
"profiled": [
|
|
{
|
|
"solution": r["solution"],
|
|
"matching_cluster_idx": r["matching_cluster_idx"],
|
|
}
|
|
for r in res["results"]
|
|
if r["profiled"]
|
|
],
|
|
}
|
|
for task_id, res in eval_results.items()
|
|
},
|
|
}
|
|
)
|
|
)
|
|
|
|
rich.print(f"Brief results have been saved to {brief_result_path}")
|
|
|
|
rule("To visualize win-rates and pair-wise DPS, run:")
|
|
rich.print(
|
|
Syntax(
|
|
f"""\
|
|
git clone git@github.com:evalplus/evalplus.github.io.git
|
|
git --git-dir=evalplus.github.io/.git pull
|
|
cp {brief_result_path} evalplus.github.io/results/evalperf
|
|
python evalplus.github.io/results/evalperf/stats.py
|
|
python -m http.server -d evalplus.github.io {get_free_port()}""",
|
|
"bash",
|
|
)
|
|
)
|
|
|
|
|
|
def main():
|
|
from fire import Fire
|
|
|
|
Fire(script)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|