mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
241 lines
8.1 KiB
Python
241 lines
8.1 KiB
Python
"""Analyze the variance of PE and their time cost.
|
|
Filter those with high variance and low time cost.
|
|
"""
|
|
|
|
import json
|
|
import math
|
|
from datetime import datetime
|
|
from typing import List
|
|
|
|
import numpy as np
|
|
from rich.console import Console
|
|
from rich.syntax import Syntax
|
|
from termcolor import colored
|
|
|
|
from evalplus.config import PREF_CURATE_MIN_INSTRUCTION
|
|
|
|
|
|
def cv(time_costs: List[float]) -> float:
|
|
"""
|
|
We use Coefficient of Variation (CV) to as the variance of PE.
|
|
CV = 100 * standard deviation / mean
|
|
"""
|
|
if len(time_costs) == 0:
|
|
raise ValueError("time_costs is empty.")
|
|
return 100 * np.std(time_costs) / np.mean(time_costs)
|
|
|
|
|
|
def filter_by_profile_size(task2profile: dict, threshold: int = 10):
|
|
to_remove = []
|
|
for task_id, profile in task2profile.items():
|
|
if (
|
|
profile is None
|
|
or len(profile) < threshold
|
|
or any(None in p for p in profile)
|
|
):
|
|
print(colored(f"⚠️ {task_id} skipped: #profile < {threshold}", "red"))
|
|
to_remove.append(task_id)
|
|
for task_id in to_remove:
|
|
del task2profile[task_id]
|
|
return task2profile
|
|
|
|
|
|
def filter_by_compute_cost(
|
|
task2profile: dict, thresh: float = PREF_CURATE_MIN_INSTRUCTION
|
|
):
|
|
"""Filter out tasks that can be solved using less than threshold #instruction."""
|
|
to_remove = []
|
|
for task_id, profile in task2profile.items():
|
|
if (
|
|
min(np.mean(p) for p in profile) < thresh
|
|
): # filter if some solution is too fast
|
|
print(
|
|
colored(
|
|
f"⚠️ {task_id} skipped: some solution is faster than {thresh} #instruction",
|
|
"red",
|
|
)
|
|
)
|
|
to_remove.append(task_id)
|
|
for task_id in to_remove:
|
|
del task2profile[task_id]
|
|
return task2profile
|
|
|
|
|
|
def filter_by_cv(task2profile: dict, thresh: float, percentile: int = 95):
|
|
to_remove = []
|
|
for task_id, profile in task2profile.items():
|
|
mean_var = np.percentile([cv(p) for p in profile], percentile)
|
|
if mean_var > thresh:
|
|
print(
|
|
colored(
|
|
f"⚠️ {task_id} skipped: P{percentile} CV = {mean_var:.1f}% > {thresh}%",
|
|
"red",
|
|
)
|
|
)
|
|
to_remove.append(task_id)
|
|
for task_id in to_remove:
|
|
del task2profile[task_id]
|
|
return task2profile
|
|
|
|
|
|
# smaller time, larger threshold
|
|
def thresh_fn(base_thresh, x, weight=0.002):
|
|
return base_thresh + math.sqrt(weight / x)
|
|
|
|
|
|
def adaptive_seg1d(arr1d, base_thresh=0.10):
|
|
# sort from large to small
|
|
arr1d = np.sort(arr1d)[::-1]
|
|
# relative distance
|
|
relative_distance = -np.diff(arr1d) / arr1d[:-1]
|
|
|
|
splitter_idx = []
|
|
for i, rel in enumerate(relative_distance):
|
|
if rel > thresh_fn(base_thresh, arr1d[i], weight=PREF_CURATE_MIN_INSTRUCTION):
|
|
splitter_idx.append(i + 1)
|
|
|
|
# [9, 8, 7, |-> 3, 2 1]
|
|
# splitter_idx points to the slowest in each cluster
|
|
return np.split(arr1d, splitter_idx)
|
|
|
|
|
|
def filter_by_clustering(task2profile: dict, base_threshold=0.2, min_clusters=3):
|
|
to_remove = []
|
|
for task_id, profile in task2profile.items():
|
|
if len(adaptive_seg1d(np.mean(profile, axis=1), base_threshold)) < min_clusters:
|
|
print(
|
|
colored(
|
|
f"⚠️ {task_id} skipped: #Cluster = 0 with {base_threshold=}%",
|
|
"red",
|
|
)
|
|
)
|
|
to_remove.append(task_id)
|
|
for task_id in to_remove:
|
|
del task2profile[task_id]
|
|
return task2profile
|
|
|
|
|
|
def brief_list_repr(lst, head_count=4, tail_count=4):
|
|
if len(lst) <= head_count + tail_count:
|
|
return f"{lst}"
|
|
else:
|
|
head = ", ".join(str(x) for x in lst[:head_count])
|
|
tail = ", ".join(str(x) for x in lst[-tail_count:])
|
|
return f"[{head}, ..., {tail}]"
|
|
|
|
|
|
def script(
|
|
profiled_solutions: str,
|
|
output_dataset: str = f"evalperf-{datetime.now():%Y%m%d}.jsonl",
|
|
debug_tasks: List[str] = [],
|
|
min_clusters=4,
|
|
):
|
|
assert profiled_solutions.endswith(".jsonl")
|
|
assert output_dataset.endswith(".jsonl")
|
|
|
|
# read jsonl
|
|
with open(profiled_solutions, "r") as f:
|
|
profiled_solutions = [json.loads(l) for l in f if l.strip()]
|
|
|
|
console = Console()
|
|
|
|
task2profile = {d["task_id"]: d["counter_profile"] for d in profiled_solutions}
|
|
print(f"Loaded {len(task2profile)} tasks.")
|
|
|
|
# * Criteria 1: Profile cannot be empty
|
|
task2profile = filter_by_profile_size(task2profile)
|
|
print(f"{len(task2profile)} tasks with profile.")
|
|
|
|
# * Criteria 2: Solutions should run more than MIN_SLOWEST_INSTRUCTION_COUNT
|
|
task2profile = filter_by_compute_cost(task2profile)
|
|
print(
|
|
f"{len(task2profile)} tasks with slowest mean time > {PREF_CURATE_MIN_INSTRUCTION}s."
|
|
)
|
|
|
|
# * Criteria 3: P99-CV should be less than 5%
|
|
final_thresh = 5
|
|
percentile = 99
|
|
task2profile = filter_by_cv(
|
|
task2profile, thresh=final_thresh, percentile=percentile
|
|
)
|
|
print(f"{len(task2profile)} tasks with CV <= {final_thresh}%.")
|
|
|
|
# * Criteria 4: Cluster should be more than 1
|
|
task2profile = filter_by_clustering(
|
|
task2profile, base_threshold=0.2, min_clusters=min_clusters
|
|
)
|
|
print(f"{len(task2profile)} tasks with #Cluster >= {min_clusters}.")
|
|
|
|
# export dataset
|
|
task2solution = {d["task_id"]: d for d in profiled_solutions}
|
|
# each item is {"task_id": "xxx", "solutions": [...], "percentile": [...]}
|
|
export_dataset = []
|
|
total_clusters = 0
|
|
for task_id, profile in task2profile.items():
|
|
print(colored(f"-========== {task_id} ==========-", "green"))
|
|
if task_id in debug_tasks:
|
|
print(colored(f"Debugging {task_id}", "red"))
|
|
mean_runtime = [np.mean(p) for p in profile]
|
|
clusters = adaptive_seg1d(mean_runtime) # descend
|
|
print(colored(f"#seg = {len(clusters)}", "green"))
|
|
|
|
accumulative_ratio = []
|
|
ref_idx = []
|
|
for i, cluster in enumerate(clusters):
|
|
prior_ar = 0 if i == 0 else accumulative_ratio[-1]
|
|
ratio = 100 * len(cluster) / len(mean_runtime)
|
|
acc_ratio = prior_ar + ratio
|
|
brief_list_str = brief_list_repr([round(1000 * v) for v in cluster])
|
|
print(
|
|
f"#{i} |{len(cluster):<3}| ({acc_ratio:<4.1f}) @cv {cv(cluster):.1f}: {brief_list_str}"
|
|
)
|
|
accumulative_ratio.append(acc_ratio)
|
|
ref_idx.append(np.where(mean_runtime == cluster[0])[0][0])
|
|
|
|
if task_id in debug_tasks:
|
|
# print solutions
|
|
solution_text = task2solution[task_id]["solutions"][ref_idx[-1]]
|
|
# remove empty lines
|
|
solution_text = "\n".join(
|
|
line for line in solution_text.split("\n") if line.strip()
|
|
)
|
|
console.print(Syntax(solution_text, "python"))
|
|
print(colored("-" * 32, "green"))
|
|
|
|
total_clusters += len(clusters)
|
|
|
|
# add reference solution and check consistency
|
|
for i in range(len(ref_idx)):
|
|
if i == 0:
|
|
continue
|
|
# prior runtime must be larger than current
|
|
assert mean_runtime[ref_idx[i - 1]] > mean_runtime[ref_idx[i]]
|
|
|
|
reference = [task2solution[task_id]["solutions"][idx] for idx in ref_idx]
|
|
|
|
assert len(reference) == len(clusters)
|
|
assert len(accumulative_ratio) == len(reference)
|
|
item = {
|
|
"task_id": task_id,
|
|
"reference": reference,
|
|
"pe_input": task2solution[task_id]["pe_input"],
|
|
"scores": accumulative_ratio,
|
|
}
|
|
export_dataset.append(item)
|
|
|
|
print(f"Total clusters: {total_clusters}")
|
|
|
|
with open(output_dataset, "w") as f:
|
|
for item in export_dataset:
|
|
f.write(json.dumps(item) + "\n")
|
|
|
|
|
|
def main():
|
|
from fire import Fire
|
|
|
|
Fire(script)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|