mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
271 lines
9.9 KiB
Python
271 lines
9.9 KiB
Python
![]() |
import json
|
||
|
import os
|
||
|
from typing import Dict, List, Optional
|
||
|
|
||
|
from .data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
|
||
|
from .provider import DecoderBase, make_model
|
||
|
from .sanitize import sanitize
|
||
|
from .utils import progress
|
||
|
|
||
|
def codegen(
|
||
|
target_path: str,
|
||
|
model: DecoderBase,
|
||
|
dataset: Dict,
|
||
|
greedy=False,
|
||
|
n_samples=1,
|
||
|
id_range=None,
|
||
|
resume=True,
|
||
|
):
|
||
|
task2nexist = {}
|
||
|
if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
|
||
|
with open(target_path, "r") as f:
|
||
|
for line in f:
|
||
|
if not line.strip():
|
||
|
continue
|
||
|
task_id = json.loads(line)["task_id"]
|
||
|
task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
|
||
|
|
||
|
if target_path.endswith(".jsonl"):
|
||
|
raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
|
||
|
else:
|
||
|
raw_target_path = target_path + ".raw"
|
||
|
os.makedirs(target_path, exist_ok=True)
|
||
|
|
||
|
print(f"Sanitized code outputs will be saved to {target_path}")
|
||
|
print(f"Raw outputs will be saved to {raw_target_path}")
|
||
|
|
||
|
backend_type: str = type(model).__name__
|
||
|
with progress(backend_type) as p:
|
||
|
for task_id, task in p.track(dataset.items()):
|
||
|
if id_range is not None:
|
||
|
id_num = int(task_id.split("/")[1])
|
||
|
low, high = id_range
|
||
|
if id_num < low or id_num >= high:
|
||
|
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
|
||
|
continue
|
||
|
|
||
|
if not target_path.endswith(".jsonl"):
|
||
|
p_name = task_id.replace("/", "_")
|
||
|
os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
|
||
|
task2nexist[task_id] = len(
|
||
|
[
|
||
|
f
|
||
|
for f in os.listdir(os.path.join(target_path, p_name))
|
||
|
if f.endswith(".py")
|
||
|
]
|
||
|
)
|
||
|
|
||
|
n_more_samples = n_samples
|
||
|
log = f"Codegen: {task_id} @ {model}"
|
||
|
if resume and task2nexist.get(task_id, 0) > 0:
|
||
|
log += f" (resuming from {task2nexist[task_id]})"
|
||
|
n_more_samples -= task2nexist[task_id]
|
||
|
|
||
|
p.console.print(log)
|
||
|
|
||
|
sidx = n_samples - n_more_samples
|
||
|
while sidx < n_samples:
|
||
|
prompt = task["prompt"].strip() + "\n"
|
||
|
outputs = model.codegen(
|
||
|
prompt,
|
||
|
do_sample=not greedy,
|
||
|
num_samples=n_samples - sidx,
|
||
|
)
|
||
|
assert outputs, "No outputs from model!"
|
||
|
for impl in outputs:
|
||
|
solution = prompt + impl if model.is_direct_completion() else impl
|
||
|
sanitized_solution = sanitize(
|
||
|
solution, entrypoint=task["entry_point"]
|
||
|
)
|
||
|
if target_path.endswith(".jsonl"):
|
||
|
# Writing the sanitized version
|
||
|
with open(target_path, "a") as f:
|
||
|
f.write(
|
||
|
json.dumps(
|
||
|
{"task_id": task_id, "solution": sanitized_solution}
|
||
|
)
|
||
|
+ "\n"
|
||
|
)
|
||
|
|
||
|
# Writing the raw version
|
||
|
with open(raw_target_path, "a") as f:
|
||
|
f.write(
|
||
|
json.dumps({"task_id": task_id, "solution": solution})
|
||
|
+ "\n"
|
||
|
)
|
||
|
else:
|
||
|
# Writing the sanitized version
|
||
|
with open(
|
||
|
os.path.join(target_path, p_name, f"{sidx}.py"),
|
||
|
"w",
|
||
|
encoding="utf-8",
|
||
|
) as f:
|
||
|
f.write(sanitized_solution)
|
||
|
|
||
|
# Writing the raw version
|
||
|
with open(
|
||
|
os.path.join(raw_target_path, p_name, f"{sidx}.py"),
|
||
|
"w",
|
||
|
encoding="utf-8",
|
||
|
) as f:
|
||
|
f.write(solution)
|
||
|
sidx += 1
|
||
|
|
||
|
|
||
|
def run_codegen(
|
||
|
model: str,
|
||
|
dataset: str,
|
||
|
root: str = "evalplus_results",
|
||
|
bs: Optional[int] = None,
|
||
|
n_samples: int = 1,
|
||
|
temperature: float = 0.0,
|
||
|
resume: bool = True,
|
||
|
greedy: bool = False,
|
||
|
id_range: List = None,
|
||
|
version: str = "default",
|
||
|
backend: str = "vllm",
|
||
|
force_base_prompt: bool = False,
|
||
|
base_url: str = None,
|
||
|
tp: int = 1,
|
||
|
evalperf_type: str = None, # For EvalPerf
|
||
|
jsonl_fmt: bool = True,
|
||
|
attn_implementation: str = "eager",
|
||
|
device_map: Optional[str] = None,
|
||
|
trust_remote_code: bool = False,
|
||
|
enable_prefix_caching: bool = False,
|
||
|
enable_chunked_prefill: bool = False,
|
||
|
dtype: str = "bfloat16",
|
||
|
gptqmodel_backend: str = "auto",
|
||
|
lang: str = "en" # For GPTQModel
|
||
|
):
|
||
|
assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
|
||
|
assert evalperf_type is None or evalperf_type in [
|
||
|
"instruct",
|
||
|
"perf-instruct",
|
||
|
"perf-CoT",
|
||
|
]
|
||
|
|
||
|
# Make dir for codes generated by each model
|
||
|
identifier = model.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
|
||
|
if evalperf_type:
|
||
|
identifier += f"-{evalperf_type}"
|
||
|
|
||
|
target_path = os.path.join(root, dataset, identifier) if lang == "en" else os.path.join(root, dataset, f"{lang}_{identifier}")
|
||
|
if jsonl_fmt:
|
||
|
target_path += ".jsonl"
|
||
|
else:
|
||
|
os.makedirs(target_path, exist_ok=True)
|
||
|
|
||
|
if dataset == "humaneval":
|
||
|
dataset_dict = get_human_eval_plus(version=version, lang=lang)
|
||
|
elif dataset == "mbpp":
|
||
|
dataset_dict = get_mbpp_plus(version=version)
|
||
|
elif dataset == "evalperf":
|
||
|
original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
|
||
|
dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
|
||
|
assert id_range is None, "id_range not supported for evalperf"
|
||
|
else:
|
||
|
raise ValueError(f"Invalid dataset {dataset}")
|
||
|
|
||
|
all_tasks_complete = False
|
||
|
if jsonl_fmt and os.path.isfile(target_path):
|
||
|
task_counts = {}
|
||
|
with open(target_path, "r") as f:
|
||
|
for line in f:
|
||
|
if not line.strip():
|
||
|
continue
|
||
|
data = json.loads(line)
|
||
|
task_id = data["task_id"]
|
||
|
task_counts[task_id] = task_counts.get(task_id, 0) + 1
|
||
|
|
||
|
all_tasks_complete = all(
|
||
|
task_counts.get(task_id, 0) >= n_samples
|
||
|
for task_id in dataset_dict.keys()
|
||
|
)
|
||
|
|
||
|
if all_tasks_complete:
|
||
|
print("All samples are already cached. Skipping codegen.")
|
||
|
return target_path
|
||
|
|
||
|
if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
|
||
|
temperature = 0.0
|
||
|
bs = 1
|
||
|
n_samples = 1
|
||
|
print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
|
||
|
|
||
|
if id_range is not None:
|
||
|
assert len(id_range) == 2, "id_range must be a list of length 2"
|
||
|
assert id_range[0] < id_range[1], "id_range must be increasing"
|
||
|
id_range = tuple(id_range)
|
||
|
|
||
|
if bs is None:
|
||
|
bs = min(n_samples, 32)
|
||
|
print(f"Setting batch size to {bs}")
|
||
|
|
||
|
# Make project dir
|
||
|
os.makedirs(root, exist_ok=True)
|
||
|
# Make dataset dir
|
||
|
os.makedirs(os.path.join(root, dataset), exist_ok=True)
|
||
|
|
||
|
# Model instructions
|
||
|
instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
|
||
|
response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
|
||
|
|
||
|
if evalperf_type == "perf-instruct":
|
||
|
instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
|
||
|
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
|
||
|
elif evalperf_type == "perf-CoT":
|
||
|
instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
|
||
|
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
|
||
|
elif evalperf_type is not None and evalperf_type != "instruct":
|
||
|
raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
|
||
|
|
||
|
# Model creation
|
||
|
model_runner = make_model(
|
||
|
model=model,
|
||
|
backend=backend,
|
||
|
batch_size=bs,
|
||
|
temperature=temperature,
|
||
|
force_base_prompt=force_base_prompt,
|
||
|
dataset=dataset,
|
||
|
base_url=base_url,
|
||
|
tp=tp,
|
||
|
instruction_prefix=instruction_prefix,
|
||
|
response_prefix=response_prefix,
|
||
|
device_map=device_map,
|
||
|
attn_implementation=attn_implementation,
|
||
|
trust_remote_code=trust_remote_code,
|
||
|
enable_prefix_caching=enable_prefix_caching,
|
||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||
|
dtype=dtype,
|
||
|
gptqmodel_backend=gptqmodel_backend,
|
||
|
)
|
||
|
|
||
|
codegen(
|
||
|
target_path=target_path,
|
||
|
dataset=dataset_dict,
|
||
|
greedy=greedy,
|
||
|
model=model_runner,
|
||
|
n_samples=n_samples,
|
||
|
resume=resume,
|
||
|
id_range=id_range,
|
||
|
)
|
||
|
|
||
|
# force shutdown the model runner
|
||
|
del model_runner
|
||
|
import gc
|
||
|
|
||
|
gc.collect()
|
||
|
|
||
|
return target_path
|
||
|
|
||
|
|
||
|
def main():
|
||
|
from fire import Fire
|
||
|
|
||
|
Fire(run_codegen)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
main()
|