OpenCompass/opencompass/datasets/evalplus/codegen.py

271 lines
9.9 KiB
Python
Raw Normal View History

2025-02-19 11:46:42 +08:00
import json
import os
from typing import Dict, List, Optional
from .data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus
from .provider import DecoderBase, make_model
from .sanitize import sanitize
from .utils import progress
def codegen(
target_path: str,
model: DecoderBase,
dataset: Dict,
greedy=False,
n_samples=1,
id_range=None,
resume=True,
):
task2nexist = {}
if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path):
with open(target_path, "r") as f:
for line in f:
if not line.strip():
continue
task_id = json.loads(line)["task_id"]
task2nexist[task_id] = task2nexist.get(task_id, 0) + 1
if target_path.endswith(".jsonl"):
raw_target_path = target_path.replace(".jsonl", ".raw.jsonl")
else:
raw_target_path = target_path + ".raw"
os.makedirs(target_path, exist_ok=True)
print(f"Sanitized code outputs will be saved to {target_path}")
print(f"Raw outputs will be saved to {raw_target_path}")
backend_type: str = type(model).__name__
with progress(backend_type) as p:
for task_id, task in p.track(dataset.items()):
if id_range is not None:
id_num = int(task_id.split("/")[1])
low, high = id_range
if id_num < low or id_num >= high:
p.console.print(f"Skipping {task_id} as it is not in {id_range}")
continue
if not target_path.endswith(".jsonl"):
p_name = task_id.replace("/", "_")
os.makedirs(os.path.join(target_path, p_name), exist_ok=True)
task2nexist[task_id] = len(
[
f
for f in os.listdir(os.path.join(target_path, p_name))
if f.endswith(".py")
]
)
n_more_samples = n_samples
log = f"Codegen: {task_id} @ {model}"
if resume and task2nexist.get(task_id, 0) > 0:
log += f" (resuming from {task2nexist[task_id]})"
n_more_samples -= task2nexist[task_id]
p.console.print(log)
sidx = n_samples - n_more_samples
while sidx < n_samples:
prompt = task["prompt"].strip() + "\n"
outputs = model.codegen(
prompt,
do_sample=not greedy,
num_samples=n_samples - sidx,
)
assert outputs, "No outputs from model!"
for impl in outputs:
solution = prompt + impl if model.is_direct_completion() else impl
sanitized_solution = sanitize(
solution, entrypoint=task["entry_point"]
)
if target_path.endswith(".jsonl"):
# Writing the sanitized version
with open(target_path, "a") as f:
f.write(
json.dumps(
{"task_id": task_id, "solution": sanitized_solution}
)
+ "\n"
)
# Writing the raw version
with open(raw_target_path, "a") as f:
f.write(
json.dumps({"task_id": task_id, "solution": solution})
+ "\n"
)
else:
# Writing the sanitized version
with open(
os.path.join(target_path, p_name, f"{sidx}.py"),
"w",
encoding="utf-8",
) as f:
f.write(sanitized_solution)
# Writing the raw version
with open(
os.path.join(raw_target_path, p_name, f"{sidx}.py"),
"w",
encoding="utf-8",
) as f:
f.write(solution)
sidx += 1
def run_codegen(
model: str,
dataset: str,
root: str = "evalplus_results",
bs: Optional[int] = None,
n_samples: int = 1,
temperature: float = 0.0,
resume: bool = True,
greedy: bool = False,
id_range: List = None,
version: str = "default",
backend: str = "vllm",
force_base_prompt: bool = False,
base_url: str = None,
tp: int = 1,
evalperf_type: str = None, # For EvalPerf
jsonl_fmt: bool = True,
attn_implementation: str = "eager",
device_map: Optional[str] = None,
trust_remote_code: bool = False,
enable_prefix_caching: bool = False,
enable_chunked_prefill: bool = False,
dtype: str = "bfloat16",
gptqmodel_backend: str = "auto",
lang: str = "en" # For GPTQModel
):
assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}"
assert evalperf_type is None or evalperf_type in [
"instruct",
"perf-instruct",
"perf-CoT",
]
# Make dir for codes generated by each model
identifier = model.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}"
if evalperf_type:
identifier += f"-{evalperf_type}"
target_path = os.path.join(root, dataset, identifier) if lang == "en" else os.path.join(root, dataset, f"{lang}_{identifier}")
if jsonl_fmt:
target_path += ".jsonl"
else:
os.makedirs(target_path, exist_ok=True)
if dataset == "humaneval":
dataset_dict = get_human_eval_plus(version=version, lang=lang)
elif dataset == "mbpp":
dataset_dict = get_mbpp_plus(version=version)
elif dataset == "evalperf":
original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()}
dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()}
assert id_range is None, "id_range not supported for evalperf"
else:
raise ValueError(f"Invalid dataset {dataset}")
all_tasks_complete = False
if jsonl_fmt and os.path.isfile(target_path):
task_counts = {}
with open(target_path, "r") as f:
for line in f:
if not line.strip():
continue
data = json.loads(line)
task_id = data["task_id"]
task_counts[task_id] = task_counts.get(task_id, 0) + 1
all_tasks_complete = all(
task_counts.get(task_id, 0) >= n_samples
for task_id in dataset_dict.keys()
)
if all_tasks_complete:
print("All samples are already cached. Skipping codegen.")
return target_path
if greedy and (temperature != 0 or bs != 1 or n_samples != 1):
temperature = 0.0
bs = 1
n_samples = 1
print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0")
if id_range is not None:
assert len(id_range) == 2, "id_range must be a list of length 2"
assert id_range[0] < id_range[1], "id_range must be increasing"
id_range = tuple(id_range)
if bs is None:
bs = min(n_samples, 32)
print(f"Setting batch size to {bs}")
# Make project dir
os.makedirs(root, exist_ok=True)
# Make dataset dir
os.makedirs(os.path.join(root, dataset), exist_ok=True)
# Model instructions
instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:"
if evalperf_type == "perf-instruct":
instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
elif evalperf_type == "perf-CoT":
instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:"
response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:"
elif evalperf_type is not None and evalperf_type != "instruct":
raise ValueError(f"Invalid evalperf_type: {evalperf_type}")
# Model creation
model_runner = make_model(
model=model,
backend=backend,
batch_size=bs,
temperature=temperature,
force_base_prompt=force_base_prompt,
dataset=dataset,
base_url=base_url,
tp=tp,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
device_map=device_map,
attn_implementation=attn_implementation,
trust_remote_code=trust_remote_code,
enable_prefix_caching=enable_prefix_caching,
enable_chunked_prefill=enable_chunked_prefill,
dtype=dtype,
gptqmodel_backend=gptqmodel_backend,
)
codegen(
target_path=target_path,
dataset=dataset_dict,
greedy=greedy,
model=model_runner,
n_samples=n_samples,
resume=resume,
id_range=id_range,
)
# force shutdown the model runner
del model_runner
import gc
gc.collect()
return target_path
def main():
from fire import Fire
Fire(run_codegen)
if __name__ == "__main__":
main()