import json import os from typing import Dict, List, Optional from .data import get_evalperf_data, get_human_eval_plus, get_mbpp_plus from .provider import DecoderBase, make_model from .sanitize import sanitize from .utils import progress def codegen( target_path: str, model: DecoderBase, dataset: Dict, greedy=False, n_samples=1, id_range=None, resume=True, ): task2nexist = {} if resume and target_path.endswith(".jsonl") and os.path.isfile(target_path): with open(target_path, "r") as f: for line in f: if not line.strip(): continue task_id = json.loads(line)["task_id"] task2nexist[task_id] = task2nexist.get(task_id, 0) + 1 if target_path.endswith(".jsonl"): raw_target_path = target_path.replace(".jsonl", ".raw.jsonl") else: raw_target_path = target_path + ".raw" os.makedirs(target_path, exist_ok=True) print(f"Sanitized code outputs will be saved to {target_path}") print(f"Raw outputs will be saved to {raw_target_path}") backend_type: str = type(model).__name__ with progress(backend_type) as p: for task_id, task in p.track(dataset.items()): if id_range is not None: id_num = int(task_id.split("/")[1]) low, high = id_range if id_num < low or id_num >= high: p.console.print(f"Skipping {task_id} as it is not in {id_range}") continue if not target_path.endswith(".jsonl"): p_name = task_id.replace("/", "_") os.makedirs(os.path.join(target_path, p_name), exist_ok=True) task2nexist[task_id] = len( [ f for f in os.listdir(os.path.join(target_path, p_name)) if f.endswith(".py") ] ) n_more_samples = n_samples log = f"Codegen: {task_id} @ {model}" if resume and task2nexist.get(task_id, 0) > 0: log += f" (resuming from {task2nexist[task_id]})" n_more_samples -= task2nexist[task_id] p.console.print(log) sidx = n_samples - n_more_samples while sidx < n_samples: prompt = task["prompt"].strip() + "\n" outputs = model.codegen( prompt, do_sample=not greedy, num_samples=n_samples - sidx, ) assert outputs, "No outputs from model!" for impl in outputs: solution = prompt + impl if model.is_direct_completion() else impl sanitized_solution = sanitize( solution, entrypoint=task["entry_point"] ) if target_path.endswith(".jsonl"): # Writing the sanitized version with open(target_path, "a") as f: f.write( json.dumps( {"task_id": task_id, "solution": sanitized_solution} ) + "\n" ) # Writing the raw version with open(raw_target_path, "a") as f: f.write( json.dumps({"task_id": task_id, "solution": solution}) + "\n" ) else: # Writing the sanitized version with open( os.path.join(target_path, p_name, f"{sidx}.py"), "w", encoding="utf-8", ) as f: f.write(sanitized_solution) # Writing the raw version with open( os.path.join(raw_target_path, p_name, f"{sidx}.py"), "w", encoding="utf-8", ) as f: f.write(solution) sidx += 1 def run_codegen( model: str, dataset: str, root: str = "evalplus_results", bs: Optional[int] = None, n_samples: int = 1, temperature: float = 0.0, resume: bool = True, greedy: bool = False, id_range: List = None, version: str = "default", backend: str = "vllm", force_base_prompt: bool = False, base_url: str = None, tp: int = 1, evalperf_type: str = None, # For EvalPerf jsonl_fmt: bool = True, attn_implementation: str = "eager", device_map: Optional[str] = None, trust_remote_code: bool = False, enable_prefix_caching: bool = False, enable_chunked_prefill: bool = False, dtype: str = "bfloat16", gptqmodel_backend: str = "auto", lang: str = "en" # For GPTQModel ): assert dataset in ["humaneval", "mbpp", "evalperf"], f"Invalid dataset {dataset}" assert evalperf_type is None or evalperf_type in [ "instruct", "perf-instruct", "perf-CoT", ] # Make dir for codes generated by each model identifier = model.strip("./").replace("/", "--") + f"_{backend}_temp_{temperature}" if evalperf_type: identifier += f"-{evalperf_type}" target_path = os.path.join(root, dataset, identifier) if lang == "en" else os.path.join(root, dataset, f"{lang}_{identifier}") if jsonl_fmt: target_path += ".jsonl" else: os.makedirs(target_path, exist_ok=True) if dataset == "humaneval": dataset_dict = get_human_eval_plus(version=version, lang=lang) elif dataset == "mbpp": dataset_dict = get_mbpp_plus(version=version) elif dataset == "evalperf": original_dataset = {**get_human_eval_plus(), **get_mbpp_plus()} dataset_dict = {k: original_dataset[k] for k in get_evalperf_data()} assert id_range is None, "id_range not supported for evalperf" else: raise ValueError(f"Invalid dataset {dataset}") all_tasks_complete = False if jsonl_fmt and os.path.isfile(target_path): task_counts = {} with open(target_path, "r") as f: for line in f: if not line.strip(): continue data = json.loads(line) task_id = data["task_id"] task_counts[task_id] = task_counts.get(task_id, 0) + 1 all_tasks_complete = all( task_counts.get(task_id, 0) >= n_samples for task_id in dataset_dict.keys() ) if all_tasks_complete: print("All samples are already cached. Skipping codegen.") return target_path if greedy and (temperature != 0 or bs != 1 or n_samples != 1): temperature = 0.0 bs = 1 n_samples = 1 print("Greedy decoding ON (--greedy): setting bs=1, n_samples=1, temperature=0") if id_range is not None: assert len(id_range) == 2, "id_range must be a list of length 2" assert id_range[0] < id_range[1], "id_range must be increasing" id_range = tuple(id_range) if bs is None: bs = min(n_samples, 32) print(f"Setting batch size to {bs}") # Make project dir os.makedirs(root, exist_ok=True) # Make dataset dir os.makedirs(os.path.join(root, dataset), exist_ok=True) # Model instructions instruction_prefix = "Please provide a self-contained Python script that solves the following problem in a markdown code block:" response_prefix = "Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:" if evalperf_type == "perf-instruct": instruction_prefix = "Please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:" response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:" elif evalperf_type == "perf-CoT": instruction_prefix = "Think step by step: please provide an efficient and self-contained Python script that solves the following problem in a markdown code block:" response_prefix = "Below is a Python script with a self-contained function that efficiently solves the problem and passes corresponding tests:" elif evalperf_type is not None and evalperf_type != "instruct": raise ValueError(f"Invalid evalperf_type: {evalperf_type}") # Model creation model_runner = make_model( model=model, backend=backend, batch_size=bs, temperature=temperature, force_base_prompt=force_base_prompt, dataset=dataset, base_url=base_url, tp=tp, instruction_prefix=instruction_prefix, response_prefix=response_prefix, device_map=device_map, attn_implementation=attn_implementation, trust_remote_code=trust_remote_code, enable_prefix_caching=enable_prefix_caching, enable_chunked_prefill=enable_chunked_prefill, dtype=dtype, gptqmodel_backend=gptqmodel_backend, ) codegen( target_path=target_path, dataset=dataset_dict, greedy=greedy, model=model_runner, n_samples=n_samples, resume=resume, id_range=id_range, ) # force shutdown the model runner del model_runner import gc gc.collect() return target_path def main(): from fire import Fire Fire(run_codegen) if __name__ == "__main__": main()