mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
169 lines
5.0 KiB
Python
169 lines
5.0 KiB
Python
"""This file implements the "Synthesizing an Synthesizer" idea using OpenAI API.
|
|
Specifically, for each HumanEval+ and MBPP+ task, we generate n test input synthesizers
|
|
by querying a vLLM server (https://docs.vllm.ai/en/latest/).
|
|
"""
|
|
|
|
import json
|
|
from typing import Optional
|
|
|
|
import openai
|
|
from tqdm import tqdm
|
|
|
|
from evalplus.data import get_human_eval_plus, get_mbpp_plus
|
|
|
|
|
|
def fewshot_cot(
|
|
task_id,
|
|
client: openai.OpenAI,
|
|
entry_point: str,
|
|
code: str,
|
|
model: str,
|
|
n: int = 1,
|
|
max_tokens: int = 2048,
|
|
):
|
|
responses = client.completions.create(
|
|
model=model,
|
|
prompt=f'''\
|
|
You are an AI programming assistant, proficient in analyzing and generating Python code. \
|
|
You are going to produce a self-contained Python function to generate a large input for a given function, \
|
|
to test its performance at scale.
|
|
### Instruction:
|
|
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `add` function:
|
|
```python3
|
|
def add(x: int, y: int):
|
|
"""Add two numbers x and y
|
|
>>> add(2, 3)
|
|
5
|
|
>>> add(5, 7)
|
|
12
|
|
"""
|
|
return x + y
|
|
```
|
|
### Response:
|
|
Analysis:
|
|
1. Input format: two integers `x` and `y`
|
|
2. Is this task O(1) solvable? Yes
|
|
### Instruction:
|
|
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `prime_num` function:
|
|
```python3
|
|
"""
|
|
Write a function to check if a number is prime or not.
|
|
assert prime_num(2) == True
|
|
"""
|
|
import math
|
|
def prime_num(num):
|
|
if num < 2: return False
|
|
for i in range(2, math.isqrt(num)):
|
|
if num % i == 0:
|
|
return False
|
|
return True
|
|
```
|
|
### Response:
|
|
Analysis:
|
|
1. Input format: An integer `n`
|
|
2. Is this task O(1) solvable? No
|
|
3. Time complexity: O(n)
|
|
4. Space complexity: O(1)
|
|
5. What kind of input can exercise its performance? Large prime numbers
|
|
```python3
|
|
# Can reuse the `prime_num` function
|
|
# `scale` is a rough estimate of the input size -- larger `scale` means larger input
|
|
# use case: prime_num(*perf_input_gen(scale))
|
|
import random
|
|
def perf_input_gen(scale: int):
|
|
for i in range(scale, 2, -1):
|
|
if prime_num(i):
|
|
return (i,)
|
|
return (2,)
|
|
```
|
|
### Instruction:
|
|
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `{entry_point}` function:
|
|
```python3
|
|
{code}
|
|
```
|
|
### Response:
|
|
Analysis:
|
|
1. Input format: ''',
|
|
n=n,
|
|
stop=["\n```\n", "\n2. Is this task O(1) solvable? Yes"],
|
|
max_tokens=max_tokens,
|
|
temperature=0.2,
|
|
)
|
|
|
|
# warn if any response is out of context
|
|
for r in responses.choices:
|
|
if r.finish_reason == "length":
|
|
print(f"Warning: response is too long for {task_id}")
|
|
|
|
return [r.text for r in responses.choices]
|
|
|
|
|
|
def main(
|
|
output: str, # output file
|
|
n: int = 16, # sample size and batch size
|
|
model: Optional[str] = "TheBloke/deepseek-coder-33B-instruct-AWQ",
|
|
port: str = 8088,
|
|
):
|
|
assert output.endswith(".jsonl"), "output must be a .jsonl file"
|
|
|
|
base_url = f"http://localhost:{port}/v1"
|
|
print(f"Trying to query vLLM model: {model} at {base_url}")
|
|
print(f"Note: To use SaS, you need to first set up a vLLM server for {model}")
|
|
print(f"For example:")
|
|
print(
|
|
f"""python -m vllm.entrypoints.openai.api_server \\
|
|
--model "{model}" \\
|
|
--port {port} \\
|
|
--tensor-parallel-size 2 \\
|
|
--max-num-seqs 16 \\
|
|
--gpu-memory-utilization 1.0"""
|
|
)
|
|
|
|
# "task_id" -> { "task_id", "entry_point", "ref_code", }
|
|
tasks = {}
|
|
for task_id, item in get_human_eval_plus().items():
|
|
tasks[task_id] = {
|
|
"task_id": task_id,
|
|
"entry_point": item["entry_point"],
|
|
"ref_code": item["prompt"] + item["canonical_solution"],
|
|
}
|
|
|
|
for task_id, item in get_mbpp_plus().items():
|
|
tasks[task_id] = {
|
|
"task_id": task_id,
|
|
"entry_point": item["entry_point"],
|
|
"ref_code": item["prompt"] + item["canonical_solution"],
|
|
}
|
|
|
|
# Using vLLM as a backend, please make sure that a vLLM server is available first.
|
|
# vLLM document: https://docs.vllm.ai/en/latest/
|
|
client = openai.OpenAI(api_key="none", base_url=base_url)
|
|
|
|
with open(output, "w") as f:
|
|
for task_id, item in tqdm(tasks.items(), total=len(tasks)):
|
|
responses = fewshot_cot(
|
|
task_id=task_id,
|
|
client=client,
|
|
entry_point=item["entry_point"],
|
|
code=item["ref_code"],
|
|
model=model,
|
|
n=n,
|
|
)
|
|
f.write(
|
|
json.dumps(
|
|
{
|
|
"task_id": task_id,
|
|
"ref_code": item["ref_code"],
|
|
"synthesizers": responses,
|
|
}
|
|
)
|
|
+ "\n"
|
|
)
|
|
f.flush()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import fire
|
|
|
|
fire.Fire(main)
|