OpenCompass/opencompass/datasets/evalplus/perf/sas.py
2025-02-19 04:46:42 +01:00

169 lines
5.0 KiB
Python

"""This file implements the "Synthesizing an Synthesizer" idea using OpenAI API.
Specifically, for each HumanEval+ and MBPP+ task, we generate n test input synthesizers
by querying a vLLM server (https://docs.vllm.ai/en/latest/).
"""
import json
from typing import Optional
import openai
from tqdm import tqdm
from evalplus.data import get_human_eval_plus, get_mbpp_plus
def fewshot_cot(
task_id,
client: openai.OpenAI,
entry_point: str,
code: str,
model: str,
n: int = 1,
max_tokens: int = 2048,
):
responses = client.completions.create(
model=model,
prompt=f'''\
You are an AI programming assistant, proficient in analyzing and generating Python code. \
You are going to produce a self-contained Python function to generate a large input for a given function, \
to test its performance at scale.
### Instruction:
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `add` function:
```python3
def add(x: int, y: int):
"""Add two numbers x and y
>>> add(2, 3)
5
>>> add(5, 7)
12
"""
return x + y
```
### Response:
Analysis:
1. Input format: two integers `x` and `y`
2. Is this task O(1) solvable? Yes
### Instruction:
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `prime_num` function:
```python3
"""
Write a function to check if a number is prime or not.
assert prime_num(2) == True
"""
import math
def prime_num(num):
if num < 2: return False
for i in range(2, math.isqrt(num)):
if num % i == 0:
return False
return True
```
### Response:
Analysis:
1. Input format: An integer `n`
2. Is this task O(1) solvable? No
3. Time complexity: O(n)
4. Space complexity: O(1)
5. What kind of input can exercise its performance? Large prime numbers
```python3
# Can reuse the `prime_num` function
# `scale` is a rough estimate of the input size -- larger `scale` means larger input
# use case: prime_num(*perf_input_gen(scale))
import random
def perf_input_gen(scale: int):
for i in range(scale, 2, -1):
if prime_num(i):
return (i,)
return (2,)
```
### Instruction:
Generate a `perf_input_gen(scale: int)` function to produce a "large" input to exercise the performance of the `{entry_point}` function:
```python3
{code}
```
### Response:
Analysis:
1. Input format: ''',
n=n,
stop=["\n```\n", "\n2. Is this task O(1) solvable? Yes"],
max_tokens=max_tokens,
temperature=0.2,
)
# warn if any response is out of context
for r in responses.choices:
if r.finish_reason == "length":
print(f"Warning: response is too long for {task_id}")
return [r.text for r in responses.choices]
def main(
output: str, # output file
n: int = 16, # sample size and batch size
model: Optional[str] = "TheBloke/deepseek-coder-33B-instruct-AWQ",
port: str = 8088,
):
assert output.endswith(".jsonl"), "output must be a .jsonl file"
base_url = f"http://localhost:{port}/v1"
print(f"Trying to query vLLM model: {model} at {base_url}")
print(f"Note: To use SaS, you need to first set up a vLLM server for {model}")
print(f"For example:")
print(
f"""python -m vllm.entrypoints.openai.api_server \\
--model "{model}" \\
--port {port} \\
--tensor-parallel-size 2 \\
--max-num-seqs 16 \\
--gpu-memory-utilization 1.0"""
)
# "task_id" -> { "task_id", "entry_point", "ref_code", }
tasks = {}
for task_id, item in get_human_eval_plus().items():
tasks[task_id] = {
"task_id": task_id,
"entry_point": item["entry_point"],
"ref_code": item["prompt"] + item["canonical_solution"],
}
for task_id, item in get_mbpp_plus().items():
tasks[task_id] = {
"task_id": task_id,
"entry_point": item["entry_point"],
"ref_code": item["prompt"] + item["canonical_solution"],
}
# Using vLLM as a backend, please make sure that a vLLM server is available first.
# vLLM document: https://docs.vllm.ai/en/latest/
client = openai.OpenAI(api_key="none", base_url=base_url)
with open(output, "w") as f:
for task_id, item in tqdm(tasks.items(), total=len(tasks)):
responses = fewshot_cot(
task_id=task_id,
client=client,
entry_point=item["entry_point"],
code=item["ref_code"],
model=model,
n=n,
)
f.write(
json.dumps(
{
"task_id": task_id,
"ref_code": item["ref_code"],
"synthesizers": responses,
}
)
+ "\n"
)
f.flush()
if __name__ == "__main__":
import fire
fire.Fire(main)