OpenCompass/opencompass/models/openai_api.py

import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from typing import Dict, List, Optional, Union

import requests

from opencompass.registry import MODELS
from opencompass.utils.prompt import PromptList

from .base_api import BaseAPIModel

PromptType = Union[PromptList, str]
OPENAI_API_BASE = 'https://api.openai.com/v1/chat/completions'


@MODELS.register_module()
class OpenAI(BaseAPIModel):
    """Model wrapper around OpenAI's models.

    Args:
        path (str): The name of OpenAI's model.
        max_seq_len (int): The maximum allowed sequence length of a model.
            Note that the length of prompt + generated tokens shall not exceed
            this value. Defaults to 2048.
        query_per_second (int): The maximum queries allowed per second
            between two consecutive calls of the API. Defaults to 1.
        retry (int): Number of retires if the API call fails. Defaults to 2.
        key (str or List[str]): OpenAI key(s). In particular, when it
            is set to "ENV", the key will be fetched from the environment
            variable $OPENAI_API_KEY, as how openai defaults to be. If it's a
            list, the keys will be used in round-robin manner. Defaults to
            'ENV'.
        org (str or List[str], optional): OpenAI organization(s). If not
            specified, OpenAI uses the default organization bound to each API
            key. If specified, the orgs will be posted with each request in
            round-robin manner. Defaults to None.
        meta_template (Dict, optional): The model's meta prompt
            template if needed, in case the requirement of injecting or
            wrapping of any meta instructions.
        openai_api_base (str): The base url of OpenAI's API. Defaults to
            'https://api.openai.com/v1/chat/completions'.
        temperature (float, optional): What sampling temperature to use.
            If not None, will override the temperature in the `generate()`
            call. Defaults to None.
    """

    is_api: bool = True

    def __init__(self,
                 path: str = 'gpt-3.5-turbo',
                 max_seq_len: int = 4096,
                 query_per_second: int = 1,
                 retry: int = 2,
                 key: Union[str, List[str]] = 'ENV',
                 org: Optional[Union[str, List[str]]] = None,
                 meta_template: Optional[Dict] = None,
                 openai_api_base: str = OPENAI_API_BASE,
                 temperature: Optional[float] = None):

        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         meta_template=meta_template,
                         query_per_second=query_per_second,
                         retry=retry)
        import tiktoken
        self.tiktoken = tiktoken
        self.temperature = temperature

        if isinstance(key, str):
            self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key]
        else:
            self.keys = key

        # record invalid keys and skip them when requesting API
        # - keys have insufficient_quota
        self.invalid_keys = set()

        self.key_ctr = 0
        if isinstance(org, str):
            self.orgs = [org]
        else:
            self.orgs = org
        self.org_ctr = 0
        self.url = openai_api_base
        self.path = path

    def generate(
        self,
        inputs: List[str or PromptList],
        max_out_len: int = 512,
        temperature: float = 0.7,
    ) -> List[str]:
        """Generate results given a list of inputs.

        Args:
            inputs (List[str or PromptList]): A list of strings or PromptDicts.
                The PromptDict should be organized in OpenCompass'
                API format.
            max_out_len (int): The maximum length of the output.
            temperature (float): What sampling temperature to use,
                between 0 and 2. Higher values like 0.8 will make the output
                more random, while lower values like 0.2 will make it more
                focused and deterministic. Defaults to 0.7.

        Returns:
            List[str]: A list of generated strings.
        """
        if self.temperature is not None:
            temperature = self.temperature

        with ThreadPoolExecutor() as executor:
            results = list(
                executor.map(self._generate, inputs,
                             [max_out_len] * len(inputs),
                             [temperature] * len(inputs)))
        return results

    def _generate(self, input: str or PromptList, max_out_len: int,
                  temperature: float) -> str:
        """Generate results given a list of inputs.

        Args:
            inputs (str or PromptList): A string or PromptDict.
                The PromptDict should be organized in OpenCompass'
                API format.
            max_out_len (int): The maximum length of the output.
            temperature (float): What sampling temperature to use,
                between 0 and 2. Higher values like 0.8 will make the output
                more random, while lower values like 0.2 will make it more
                focused and deterministic.

        Returns:
            str: The generated string.
        """
        assert isinstance(input, (str, PromptList))

        if isinstance(input, str):
            messages = [{'role': 'user', 'content': input}]
        else:
            messages = []
            for item in input:
                msg = {'content': item['prompt']}
                if item['role'] == 'HUMAN':
                    msg['role'] = 'user'
                elif item['role'] == 'BOT':
                    msg['role'] = 'assistant'
                elif item['role'] == 'SYSTEM':
                    msg['role'] = 'system'
                messages.append(msg)

        # max num token for gpt-3.5-turbo is 4097
        context_window = 4096
        if '32k' in self.path:
            context_window = 32768
        elif '16k' in self.path:
            context_window = 16384
        elif 'gpt-4' in self.path:
            context_window = 8192

        # Hold out 100 tokens due to potential errors in tiktoken calculation
        max_out_len = min(
            max_out_len, context_window - self.get_token_len(str(input)) - 100)
        if max_out_len <= 0:
            return ''

        max_num_retries = 0
        while max_num_retries < self.retry:
            self.wait()

            with Lock():
                if len(self.invalid_keys) == len(self.keys):
                    raise RuntimeError('All keys have insufficient quota.')

                # find the next valid key
                while True:
                    self.key_ctr += 1
                    if self.key_ctr == len(self.keys):
                        self.key_ctr = 0

                    if self.keys[self.key_ctr] not in self.invalid_keys:
                        break

                key = self.keys[self.key_ctr]

            header = {
                'Authorization': f'Bearer {key}',
                'content-type': 'application/json',
            }

            if self.orgs:
                with Lock():
                    self.org_ctr += 1
                    if self.org_ctr == len(self.orgs):
                        self.org_ctr = 0
                header['OpenAI-Organization'] = self.orgs[self.org_ctr]

            try:
                data = dict(
                    model=self.path,
                    messages=messages,
                    max_tokens=max_out_len,
                    n=1,
                    stop=None,
                    temperature=temperature,
                )
                raw_response = requests.post(self.url,
                                             headers=header,
                                             data=json.dumps(data))
            except requests.ConnectionError:
                self.logger.error('Got connection error, retrying...')
                continue
            try:
                response = raw_response.json()
            except requests.JSONDecodeError:
                self.logger.error('JsonDecode error, got',
                                  str(raw_response.content))
                continue
            try:
                return response['choices'][0]['message']['content'].strip()
            except KeyError:
                if 'error' in response:
                    if response['error']['code'] == 'rate_limit_exceeded':
                        time.sleep(1)
                        continue
                    elif response['error']['code'] == 'insufficient_quota':
                        self.invalid_keys.add(key)
                        self.logger.warn(f'insufficient_quota key: {key}')
                        continue

                    self.logger.error('Find error message in response: ',
                                      str(response['error']))
            max_num_retries += 1

        raise RuntimeError('Calling OpenAI failed after retrying for '
                           f'{max_num_retries} times. Check the logs for '
                           'details.')

    def get_token_len(self, prompt: str) -> int:
        """Get lengths of the tokenized string. Only English and Chinese
        characters are counted for now. Users are encouraged to override this
        method if more accurate length is needed.

        Args:
            prompt (str): Input string.

        Returns:
            int: Length of the input tokens
        """
        enc = self.tiktoken.encoding_for_model(self.path)
        return len(enc.encode(prompt))
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`import json`
Add release contribution 2023-07-05 10:33:12 +08:00			`import os`
[Enhancement] Optimize OpenAI models (#128) * [Feature] Enhance OpenAI API, add example config for GPT evaluation 2023-08-03 14:55:16 +08:00			`import time`
Add release contribution 2023-07-05 10:33:12 +08:00			`from concurrent.futures import ThreadPoolExecutor`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`from threading import Lock`
Add release contribution 2023-07-05 10:33:12 +08:00			`from typing import Dict, List, Optional, Union`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`import requests`

Add release contribution 2023-07-05 10:33:12 +08:00			`from opencompass.registry import MODELS`
			`from opencompass.utils.prompt import PromptList`

			`from .base_api import BaseAPIModel`

			`PromptType = Union[PromptList, str]`
[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`OPENAI_API_BASE = 'https://api.openai.com/v1/chat/completions'`
Add release contribution 2023-07-05 10:33:12 +08:00

			`@MODELS.register_module()`
			`class OpenAI(BaseAPIModel):`
			`"""Model wrapper around OpenAI's models.`

			`Args:`
			`path (str): The name of OpenAI's model.`
			`max_seq_len (int): The maximum allowed sequence length of a model.`
			`Note that the length of prompt + generated tokens shall not exceed`
			`this value. Defaults to 2048.`
			`query_per_second (int): The maximum queries allowed per second`
			`between two consecutive calls of the API. Defaults to 1.`
			`retry (int): Number of retires if the API call fails. Defaults to 2.`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`key (str or List[str]): OpenAI key(s). In particular, when it`
			`is set to "ENV", the key will be fetched from the environment`
			`variable $OPENAI_API_KEY, as how openai defaults to be. If it's a`
			`list, the keys will be used in round-robin manner. Defaults to`
			`'ENV'.`
			`org (str or List[str], optional): OpenAI organization(s). If not`
			`specified, OpenAI uses the default organization bound to each API`
			`key. If specified, the orgs will be posted with each request in`
			`round-robin manner. Defaults to None.`
Add release contribution 2023-07-05 10:33:12 +08:00			`meta_template (Dict, optional): The model's meta prompt`
			`template if needed, in case the requirement of injecting or`
			`wrapping of any meta instructions.`
			`openai_api_base (str): The base url of OpenAI's API. Defaults to`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`'https://api.openai.com/v1/chat/completions'.`
[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`temperature (float, optional): What sampling temperature to use.`
			If not None, will override the temperature in the `generate()`
			`call. Defaults to None.`
Add release contribution 2023-07-05 10:33:12 +08:00			`"""`

			`is_api: bool = True`

[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`def __init__(self,`
[Feature] Calculate max_out_len without hard code for OpenAI model (#158) * calulate max_out_len without hard code * set default value * update configs * Update configs/eval_gpt3.5.py Co-authored-by: Tong Gao <gaotongxiao@gmail.com> --------- Co-authored-by: Tong Gao <gaotongxiao@gmail.com> 2023-08-08 15:16:56 +08:00			`path: str = 'gpt-3.5-turbo',`
			`max_seq_len: int = 4096,`
[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`query_per_second: int = 1,`
			`retry: int = 2,`
			`key: Union[str, List[str]] = 'ENV',`
			`org: Optional[Union[str, List[str]]] = None,`
			`meta_template: Optional[Dict] = None,`
			`openai_api_base: str = OPENAI_API_BASE,`
			`temperature: Optional[float] = None):`

Add release contribution 2023-07-05 10:33:12 +08:00			`super().__init__(path=path,`
			`max_seq_len=max_seq_len,`
			`meta_template=meta_template,`
			`query_per_second=query_per_second,`
			`retry=retry)`
			`import tiktoken`
			`self.tiktoken = tiktoken`
[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`self.temperature = temperature`
Add release contribution 2023-07-05 10:33:12 +08:00
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`if isinstance(key, str):`
			`self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key]`
			`else:`
			`self.keys = key`
[Enhancement] Skip invalid keys to avoid requesting API (#184) * Skip invalid keys to avoid requesting API * get expected key * print warning info 2023-08-10 18:41:43 +08:00
			`# record invalid keys and skip them when requesting API`
			`# - keys have insufficient_quota`
			`self.invalid_keys = set()`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`self.key_ctr = 0`
			`if isinstance(org, str):`
			`self.orgs = [org]`
			`else:`
			`self.orgs = org`
			`self.org_ctr = 0`
			`self.url = openai_api_base`
[API] Refine OpenAI (#175) 2023-08-09 12:38:57 +08:00			`self.path = path`
Add release contribution 2023-07-05 10:33:12 +08:00
			`def generate(`
			`self,`
			`inputs: List[str or PromptList],`
			`max_out_len: int = 512,`
			`temperature: float = 0.7,`
			`) -> List[str]:`
			`"""Generate results given a list of inputs.`

			`Args:`
			`inputs (List[str or PromptList]): A list of strings or PromptDicts.`
			`The PromptDict should be organized in OpenCompass'`
			`API format.`
			`max_out_len (int): The maximum length of the output.`
			`temperature (float): What sampling temperature to use,`
			`between 0 and 2. Higher values like 0.8 will make the output`
			`more random, while lower values like 0.2 will make it more`
			`focused and deterministic. Defaults to 0.7.`

			`Returns:`
			`List[str]: A list of generated strings.`
			`"""`
[Feature] Allow explicitly setting the temperature for API model (#121) * allow explicitly setting the temperature * update 2023-07-28 11:28:15 +08:00			`if self.temperature is not None:`
			`temperature = self.temperature`

Add release contribution 2023-07-05 10:33:12 +08:00			`with ThreadPoolExecutor() as executor:`
			`results = list(`
			`executor.map(self._generate, inputs,`
			`[max_out_len] * len(inputs),`
			`[temperature] * len(inputs)))`
			`return results`

			`def _generate(self, input: str or PromptList, max_out_len: int,`
			`temperature: float) -> str:`
			`"""Generate results given a list of inputs.`

			`Args:`
			`inputs (str or PromptList): A string or PromptDict.`
			`The PromptDict should be organized in OpenCompass'`
			`API format.`
			`max_out_len (int): The maximum length of the output.`
			`temperature (float): What sampling temperature to use,`
			`between 0 and 2. Higher values like 0.8 will make the output`
			`more random, while lower values like 0.2 will make it more`
			`focused and deterministic.`

			`Returns:`
			`str: The generated string.`
			`"""`
			`assert isinstance(input, (str, PromptList))`

			`if isinstance(input, str):`
			`messages = [{'role': 'user', 'content': input}]`
			`else:`
			`messages = []`
			`for item in input:`
			`msg = {'content': item['prompt']}`
			`if item['role'] == 'HUMAN':`
			`msg['role'] = 'user'`
			`elif item['role'] == 'BOT':`
			`msg['role'] = 'assistant'`
			`elif item['role'] == 'SYSTEM':`
			`msg['role'] = 'system'`
			`messages.append(msg)`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`# max num token for gpt-3.5-turbo is 4097`
[API] Refine OpenAI (#175) 2023-08-09 12:38:57 +08:00			`context_window = 4096`
			`if '32k' in self.path:`
			`context_window = 32768`
			`elif '16k' in self.path:`
			`context_window = 16384`
			`elif 'gpt-4' in self.path:`
			`context_window = 8192`

			`# Hold out 100 tokens due to potential errors in tiktoken calculation`
[Feature] Calculate max_out_len without hard code for OpenAI model (#158) * calulate max_out_len without hard code * set default value * update configs * Update configs/eval_gpt3.5.py Co-authored-by: Tong Gao <gaotongxiao@gmail.com> --------- Co-authored-by: Tong Gao <gaotongxiao@gmail.com> 2023-08-08 15:16:56 +08:00			`max_out_len = min(`
[API] Refine OpenAI (#175) 2023-08-09 12:38:57 +08:00			`max_out_len, context_window - self.get_token_len(str(input)) - 100)`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`if max_out_len <= 0:`
			`return ''`

Add release contribution 2023-07-05 10:33:12 +08:00			`max_num_retries = 0`
			`while max_num_retries < self.retry:`
			`self.wait()`
[Enhancement] Skip invalid keys to avoid requesting API (#184) * Skip invalid keys to avoid requesting API * get expected key * print warning info 2023-08-10 18:41:43 +08:00
			`with Lock():`
			`if len(self.invalid_keys) == len(self.keys):`
			`raise RuntimeError('All keys have insufficient quota.')`

			`# find the next valid key`
			`while True:`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`self.key_ctr += 1`
			`if self.key_ctr == len(self.keys):`
			`self.key_ctr = 0`
[Enhancement] Skip invalid keys to avoid requesting API (#184) * Skip invalid keys to avoid requesting API * get expected key * print warning info 2023-08-10 18:41:43 +08:00
			`if self.keys[self.key_ctr] not in self.invalid_keys:`
			`break`

			`key = self.keys[self.key_ctr]`

			`header = {`
			`'Authorization': f'Bearer {key}',`
			`'content-type': 'application/json',`
			`}`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`if self.orgs:`
			`with Lock():`
			`self.org_ctr += 1`
			`if self.org_ctr == len(self.orgs):`
			`self.org_ctr = 0`
			`header['OpenAI-Organization'] = self.orgs[self.org_ctr]`

Add release contribution 2023-07-05 10:33:12 +08:00			`try:`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`data = dict(`
Add release contribution 2023-07-05 10:33:12 +08:00			`model=self.path,`
			`messages=messages,`
			`max_tokens=max_out_len,`
			`n=1,`
			`stop=None,`
			`temperature=temperature,`
			`)`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`raw_response = requests.post(self.url,`
			`headers=header,`
			`data=json.dumps(data))`
			`except requests.ConnectionError:`
			`self.logger.error('Got connection error, retrying...')`
			`continue`
			`try:`
			`response = raw_response.json()`
			`except requests.JSONDecodeError:`
			`self.logger.error('JsonDecode error, got',`
			`str(raw_response.content))`
[Enhancement] Optimize OpenAI models (#128) * [Feature] Enhance OpenAI API, add example config for GPT evaluation 2023-08-03 14:55:16 +08:00			`continue`
[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`try:`
			`return response['choices'][0]['message']['content'].strip()`
			`except KeyError:`
			`if 'error' in response:`
[Enhancement] Optimize OpenAI models (#128) * [Feature] Enhance OpenAI API, add example config for GPT evaluation 2023-08-03 14:55:16 +08:00			`if response['error']['code'] == 'rate_limit_exceeded':`
			`time.sleep(1)`
			`continue`
[Enhancement] Skip invalid keys to avoid requesting API (#184) * Skip invalid keys to avoid requesting API * get expected key * print warning info 2023-08-10 18:41:43 +08:00			`elif response['error']['code'] == 'insufficient_quota':`
			`self.invalid_keys.add(key)`
			`self.logger.warn(f'insufficient_quota key: {key}')`
			`continue`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`self.logger.error('Find error message in response: ',`
			`str(response['error']))`
Add release contribution 2023-07-05 10:33:12 +08:00			`max_num_retries += 1`

[Feature] Enhance OpenAI API, add example config for GPT evaluation (#53) * [Feature] Enhance OpenAI API, add example config for GPT evaluation * fix 2023-07-12 16:43:46 +08:00			`raise RuntimeError('Calling OpenAI failed after retrying for '`
			`f'{max_num_retries} times. Check the logs for '`
			`'details.')`
Add release contribution 2023-07-05 10:33:12 +08:00
			`def get_token_len(self, prompt: str) -> int:`
			`"""Get lengths of the tokenized string. Only English and Chinese`
			`characters are counted for now. Users are encouraged to override this`
			`method if more accurate length is needed.`

			`Args:`
			`prompt (str): Input string.`

			`Returns:`
			`int: Length of the input tokens`
			`"""`
			`enc = self.tiktoken.encoding_for_model(self.path)`
			`return len(enc.encode(prompt))`