diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index f1109b1d..f347e9e2 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -121,8 +121,40 @@ class BigCodeBenchEvaluator(BaseEvaluator): logger.info('Start to extract code from predictions') sanitized_predictions = [] for prediction, entrypoint in zip(predictions, entrypoints): - sanitized_prediction = extract_code_generation( - prediction, entrypoint=entrypoint) + try: + import signal + from contextlib import contextmanager + + @contextmanager + def timeout_handler(seconds): + + def _handle_timeout(signum, frame): + raise TimeoutError(f'Code extraction timed out' + f'after {seconds} seconds') + + original_handler = signal.signal(signal.SIGALRM, + _handle_timeout) + signal.alarm(seconds) + try: + yield + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, original_handler) + + with timeout_handler(10): + sanitized_prediction = extract_code_generation( + prediction, entrypoint=entrypoint) + + except TimeoutError as e: + logger.warning( + f'Code extraction timeout for entrypoint {entrypoint}: ' + f'{str(e)}') + sanitized_prediction = '' + except Exception as e: + logger.warning( + f'Code extraction failed for entrypoint {entrypoint}: ' + f'{str(e)}') + sanitized_prediction = '' sanitized_predictions.append(sanitized_prediction) # Prepare for submission diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 2781d160..d5ac02d8 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -25,12 +25,7 @@ OPENAI_API_BASE = os.path.join( OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1/') -O1_MODEL_LIST = [ - 'o1-preview-2024-09-12', - 'o1-mini-2024-09-12', - 'o1-preview', - 'o1-mini', -] +O1_MODEL_LIST = ['o1', 'o3'] @MODELS.register_module() @@ -96,7 +91,6 @@ class OpenAI(BaseAPIModel): temperature: Optional[float] = None, tokenizer_path: Optional[str] = None, extra_body: Optional[Dict] = None, - max_completion_tokens: int = 16384, verbose: bool = False, ): @@ -151,9 +145,6 @@ class OpenAI(BaseAPIModel): self.proxy_url = openai_proxy_url self.path = path - self.max_completion_tokens = max_completion_tokens - self.logger.warning( - f'Max Completion tokens for {path} is {max_completion_tokens}') def generate( self, @@ -250,16 +241,15 @@ class OpenAI(BaseAPIModel): header['OpenAI-Organization'] = self.orgs[self.org_ctr] try: - if self.path in O1_MODEL_LIST: + if any(model in self.path for model in O1_MODEL_LIST): self.logger.warning( f"'max_token' is unsupported for model {self.path}") self.logger.warning( - f'We use max_completion_tokens: ' - f'{self.max_completion_tokens}for this query') + f'We use max_out_len: {max_out_len} for this query') data = dict( model=self.path, messages=messages, - max_completion_tokens=self.max_completion_tokens, + max_completion_tokens=max_out_len, n=1, logprobs=self.logprobs, top_logprobs=self.top_logprobs, @@ -440,7 +430,7 @@ class OpenAI(BaseAPIModel): if mode == 'front': cur_prompt = sep.join(words[-mid:]) elif mode == 'mid': - cur_prompt = (sep.join(words[:mid]) + sep.join(words[-mid:])) + cur_prompt = sep.join(words[:mid]) + sep.join(words[-mid:]) elif mode == 'rear': cur_prompt = sep.join(words[:mid]) @@ -480,7 +470,9 @@ class OpenAI(BaseAPIModel): """ # Check input length when mode is 'none' if mode == 'none': - input_len = get_token_len_func(str(input)) + input_len = (get_token_len_func(input) if isinstance( + input, str) else sum( + get_token_len_func(item['prompt']) for item in input)) if input_len > max_seq_len: raise ValueError( f'Input length ({input_len}) exceeds max_seq_len ' @@ -499,12 +491,15 @@ class OpenAI(BaseAPIModel): # Convert input to messages format if isinstance(input, str): messages = [{'role': 'user', 'content': input}] + input_len = get_token_len_func(input) else: messages = [] + processed_prompts = [] for item in input: input_content = item['prompt'] if mode != 'none': input_content = bin_trim_wrapper(input_content) + processed_prompts.append(input_content) msg = {'content': input_content} if item['role'] == 'HUMAN': msg['role'] = 'user' @@ -513,19 +508,18 @@ class OpenAI(BaseAPIModel): elif item['role'] == 'SYSTEM': msg['role'] = 'system' messages.append(msg) + input_len = sum( + get_token_len_func(prompt) for prompt in processed_prompts) # Adjust max_out_len if max_out_len is not None: original_max_out_len = max_out_len - max_out_len = min( - max_out_len, - max_seq_len - get_token_len_func(str(input)) - 100) + max_out_len = min(max_out_len, max_seq_len - input_len - 100) if max_out_len <= 0: raise ValueError( f'max_out_len ({max_out_len}) is less than or equal to 0. ' - f'This may be due to input length ' - f'({get_token_len_func(str(input))}) being too close to ' - f'max_seq_len ({max_seq_len}). Please either increase ' + f'This may be due to input length ({input_len}) being too ' + f'close to max_seq_len ({max_seq_len}). Please increase ' f'max_seq_len or use a truncation mode other than "none".') if max_out_len < original_max_out_len: self.logger.warning( @@ -555,7 +549,6 @@ class OpenAISDK(OpenAI): temperature: float | None = None, tokenizer_path: str | None = None, extra_body: Dict | None = None, - max_completion_tokens: int = 16384, verbose: bool = False, status_code_mappings: dict = {}, ): @@ -577,7 +570,6 @@ class OpenAISDK(OpenAI): tokenizer_path, extra_body, verbose=verbose, - max_completion_tokens=max_completion_tokens, ) from openai import OpenAI @@ -605,8 +597,23 @@ class OpenAISDK(OpenAI): self.logger.info(f'Used openai_client: {self.openai_client}') self.status_code_mappings = status_code_mappings - def _generate(self, input: PromptList | str, max_out_len: int, - temperature: float) -> str: + def _generate(self, + input: PromptList | str, + max_out_len: int, + temperature: float, + timeout: int = 3600) -> str: + """Generate results given a list of inputs. + + Args: + input (PromptType): A string or PromptDict. + max_out_len (int): The maximum length of the output. + temperature (float): What sampling temperature to use. + timeout (int, optional): Timeout in seconds for the API call. + Defaults to 3600 (60 minutes). + + Returns: + str: The generated string. + """ from openai import APIStatusError, BadRequestError assert isinstance(input, (str, PromptList)) @@ -618,16 +625,14 @@ class OpenAISDK(OpenAI): num_retries = 0 while num_retries < self.retry: self.wait() - - if self.path in O1_MODEL_LIST: + if any(model in self.path for model in O1_MODEL_LIST): self.logger.warning( f"'max_token' is unsupported for model {self.path}") self.logger.warning( - f'We use max_completion_tokens: ' - f'{self.max_completion_tokens}for this query') + f'We use max_out_len: {max_out_len} for this query') query_data = dict( model=self.path, - max_completion_tokens=self.max_completion_tokens, + max_completion_tokens=max_out_len, n=1, messages=messages, extra_body=self.extra_body, @@ -646,7 +651,8 @@ class OpenAISDK(OpenAI): if self.verbose: self.logger.info('Start calling OpenAI API') responses = self.openai_client.chat.completions.create( - **query_data) + **query_data, timeout=timeout) # timeout in seconds + if self.verbose: self.logger.info( 'Successfully get response from OpenAI API') diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py index 67c01243..7ed1ee53 100644 --- a/opencompass/summarizers/subjective/compassbench.py +++ b/opencompass/summarizers/subjective/compassbench.py @@ -34,39 +34,29 @@ MAP = { '总分', '中文总分', '英文总分', - 'instruct/compassbenchv1_4_IF_en_fofo_sub', - 'instruct/compassbenchv1_4_IF_zh_fofo_sub', + 'instruct/compassbench_2501_IF_en_chatIF_sub', + 'instruct/compassbench_2501_IF_en_functionalIF_sub', + 'instruct/compassbench_2501_IF_cn_chatIF_sub', + 'instruct/compassbench_2501_IF_cn_functionalIF_sub', ], 'language': [ '总分', '中文总分', '英文总分', - 'language/compassbenchv1_4_language_zh_chat_sub', - 'language/compassbenchv1_4_language_zh_creation_sub', - 'language/compassbenchv1_4_language_zh_NLP_sub', - 'language/compassbenchv1_4_language_en_chat_sub', - 'language/compassbenchv1_4_language_en_creation_sub', - 'language/compassbenchv1_4_language_en_NLP_sub', + 'language/compassbench_v2501_language_zh_chat_sub', + 'language/compassbench_v2501_language_zh_nlp_sub', + 'language/compassbench_v2501_language_zh_creation_sub', + 'language/compassbench_v2501_language_en_chat_sub', + 'language/compassbench_v2501_language_en_nlp_sub', + 'language/compassbench_v2501_language_en_creation_sub', ], - 'reasoning': [ + + 'code': [ '总分', '中文总分', '英文总分', - 'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub', - 'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub', - 'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub', - 'reasoning/compassbenchv1_4_reasoning_en_Social_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_Social_sub', - ], - 'coding': [ - '总分', - '中文总分', - '英文总分', - 'coding/compassbenchv1_4_coding_en_sub', - 'coding/compassbenchv1_4_coding_zh_sub', + 'code/compassbench_2501_code_arena_en_sub', + 'code/compassbench_2501_code_arena_zh_sub', ], }