From 65ff602cf556f59a279cf7a81ded95a9b37322d4 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Tue, 15 Apr 2025 11:33:16 +0800 Subject: [PATCH] [Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK --- opencompass/datasets/generic.py | 3 +- opencompass/models/openai_api.py | 48 ++++++++++++++++++-------------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index 07b6a0bb..deca2486 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -37,7 +37,6 @@ def get_final_results(judged_answers, is_correct = is_correct_count / count is_incorrect = is_incorrect_count / count is_given_attempted = is_correct + is_incorrect - loose_accuracy = is_correct / count accuracy_given_attempted = (is_correct / is_given_attempted if is_given_attempted > 0 else 0) attempted_judge_ratio = attempted_judge_count / count @@ -46,7 +45,7 @@ def get_final_results(judged_answers, (accuracy_given_attempted + is_correct) if (accuracy_given_attempted + is_correct) > 0 else 0) result = { - metric_name: loose_accuracy * 100, + metric_name: is_correct * 100, f'{metric_name}_given_attempted': accuracy_given_attempted * 100, 'f1': f1, 'attempted_ratio': attempted_judge_ratio * 100, diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 7b2c2c53..6ef11b8f 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel): class OpenAISDK(OpenAI): - def __init__( - self, - path: str = 'gpt-3.5-turbo', - max_seq_len: int = 16384, - query_per_second: int = 1, - rpm_verbose: bool = False, - retry: int = 2, - key: str | List[str] = 'ENV', - org: str | List[str] | None = None, - meta_template: Dict | None = None, - openai_api_base: str | List[str] = OPENAISDK_API_BASE, - openai_proxy_url: Optional[str] = None, - mode: str = 'none', - logprobs: bool | None = False, - top_logprobs: int | None = None, - temperature: float | None = None, - tokenizer_path: str | None = None, - extra_body: Dict | None = None, - verbose: bool = False, - status_code_mappings: dict = {}, - ): + def __init__(self, + path: str = 'gpt-3.5-turbo', + max_seq_len: int = 16384, + query_per_second: int = 1, + rpm_verbose: bool = False, + retry: int = 2, + key: str | List[str] = 'ENV', + org: str | List[str] | None = None, + meta_template: Dict | None = None, + openai_api_base: str | List[str] = OPENAISDK_API_BASE, + openai_proxy_url: Optional[str] = None, + mode: str = 'none', + logprobs: bool | None = False, + top_logprobs: int | None = None, + temperature: float | None = None, + tokenizer_path: str | None = None, + extra_body: Dict | None = None, + verbose: bool = False, + status_code_mappings: dict = {}, + think_tag: str = ''): super().__init__( path, max_seq_len, @@ -596,6 +595,7 @@ class OpenAISDK(OpenAI): if self.verbose: self.logger.info(f'Used openai_client: {self.openai_client}') self.status_code_mappings = status_code_mappings + self.think_tag = think_tag def _generate(self, input: PromptList | str, @@ -670,6 +670,12 @@ class OpenAISDK(OpenAI): num_retries += 1 # Continue to retry instead of returning empty response continue + # If the model has reasoning_content, concat it + # with the content + if hasattr(responses.choices[0].message, 'reasoning_content'): + return (responses.choices[0].message.reasoning_content + + self.think_tag + + responses.choices[0].message.content) return responses.choices[0].message.content