From 65ff602cf556f59a279cf7a81ded95a9b37322d4 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 15 Apr 2025 11:33:16 +0800
Subject: [PATCH] [Update] Fix LLM Judge metrics cacluation & Add reasoning
 content concat to OpenAI SDK

---
 opencompass/datasets/generic.py  |  3 +-
 opencompass/models/openai_api.py | 48 ++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 23 deletions(-)
diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py
index 07b6a0bb..deca2486 100644
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@@ -37,7 +37,6 @@ def get_final_results(judged_answers,
     is_correct = is_correct_count / count
     is_incorrect = is_incorrect_count / count
     is_given_attempted = is_correct + is_incorrect
-    loose_accuracy = is_correct / count
     accuracy_given_attempted = (is_correct / is_given_attempted
                                 if is_given_attempted > 0 else 0)
     attempted_judge_ratio = attempted_judge_count / count
@@ -46,7 +45,7 @@ def get_final_results(judged_answers,
           (accuracy_given_attempted + is_correct) if
           (accuracy_given_attempted + is_correct) > 0 else 0)
     result = {
-        metric_name: loose_accuracy * 100,
+        metric_name: is_correct * 100,
         f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
         'f1': f1,
         'attempted_ratio': attempted_judge_ratio * 100,
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 7b2c2c53..6ef11b8f 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel):
 
 class OpenAISDK(OpenAI):
 
-    def __init__(
-        self,
-        path: str = 'gpt-3.5-turbo',
-        max_seq_len: int = 16384,
-        query_per_second: int = 1,
-        rpm_verbose: bool = False,
-        retry: int = 2,
-        key: str | List[str] = 'ENV',
-        org: str | List[str] | None = None,
-        meta_template: Dict | None = None,
-        openai_api_base: str | List[str] = OPENAISDK_API_BASE,
-        openai_proxy_url: Optional[str] = None,
-        mode: str = 'none',
-        logprobs: bool | None = False,
-        top_logprobs: int | None = None,
-        temperature: float | None = None,
-        tokenizer_path: str | None = None,
-        extra_body: Dict | None = None,
-        verbose: bool = False,
-        status_code_mappings: dict = {},
-    ):
+    def __init__(self,
+                 path: str = 'gpt-3.5-turbo',
+                 max_seq_len: int = 16384,
+                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
+                 retry: int = 2,
+                 key: str | List[str] = 'ENV',
+                 org: str | List[str] | None = None,
+                 meta_template: Dict | None = None,
+                 openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+                 openai_proxy_url: Optional[str] = None,
+                 mode: str = 'none',
+                 logprobs: bool | None = False,
+                 top_logprobs: int | None = None,
+                 temperature: float | None = None,
+                 tokenizer_path: str | None = None,
+                 extra_body: Dict | None = None,
+                 verbose: bool = False,
+                 status_code_mappings: dict = {},
+                 think_tag: str = '</think>'):
         super().__init__(
             path,
             max_seq_len,
@@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
         if self.verbose:
             self.logger.info(f'Used openai_client: {self.openai_client}')
         self.status_code_mappings = status_code_mappings
+        self.think_tag = think_tag
 
     def _generate(self,
                   input: PromptList | str,
@@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
                     num_retries += 1
                     # Continue to retry instead of returning empty response
                     continue
+                # If the model has reasoning_content, concat it
+                # with the content
+                if hasattr(responses.choices[0].message, 'reasoning_content'):
+                    return (responses.choices[0].message.reasoning_content +
+                            self.think_tag +
+                            responses.choices[0].message.content)
 
                 return responses.choices[0].message.content