[Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK

2025-05-30 16:03:24 +08:00 · 2025-04-15 11:33:16 +08:00 · 2025-04-15 11:33:16 +08:00 · 65ff602cf5
commit 65ff602cf5
parent 75e7834b59
2 changed files with 28 additions and 23 deletions
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@ -37,7 +37,6 @@ def get_final_results(judged_answers,
    is_correct = is_correct_count / count
    is_incorrect = is_incorrect_count / count
    is_given_attempted = is_correct + is_incorrect
-    loose_accuracy = is_correct / count
    accuracy_given_attempted = (is_correct / is_given_attempted
                                if is_given_attempted > 0 else 0)
    attempted_judge_ratio = attempted_judge_count / count
@ -46,7 +45,7 @@ def get_final_results(judged_answers,
          (accuracy_given_attempted + is_correct) if
          (accuracy_given_attempted + is_correct) > 0 else 0)
    result = {
-        metric_name: loose_accuracy * 100,
+        metric_name: is_correct * 100,
        f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
        'f1': f1,
        'attempted_ratio': attempted_judge_ratio * 100,
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -531,8 +531,7 @@ class OpenAI(BaseAPIModel):

 class OpenAISDK(OpenAI):

-    def __init__(
-        self,
+    def __init__(self,
                 path: str = 'gpt-3.5-turbo',
                 max_seq_len: int = 16384,
                 query_per_second: int = 1,
@ -551,7 +550,7 @@ class OpenAISDK(OpenAI):
                 extra_body: Dict | None = None,
                 verbose: bool = False,
                 status_code_mappings: dict = {},
-    ):
+                 think_tag: str = '</think>'):
        super().__init__(
            path,
            max_seq_len,
@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
        if self.verbose:
            self.logger.info(f'Used openai_client: {self.openai_client}')
        self.status_code_mappings = status_code_mappings
+        self.think_tag = think_tag

    def _generate(self,
                  input: PromptList | str,
@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
                    num_retries += 1
                    # Continue to retry instead of returning empty response
                    continue
+                # If the model has reasoning_content, concat it
+                # with the content
+                if hasattr(responses.choices[0].message, 'reasoning_content'):
+                    return (responses.choices[0].message.reasoning_content +
+                            self.think_tag +
+                            responses.choices[0].message.content)

                return responses.choices[0].message.content