[Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK

2025-05-30 16:03:24 +08:00 · 2025-04-15 11:33:16 +08:00 · 2025-04-15 11:33:16 +08:00 · 65ff602cf5
commit 65ff602cf5
parent 75e7834b59
2 changed files with 28 additions and 23 deletions
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@ -37,7 +37,6 @@ def get_final_results(judged_answers,
    is_correct = is_correct_count / count
    is_incorrect = is_incorrect_count / count
    is_given_attempted = is_correct + is_incorrect
    loose_accuracy = is_correct / count
    accuracy_given_attempted = (is_correct / is_given_attempted
                                if is_given_attempted > 0 else 0)
    attempted_judge_ratio = attempted_judge_count / count
@ -46,7 +45,7 @@ def get_final_results(judged_answers,
          (accuracy_given_attempted + is_correct) if
          (accuracy_given_attempted + is_correct) > 0 else 0)
    result = {
-        metric_name: loose_accuracy * 100,
+        metric_name: is_correct * 100,
        f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
        'f1': f1,
        'attempted_ratio': attempted_judge_ratio * 100,
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel):
 class OpenAISDK(OpenAI):
-    def __init__(
+    def __init__(self,
-        self,
+                 path: str = 'gpt-3.5-turbo',
-        path: str = 'gpt-3.5-turbo',
+                 max_seq_len: int = 16384,
-        max_seq_len: int = 16384,
+                 query_per_second: int = 1,
-        query_per_second: int = 1,
+                 rpm_verbose: bool = False,
-        rpm_verbose: bool = False,
+                 retry: int = 2,
-        retry: int = 2,
+                 key: str | List[str] = 'ENV',
-        key: str | List[str] = 'ENV',
+                 org: str | List[str] | None = None,
-        org: str | List[str] | None = None,
+                 meta_template: Dict | None = None,
-        meta_template: Dict | None = None,
+                 openai_api_base: str | List[str] = OPENAISDK_API_BASE,
-        openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+                 openai_proxy_url: Optional[str] = None,
-        openai_proxy_url: Optional[str] = None,
+                 mode: str = 'none',
-        mode: str = 'none',
+                 logprobs: bool | None = False,
-        logprobs: bool | None = False,
+                 top_logprobs: int | None = None,
-        top_logprobs: int | None = None,
+                 temperature: float | None = None,
-        temperature: float | None = None,
+                 tokenizer_path: str | None = None,
-        tokenizer_path: str | None = None,
+                 extra_body: Dict | None = None,
-        extra_body: Dict | None = None,
+                 verbose: bool = False,
-        verbose: bool = False,
+                 status_code_mappings: dict = {},
-        status_code_mappings: dict = {},
+                 think_tag: str = '</think>'):
    ):
        super().__init__(
            path,
            max_seq_len,
@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
        if self.verbose:
            self.logger.info(f'Used openai_client: {self.openai_client}')
        self.status_code_mappings = status_code_mappings
        self.think_tag = think_tag
    def _generate(self,
                  input: PromptList | str,
@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
                    num_retries += 1
                    # Continue to retry instead of returning empty response
                    continue
                # If the model has reasoning_content, concat it
                # with the content
                if hasattr(responses.choices[0].message, 'reasoning_content'):
                    return (responses.choices[0].message.reasoning_content +
                            self.think_tag +
                            responses.choices[0].message.content)
                return responses.choices[0].message.content