[Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK

This commit is contained in:
Linchen Xiao 2025-04-15 11:33:16 +08:00 committed by GitHub
parent 75e7834b59
commit 65ff602cf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 23 deletions

View File

@ -37,7 +37,6 @@ def get_final_results(judged_answers,
is_correct = is_correct_count / count is_correct = is_correct_count / count
is_incorrect = is_incorrect_count / count is_incorrect = is_incorrect_count / count
is_given_attempted = is_correct + is_incorrect is_given_attempted = is_correct + is_incorrect
loose_accuracy = is_correct / count
accuracy_given_attempted = (is_correct / is_given_attempted accuracy_given_attempted = (is_correct / is_given_attempted
if is_given_attempted > 0 else 0) if is_given_attempted > 0 else 0)
attempted_judge_ratio = attempted_judge_count / count attempted_judge_ratio = attempted_judge_count / count
@ -46,7 +45,7 @@ def get_final_results(judged_answers,
(accuracy_given_attempted + is_correct) if (accuracy_given_attempted + is_correct) if
(accuracy_given_attempted + is_correct) > 0 else 0) (accuracy_given_attempted + is_correct) > 0 else 0)
result = { result = {
metric_name: loose_accuracy * 100, metric_name: is_correct * 100,
f'{metric_name}_given_attempted': accuracy_given_attempted * 100, f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
'f1': f1, 'f1': f1,
'attempted_ratio': attempted_judge_ratio * 100, 'attempted_ratio': attempted_judge_ratio * 100,

View File

@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel):
class OpenAISDK(OpenAI): class OpenAISDK(OpenAI):
def __init__( def __init__(self,
self, path: str = 'gpt-3.5-turbo',
path: str = 'gpt-3.5-turbo', max_seq_len: int = 16384,
max_seq_len: int = 16384, query_per_second: int = 1,
query_per_second: int = 1, rpm_verbose: bool = False,
rpm_verbose: bool = False, retry: int = 2,
retry: int = 2, key: str | List[str] = 'ENV',
key: str | List[str] = 'ENV', org: str | List[str] | None = None,
org: str | List[str] | None = None, meta_template: Dict | None = None,
meta_template: Dict | None = None, openai_api_base: str | List[str] = OPENAISDK_API_BASE,
openai_api_base: str | List[str] = OPENAISDK_API_BASE, openai_proxy_url: Optional[str] = None,
openai_proxy_url: Optional[str] = None, mode: str = 'none',
mode: str = 'none', logprobs: bool | None = False,
logprobs: bool | None = False, top_logprobs: int | None = None,
top_logprobs: int | None = None, temperature: float | None = None,
temperature: float | None = None, tokenizer_path: str | None = None,
tokenizer_path: str | None = None, extra_body: Dict | None = None,
extra_body: Dict | None = None, verbose: bool = False,
verbose: bool = False, status_code_mappings: dict = {},
status_code_mappings: dict = {}, think_tag: str = '</think>'):
):
super().__init__( super().__init__(
path, path,
max_seq_len, max_seq_len,
@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
if self.verbose: if self.verbose:
self.logger.info(f'Used openai_client: {self.openai_client}') self.logger.info(f'Used openai_client: {self.openai_client}')
self.status_code_mappings = status_code_mappings self.status_code_mappings = status_code_mappings
self.think_tag = think_tag
def _generate(self, def _generate(self,
input: PromptList | str, input: PromptList | str,
@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
num_retries += 1 num_retries += 1
# Continue to retry instead of returning empty response # Continue to retry instead of returning empty response
continue continue
# If the model has reasoning_content, concat it
# with the content
if hasattr(responses.choices[0].message, 'reasoning_content'):
return (responses.choices[0].message.reasoning_content +
self.think_tag +
responses.choices[0].message.content)
return responses.choices[0].message.content return responses.choices[0].message.content