[Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK

This commit is contained in:
Linchen Xiao 2025-04-15 11:33:16 +08:00 committed by GitHub
parent 75e7834b59
commit 65ff602cf5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 28 additions and 23 deletions

View File

@ -37,7 +37,6 @@ def get_final_results(judged_answers,
is_correct = is_correct_count / count
is_incorrect = is_incorrect_count / count
is_given_attempted = is_correct + is_incorrect
loose_accuracy = is_correct / count
accuracy_given_attempted = (is_correct / is_given_attempted
if is_given_attempted > 0 else 0)
attempted_judge_ratio = attempted_judge_count / count
@ -46,7 +45,7 @@ def get_final_results(judged_answers,
(accuracy_given_attempted + is_correct) if
(accuracy_given_attempted + is_correct) > 0 else 0)
result = {
metric_name: loose_accuracy * 100,
metric_name: is_correct * 100,
f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
'f1': f1,
'attempted_ratio': attempted_judge_ratio * 100,

View File

@ -531,8 +531,7 @@ class OpenAI(BaseAPIModel):
class OpenAISDK(OpenAI):
def __init__(
self,
def __init__(self,
path: str = 'gpt-3.5-turbo',
max_seq_len: int = 16384,
query_per_second: int = 1,
@ -551,7 +550,7 @@ class OpenAISDK(OpenAI):
extra_body: Dict | None = None,
verbose: bool = False,
status_code_mappings: dict = {},
):
think_tag: str = '</think>'):
super().__init__(
path,
max_seq_len,
@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
if self.verbose:
self.logger.info(f'Used openai_client: {self.openai_client}')
self.status_code_mappings = status_code_mappings
self.think_tag = think_tag
def _generate(self,
input: PromptList | str,
@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
num_retries += 1
# Continue to retry instead of returning empty response
continue
# If the model has reasoning_content, concat it
# with the content
if hasattr(responses.choices[0].message, 'reasoning_content'):
return (responses.choices[0].message.reasoning_content +
self.think_tag +
responses.choices[0].message.content)
return responses.choices[0].message.content