mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Update] Fix LLM Judge metrics cacluation & Add reasoning content concat to OpenAI SDK
This commit is contained in:
parent
75e7834b59
commit
65ff602cf5
@ -37,7 +37,6 @@ def get_final_results(judged_answers,
|
|||||||
is_correct = is_correct_count / count
|
is_correct = is_correct_count / count
|
||||||
is_incorrect = is_incorrect_count / count
|
is_incorrect = is_incorrect_count / count
|
||||||
is_given_attempted = is_correct + is_incorrect
|
is_given_attempted = is_correct + is_incorrect
|
||||||
loose_accuracy = is_correct / count
|
|
||||||
accuracy_given_attempted = (is_correct / is_given_attempted
|
accuracy_given_attempted = (is_correct / is_given_attempted
|
||||||
if is_given_attempted > 0 else 0)
|
if is_given_attempted > 0 else 0)
|
||||||
attempted_judge_ratio = attempted_judge_count / count
|
attempted_judge_ratio = attempted_judge_count / count
|
||||||
@ -46,7 +45,7 @@ def get_final_results(judged_answers,
|
|||||||
(accuracy_given_attempted + is_correct) if
|
(accuracy_given_attempted + is_correct) if
|
||||||
(accuracy_given_attempted + is_correct) > 0 else 0)
|
(accuracy_given_attempted + is_correct) > 0 else 0)
|
||||||
result = {
|
result = {
|
||||||
metric_name: loose_accuracy * 100,
|
metric_name: is_correct * 100,
|
||||||
f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
|
f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
|
||||||
'f1': f1,
|
'f1': f1,
|
||||||
'attempted_ratio': attempted_judge_ratio * 100,
|
'attempted_ratio': attempted_judge_ratio * 100,
|
||||||
|
@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel):
|
|||||||
|
|
||||||
class OpenAISDK(OpenAI):
|
class OpenAISDK(OpenAI):
|
||||||
|
|
||||||
def __init__(
|
def __init__(self,
|
||||||
self,
|
path: str = 'gpt-3.5-turbo',
|
||||||
path: str = 'gpt-3.5-turbo',
|
max_seq_len: int = 16384,
|
||||||
max_seq_len: int = 16384,
|
query_per_second: int = 1,
|
||||||
query_per_second: int = 1,
|
rpm_verbose: bool = False,
|
||||||
rpm_verbose: bool = False,
|
retry: int = 2,
|
||||||
retry: int = 2,
|
key: str | List[str] = 'ENV',
|
||||||
key: str | List[str] = 'ENV',
|
org: str | List[str] | None = None,
|
||||||
org: str | List[str] | None = None,
|
meta_template: Dict | None = None,
|
||||||
meta_template: Dict | None = None,
|
openai_api_base: str | List[str] = OPENAISDK_API_BASE,
|
||||||
openai_api_base: str | List[str] = OPENAISDK_API_BASE,
|
openai_proxy_url: Optional[str] = None,
|
||||||
openai_proxy_url: Optional[str] = None,
|
mode: str = 'none',
|
||||||
mode: str = 'none',
|
logprobs: bool | None = False,
|
||||||
logprobs: bool | None = False,
|
top_logprobs: int | None = None,
|
||||||
top_logprobs: int | None = None,
|
temperature: float | None = None,
|
||||||
temperature: float | None = None,
|
tokenizer_path: str | None = None,
|
||||||
tokenizer_path: str | None = None,
|
extra_body: Dict | None = None,
|
||||||
extra_body: Dict | None = None,
|
verbose: bool = False,
|
||||||
verbose: bool = False,
|
status_code_mappings: dict = {},
|
||||||
status_code_mappings: dict = {},
|
think_tag: str = '</think>'):
|
||||||
):
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
path,
|
path,
|
||||||
max_seq_len,
|
max_seq_len,
|
||||||
@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
|
|||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.logger.info(f'Used openai_client: {self.openai_client}')
|
self.logger.info(f'Used openai_client: {self.openai_client}')
|
||||||
self.status_code_mappings = status_code_mappings
|
self.status_code_mappings = status_code_mappings
|
||||||
|
self.think_tag = think_tag
|
||||||
|
|
||||||
def _generate(self,
|
def _generate(self,
|
||||||
input: PromptList | str,
|
input: PromptList | str,
|
||||||
@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
|
|||||||
num_retries += 1
|
num_retries += 1
|
||||||
# Continue to retry instead of returning empty response
|
# Continue to retry instead of returning empty response
|
||||||
continue
|
continue
|
||||||
|
# If the model has reasoning_content, concat it
|
||||||
|
# with the content
|
||||||
|
if hasattr(responses.choices[0].message, 'reasoning_content'):
|
||||||
|
return (responses.choices[0].message.reasoning_content +
|
||||||
|
self.think_tag +
|
||||||
|
responses.choices[0].message.content)
|
||||||
|
|
||||||
return responses.choices[0].message.content
|
return responses.choices[0].message.content
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user