mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
enable tested reasoning model
This commit is contained in:
parent
61ceb02c23
commit
f2c17190c9
@ -1,5 +1,7 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.utils.text_postprocessors import remove_reasoning_part_before_evaluation
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
|
from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
|
||||||
hu_life_qa_datasets,
|
hu_life_qa_datasets,
|
||||||
@ -8,7 +10,7 @@ with read_base():
|
|||||||
|
|
||||||
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
|
||||||
from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model
|
from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model
|
||||||
from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model
|
from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model
|
||||||
|
|
||||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
|
||||||
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
|
||||||
@ -17,8 +19,9 @@ with read_base():
|
|||||||
|
|
||||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
|
||||||
|
|
||||||
|
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
|
||||||
from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
|
||||||
from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model
|
from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
|
||||||
|
|
||||||
from opencompass.models import OpenAI
|
from opencompass.models import OpenAI
|
||||||
from opencompass.partitioners import (
|
from opencompass.partitioners import (
|
||||||
@ -36,10 +39,21 @@ api_meta_template = dict(round=[
|
|||||||
dict(role='BOT', api_role='BOT', generate=True),
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
])
|
])
|
||||||
|
|
||||||
|
for model in deepseek_r1_api_aliyun_model:
|
||||||
|
model['return_reasoning_content'] = True
|
||||||
|
model['pred_postprocessor'] = {
|
||||||
|
'open_hu_eval_*': {
|
||||||
|
'type': 'rm_<think>_before_eval'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
del model
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
# *gpt_4o_mini_20240718_model,
|
*gpt_4o_mini_20240718_model,
|
||||||
# *gpt_4o_20241120_model,
|
*gpt_4o_20241120_model,
|
||||||
# *deepseek_v3_api_siliconflow_model,
|
*o1_mini_2024_09_12_model,
|
||||||
|
*deepseek_v3_api_aliyun_model,
|
||||||
|
*deepseek_r1_api_aliyun_model,
|
||||||
*lmdeploy_qwen2_5_7b_instruct_model,
|
*lmdeploy_qwen2_5_7b_instruct_model,
|
||||||
*lmdeploy_qwen2_5_72b_instruct_model,
|
*lmdeploy_qwen2_5_72b_instruct_model,
|
||||||
*lmdeploy_llama3_1_8b_instruct_model,
|
*lmdeploy_llama3_1_8b_instruct_model,
|
||||||
|
@ -5,11 +5,11 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
|
||||||
hu_life_qa_reader_cfg = dict(
|
hu_life_qa_reader_cfg = dict(
|
||||||
input_columns=["dialogue", "prompt"],
|
input_columns=['dialogue', 'prompt'],
|
||||||
output_column="judge",
|
output_column='judge',
|
||||||
)
|
)
|
||||||
|
|
||||||
data_path ="/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl"
|
data_path ='/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl'
|
||||||
|
|
||||||
hu_life_qa_datasets = []
|
hu_life_qa_datasets = []
|
||||||
hu_life_qa_infer_cfg = dict(
|
hu_life_qa_infer_cfg = dict(
|
||||||
@ -21,8 +21,8 @@ hu_life_qa_infer_cfg = dict(
|
|||||||
inferencer=dict(
|
inferencer=dict(
|
||||||
type=ChatInferencer,
|
type=ChatInferencer,
|
||||||
max_seq_len=4096,
|
max_seq_len=4096,
|
||||||
max_out_len=512,
|
max_out_len=2048,
|
||||||
infer_mode="last",
|
infer_mode='last',
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -34,12 +34,12 @@ hu_life_qa_eval_cfg = dict(
|
|||||||
template="""{prompt}"""
|
template="""{prompt}"""
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
pred_role="BOT",
|
pred_role='BOT',
|
||||||
)
|
)
|
||||||
|
|
||||||
hu_life_qa_datasets.append(
|
hu_life_qa_datasets.append(
|
||||||
dict(
|
dict(
|
||||||
abbr="hu_life_qa",
|
abbr='open_hu_eval_hu_life_qa',
|
||||||
type=WildBenchDataset,
|
type=WildBenchDataset,
|
||||||
path=data_path,
|
path=data_path,
|
||||||
reader_cfg=hu_life_qa_reader_cfg,
|
reader_cfg=hu_life_qa_reader_cfg,
|
||||||
@ -49,22 +49,22 @@ hu_life_qa_datasets.append(
|
|||||||
)
|
)
|
||||||
|
|
||||||
task_group_new = {
|
task_group_new = {
|
||||||
"life_culture_custom": "life_culture_custom",
|
'life_culture_custom': 'life_culture_custom',
|
||||||
"childbearing and education": "life_culture_custom",
|
'childbearing and education': 'life_culture_custom',
|
||||||
"culture and community": "life_culture_custom",
|
'culture and community': 'life_culture_custom',
|
||||||
'culture and customs': "life_culture_custom",
|
'culture and customs': 'life_culture_custom',
|
||||||
"food and drink": "life_culture_custom",
|
'food and drink': 'life_culture_custom',
|
||||||
"health": "life_culture_custom",
|
'health': 'life_culture_custom',
|
||||||
"holidays": "life_culture_custom",
|
'holidays': 'life_culture_custom',
|
||||||
"home": "life_culture_custom",
|
'home': 'life_culture_custom',
|
||||||
"person": "life_culture_custom",
|
'person': 'life_culture_custom',
|
||||||
"transport": "life_culture_custom",
|
'transport': 'life_culture_custom',
|
||||||
"science": "life_culture_custom",
|
'science': 'life_culture_custom',
|
||||||
"travel": "life_culture_custom",
|
'travel': 'life_culture_custom',
|
||||||
"business_finance": "business_finance",
|
'business_finance': 'business_finance',
|
||||||
"business and finance": "business_finance",
|
'business and finance': 'business_finance',
|
||||||
"education_profession": "education_profession",
|
'education_profession': 'education_profession',
|
||||||
"public education and courses": "education_profession",
|
'public education and courses': 'education_profession',
|
||||||
"politics_policy_law": "politics_policy_law",
|
'politics_policy_law': 'politics_policy_law',
|
||||||
"politics": "politics_policy_law",
|
'politics': 'politics_policy_law',
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
from opencompass.models import OpenAISDK
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
abbr='deepseek_r1_api_aliyun',
|
||||||
|
type=OpenAISDK,
|
||||||
|
path='deepseek-r1',
|
||||||
|
key='ENV_ALIYUN',
|
||||||
|
openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
||||||
|
query_per_second=1,
|
||||||
|
max_out_len=2048,
|
||||||
|
max_seq_len=4096,
|
||||||
|
batch_size=8,
|
||||||
|
retry=30,
|
||||||
|
verbose=True,
|
||||||
|
),
|
||||||
|
]
|
@ -0,0 +1,17 @@
|
|||||||
|
from opencompass.models import OpenAISDK
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
abbr='deepseek_v3_api_aliyun',
|
||||||
|
type=OpenAISDK,
|
||||||
|
path='deepseek-v3',
|
||||||
|
key='ENV_ALIYUN',
|
||||||
|
openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
|
||||||
|
query_per_second=1,
|
||||||
|
max_out_len=2048,
|
||||||
|
max_seq_len=4096,
|
||||||
|
batch_size=8,
|
||||||
|
retry=30,
|
||||||
|
verbose=True,
|
||||||
|
),
|
||||||
|
]
|
@ -100,6 +100,7 @@ class OpenAI(BaseAPIModel):
|
|||||||
tokenizer_path: Optional[str] = None,
|
tokenizer_path: Optional[str] = None,
|
||||||
extra_body: Optional[Dict] = None,
|
extra_body: Optional[Dict] = None,
|
||||||
max_completion_tokens: int = 16384,
|
max_completion_tokens: int = 16384,
|
||||||
|
return_reasoning_content: Optional[bool] = False,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
):
|
):
|
||||||
|
|
||||||
@ -123,6 +124,7 @@ class OpenAI(BaseAPIModel):
|
|||||||
self.tokenizer_path = tokenizer_path
|
self.tokenizer_path = tokenizer_path
|
||||||
self.hf_tokenizer = None
|
self.hf_tokenizer = None
|
||||||
self.extra_body = extra_body
|
self.extra_body = extra_body
|
||||||
|
self.return_reasoning_content = return_reasoning_content
|
||||||
|
|
||||||
if isinstance(key, str):
|
if isinstance(key, str):
|
||||||
if key == 'ENV':
|
if key == 'ENV':
|
||||||
@ -137,6 +139,10 @@ class OpenAI(BaseAPIModel):
|
|||||||
if 'DEEPSEEK_API_KEY' not in os.environ:
|
if 'DEEPSEEK_API_KEY' not in os.environ:
|
||||||
raise ValueError('Deepseek API key is not set.')
|
raise ValueError('Deepseek API key is not set.')
|
||||||
self.keys = os.getenv('DEEPSEEK_API_KEY').split(',')
|
self.keys = os.getenv('DEEPSEEK_API_KEY').split(',')
|
||||||
|
elif key == 'ENV_ALIYUN':
|
||||||
|
if 'DASHSCOPE_API_KEY' not in os.environ:
|
||||||
|
raise ValueError('DASHSCOPE API key (aliyun) is not set.')
|
||||||
|
self.keys = os.getenv('DASHSCOPE_API_KEY').split(',')
|
||||||
else:
|
else:
|
||||||
self.keys = [key]
|
self.keys = [key]
|
||||||
else:
|
else:
|
||||||
@ -340,7 +346,16 @@ class OpenAI(BaseAPIModel):
|
|||||||
if self.logprobs:
|
if self.logprobs:
|
||||||
return response['choices']
|
return response['choices']
|
||||||
else:
|
else:
|
||||||
return response['choices'][0]['message']['content'].strip()
|
message = response['choices'][0]['message']
|
||||||
|
content = message['content'].strip()
|
||||||
|
if self.return_reasoning_content:
|
||||||
|
r_content = message.get('reasoning_content',
|
||||||
|
'').strip()
|
||||||
|
if r_content:
|
||||||
|
r_content = '<think>' + r_content + '</think>'
|
||||||
|
return r_content + content
|
||||||
|
else:
|
||||||
|
return content
|
||||||
except KeyError:
|
except KeyError:
|
||||||
if 'error' in response:
|
if 'error' in response:
|
||||||
if response['error']['code'] == 'rate_limit_exceeded':
|
if response['error']['code'] == 'rate_limit_exceeded':
|
||||||
@ -567,6 +582,7 @@ class OpenAISDK(OpenAI):
|
|||||||
tokenizer_path: str | None = None,
|
tokenizer_path: str | None = None,
|
||||||
extra_body: Dict | None = None,
|
extra_body: Dict | None = None,
|
||||||
max_completion_tokens: int = 16384,
|
max_completion_tokens: int = 16384,
|
||||||
|
return_reasoning_content: Optional[bool] = False,
|
||||||
verbose: bool = False,
|
verbose: bool = False,
|
||||||
status_code_mappings: dict = {},
|
status_code_mappings: dict = {},
|
||||||
):
|
):
|
||||||
@ -588,6 +604,7 @@ class OpenAISDK(OpenAI):
|
|||||||
tokenizer_path,
|
tokenizer_path,
|
||||||
extra_body,
|
extra_body,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
|
return_reasoning_content=return_reasoning_content,
|
||||||
max_completion_tokens=max_completion_tokens,
|
max_completion_tokens=max_completion_tokens,
|
||||||
)
|
)
|
||||||
key = random.choice(self.keys)
|
key = random.choice(self.keys)
|
||||||
@ -670,7 +687,18 @@ class OpenAISDK(OpenAI):
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
'Response is empty, it is an internal server error \
|
'Response is empty, it is an internal server error \
|
||||||
from the API provider.')
|
from the API provider.')
|
||||||
return responses.choices[0].message.content
|
|
||||||
|
message = responses.choices[0].message
|
||||||
|
content = message.content
|
||||||
|
if self.return_reasoning_content:
|
||||||
|
try:
|
||||||
|
r_content = message.reasoning_content
|
||||||
|
r_content = '<think>' + r_content + '</think>'
|
||||||
|
except AttributeError:
|
||||||
|
r_content = ''
|
||||||
|
return r_content + content
|
||||||
|
else:
|
||||||
|
return content
|
||||||
|
|
||||||
except (BadRequestError, APIStatusError) as e:
|
except (BadRequestError, APIStatusError) as e:
|
||||||
# Handle BadRequest status
|
# Handle BadRequest status
|
||||||
|
@ -229,3 +229,15 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
|
|||||||
match = re.search(answer_pattern, response_text)
|
match = re.search(answer_pattern, response_text)
|
||||||
extracted_answer = match.group(1) if match else ''
|
extracted_answer = match.group(1) if match else ''
|
||||||
return extracted_answer
|
return extracted_answer
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('rm_<think>_before_eval')
|
||||||
|
def remove_reasoning_part_before_evaluation(text: str):
|
||||||
|
if text.startswith('<think>'):
|
||||||
|
reasoning_end = text.rfind('</think>')
|
||||||
|
if reasoning_end == -1:
|
||||||
|
return text
|
||||||
|
else:
|
||||||
|
return text[reasoning_end + 8:]
|
||||||
|
else:
|
||||||
|
return text
|
||||||
|
Loading…
Reference in New Issue
Block a user