enable tested reasoning model

This commit is contained in:
hoteryoung 2025-02-10 09:38:49 +08:00 committed by jxd
parent 61ceb02c23
commit f2c17190c9
6 changed files with 120 additions and 32 deletions

View File

@ -1,5 +1,7 @@
from mmengine.config import read_base from mmengine.config import read_base
from opencompass.utils.text_postprocessors import remove_reasoning_part_before_evaluation
with read_base(): with read_base():
from opencompass.configs.datasets.OpenHuEval.HuLifeQA import ( from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
hu_life_qa_datasets, hu_life_qa_datasets,
@ -8,7 +10,7 @@ with read_base():
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model
from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
@ -17,8 +19,9 @@ with read_base():
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
from opencompass.models import OpenAI from opencompass.models import OpenAI
from opencompass.partitioners import ( from opencompass.partitioners import (
@ -36,10 +39,21 @@ api_meta_template = dict(round=[
dict(role='BOT', api_role='BOT', generate=True), dict(role='BOT', api_role='BOT', generate=True),
]) ])
for model in deepseek_r1_api_aliyun_model:
model['return_reasoning_content'] = True
model['pred_postprocessor'] = {
'open_hu_eval_*': {
'type': 'rm_<think>_before_eval'
}
}
del model
models = [ models = [
# *gpt_4o_mini_20240718_model, *gpt_4o_mini_20240718_model,
# *gpt_4o_20241120_model, *gpt_4o_20241120_model,
# *deepseek_v3_api_siliconflow_model, *o1_mini_2024_09_12_model,
*deepseek_v3_api_aliyun_model,
*deepseek_r1_api_aliyun_model,
*lmdeploy_qwen2_5_7b_instruct_model, *lmdeploy_qwen2_5_7b_instruct_model,
*lmdeploy_qwen2_5_72b_instruct_model, *lmdeploy_qwen2_5_72b_instruct_model,
*lmdeploy_llama3_1_8b_instruct_model, *lmdeploy_llama3_1_8b_instruct_model,

View File

@ -5,11 +5,11 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
hu_life_qa_reader_cfg = dict( hu_life_qa_reader_cfg = dict(
input_columns=["dialogue", "prompt"], input_columns=['dialogue', 'prompt'],
output_column="judge", output_column='judge',
) )
data_path ="/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl" data_path ='/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl'
hu_life_qa_datasets = [] hu_life_qa_datasets = []
hu_life_qa_infer_cfg = dict( hu_life_qa_infer_cfg = dict(
@ -21,8 +21,8 @@ hu_life_qa_infer_cfg = dict(
inferencer=dict( inferencer=dict(
type=ChatInferencer, type=ChatInferencer,
max_seq_len=4096, max_seq_len=4096,
max_out_len=512, max_out_len=2048,
infer_mode="last", infer_mode='last',
), ),
) )
@ -34,12 +34,12 @@ hu_life_qa_eval_cfg = dict(
template="""{prompt}""" template="""{prompt}"""
), ),
), ),
pred_role="BOT", pred_role='BOT',
) )
hu_life_qa_datasets.append( hu_life_qa_datasets.append(
dict( dict(
abbr="hu_life_qa", abbr='open_hu_eval_hu_life_qa',
type=WildBenchDataset, type=WildBenchDataset,
path=data_path, path=data_path,
reader_cfg=hu_life_qa_reader_cfg, reader_cfg=hu_life_qa_reader_cfg,
@ -49,22 +49,22 @@ hu_life_qa_datasets.append(
) )
task_group_new = { task_group_new = {
"life_culture_custom": "life_culture_custom", 'life_culture_custom': 'life_culture_custom',
"childbearing and education": "life_culture_custom", 'childbearing and education': 'life_culture_custom',
"culture and community": "life_culture_custom", 'culture and community': 'life_culture_custom',
'culture and customs': "life_culture_custom", 'culture and customs': 'life_culture_custom',
"food and drink": "life_culture_custom", 'food and drink': 'life_culture_custom',
"health": "life_culture_custom", 'health': 'life_culture_custom',
"holidays": "life_culture_custom", 'holidays': 'life_culture_custom',
"home": "life_culture_custom", 'home': 'life_culture_custom',
"person": "life_culture_custom", 'person': 'life_culture_custom',
"transport": "life_culture_custom", 'transport': 'life_culture_custom',
"science": "life_culture_custom", 'science': 'life_culture_custom',
"travel": "life_culture_custom", 'travel': 'life_culture_custom',
"business_finance": "business_finance", 'business_finance': 'business_finance',
"business and finance": "business_finance", 'business and finance': 'business_finance',
"education_profession": "education_profession", 'education_profession': 'education_profession',
"public education and courses": "education_profession", 'public education and courses': 'education_profession',
"politics_policy_law": "politics_policy_law", 'politics_policy_law': 'politics_policy_law',
"politics": "politics_policy_law", 'politics': 'politics_policy_law',
} }

View File

@ -0,0 +1,17 @@
from opencompass.models import OpenAISDK
models = [
dict(
abbr='deepseek_r1_api_aliyun',
type=OpenAISDK,
path='deepseek-r1',
key='ENV_ALIYUN',
openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
retry=30,
verbose=True,
),
]

View File

@ -0,0 +1,17 @@
from opencompass.models import OpenAISDK
models = [
dict(
abbr='deepseek_v3_api_aliyun',
type=OpenAISDK,
path='deepseek-v3',
key='ENV_ALIYUN',
openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
retry=30,
verbose=True,
),
]

View File

@ -100,6 +100,7 @@ class OpenAI(BaseAPIModel):
tokenizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None,
extra_body: Optional[Dict] = None, extra_body: Optional[Dict] = None,
max_completion_tokens: int = 16384, max_completion_tokens: int = 16384,
return_reasoning_content: Optional[bool] = False,
verbose: bool = False, verbose: bool = False,
): ):
@ -123,6 +124,7 @@ class OpenAI(BaseAPIModel):
self.tokenizer_path = tokenizer_path self.tokenizer_path = tokenizer_path
self.hf_tokenizer = None self.hf_tokenizer = None
self.extra_body = extra_body self.extra_body = extra_body
self.return_reasoning_content = return_reasoning_content
if isinstance(key, str): if isinstance(key, str):
if key == 'ENV': if key == 'ENV':
@ -137,6 +139,10 @@ class OpenAI(BaseAPIModel):
if 'DEEPSEEK_API_KEY' not in os.environ: if 'DEEPSEEK_API_KEY' not in os.environ:
raise ValueError('Deepseek API key is not set.') raise ValueError('Deepseek API key is not set.')
self.keys = os.getenv('DEEPSEEK_API_KEY').split(',') self.keys = os.getenv('DEEPSEEK_API_KEY').split(',')
elif key == 'ENV_ALIYUN':
if 'DASHSCOPE_API_KEY' not in os.environ:
raise ValueError('DASHSCOPE API key (aliyun) is not set.')
self.keys = os.getenv('DASHSCOPE_API_KEY').split(',')
else: else:
self.keys = [key] self.keys = [key]
else: else:
@ -340,7 +346,16 @@ class OpenAI(BaseAPIModel):
if self.logprobs: if self.logprobs:
return response['choices'] return response['choices']
else: else:
return response['choices'][0]['message']['content'].strip() message = response['choices'][0]['message']
content = message['content'].strip()
if self.return_reasoning_content:
r_content = message.get('reasoning_content',
'').strip()
if r_content:
r_content = '<think>' + r_content + '</think>'
return r_content + content
else:
return content
except KeyError: except KeyError:
if 'error' in response: if 'error' in response:
if response['error']['code'] == 'rate_limit_exceeded': if response['error']['code'] == 'rate_limit_exceeded':
@ -567,6 +582,7 @@ class OpenAISDK(OpenAI):
tokenizer_path: str | None = None, tokenizer_path: str | None = None,
extra_body: Dict | None = None, extra_body: Dict | None = None,
max_completion_tokens: int = 16384, max_completion_tokens: int = 16384,
return_reasoning_content: Optional[bool] = False,
verbose: bool = False, verbose: bool = False,
status_code_mappings: dict = {}, status_code_mappings: dict = {},
): ):
@ -588,6 +604,7 @@ class OpenAISDK(OpenAI):
tokenizer_path, tokenizer_path,
extra_body, extra_body,
verbose=verbose, verbose=verbose,
return_reasoning_content=return_reasoning_content,
max_completion_tokens=max_completion_tokens, max_completion_tokens=max_completion_tokens,
) )
key = random.choice(self.keys) key = random.choice(self.keys)
@ -670,7 +687,18 @@ class OpenAISDK(OpenAI):
self.logger.error( self.logger.error(
'Response is empty, it is an internal server error \ 'Response is empty, it is an internal server error \
from the API provider.') from the API provider.')
return responses.choices[0].message.content
message = responses.choices[0].message
content = message.content
if self.return_reasoning_content:
try:
r_content = message.reasoning_content
r_content = '<think>' + r_content + '</think>'
except AttributeError:
r_content = ''
return r_content + content
else:
return content
except (BadRequestError, APIStatusError) as e: except (BadRequestError, APIStatusError) as e:
# Handle BadRequest status # Handle BadRequest status

View File

@ -229,3 +229,15 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
match = re.search(answer_pattern, response_text) match = re.search(answer_pattern, response_text)
extracted_answer = match.group(1) if match else '' extracted_answer = match.group(1) if match else ''
return extracted_answer return extracted_answer
@TEXT_POSTPROCESSORS.register_module('rm_<think>_before_eval')
def remove_reasoning_part_before_evaluation(text: str):
if text.startswith('<think>'):
reasoning_end = text.rfind('</think>')
if reasoning_end == -1:
return text
else:
return text[reasoning_end + 8:]
else:
return text