From f2c17190c9f393c0f7012bae512e0f43b949ee78 Mon Sep 17 00:00:00 2001 From: hoteryoung Date: Mon, 10 Feb 2025 09:38:49 +0800 Subject: [PATCH] enable tested reasoning model --- examples/eval_OpenHuEval_HuLifeQA.py | 24 +++++++-- .../configs/datasets/OpenHuEval/HuLifeQA.py | 50 +++++++++---------- .../models/deepseek/deepseek_r1_api_aliyun.py | 17 +++++++ .../models/deepseek/deepseek_v3_api_aliyun.py | 17 +++++++ opencompass/models/openai_api.py | 32 +++++++++++- opencompass/utils/text_postprocessors.py | 12 +++++ 6 files changed, 120 insertions(+), 32 deletions(-) create mode 100644 opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py create mode 100644 opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py diff --git a/examples/eval_OpenHuEval_HuLifeQA.py b/examples/eval_OpenHuEval_HuLifeQA.py index 14808cb2..d726fd86 100644 --- a/examples/eval_OpenHuEval_HuLifeQA.py +++ b/examples/eval_OpenHuEval_HuLifeQA.py @@ -1,5 +1,7 @@ from mmengine.config import read_base +from opencompass.utils.text_postprocessors import remove_reasoning_part_before_evaluation + with read_base(): from opencompass.configs.datasets.OpenHuEval.HuLifeQA import ( hu_life_qa_datasets, @@ -8,7 +10,7 @@ with read_base(): from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model - from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model + from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model @@ -17,8 +19,9 @@ with read_base(): from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model - from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model + from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model from opencompass.models import OpenAI from opencompass.partitioners import ( @@ -36,10 +39,21 @@ api_meta_template = dict(round=[ dict(role='BOT', api_role='BOT', generate=True), ]) +for model in deepseek_r1_api_aliyun_model: + model['return_reasoning_content'] = True + model['pred_postprocessor'] = { + 'open_hu_eval_*': { + 'type': 'rm__before_eval' + } + } +del model + models = [ - # *gpt_4o_mini_20240718_model, - # *gpt_4o_20241120_model, - # *deepseek_v3_api_siliconflow_model, + *gpt_4o_mini_20240718_model, + *gpt_4o_20241120_model, + *o1_mini_2024_09_12_model, + *deepseek_v3_api_aliyun_model, + *deepseek_r1_api_aliyun_model, *lmdeploy_qwen2_5_7b_instruct_model, *lmdeploy_qwen2_5_72b_instruct_model, *lmdeploy_llama3_1_8b_instruct_model, diff --git a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py index 610d7cf8..e62ddc93 100644 --- a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py +++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py @@ -5,11 +5,11 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever hu_life_qa_reader_cfg = dict( - input_columns=["dialogue", "prompt"], - output_column="judge", + input_columns=['dialogue', 'prompt'], + output_column='judge', ) -data_path ="/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl" +data_path ='/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl' hu_life_qa_datasets = [] hu_life_qa_infer_cfg = dict( @@ -21,8 +21,8 @@ hu_life_qa_infer_cfg = dict( inferencer=dict( type=ChatInferencer, max_seq_len=4096, - max_out_len=512, - infer_mode="last", + max_out_len=2048, + infer_mode='last', ), ) @@ -34,12 +34,12 @@ hu_life_qa_eval_cfg = dict( template="""{prompt}""" ), ), - pred_role="BOT", + pred_role='BOT', ) hu_life_qa_datasets.append( dict( - abbr="hu_life_qa", + abbr='open_hu_eval_hu_life_qa', type=WildBenchDataset, path=data_path, reader_cfg=hu_life_qa_reader_cfg, @@ -49,22 +49,22 @@ hu_life_qa_datasets.append( ) task_group_new = { - "life_culture_custom": "life_culture_custom", - "childbearing and education": "life_culture_custom", - "culture and community": "life_culture_custom", - 'culture and customs': "life_culture_custom", - "food and drink": "life_culture_custom", - "health": "life_culture_custom", - "holidays": "life_culture_custom", - "home": "life_culture_custom", - "person": "life_culture_custom", - "transport": "life_culture_custom", - "science": "life_culture_custom", - "travel": "life_culture_custom", - "business_finance": "business_finance", - "business and finance": "business_finance", - "education_profession": "education_profession", - "public education and courses": "education_profession", - "politics_policy_law": "politics_policy_law", - "politics": "politics_policy_law", + 'life_culture_custom': 'life_culture_custom', + 'childbearing and education': 'life_culture_custom', + 'culture and community': 'life_culture_custom', + 'culture and customs': 'life_culture_custom', + 'food and drink': 'life_culture_custom', + 'health': 'life_culture_custom', + 'holidays': 'life_culture_custom', + 'home': 'life_culture_custom', + 'person': 'life_culture_custom', + 'transport': 'life_culture_custom', + 'science': 'life_culture_custom', + 'travel': 'life_culture_custom', + 'business_finance': 'business_finance', + 'business and finance': 'business_finance', + 'education_profession': 'education_profession', + 'public education and courses': 'education_profession', + 'politics_policy_law': 'politics_policy_law', + 'politics': 'politics_policy_law', } diff --git a/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py b/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py new file mode 100644 index 00000000..1fc3a55c --- /dev/null +++ b/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py @@ -0,0 +1,17 @@ +from opencompass.models import OpenAISDK + +models = [ + dict( + abbr='deepseek_r1_api_aliyun', + type=OpenAISDK, + path='deepseek-r1', + key='ENV_ALIYUN', + openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1', + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + retry=30, + verbose=True, + ), +] diff --git a/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py b/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py new file mode 100644 index 00000000..38370e40 --- /dev/null +++ b/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py @@ -0,0 +1,17 @@ +from opencompass.models import OpenAISDK + +models = [ + dict( + abbr='deepseek_v3_api_aliyun', + type=OpenAISDK, + path='deepseek-v3', + key='ENV_ALIYUN', + openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1', + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8, + retry=30, + verbose=True, + ), +] diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index e239c6ae..aae7d862 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -100,6 +100,7 @@ class OpenAI(BaseAPIModel): tokenizer_path: Optional[str] = None, extra_body: Optional[Dict] = None, max_completion_tokens: int = 16384, + return_reasoning_content: Optional[bool] = False, verbose: bool = False, ): @@ -123,6 +124,7 @@ class OpenAI(BaseAPIModel): self.tokenizer_path = tokenizer_path self.hf_tokenizer = None self.extra_body = extra_body + self.return_reasoning_content = return_reasoning_content if isinstance(key, str): if key == 'ENV': @@ -137,6 +139,10 @@ class OpenAI(BaseAPIModel): if 'DEEPSEEK_API_KEY' not in os.environ: raise ValueError('Deepseek API key is not set.') self.keys = os.getenv('DEEPSEEK_API_KEY').split(',') + elif key == 'ENV_ALIYUN': + if 'DASHSCOPE_API_KEY' not in os.environ: + raise ValueError('DASHSCOPE API key (aliyun) is not set.') + self.keys = os.getenv('DASHSCOPE_API_KEY').split(',') else: self.keys = [key] else: @@ -340,7 +346,16 @@ class OpenAI(BaseAPIModel): if self.logprobs: return response['choices'] else: - return response['choices'][0]['message']['content'].strip() + message = response['choices'][0]['message'] + content = message['content'].strip() + if self.return_reasoning_content: + r_content = message.get('reasoning_content', + '').strip() + if r_content: + r_content = '' + r_content + '' + return r_content + content + else: + return content except KeyError: if 'error' in response: if response['error']['code'] == 'rate_limit_exceeded': @@ -567,6 +582,7 @@ class OpenAISDK(OpenAI): tokenizer_path: str | None = None, extra_body: Dict | None = None, max_completion_tokens: int = 16384, + return_reasoning_content: Optional[bool] = False, verbose: bool = False, status_code_mappings: dict = {}, ): @@ -588,6 +604,7 @@ class OpenAISDK(OpenAI): tokenizer_path, extra_body, verbose=verbose, + return_reasoning_content=return_reasoning_content, max_completion_tokens=max_completion_tokens, ) key = random.choice(self.keys) @@ -670,7 +687,18 @@ class OpenAISDK(OpenAI): self.logger.error( 'Response is empty, it is an internal server error \ from the API provider.') - return responses.choices[0].message.content + + message = responses.choices[0].message + content = message.content + if self.return_reasoning_content: + try: + r_content = message.reasoning_content + r_content = '' + r_content + '' + except AttributeError: + r_content = '' + return r_content + content + else: + return content except (BadRequestError, APIStatusError) as e: # Handle BadRequest status diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index eb7469ab..be45ba52 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -229,3 +229,15 @@ def match_answer_pattern(response_text: str, answer_pattern: str): match = re.search(answer_pattern, response_text) extracted_answer = match.group(1) if match else '' return extracted_answer + + +@TEXT_POSTPROCESSORS.register_module('rm__before_eval') +def remove_reasoning_part_before_evaluation(text: str): + if text.startswith(''): + reasoning_end = text.rfind('') + if reasoning_end == -1: + return text + else: + return text[reasoning_end + 8:] + else: + return text