From f2c17190c9f393c0f7012bae512e0f43b949ee78 Mon Sep 17 00:00:00 2001
From: hoteryoung <hoteryoung@163.com>
Date: Mon, 10 Feb 2025 09:38:49 +0800
Subject: [PATCH] enable tested reasoning model

---
 examples/eval_OpenHuEval_HuLifeQA.py          | 24 +++++++--
 .../configs/datasets/OpenHuEval/HuLifeQA.py   | 50 +++++++++----------
 .../models/deepseek/deepseek_r1_api_aliyun.py | 17 +++++++
 .../models/deepseek/deepseek_v3_api_aliyun.py | 17 +++++++
 opencompass/models/openai_api.py              | 32 +++++++++++-
 opencompass/utils/text_postprocessors.py      | 12 +++++
 6 files changed, 120 insertions(+), 32 deletions(-)
 create mode 100644 opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py
 create mode 100644 opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py
diff --git a/examples/eval_OpenHuEval_HuLifeQA.py b/examples/eval_OpenHuEval_HuLifeQA.py
index 14808cb2..d726fd86 100644
--- a/examples/eval_OpenHuEval_HuLifeQA.py
+++ b/examples/eval_OpenHuEval_HuLifeQA.py
@@ -1,5 +1,7 @@
 from mmengine.config import read_base
 
+from opencompass.utils.text_postprocessors import remove_reasoning_part_before_evaluation
+
 with read_base():
     from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
         hu_life_qa_datasets,
@@ -8,7 +10,7 @@ with read_base():
 
     from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
     from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model
-    from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model
+    from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model
 
     from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
     from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
@@ -17,8 +19,9 @@ with read_base():
 
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
 
+    from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
     from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
-    from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model
+    from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
 
 from opencompass.models import OpenAI
 from opencompass.partitioners import (
@@ -36,10 +39,21 @@ api_meta_template = dict(round=[
     dict(role='BOT', api_role='BOT', generate=True),
 ])
 
+for model in deepseek_r1_api_aliyun_model:
+    model['return_reasoning_content'] = True
+    model['pred_postprocessor'] = {
+        'open_hu_eval_*': {
+            'type': 'rm_<think>_before_eval'
+        }
+    }
+del model
+
 models = [
-    # *gpt_4o_mini_20240718_model,
-    # *gpt_4o_20241120_model,
-    # *deepseek_v3_api_siliconflow_model,
+    *gpt_4o_mini_20240718_model,
+    *gpt_4o_20241120_model,
+    *o1_mini_2024_09_12_model,
+    *deepseek_v3_api_aliyun_model,
+    *deepseek_r1_api_aliyun_model,
     *lmdeploy_qwen2_5_7b_instruct_model,
     *lmdeploy_qwen2_5_72b_instruct_model,
     *lmdeploy_llama3_1_8b_instruct_model,
diff --git a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
index 610d7cf8..e62ddc93 100644
--- a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
@@ -5,11 +5,11 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 
 hu_life_qa_reader_cfg = dict(
-    input_columns=["dialogue", "prompt"],
-    output_column="judge",
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
 )
 
-data_path ="/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl"
+data_path ='/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl'
 
 hu_life_qa_datasets = []
 hu_life_qa_infer_cfg = dict(
@@ -21,8 +21,8 @@ hu_life_qa_infer_cfg = dict(
     inferencer=dict(
         type=ChatInferencer,
         max_seq_len=4096,
-        max_out_len=512,
-        infer_mode="last",
+        max_out_len=2048,
+        infer_mode='last',
     ),
 )
 
@@ -34,12 +34,12 @@ hu_life_qa_eval_cfg = dict(
             template="""{prompt}"""
         ),
     ),
-    pred_role="BOT",
+    pred_role='BOT',
 )
 
 hu_life_qa_datasets.append(
     dict(
-        abbr="hu_life_qa",
+        abbr='open_hu_eval_hu_life_qa',
         type=WildBenchDataset,
         path=data_path,
         reader_cfg=hu_life_qa_reader_cfg,
@@ -49,22 +49,22 @@ hu_life_qa_datasets.append(
 )
 
 task_group_new = {
-    "life_culture_custom": "life_culture_custom",
-    "childbearing and education": "life_culture_custom",
-    "culture and community": "life_culture_custom",
-    'culture and customs': "life_culture_custom",
-    "food and drink": "life_culture_custom",
-    "health": "life_culture_custom",
-    "holidays": "life_culture_custom",
-    "home": "life_culture_custom",
-    "person": "life_culture_custom",
-    "transport": "life_culture_custom",
-    "science": "life_culture_custom",
-    "travel": "life_culture_custom",
-    "business_finance": "business_finance",
-    "business and finance": "business_finance",
-    "education_profession": "education_profession",
-    "public education and courses": "education_profession",
-    "politics_policy_law": "politics_policy_law",
-    "politics": "politics_policy_law",
+    'life_culture_custom': 'life_culture_custom',
+    'childbearing and education': 'life_culture_custom',
+    'culture and community': 'life_culture_custom',
+    'culture and customs': 'life_culture_custom',
+    'food and drink': 'life_culture_custom',
+    'health': 'life_culture_custom',
+    'holidays': 'life_culture_custom',
+    'home': 'life_culture_custom',
+    'person': 'life_culture_custom',
+    'transport': 'life_culture_custom',
+    'science': 'life_culture_custom',
+    'travel': 'life_culture_custom',
+    'business_finance': 'business_finance',
+    'business and finance': 'business_finance',
+    'education_profession': 'education_profession',
+    'public education and courses': 'education_profession',
+    'politics_policy_law': 'politics_policy_law',
+    'politics': 'politics_policy_law',
 }
diff --git a/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py b/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py
new file mode 100644
index 00000000..1fc3a55c
--- /dev/null
+++ b/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py
@@ -0,0 +1,17 @@
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        abbr='deepseek_r1_api_aliyun',
+        type=OpenAISDK,
+        path='deepseek-r1',
+        key='ENV_ALIYUN',
+        openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=30,
+        verbose=True,
+    ),
+]
diff --git a/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py b/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py
new file mode 100644
index 00000000..38370e40
--- /dev/null
+++ b/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py
@@ -0,0 +1,17 @@
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        abbr='deepseek_v3_api_aliyun',
+        type=OpenAISDK,
+        path='deepseek-v3',
+        key='ENV_ALIYUN',
+        openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=30,
+        verbose=True,
+    ),
+]
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index e239c6ae..aae7d862 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -100,6 +100,7 @@ class OpenAI(BaseAPIModel):
         tokenizer_path: Optional[str] = None,
         extra_body: Optional[Dict] = None,
         max_completion_tokens: int = 16384,
+        return_reasoning_content: Optional[bool] = False,
         verbose: bool = False,
     ):
 
@@ -123,6 +124,7 @@ class OpenAI(BaseAPIModel):
         self.tokenizer_path = tokenizer_path
         self.hf_tokenizer = None
         self.extra_body = extra_body
+        self.return_reasoning_content = return_reasoning_content
 
         if isinstance(key, str):
             if key == 'ENV':
@@ -137,6 +139,10 @@ class OpenAI(BaseAPIModel):
                 if 'DEEPSEEK_API_KEY' not in os.environ:
                     raise ValueError('Deepseek API key is not set.')
                 self.keys = os.getenv('DEEPSEEK_API_KEY').split(',')
+            elif key == 'ENV_ALIYUN':
+                if 'DASHSCOPE_API_KEY' not in os.environ:
+                    raise ValueError('DASHSCOPE API key (aliyun) is not set.')
+                self.keys = os.getenv('DASHSCOPE_API_KEY').split(',')
             else:
                 self.keys = [key]
         else:
@@ -340,7 +346,16 @@ class OpenAI(BaseAPIModel):
                 if self.logprobs:
                     return response['choices']
                 else:
-                    return response['choices'][0]['message']['content'].strip()
+                    message = response['choices'][0]['message']
+                    content = message['content'].strip()
+                    if self.return_reasoning_content:
+                        r_content = message.get('reasoning_content',
+                                                '').strip()
+                        if r_content:
+                            r_content = '<think>' + r_content + '</think>'
+                        return r_content + content
+                    else:
+                        return content
             except KeyError:
                 if 'error' in response:
                     if response['error']['code'] == 'rate_limit_exceeded':
@@ -567,6 +582,7 @@ class OpenAISDK(OpenAI):
         tokenizer_path: str | None = None,
         extra_body: Dict | None = None,
         max_completion_tokens: int = 16384,
+        return_reasoning_content: Optional[bool] = False,
         verbose: bool = False,
         status_code_mappings: dict = {},
     ):
@@ -588,6 +604,7 @@ class OpenAISDK(OpenAI):
             tokenizer_path,
             extra_body,
             verbose=verbose,
+            return_reasoning_content=return_reasoning_content,
             max_completion_tokens=max_completion_tokens,
         )
         key = random.choice(self.keys)
@@ -670,7 +687,18 @@ class OpenAISDK(OpenAI):
                     self.logger.error(
                         'Response is empty, it is an internal server error \
                             from the API provider.')
-                return responses.choices[0].message.content
+
+                message = responses.choices[0].message
+                content = message.content
+                if self.return_reasoning_content:
+                    try:
+                        r_content = message.reasoning_content
+                        r_content = '<think>' + r_content + '</think>'
+                    except AttributeError:
+                        r_content = ''
+                    return r_content + content
+                else:
+                    return content
 
             except (BadRequestError, APIStatusError) as e:
                 # Handle BadRequest status
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index eb7469ab..be45ba52 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -229,3 +229,15 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
     match = re.search(answer_pattern, response_text)
     extracted_answer = match.group(1) if match else ''
     return extracted_answer
+
+
+@TEXT_POSTPROCESSORS.register_module('rm_<think>_before_eval')
+def remove_reasoning_part_before_evaluation(text: str):
+    if text.startswith('<think>'):
+        reasoning_end = text.rfind('</think>')
+        if reasoning_end == -1:
+            return text
+        else:
+            return text[reasoning_end + 8:]
+    else:
+        return text