enable tested reasoning model

2025-05-30 16:03:24 +08:00 · 2025-02-10 09:38:49 +08:00 · 2025-02-10 09:38:49 +08:00 · f2c17190c9
commit f2c17190c9
parent 61ceb02c23
6 changed files with 120 additions and 32 deletions
--- a/examples/eval_OpenHuEval_HuLifeQA.py
+++ b/examples/eval_OpenHuEval_HuLifeQA.py
@ -1,5 +1,7 @@
 from mmengine.config import read_base

+from opencompass.utils.text_postprocessors import remove_reasoning_part_before_evaluation
+
 with read_base():
    from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
        hu_life_qa_datasets,
@ -8,7 +10,7 @@ with read_base():

    from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
    from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_20241120_model
-    from opencompass.configs.models.deepseek.deepseek_v3_api_siliconflow import models as deepseek_v3_api_siliconflow_model
+    from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model

    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model
@ -17,8 +19,9 @@ with read_base():

    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model

+    from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model
    from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
-    from opencompass.configs.models.deepseek.deepseek_r1_siliconflow import models as deepseek_r1_siliconflow_model
+    from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model

 from opencompass.models import OpenAI
 from opencompass.partitioners import (
@ -36,10 +39,21 @@ api_meta_template = dict(round=[
    dict(role='BOT', api_role='BOT', generate=True),
 ])

+for model in deepseek_r1_api_aliyun_model:
+    model['return_reasoning_content'] = True
+    model['pred_postprocessor'] = {
+        'open_hu_eval_*': {
+            'type': 'rm_<think>_before_eval'
+        }
+    }
+del model
+
 models = [
-    # *gpt_4o_mini_20240718_model,
-    # *gpt_4o_20241120_model,
-    # *deepseek_v3_api_siliconflow_model,
+    *gpt_4o_mini_20240718_model,
+    *gpt_4o_20241120_model,
+    *o1_mini_2024_09_12_model,
+    *deepseek_v3_api_aliyun_model,
+    *deepseek_r1_api_aliyun_model,
    *lmdeploy_qwen2_5_7b_instruct_model,
    *lmdeploy_qwen2_5_72b_instruct_model,
    *lmdeploy_llama3_1_8b_instruct_model,
--- a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
@ -5,11 +5,11 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever

 hu_life_qa_reader_cfg = dict(
-    input_columns=["dialogue", "prompt"],
-    output_column="judge",
+    input_columns=['dialogue', 'prompt'],
+    output_column='judge',
 )

-data_path ="/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl"
+data_path ='/mnt/hwfile/opendatalab/yanghaote/share/HuLifeQA_20250131.jsonl'

 hu_life_qa_datasets = []
 hu_life_qa_infer_cfg = dict(
@ -21,8 +21,8 @@ hu_life_qa_infer_cfg = dict(
    inferencer=dict(
        type=ChatInferencer,
        max_seq_len=4096,
-        max_out_len=512,
-        infer_mode="last",
+        max_out_len=2048,
+        infer_mode='last',
    ),
 )

@ -34,12 +34,12 @@ hu_life_qa_eval_cfg = dict(
            template="""{prompt}"""
        ),
    ),
-    pred_role="BOT",
+    pred_role='BOT',
 )

 hu_life_qa_datasets.append(
    dict(
-        abbr="hu_life_qa",
+        abbr='open_hu_eval_hu_life_qa',
        type=WildBenchDataset,
        path=data_path,
        reader_cfg=hu_life_qa_reader_cfg,
@ -49,22 +49,22 @@ hu_life_qa_datasets.append(
 )

 task_group_new = {
-    "life_culture_custom": "life_culture_custom",
-    "childbearing and education": "life_culture_custom",
-    "culture and community": "life_culture_custom",
-    'culture and customs': "life_culture_custom",
-    "food and drink": "life_culture_custom",
-    "health": "life_culture_custom",
-    "holidays": "life_culture_custom",
-    "home": "life_culture_custom",
-    "person": "life_culture_custom",
-    "transport": "life_culture_custom",
-    "science": "life_culture_custom",
-    "travel": "life_culture_custom",
-    "business_finance": "business_finance",
-    "business and finance": "business_finance",
-    "education_profession": "education_profession",
-    "public education and courses": "education_profession",
-    "politics_policy_law": "politics_policy_law",
-    "politics": "politics_policy_law",
+    'life_culture_custom': 'life_culture_custom',
+    'childbearing and education': 'life_culture_custom',
+    'culture and community': 'life_culture_custom',
+    'culture and customs': 'life_culture_custom',
+    'food and drink': 'life_culture_custom',
+    'health': 'life_culture_custom',
+    'holidays': 'life_culture_custom',
+    'home': 'life_culture_custom',
+    'person': 'life_culture_custom',
+    'transport': 'life_culture_custom',
+    'science': 'life_culture_custom',
+    'travel': 'life_culture_custom',
+    'business_finance': 'business_finance',
+    'business and finance': 'business_finance',
+    'education_profession': 'education_profession',
+    'public education and courses': 'education_profession',
+    'politics_policy_law': 'politics_policy_law',
+    'politics': 'politics_policy_law',
 }
--- a/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py
+++ b/opencompass/configs/models/deepseek/deepseek_r1_api_aliyun.py
@ -0,0 +1,17 @@
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        abbr='deepseek_r1_api_aliyun',
+        type=OpenAISDK,
+        path='deepseek-r1',
+        key='ENV_ALIYUN',
+        openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=30,
+        verbose=True,
+    ),
+]
--- a/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py
+++ b/opencompass/configs/models/deepseek/deepseek_v3_api_aliyun.py
@ -0,0 +1,17 @@
+from opencompass.models import OpenAISDK
+
+models = [
+    dict(
+        abbr='deepseek_v3_api_aliyun',
+        type=OpenAISDK,
+        path='deepseek-v3',
+        key='ENV_ALIYUN',
+        openai_api_base='https://dashscope.aliyuncs.com/compatible-mode/v1',
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8,
+        retry=30,
+        verbose=True,
+    ),
+]
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -100,6 +100,7 @@ class OpenAI(BaseAPIModel):
        tokenizer_path: Optional[str] = None,
        extra_body: Optional[Dict] = None,
        max_completion_tokens: int = 16384,
+        return_reasoning_content: Optional[bool] = False,
        verbose: bool = False,
    ):

@ -123,6 +124,7 @@ class OpenAI(BaseAPIModel):
        self.tokenizer_path = tokenizer_path
        self.hf_tokenizer = None
        self.extra_body = extra_body
+        self.return_reasoning_content = return_reasoning_content

        if isinstance(key, str):
            if key == 'ENV':
@ -137,6 +139,10 @@ class OpenAI(BaseAPIModel):
                if 'DEEPSEEK_API_KEY' not in os.environ:
                    raise ValueError('Deepseek API key is not set.')
                self.keys = os.getenv('DEEPSEEK_API_KEY').split(',')
+            elif key == 'ENV_ALIYUN':
+                if 'DASHSCOPE_API_KEY' not in os.environ:
+                    raise ValueError('DASHSCOPE API key (aliyun) is not set.')
+                self.keys = os.getenv('DASHSCOPE_API_KEY').split(',')
            else:
                self.keys = [key]
        else:
@ -340,7 +346,16 @@ class OpenAI(BaseAPIModel):
                if self.logprobs:
                    return response['choices']
                else:
-                    return response['choices'][0]['message']['content'].strip()
+                    message = response['choices'][0]['message']
+                    content = message['content'].strip()
+                    if self.return_reasoning_content:
+                        r_content = message.get('reasoning_content',
+                                                '').strip()
+                        if r_content:
+                            r_content = '<think>' + r_content + '</think>'
+                        return r_content + content
+                    else:
+                        return content
            except KeyError:
                if 'error' in response:
                    if response['error']['code'] == 'rate_limit_exceeded':
@ -567,6 +582,7 @@ class OpenAISDK(OpenAI):
        tokenizer_path: str | None = None,
        extra_body: Dict | None = None,
        max_completion_tokens: int = 16384,
+        return_reasoning_content: Optional[bool] = False,
        verbose: bool = False,
        status_code_mappings: dict = {},
    ):
@ -588,6 +604,7 @@ class OpenAISDK(OpenAI):
            tokenizer_path,
            extra_body,
            verbose=verbose,
+            return_reasoning_content=return_reasoning_content,
            max_completion_tokens=max_completion_tokens,
        )
        key = random.choice(self.keys)
@ -670,7 +687,18 @@ class OpenAISDK(OpenAI):
                    self.logger.error(
                        'Response is empty, it is an internal server error \
                            from the API provider.')
-                return responses.choices[0].message.content
+
+                message = responses.choices[0].message
+                content = message.content
+                if self.return_reasoning_content:
+                    try:
+                        r_content = message.reasoning_content
+                        r_content = '<think>' + r_content + '</think>'
+                    except AttributeError:
+                        r_content = ''
+                    return r_content + content
+                else:
+                    return content

            except (BadRequestError, APIStatusError) as e:
                # Handle BadRequest status
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -229,3 +229,15 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
    match = re.search(answer_pattern, response_text)
    extracted_answer = match.group(1) if match else ''
    return extracted_answer
+
+
+@TEXT_POSTPROCESSORS.register_module('rm_<think>_before_eval')
+def remove_reasoning_part_before_evaluation(text: str):
+    if text.startswith('<think>'):
+        reasoning_end = text.rfind('</think>')
+        if reasoning_end == -1:
+            return text
+        else:
+            return text[reasoning_end + 8:]
+    else:
+        return text