From 879b181c1b2f4dab6d32e7f0f4e8581bc6626d06 Mon Sep 17 00:00:00 2001
From: Hoter Young <hoteryoung@163.com>
Date: Fri, 14 Feb 2025 20:44:53 +0800
Subject: [PATCH] add some features (#32)

* [Feature] Support answer extraction of QwQ when evaluating HuSimpleQA

* [Feature] Support mulit-language summarization in HuSimpleQASummarizer

* [Feature] Support DeepSeep-R1-Distill-Qwen_32B_turbomind
---
 examples/eval_OpenHuEval_HuSimpleQA.py        | 13 ++++-
 ...y_deepseek_r1_distill_qwen_32b_instruct.py | 15 +++++
 .../summarizers/subjective/husimpleqa.py      | 58 ++++++++++---------
 opencompass/utils/text_postprocessors.py      | 47 ++++++++++++++-
 4 files changed, 101 insertions(+), 32 deletions(-)
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b_instruct.py
diff --git a/examples/eval_OpenHuEval_HuSimpleQA.py b/examples/eval_OpenHuEval_HuSimpleQA.py
index 05e3cb3a..9cc2b5fb 100644
--- a/examples/eval_OpenHuEval_HuSimpleQA.py
+++ b/examples/eval_OpenHuEval_HuSimpleQA.py
@@ -3,7 +3,7 @@ from mmengine.config import read_base
 from opencompass.summarizers.subjective.husimpleqa import HuSimpleQASummarizer
 
 with read_base():
-    from opencompass.configs.datasets.OpenHuEval.HuSimpleQA.HuSimpleQA import HuSimpleQA_datasets
+    from opencompass.configs.datasets.OpenHuEval.HuSimpleQA.HuSimpleQA import HuSimpleQA_datasets, PROMPT_LANGUAGES
 
     from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model
     from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model
@@ -14,7 +14,7 @@ with read_base():
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import models as lmdeploy_llama3_1_8b_instruct_model
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_70b_instruct import models as lmdeploy_llama3_1_70b_instruct_model
 
-    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
+    # from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model
 
     from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model
     from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model
@@ -45,6 +45,12 @@ for model in models:
                 'type': 'rm_<think>_before_eval'
             }
         }
+    if model['abbr'].startswith('QwQ'):
+        model['pred_postprocessor'] = {
+            'OpenHuEval_*': {
+                'type': 'extract_qwq_answer_before_eval_for_husimpleqa'
+            }
+        }
 del model
 
 
@@ -92,7 +98,8 @@ eval = dict(
                 task=dict(type=SubjectiveEvalTask)),
 )
 
-summarizer = dict(type=HuSimpleQASummarizer)
+summarizer = dict(type=HuSimpleQASummarizer, 
+                  prompt_languages=PROMPT_LANGUAGES)
 
 work_dir = (
     './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' 
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b_instruct.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b_instruct.py
new file mode 100644
index 00000000..0723f4da
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b_instruct.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek_r1_distill_qwen_32b_turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        max_seq_len=16384,
+        max_out_len=4096,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/opencompass/summarizers/subjective/husimpleqa.py b/opencompass/summarizers/subjective/husimpleqa.py
index 03d2f594..267f93f3 100644
--- a/opencompass/summarizers/subjective/husimpleqa.py
+++ b/opencompass/summarizers/subjective/husimpleqa.py
@@ -60,6 +60,7 @@ def get_capability_results(
             writer.writerow(col_name)
         writer.writerow(column)
 
+
 class HuSimpleQASummarizer:
     """Do the subjectivity analyze based on evaluation results.
 
@@ -67,10 +68,11 @@ class HuSimpleQASummarizer:
         config (ConfigDict): The configuration object of the evaluation task.
     """
 
-    def __init__(self, config: ConfigDict) -> None:
+    def __init__(self, config: ConfigDict, prompt_languages) -> None:
         self.judge_type = 'single'
         self.tasks = []
         self.cfg = config
+        self.prompt_languages = prompt_languages
 
         self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
         self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
@@ -85,30 +87,32 @@ class HuSimpleQASummarizer:
         Returns:
             pd.DataFrame: The summary results.
         """
+        for language in self.prompt_languages:
+            dataset_cfgs = self.cfg['datasets']
+            output_dir, results_folder = get_outdir(self.cfg, time_str)
+            fout_flag = 0
+            for eval_model_cfg in self.eval_model_cfgs:
+                eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
+                show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
+                subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
+                if os.path.isdir(subdir_path):
+                    fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability' + '_' + language + '.csv')
+                    overall_judged_answers, overall_references = [], []
+                    for dataset in dataset_cfgs:
+                        if not dataset['abbr'].endswith('_' + language):
+                            continue
+                        judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
+                        judged_answers = [item['judge'] for item in judged_answers]
+                        overall_judged_answers += judged_answers
+                        overall_references += references
 
-        dataset_cfgs = self.cfg['datasets']
-        output_dir, results_folder = get_outdir(self.cfg, time_str)
-        fout_flag = 0
-        for eval_model_cfg in self.eval_model_cfgs:
-            eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
-            show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
-            subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
-            if os.path.isdir(subdir_path):
-                fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
-                overall_judged_answers, overall_references = [], []
-                for dataset in dataset_cfgs:
-                    judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
-                    judged_answers = [item['judge'] for item in judged_answers]
-                    overall_judged_answers += judged_answers
-                    overall_references += references
-
-                get_capability_results(
-                    overall_judged_answers,
-                    overall_references,
-                    fout,
-                    fout_flag,
-                    show_model_abbr,
-                )
-                fout_flag += 1
-            else:
-                print(subdir_path + ' is not exist! please check!')
+                    get_capability_results(
+                        overall_judged_answers,
+                        overall_references,
+                        fout,
+                        fout_flag,
+                        show_model_abbr,
+                    )
+                    fout_flag += 1
+                else:
+                    print(subdir_path + ' is not exist! please check!')
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index 056b5776..899610c3 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -248,10 +248,11 @@ def extract_answer_before_evaluation(text: str):
     """Overall, there are three situations in responses of QWQ:
 
     1. There is a **Final Answer** title in the whole context.
-        2. There is only one sentence in the context.
-        3. There are more than one sentences in the context, \
+    2. There is only one sentence in the context.
+    3. There are more than one sentences in the context, \
     and the last one is the answer.
     """
+    text = text.strip('\n')
     if '**Final Answer**' in text:
         answer = text.split('\n\n**Final Answer**\n\n')[-1]
     else:
@@ -264,3 +265,45 @@ def extract_answer_before_evaluation(text: str):
         else:
             answer = text_split[-1] + '.'
     return answer
+
+
+@TEXT_POSTPROCESSORS.register_module(
+    'extract_qwq_answer_before_eval_for_husimpleqa')
+def extract_answer_before_evaluation(text: str):
+    """The format of the answer from QwQ when inferring HuSimpleQA is \
+    different with others models due to the special prompt."""
+    max_sentence_len = 6
+    text_split = text.split('\n\n')
+    last_try_idx = max(len(text_split) - max_sentence_len, 0)
+    ans_start_idx = last_try_idx
+    has_answer = False
+    has_score = False
+    score_flags = [
+        'score', 'Score', 'confidence', 'Confidence', 'szcore', 'Szcore',
+        'pontszám', 'Pontszám', 'Biztonság', 'biztonság', 'Biztoságskor',
+        'biztoságskor', 'Biztoság', '信心', '分数'
+    ]
+    answer_flags = ['answer', 'Answer', 'Válasz', 'válasz', '答案', '回答']
+
+    for idx, s in enumerate(reversed(text_split)):
+        sen_idx = len(text_split) - 1 - idx
+        if sen_idx < last_try_idx:
+            break
+
+        for sf in score_flags:
+            if sf in s:
+                has_score = True
+                break
+
+        for af in answer_flags:
+            if af in s:
+                has_answer = True
+                break
+
+        if has_answer and has_score:
+            ans_start_idx = sen_idx
+            break
+
+    answer = '\n\n'.join(text_split[max(ans_start_idx - 1, 0):])
+
+    return answer