From f407930475e4d7cf3338eb9a5b1ac4f03916d7f6 Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Thu, 20 Feb 2025 12:19:46 +0800
Subject: [PATCH] [Feature] Support subjective evaluation for reasoning model
 (#1868)

* fix pip version

* fix pip version

* add subeval for reasoning model

* add subeval for reasoning model

* update configs

* update config

* update config

* update config

* update files
---
 .../alignbench_judgeby_critiquellm.py          |  2 +-
 .../alignbench_judgeby_critiquellm_new.py      |  2 +-
 .../alignbench_v1_1_judgeby_critiquellm.py     |  2 +-
 .../alignbench_v1_1_judgeby_critiquellm_new.py |  2 +-
 .../alpaca_eval/alpacav2_judgeby_gpt4.py       |  3 ++-
 .../alpacav2_judgeby_gpt4_bradleyterry.py      |  2 +-
 .../alpaca_eval/alpacav2_judgeby_gpt4_new.py   |  2 +-
 .../arena_hard/arena_hard_compare.py           |  2 +-
 .../arena_hard_compare_bradleyterry.py         |  2 +-
 .../arena_hard/arena_hard_compare_new.py       |  2 +-
 .../compassarena/compassarena_compare.py       |  2 +-
 .../compassarena_compare_bradleyterry.py       |  4 ++--
 .../compassarena/compassarena_compare_new.py   |  2 +-
 .../subjective/fofo/fofo_bilingual_judge.py    |  2 +-
 .../fofo/fofo_bilingual_judge_new.py           |  2 +-
 .../datasets/subjective/fofo/fofo_judge.py     |  2 +-
 .../datasets/subjective/fofo/fofo_judge_new.py |  2 +-
 .../followbench/followbench_llmeval.py         |  2 +-
 .../followbench/followbench_llmeval_new.py     |  2 +-
 .../subjective/multiround/mtbench101_judge.py  |  2 +-
 .../multiround/mtbench101_judge_new.py         |  2 +-
 .../openicl/icl_evaluator/lm_evaluator.py      |  2 +-
 opencompass/tasks/subjective_eval.py           | 18 +++++++++++++++---
 opencompass/utils/text_postprocessors.py       | 12 ++++++++++++
 24 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
index 86c2a80b..0bc7df77 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
@@ -32,7 +32,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
index 20797b0f..d3f59b9f 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
@@ -31,7 +31,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
index 024f66a1..44f63f4f 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
@@ -32,7 +32,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
index 2ff09a3e..216e6ffa 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
@@ -31,7 +31,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
index 137e5ca0..ad0d4ef4 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
@@ -73,12 +73,13 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
         evaluator=dict(
             type=LMEvaluator,
+            
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
index 99f2e2be..19fe1559 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
@@ -74,7 +74,7 @@ for _name in subjective_all_sets:
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
index 06a82efe..a0510f5c 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
@@ -72,7 +72,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
index 90837c7b..7446fdd7 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
@@ -38,7 +38,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
index 7a0e9ae8..dc4b250e 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
@@ -39,7 +39,7 @@ for _name in subjective_all_sets:
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
index 08b27ca7..dbad40ef 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
@@ -37,7 +37,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
index 90141e66..47cc7b31 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
@@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
index 8a687889..38d7927a 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
@@ -1,6 +1,6 @@
 from opencompass.datasets import (
     CompassArenaDataset,
-    compassarena_bradleyterry_postprocess,
+    compassarena_bradleyterry_postprocess
 )
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
@@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
index a32691ad..83266765 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
@@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
index 089fd101..9516e074 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@@ -91,7 +91,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
index 81e160b5..f732dba0 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
@@ -90,7 +90,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
index 89400892..8944be01 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
@@ -59,7 +59,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
index 691aff2b..03dcf190 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
@@ -58,7 +58,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer,),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
index e601bda3..1c4203fd 100644
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
@@ -29,7 +29,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
index b0aacd86..970605b6 100644
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
@@ -28,7 +28,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
index 00924ecb..53ab1631 100644
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
@@ -24,7 +24,7 @@ for _name in subjective_all_sets:
                 template="""{dialogue}""",
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+            inferencer=dict(type=ChatInferencer, infer_mode='last'),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
index 938f2f5e..01b9c12f 100644
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
@@ -23,7 +23,7 @@ for _name in subjective_all_sets:
                 template="""{dialogue}""",
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+            inferencer=dict(type=ChatInferencer, infer_mode='last'),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py
index 53814070..94f2cf94 100644
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@@ -329,4 +329,4 @@ class LMEvaluator:
         else:
             kwargs = self.dict_postprocessor
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            return proc(output, self.output_path, **kwargs)
\ No newline at end of file
diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py
index 417c5cdb..0ddd7b0c 100644
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
             if fnmatch.fnmatch(ds_abbr, pattern):
                 pred_postprocessor = model_postprocessors[pattern]
                 break
+
         if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
             kwargs = pred_postprocessor or eval_cfg['evaluator'][
                 'pred_postprocessor']
-            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
+            temp_kwargs = copy.deepcopy(kwargs)
+            proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
             self.logger.info('Get postprocessor {postprocessor}.')
-            pred_strs = [proc(s, **kwargs) for s in pred_strs]
+            pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
         else:
-            self.logger.info('No postprocessor found.')
+            self.logger.info('No dataset postprocessor found.')
+
+        if 'pred_postprocessor' in model_cfg or pred_postprocessor:
+            kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
+            temp_kwargs = copy.deepcopy(kwargs)
+            proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
+            pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
+        else:
+            self.logger.info('No model postprocessor found.')
 
         return {
             'model_name': model_abbr_from_cfg(model_cfg),
@@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
             if fnmatch.fnmatch(ds_abbr, pattern):
                 pred_postprocessor = model_postprocessors[pattern]
                 break
+
         if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
+
             kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
             proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
             pred_strs = [proc(s, **kwargs) for s in pred_strs]
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index eb7469ab..7110e752 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str:
     return ''
 
 
+@TEXT_POSTPROCESSORS.register_module('think_pred')
+def think_pred_postprocess(
+    prediction: str,
+    re_pattern: str,
+) -> str:
+    match = re.search(re_pattern, prediction)
+    if match:
+        return match.group(1).strip()
+    else:
+        return prediction
+
+
 def first_option_postprocess(text: str, options: str, cushion=True) -> str:
     """Find first valid option for text."""