[Feature] Support subjective evaluation for reasoning model (#1868)

* fix pip version

* fix pip version

* add subeval for reasoning model

* add subeval for reasoning model

* update configs

* update config

* update config

* update config

* update files
This commit is contained in:
bittersweet1999 2025-02-20 12:19:46 +08:00 committed by GitHub
parent 68a9838907
commit f407930475
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 51 additions and 26 deletions

View File

@ -32,7 +32,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -31,7 +31,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -32,7 +32,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -31,7 +31,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -73,12 +73,13 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(

View File

@ -74,7 +74,7 @@ for _name in subjective_all_sets:
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -72,7 +72,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -38,7 +38,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -39,7 +39,7 @@ for _name in subjective_all_sets:
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -37,7 +37,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -1,6 +1,6 @@
from opencompass.datasets import (
CompassArenaDataset,
compassarena_bradleyterry_postprocess,
compassarena_bradleyterry_postprocess
)
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -91,7 +91,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -90,7 +90,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -59,7 +59,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -58,7 +58,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer,),
)
subjective_eval_cfg = dict(

View File

@ -29,7 +29,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -28,7 +28,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -24,7 +24,7 @@ for _name in subjective_all_sets:
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
inferencer=dict(type=ChatInferencer, infer_mode='last'),
)
subjective_eval_cfg = dict(

View File

@ -23,7 +23,7 @@ for _name in subjective_all_sets:
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
inferencer=dict(type=ChatInferencer, infer_mode='last'),
)
subjective_eval_cfg = dict(

View File

@ -329,4 +329,4 @@ class LMEvaluator:
else:
kwargs = self.dict_postprocessor
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
return proc(output, self.output_path, **kwargs)
return proc(output, self.output_path, **kwargs)

View File

@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
if fnmatch.fnmatch(ds_abbr, pattern):
pred_postprocessor = model_postprocessors[pattern]
break
if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
kwargs = pred_postprocessor or eval_cfg['evaluator'][
'pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
temp_kwargs = copy.deepcopy(kwargs)
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
self.logger.info('Get postprocessor {postprocessor}.')
pred_strs = [proc(s, **kwargs) for s in pred_strs]
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
else:
self.logger.info('No postprocessor found.')
self.logger.info('No dataset postprocessor found.')
if 'pred_postprocessor' in model_cfg or pred_postprocessor:
kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
temp_kwargs = copy.deepcopy(kwargs)
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
else:
self.logger.info('No model postprocessor found.')
return {
'model_name': model_abbr_from_cfg(model_cfg),
@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
if fnmatch.fnmatch(ds_abbr, pattern):
pred_postprocessor = model_postprocessors[pattern]
break
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
pred_strs = [proc(s, **kwargs) for s in pred_strs]

View File

@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str:
return ''
@TEXT_POSTPROCESSORS.register_module('think_pred')
def think_pred_postprocess(
prediction: str,
re_pattern: str,
) -> str:
match = re.search(re_pattern, prediction)
if match:
return match.group(1).strip()
else:
return prediction
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
"""Find first valid option for text."""