mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Support subjective evaluation for reasoning model (#1868)
* fix pip version * fix pip version * add subeval for reasoning model * add subeval for reasoning model * update configs * update config * update config * update config * update files
This commit is contained in:
parent
68a9838907
commit
f407930475
@ -32,7 +32,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -31,7 +31,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -32,7 +32,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -31,7 +31,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -73,12 +73,13 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
evaluator=dict(
|
evaluator=dict(
|
||||||
type=LMEvaluator,
|
type=LMEvaluator,
|
||||||
|
|
||||||
prompt_template=dict(
|
prompt_template=dict(
|
||||||
type=PromptTemplate,
|
type=PromptTemplate,
|
||||||
template=dict(
|
template=dict(
|
||||||
|
@ -74,7 +74,7 @@ for _name in subjective_all_sets:
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -72,7 +72,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -38,7 +38,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -39,7 +39,7 @@ for _name in subjective_all_sets:
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -37,7 +37,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from opencompass.datasets import (
|
from opencompass.datasets import (
|
||||||
CompassArenaDataset,
|
CompassArenaDataset,
|
||||||
compassarena_bradleyterry_postprocess,
|
compassarena_bradleyterry_postprocess
|
||||||
)
|
)
|
||||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -91,7 +91,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -90,7 +90,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -59,7 +59,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -58,7 +58,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
inferencer=dict(type=GenInferencer,),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -29,7 +29,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -28,7 +28,7 @@ for _name in subjective_all_sets:
|
|||||||
]),
|
]),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
inferencer=dict(type=GenInferencer),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -24,7 +24,7 @@ for _name in subjective_all_sets:
|
|||||||
template="""{dialogue}""",
|
template="""{dialogue}""",
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
|
inferencer=dict(type=ChatInferencer, infer_mode='last'),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -23,7 +23,7 @@ for _name in subjective_all_sets:
|
|||||||
template="""{dialogue}""",
|
template="""{dialogue}""",
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
|
inferencer=dict(type=ChatInferencer, infer_mode='last'),
|
||||||
)
|
)
|
||||||
|
|
||||||
subjective_eval_cfg = dict(
|
subjective_eval_cfg = dict(
|
||||||
|
@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
if fnmatch.fnmatch(ds_abbr, pattern):
|
if fnmatch.fnmatch(ds_abbr, pattern):
|
||||||
pred_postprocessor = model_postprocessors[pattern]
|
pred_postprocessor = model_postprocessors[pattern]
|
||||||
break
|
break
|
||||||
|
|
||||||
if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
|
if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
|
||||||
kwargs = pred_postprocessor or eval_cfg['evaluator'][
|
kwargs = pred_postprocessor or eval_cfg['evaluator'][
|
||||||
'pred_postprocessor']
|
'pred_postprocessor']
|
||||||
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
temp_kwargs = copy.deepcopy(kwargs)
|
||||||
|
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
|
||||||
self.logger.info('Get postprocessor {postprocessor}.')
|
self.logger.info('Get postprocessor {postprocessor}.')
|
||||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
|
||||||
else:
|
else:
|
||||||
self.logger.info('No postprocessor found.')
|
self.logger.info('No dataset postprocessor found.')
|
||||||
|
|
||||||
|
if 'pred_postprocessor' in model_cfg or pred_postprocessor:
|
||||||
|
kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
|
||||||
|
temp_kwargs = copy.deepcopy(kwargs)
|
||||||
|
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
|
||||||
|
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
|
||||||
|
else:
|
||||||
|
self.logger.info('No model postprocessor found.')
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'model_name': model_abbr_from_cfg(model_cfg),
|
'model_name': model_abbr_from_cfg(model_cfg),
|
||||||
@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
|
|||||||
if fnmatch.fnmatch(ds_abbr, pattern):
|
if fnmatch.fnmatch(ds_abbr, pattern):
|
||||||
pred_postprocessor = model_postprocessors[pattern]
|
pred_postprocessor = model_postprocessors[pattern]
|
||||||
break
|
break
|
||||||
|
|
||||||
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
|
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
|
||||||
|
|
||||||
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
|
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
|
||||||
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
||||||
|
@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str:
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('think_pred')
|
||||||
|
def think_pred_postprocess(
|
||||||
|
prediction: str,
|
||||||
|
re_pattern: str,
|
||||||
|
) -> str:
|
||||||
|
match = re.search(re_pattern, prediction)
|
||||||
|
if match:
|
||||||
|
return match.group(1).strip()
|
||||||
|
else:
|
||||||
|
return prediction
|
||||||
|
|
||||||
|
|
||||||
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
||||||
"""Find first valid option for text."""
|
"""Find first valid option for text."""
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user