OpenCompass/opencompass/tasks/subjective_eval.py

291 lines
12 KiB
Python
Raw Normal View History

2023-10-13 19:50:54 +08:00
import argparse
import copy
import fnmatch
import os.path as osp
import random
import time
from typing import List, Union
2023-10-13 19:50:54 +08:00
import mmengine
from mmengine.config import Config, ConfigDict
from mmengine.utils import mkdir_or_exist
from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
from opencompass.tasks.base import BaseTask
from opencompass.tasks.openicl_eval import extract_role_pred
2023-10-13 19:50:54 +08:00
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path, get_logger,
model_abbr_from_cfg, task_abbr_from_cfg)
2023-10-13 19:50:54 +08:00
class SubjectiveEvalTask(BaseTask):
"""Subjective Evaluation Task.
This task is used to evaluate the metric between predictions and
references.
Args:
cfg (ConfigDict): The configuration of the entire evaluation task.
"""
name_prefix = 'SubjectiveEval'
log_subdir = 'logs/eval'
output_subdir = 'results'
def __init__(self, cfg: ConfigDict):
super().__init__(cfg)
self.logger = get_logger()
judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
if type(judge_cfg) != ConfigDict:
print('*' * 100)
print('Due to different Judge model needs different summarizer and'
" prompts, we don't support multi judge model evaluation at "
'one time, please do not use list to set your judge cfg, jus'
't use a dict or list[0] should be fine. If you want to eval'
'uation multi judge model in one script, we suggest you to u'
'se a bash or bat script to start multi configs evaluation!')
print('*' * 100)
assert type(judge_cfg) == ConfigDict
2023-10-13 19:50:54 +08:00
run_cfg = judge_cfg.get('run_cfg', {})
self.num_gpus = run_cfg.get('num_gpus', 0)
self.num_procs = run_cfg.get('num_procs', 1)
self.judge_cfg = copy.deepcopy(judge_cfg)
self.given_pred = cfg.eval.get('given_pred', [])
2023-10-13 19:50:54 +08:00
def get_command(self, cfg_path, template):
"""Get the command template for the task.
Args:
cfg_path (str): The path to the config file of the task.
template (str): The template which have '{task_cmd}' to format
the command.
"""
script_path = __file__
if self.num_gpus > 0:
port = random.randint(12000, 32000)
command = (f'torchrun --master_port={port} '
f'--nproc_per_node {self.num_procs} '
f'{script_path} {cfg_path}')
else:
command = f'python {script_path} {cfg_path}'
return template.format(task_cmd=command)
def run(self):
# model_cfg can be a list of model configs
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
for dataset_cfg in dataset_cfgs:
# Load Dataset
eval_cfg = dataset_cfg.get('eval_cfg')
output_column = dataset_cfg['reader_cfg']['output_column']
if type(model_cfg) == ConfigDict:
model_cfg = (model_cfg, )
model_cfg += ({
'abbr':
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
}, )
2023-10-13 19:50:54 +08:00
out_path = get_infer_output_path(
model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
if osp.exists(out_path):
continue
self._score(model_cfg, dataset_cfg, eval_cfg, output_column)
def _load_model_pred(
self,
model_cfg: Union[ConfigDict, List[ConfigDict]],
dataset_cfg: ConfigDict,
eval_cfg: ConfigDict,
given_preds: List[dict],
) -> Union[None, List[str]]:
2023-10-13 19:50:54 +08:00
if isinstance(model_cfg, (tuple, list)):
return [
self._load_model_pred(m, dataset_cfg, eval_cfg, given_preds)
2023-10-13 19:50:54 +08:00
for m in model_cfg
]
pred_strs = None
# There will be 5 situations, so we need to deal with them
# 1.There are no partitions in infer and judge stage
# 2.No partition in infer stage, but use partition in judge stage
# 3.Use partition in infer stage, but not use partition in judge stage
# 4.Use both partition, with same partition size
# 5.Use both partition, but different partition size
# If take SubjectSizePartition, get new filename without _0
if 'test_range' in dataset_cfg['reader_cfg']:
filename = get_infer_output_path(
model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))
root, ext = osp.splitext(filename)
last_underscore_index = root.rfind('_')
root = root[:last_underscore_index]
filename = root + ext
# If take SubjectNaivePartition, get filename
else:
filename = get_infer_output_path(
model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))
for given_pred in given_preds:
abbr = given_pred['abbr']
path = given_pred['path']
if abbr == model_cfg['abbr']:
filename = osp.join(path, osp.basename(filename))
# Get partition name
2023-10-13 19:50:54 +08:00
root, ext = osp.splitext(filename)
partial_filename = root + '_0' + ext
# If no predictions get in predictions dir
assert osp.exists(filename) or osp.exists(
osp.realpath(partial_filename)
), 'No predictions found for {filename}.'.format(filename=filename)
# If use Naive partition in infer stage
if osp.exists(osp.realpath(filename)):
preds = mmengine.load(filename)
pred_strs = [
preds[str(i)]['prediction'] for i in range(len(preds))
]
# If use Size partition in infer stage
2023-12-23 20:06:53 +08:00
else:
filename = partial_filename
pred_strs = []
i = 1
while osp.exists(osp.realpath(filename)):
2023-10-13 19:50:54 +08:00
preds = mmengine.load(filename)
filename = root + f'_{i}' + ext
i += 1
pred_strs += [
2023-10-13 19:50:54 +08:00
preds[str(i)]['prediction'] for i in range(len(preds))
]
# Get all predictions in pred_strs
2023-10-13 19:50:54 +08:00
# If take SubjectSizePartition, get new pred_strs based on test_range
if 'test_range' in dataset_cfg['reader_cfg']:
test_range = dataset_cfg['reader_cfg']['test_range']
pred_strs = eval('pred_strs' + test_range)
# If take SubjectNaivePartition, get all pred_strs
else:
pred_strs = pred_strs
if ('pred_role' in eval_cfg and 'meta_template' in model_cfg
and not MODELS.get(model_cfg['type']).is_api
and isinstance(pred_strs[0], str)):
# Create a prompt template for role config parsing
from opencompass.models.base import LMTemplateParser
parser = LMTemplateParser(model_cfg['meta_template'])
role = parser.roles[eval_cfg['pred_role']]
pred_strs = [
extract_role_pred(pred, role.get('begin', None),
role.get('end', None)) for pred in pred_strs
]
# Postprocess predictions if necessary
ds_abbr = dataset_abbr_from_cfg(dataset_cfg)
model_postprocessors = model_cfg.get('pred_postprocessor', {})
pred_postprocessor = None
for pattern in model_postprocessors.keys():
if fnmatch.fnmatch(ds_abbr, pattern):
pred_postprocessor = model_postprocessors[pattern]
break
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
pred_strs = [proc(s, **kwargs) for s in pred_strs]
2023-10-13 19:50:54 +08:00
return {
'model_name': model_abbr_from_cfg(model_cfg),
'model_preds': pred_strs
}
2023-10-13 19:50:54 +08:00
def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
test_set = build_dataset_from_cfg(dataset_cfg).test
# Postprocess dataset if necessary
if 'dataset_postprocessor' in eval_cfg:
proc = TEXT_POSTPROCESSORS.get(
eval_cfg['dataset_postprocessor']['type'])
def postprocess(sample):
s = sample[output_column]
sample[output_column] = proc(s)
return sample
test_set = test_set.map(postprocess)
# Get out_path
out_path = get_infer_output_path(model_cfg, dataset_cfg,
osp.join(self.work_dir, 'results'))
new_model_cfg = []
for m_cfg in model_cfg:
if len(m_cfg) > 1:
new_model_cfg.append(m_cfg)
if len(new_model_cfg) == 1:
new_model_cfg = new_model_cfg[0]
model_preds = self._load_model_pred(new_model_cfg, dataset_cfg,
eval_cfg, self.given_pred)
if not self.judge_cfg:
raise ValueError('missing "eval.runner.task.judge_cfg"')
eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg
eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg
eval_cfg['evaluator']['output_path'] = out_path
2023-10-13 19:50:54 +08:00
icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
references = (test_set[output_column] if output_column else None)
2023-12-23 20:06:53 +08:00
if 'error' not in model_preds:
result = icl_evaluator.score(predictions=model_preds,
references=references)
else:
result = model_preds
2023-10-13 19:50:54 +08:00
if 'error' in result:
self.logger.error(
f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
return
else:
self.logger.info(
f'Task {task_abbr_from_cfg(self.cfg)}') #: {result}')
2023-10-13 19:50:54 +08:00
# Save result
mkdir_or_exist(osp.split(out_path)[0])
mmengine.dump(result,
open(out_path, 'w', encoding='utf-8'),
file_format='json',
ensure_ascii=False,
indent=4)
def get_output_paths(self, file_extension: str = 'json') -> List[str]:
"""Get the paths to the output files. Every file should exist if the
task succeeds.
Args:
file_extension (str): The file extension of the output files.
Default: 'json'.
"""
output_paths = []
for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):
for dataset in datasets:
if type(model) == ConfigDict:
model = (model, )
model += ({
'abbr':
'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
}, )
output_paths.append(
get_infer_output_path(
model, dataset,
osp.join(self.work_dir, self.output_subdir),
file_extension))
return output_paths
2023-10-13 19:50:54 +08:00
def parse_args():
parser = argparse.ArgumentParser(description='Score Calculator')
parser.add_argument('config', help='Config file path')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
cfg = Config.fromfile(args.config)
start_time = time.time()
inferencer = SubjectiveEvalTask(cfg)
inferencer.run()
end_time = time.time()
get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')