OpenCompass/opencompass/tasks/subjective_eval.py

import argparse
import copy
import fnmatch
import os.path as osp
import random
import time
from typing import List, Union

import mmengine
from mmengine.config import Config, ConfigDict
from mmengine.utils import mkdir_or_exist

from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
from opencompass.tasks.base import BaseTask
from opencompass.tasks.openicl_eval import extract_role_pred
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path, get_logger,
                               model_abbr_from_cfg, task_abbr_from_cfg)


class SubjectiveEvalTask(BaseTask):
    """Subjective Evaluation Task.

    This task is used to evaluate the metric between predictions and
    references.

    Args:
        cfg (ConfigDict): The configuration of the entire evaluation task.
    """

    name_prefix = 'SubjectiveEval'
    log_subdir = 'logs/eval'
    output_subdir = 'results'

    def __init__(self, cfg: ConfigDict):
        super().__init__(cfg)
        self.logger = get_logger()
        judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})
        if type(judge_cfg) != ConfigDict:
            print('*' * 100)
            print('Due to different Judge model needs different summarizer and'
                  " prompts, we don't support multi judge model evaluation at "
                  'one time, please do not use list to set your judge cfg, jus'
                  't use a dict or list[0] should be fine. If you want to eval'
                  'uation multi judge model in one script, we suggest you to u'
                  'se a bash or bat script to start multi configs evaluation!')
            print('*' * 100)
        assert type(judge_cfg) == ConfigDict
        run_cfg = judge_cfg.get('run_cfg', {})
        self.num_gpus = run_cfg.get('num_gpus', 0)
        self.num_procs = run_cfg.get('num_procs', 1)
        self.judge_cfg = copy.deepcopy(judge_cfg)
        self.given_pred = cfg.eval.get('given_pred', [])

    def get_command(self, cfg_path, template):
        """Get the command template for the task.

        Args:
            cfg_path (str): The path to the config file of the task.
            template (str): The template which have '{task_cmd}' to format
                the command.
        """
        script_path = __file__
        if self.num_gpus > 0:
            port = random.randint(12000, 32000)
            command = (f'torchrun --master_port={port} '
                       f'--nproc_per_node {self.num_procs} '
                       f'{script_path} {cfg_path}')
        else:
            command = f'python {script_path} {cfg_path}'

        return template.format(task_cmd=command)

    def run(self):
        # model_cfg can be a list of model configs
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
            for dataset_cfg in dataset_cfgs:
                # Load Dataset
                eval_cfg = dataset_cfg.get('eval_cfg')
                output_column = dataset_cfg['reader_cfg']['output_column']
                if type(model_cfg) == ConfigDict:
                    model_cfg = (model_cfg, )
                model_cfg += ({
                    'abbr':
                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
                }, )
                out_path = get_infer_output_path(
                    model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))
                if osp.exists(out_path):
                    continue
                self._score(model_cfg, dataset_cfg, eval_cfg, output_column)

    def _load_model_pred(
        self,
        model_cfg: Union[ConfigDict, List[ConfigDict]],
        dataset_cfg: ConfigDict,
        eval_cfg: ConfigDict,
        given_preds: List[dict],
    ) -> Union[None, List[str]]:
        if isinstance(model_cfg, (tuple, list)):
            return [
                self._load_model_pred(m, dataset_cfg, eval_cfg, given_preds)
                for m in model_cfg
            ]

        pred_strs = None

        # There will be 5 situations, so we need to deal with them
        # 1.There are no partitions in infer and judge stage
        # 2.No partition in infer stage, but use partition in judge stage
        # 3.Use partition in infer stage, but not use partition in judge stage
        # 4.Use both partition, with same partition size
        # 5.Use both partition, but different partition size

        # If take SubjectSizePartition, get new filename without _0
        if 'test_range' in dataset_cfg['reader_cfg']:
            filename = get_infer_output_path(
                model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))
            root, ext = osp.splitext(filename)
            last_underscore_index = root.rfind('_')
            root = root[:last_underscore_index]
            filename = root + ext
        # If take SubjectNaivePartition, get filename
        else:
            filename = get_infer_output_path(
                model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))
        for given_pred in given_preds:
            abbr = given_pred['abbr']
            path = given_pred['path']
            if abbr == model_cfg['abbr']:
                filename = osp.join(path, osp.basename(filename))
        # Get partition name
        root, ext = osp.splitext(filename)
        partial_filename = root + '_0' + ext
        # If no predictions get in predictions dir
        assert osp.exists(filename) or osp.exists(
            osp.realpath(partial_filename)
        ), 'No predictions found for {filename}.'.format(filename=filename)

        # If use Naive partition in infer stage
        if osp.exists(osp.realpath(filename)):
            preds = mmengine.load(filename)
            pred_strs = [
                preds[str(i)]['prediction'] for i in range(len(preds))
            ]
        # If use Size partition in infer stage
        else:
            filename = partial_filename
            pred_strs = []
            i = 1
            while osp.exists(osp.realpath(filename)):
                preds = mmengine.load(filename)
                filename = root + f'_{i}' + ext
                i += 1
                pred_strs += [
                    preds[str(i)]['prediction'] for i in range(len(preds))
                ]
        # Get all predictions in pred_strs

        # If take SubjectSizePartition, get new pred_strs based on test_range
        if 'test_range' in dataset_cfg['reader_cfg']:
            test_range = dataset_cfg['reader_cfg']['test_range']
            pred_strs = eval('pred_strs' + test_range)
        # If take SubjectNaivePartition, get all pred_strs
        else:
            pred_strs = pred_strs
        if ('pred_role' in eval_cfg and 'meta_template' in model_cfg
                and not MODELS.get(model_cfg['type']).is_api
                and isinstance(pred_strs[0], str)):
            # Create a prompt template for role config parsing
            from opencompass.models.base import LMTemplateParser
            parser = LMTemplateParser(model_cfg['meta_template'])
            role = parser.roles[eval_cfg['pred_role']]
            pred_strs = [
                extract_role_pred(pred, role.get('begin', None),
                                  role.get('end', None)) for pred in pred_strs
            ]

        # Postprocess predictions if necessary
        ds_abbr = dataset_abbr_from_cfg(dataset_cfg)
        model_postprocessors = model_cfg.get('pred_postprocessor', {})
        pred_postprocessor = None
        for pattern in model_postprocessors.keys():
            if fnmatch.fnmatch(ds_abbr, pattern):
                pred_postprocessor = model_postprocessors[pattern]
                break
        if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
            kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
            pred_strs = [proc(s, **kwargs) for s in pred_strs]

        return {
            'model_name': model_abbr_from_cfg(model_cfg),
            'model_preds': pred_strs
        }

    def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):
        test_set = build_dataset_from_cfg(dataset_cfg).test
        # Postprocess dataset if necessary
        if 'dataset_postprocessor' in eval_cfg:
            proc = TEXT_POSTPROCESSORS.get(
                eval_cfg['dataset_postprocessor']['type'])

            def postprocess(sample):
                s = sample[output_column]
                sample[output_column] = proc(s)
                return sample

            test_set = test_set.map(postprocess)
        # Get out_path
        out_path = get_infer_output_path(model_cfg, dataset_cfg,
                                         osp.join(self.work_dir, 'results'))
        new_model_cfg = []
        for m_cfg in model_cfg:
            if len(m_cfg) > 1:
                new_model_cfg.append(m_cfg)
        if len(new_model_cfg) == 1:
            new_model_cfg = new_model_cfg[0]
        model_preds = self._load_model_pred(new_model_cfg, dataset_cfg,
                                            eval_cfg, self.given_pred)
        if not self.judge_cfg:
            raise ValueError('missing "eval.runner.task.judge_cfg"')
        eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg
        eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg
        eval_cfg['evaluator']['output_path'] = out_path
        icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])
        references = (test_set[output_column] if output_column else None)

        if 'error' not in model_preds:
            result = icl_evaluator.score(predictions=model_preds,
                                         references=references)
        else:
            result = model_preds

        if 'error' in result:
            self.logger.error(
                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
            return
        else:
            self.logger.info(
                f'Task {task_abbr_from_cfg(self.cfg)}')  #: {result}')

        # Save result
        mkdir_or_exist(osp.split(out_path)[0])
        mmengine.dump(result,
                      open(out_path, 'w', encoding='utf-8'),
                      file_format='json',
                      ensure_ascii=False,
                      indent=4)

    def get_output_paths(self, file_extension: str = 'json') -> List[str]:
        """Get the paths to the output files. Every file should exist if the
        task succeeds.

        Args:
            file_extension (str): The file extension of the output files.
                Default: 'json'.
        """
        output_paths = []
        for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):
            for dataset in datasets:
                if type(model) == ConfigDict:
                    model = (model, )
                model += ({
                    'abbr':
                    'judged-by--' + model_abbr_from_cfg(self.judge_cfg)
                }, )
                output_paths.append(
                    get_infer_output_path(
                        model, dataset,
                        osp.join(self.work_dir, self.output_subdir),
                        file_extension))
        return output_paths


def parse_args():
    parser = argparse.ArgumentParser(description='Score Calculator')
    parser.add_argument('config', help='Config file path')
    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()
    cfg = Config.fromfile(args.config)
    start_time = time.time()
    inferencer = SubjectiveEvalTask(cfg)
    inferencer.run()
    end_time = time.time()
    get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`import argparse`
			`import copy`
			`import fnmatch`
			`import os.path as osp`
			`import random`
			`import time`
quick fix for postprocess pred extraction (#771) 2024-01-05 21:10:18 +08:00			`from typing import List, Union`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`import mmengine`
			`from mmengine.config import Config, ConfigDict`
			`from mmengine.utils import mkdir_or_exist`

			`from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS`
			`from opencompass.tasks.base import BaseTask`
quick fix for postprocess pred extraction (#771) 2024-01-05 21:10:18 +08:00			`from opencompass.tasks.openicl_eval import extract_role_pred`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,`
			`get_infer_output_path, get_logger,`
[Feature] Add other judgelm prompts for Alignbench (#731) * add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings 2023-12-27 17:54:53 +08:00			`model_abbr_from_cfg, task_abbr_from_cfg)`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00

			`class SubjectiveEvalTask(BaseTask):`
			`"""Subjective Evaluation Task.`

			`This task is used to evaluate the metric between predictions and`
			`references.`

			`Args:`
			`cfg (ConfigDict): The configuration of the entire evaluation task.`
			`"""`

			`name_prefix = 'SubjectiveEval'`
			`log_subdir = 'logs/eval'`
			`output_subdir = 'results'`

			`def __init__(self, cfg: ConfigDict):`
			`super().__init__(cfg)`
			`self.logger = get_logger()`
			`judge_cfg = cfg.eval.runner.task.get('judge_cfg', {})`
[Feature] Add other judgelm prompts for Alignbench (#731) * add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings 2023-12-27 17:54:53 +08:00			`if type(judge_cfg) != ConfigDict:`
			`print('' 100)`
			`print('Due to different Judge model needs different summarizer and'`
			`" prompts, we don't support multi judge model evaluation at "`
			`'one time, please do not use list to set your judge cfg, jus'`
			`'t use a dict or list[0] should be fine. If you want to eval'`
			`'uation multi judge model in one script, we suggest you to u'`
			`'se a bash or bat script to start multi configs evaluation!')`
			`print('' 100)`
			`assert type(judge_cfg) == ConfigDict`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`run_cfg = judge_cfg.get('run_cfg', {})`
			`self.num_gpus = run_cfg.get('num_gpus', 0)`
			`self.num_procs = run_cfg.get('num_procs', 1)`
			`self.judge_cfg = copy.deepcopy(judge_cfg)`
add support for set prediction path (#984) 2024-03-19 14:32:15 +08:00			`self.given_pred = cfg.eval.get('given_pred', [])`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`def get_command(self, cfg_path, template):`
			`"""Get the command template for the task.`

			`Args:`
			`cfg_path (str): The path to the config file of the task.`
			`template (str): The template which have '{task_cmd}' to format`
			`the command.`
			`"""`
			`script_path = __file__`
			`if self.num_gpus > 0:`
			`port = random.randint(12000, 32000)`
			`command = (f'torchrun --master_port={port} '`
			`f'--nproc_per_node {self.num_procs} '`
			`f'{script_path} {cfg_path}')`
			`else:`
			`command = f'python {script_path} {cfg_path}'`

			`return template.format(task_cmd=command)`

			`def run(self):`
			`# model_cfg can be a list of model configs`
			`for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):`
			`for dataset_cfg in dataset_cfgs:`
			`# Load Dataset`
			`eval_cfg = dataset_cfg.get('eval_cfg')`
			`output_column = dataset_cfg['reader_cfg']['output_column']`
[Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr 2023-12-21 15:58:20 +08:00			`if type(model_cfg) == ConfigDict:`
			`model_cfg = (model_cfg, )`
			`model_cfg += ({`
[Feature] Add other judgelm prompts for Alignbench (#731) * add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings 2023-12-27 17:54:53 +08:00			`'abbr':`
			`'judged-by--' + model_abbr_from_cfg(self.judge_cfg)`
[Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr 2023-12-21 15:58:20 +08:00			`}, )`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`out_path = get_infer_output_path(`
			`model_cfg, dataset_cfg, osp.join(self.work_dir, 'results'))`
			`if osp.exists(out_path):`
			`continue`
			`self._score(model_cfg, dataset_cfg, eval_cfg, output_column)`

add support for set prediction path (#984) 2024-03-19 14:32:15 +08:00			`def _load_model_pred(`
			`self,`
			`model_cfg: Union[ConfigDict, List[ConfigDict]],`
			`dataset_cfg: ConfigDict,`
			`eval_cfg: ConfigDict,`
			`given_preds: List[dict],`
			`) -> Union[None, List[str]]:`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`if isinstance(model_cfg, (tuple, list)):`
			`return [`
add support for set prediction path (#984) 2024-03-19 14:32:15 +08:00			`self._load_model_pred(m, dataset_cfg, eval_cfg, given_preds)`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`for m in model_cfg`
			`]`

[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`pred_strs = None`

			`# There will be 5 situations, so we need to deal with them`
			`# 1.There are no partitions in infer and judge stage`
			`# 2.No partition in infer stage, but use partition in judge stage`
			`# 3.Use partition in infer stage, but not use partition in judge stage`
			`# 4.Use both partition, with same partition size`
			`# 5.Use both partition, but different partition size`

			`# If take SubjectSizePartition, get new filename without _0`
			`if 'test_range' in dataset_cfg['reader_cfg']:`
			`filename = get_infer_output_path(`
			`model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))`
			`root, ext = osp.splitext(filename)`
quick fix for postprocess pred extraction (#771) 2024-01-05 21:10:18 +08:00			`last_underscore_index = root.rfind('_')`
			`root = root[:last_underscore_index]`
			`filename = root + ext`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# If take SubjectNaivePartition, get filename`
			`else:`
			`filename = get_infer_output_path(`
			`model_cfg, dataset_cfg, osp.join(self.work_dir, 'predictions'))`
add support for set prediction path (#984) 2024-03-19 14:32:15 +08:00			`for given_pred in given_preds:`
			`abbr = given_pred['abbr']`
			`path = given_pred['path']`
			`if abbr == model_cfg['abbr']:`
			`filename = osp.join(path, osp.basename(filename))`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# Get partition name`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`root, ext = osp.splitext(filename)`
			`partial_filename = root + '_0' + ext`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# If no predictions get in predictions dir`
[Feature] Support AlpacaEval_V2 (#1006) * support alpacaeval_v2 * support alpacaeval * update docs * update docs 2024-03-28 16:49:04 +08:00			`assert osp.exists(filename) or osp.exists(`
			`osp.realpath(partial_filename)`
			`), 'No predictions found for {filename}.'.format(filename=filename)`

			`# If use Naive partition in infer stage`
			`if osp.exists(osp.realpath(filename)):`
			`preds = mmengine.load(filename)`
			`pred_strs = [`
			`preds[str(i)]['prediction'] for i in range(len(preds))`
			`]`
			`# If use Size partition in infer stage`
[Fix] Fix subjective alignbench (#730) 2023-12-23 20:06:53 +08:00			`else:`
[Feature] Support AlpacaEval_V2 (#1006) * support alpacaeval_v2 * support alpacaeval * update docs * update docs 2024-03-28 16:49:04 +08:00			`filename = partial_filename`
			`pred_strs = []`
			`i = 1`
			`while osp.exists(osp.realpath(filename)):`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`preds = mmengine.load(filename)`
[Feature] Support AlpacaEval_V2 (#1006) * support alpacaeval_v2 * support alpacaeval * update docs * update docs 2024-03-28 16:49:04 +08:00			`filename = root + f'_{i}' + ext`
			`i += 1`
			`pred_strs += [`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`preds[str(i)]['prediction'] for i in range(len(preds))`
			`]`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# Get all predictions in pred_strs`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# If take SubjectSizePartition, get new pred_strs based on test_range`
			`if 'test_range' in dataset_cfg['reader_cfg']:`
			`test_range = dataset_cfg['reader_cfg']['test_range']`
			`pred_strs = eval('pred_strs' + test_range)`
			`# If take SubjectNaivePartition, get all pred_strs`
			`else:`
			`pred_strs = pred_strs`
			`if ('pred_role' in eval_cfg and 'meta_template' in model_cfg`
[Feature] add mtbench (#829) * add mtbench * add mtbench * Update configs/datasets/subjective/multiround/mtbench_judgeby_gpt4.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update configs/datasets/subjective/multiround/mtbench_judgeby_gpt4.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update opencompass/datasets/subjective/__init__.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * Update opencompass/datasets/subjective/mtbench.py Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> * fix mtbench --------- Co-authored-by: Songyang Zhang <tonysy@users.noreply.github.com> 2024-01-24 12:11:47 +08:00			`and not MODELS.get(model_cfg['type']).is_api`
			`and isinstance(pred_strs[0], str)):`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`# Create a prompt template for role config parsing`
			`from opencompass.models.base import LMTemplateParser`
			`parser = LMTemplateParser(model_cfg['meta_template'])`
			`role = parser.roles[eval_cfg['pred_role']]`
			`pred_strs = [`
quick fix for postprocess pred extraction (#771) 2024-01-05 21:10:18 +08:00			`extract_role_pred(pred, role.get('begin', None),`
			`role.get('end', None)) for pred in pred_strs`
[Fix] SubSizePartition fix (#746) * fix subjective_eval * subject_eval partition situation fixed * subject_eval partition situation fixed 2023-12-28 11:46:46 +08:00			`]`

			`# Postprocess predictions if necessary`
			`ds_abbr = dataset_abbr_from_cfg(dataset_cfg)`
			`model_postprocessors = model_cfg.get('pred_postprocessor', {})`
			`pred_postprocessor = None`
			`for pattern in model_postprocessors.keys():`
			`if fnmatch.fnmatch(ds_abbr, pattern):`
			`pred_postprocessor = model_postprocessors[pattern]`
			`break`
			`if 'pred_postprocessor' in eval_cfg or pred_postprocessor:`
			`kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']`
			`proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))`
			`pred_strs = [proc(s, **kwargs) for s in pred_strs]`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
[Feature] Add other judgelm prompts for Alignbench (#731) * add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings 2023-12-27 17:54:53 +08:00			`return {`
			`'model_name': model_abbr_from_cfg(model_cfg),`
			`'model_preds': pred_strs`
			`}`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column):`
			`test_set = build_dataset_from_cfg(dataset_cfg).test`
			`# Postprocess dataset if necessary`
			`if 'dataset_postprocessor' in eval_cfg:`
			`proc = TEXT_POSTPROCESSORS.get(`
			`eval_cfg['dataset_postprocessor']['type'])`

			`def postprocess(sample):`
			`s = sample[output_column]`
			`sample[output_column] = proc(s)`
			`return sample`

			`test_set = test_set.map(postprocess)`
			`# Get out_path`
			`out_path = get_infer_output_path(model_cfg, dataset_cfg,`
			`osp.join(self.work_dir, 'results'))`
[Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr 2023-12-21 15:58:20 +08:00			`new_model_cfg = []`
			`for m_cfg in model_cfg:`
			`if len(m_cfg) > 1:`
			`new_model_cfg.append(m_cfg)`
			`if len(new_model_cfg) == 1:`
			`new_model_cfg = new_model_cfg[0]`
			`model_preds = self._load_model_pred(new_model_cfg, dataset_cfg,`
add support for set prediction path (#984) 2024-03-19 14:32:15 +08:00			`eval_cfg, self.given_pred)`
[Feature] Add Subjective Evaluation (#680) * new version of subject * fixed draw * fixed draw * fixed draw * done * done * done * done * fixed lint 2023-12-11 22:22:11 +08:00			`if not self.judge_cfg:`
			`raise ValueError('missing "eval.runner.task.judge_cfg"')`
			`eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg`
			`eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg`
			`eval_cfg['evaluator']['output_path'] = out_path`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00			`icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator'])`
			`references = (test_set[output_column] if output_column else None)`
[Fix] Fix subjective alignbench (#730) 2023-12-23 20:06:53 +08:00
			`if 'error' not in model_preds:`
			`result = icl_evaluator.score(predictions=model_preds,`
			`references=references)`
			`else:`
			`result = model_preds`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`if 'error' in result:`
			`self.logger.error(`
			`f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')`
			`return`
			`else:`
[Feature] Add Subjective Evaluation (#680) * new version of subject * fixed draw * fixed draw * fixed draw * done * done * done * done * fixed lint 2023-12-11 22:22:11 +08:00			`self.logger.info(`
			`f'Task {task_abbr_from_cfg(self.cfg)}') #: {result}')`
[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`# Save result`
			`mkdir_or_exist(osp.split(out_path)[0])`
			`mmengine.dump(result,`
			`open(out_path, 'w', encoding='utf-8'),`
			`file_format='json',`
			`ensure_ascii=False,`
			`indent=4)`

[Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr 2023-12-21 15:58:20 +08:00			`def get_output_paths(self, file_extension: str = 'json') -> List[str]:`
			`"""Get the paths to the output files. Every file should exist if the`
			`task succeeds.`

			`Args:`
			`file_extension (str): The file extension of the output files.`
			`Default: 'json'.`
			`"""`
			`output_paths = []`
			`for model, datasets in zip(self.model_cfgs, self.dataset_cfgs):`
			`for dataset in datasets:`
			`if type(model) == ConfigDict:`
			`model = (model, )`
[Feature] Add other judgelm prompts for Alignbench (#731) * add judgellm prompts * add judgelm prompts * update import info * fix situation that no abbr in config * fix situation that no abbr in config * add summarizer for other judgellm * change config name * add maxlen * add maxlen * dict assert * dict assert * fix strings * fix strings 2023-12-27 17:54:53 +08:00			`model += ({`
			`'abbr':`
			`'judged-by--' + model_abbr_from_cfg(self.judge_cfg)`
			`}, )`
[Feature] Add abbr for judgemodel in subjective evaluation (#724) * add_judgemodel_abbr * add judgemodel abbr 2023-12-21 15:58:20 +08:00			`output_paths.append(`
			`get_infer_output_path(`
			`model, dataset,`
			`osp.join(self.work_dir, self.output_subdir),`
			`file_extension))`
			`return output_paths`

[Sync] update github token (#475) 2023-10-13 19:50:54 +08:00
			`def parse_args():`
			`parser = argparse.ArgumentParser(description='Score Calculator')`
			`parser.add_argument('config', help='Config file path')`
			`args = parser.parse_args()`
			`return args`


			`if __name__ == '__main__':`
			`args = parse_args()`
			`cfg = Config.fromfile(args.config)`
			`start_time = time.time()`
			`inferencer = SubjectiveEvalTask(cfg)`
			`inferencer.run()`
			`end_time = time.time()`
			`get_logger().info(f'time elapsed: {end_time - start_time:.2f}s')`