diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py index 7cc1bbcc..4fc76e7f 100644 --- a/opencompass/openicl/icl_evaluator/__init__.py +++ b/opencompass/openicl/icl_evaluator/__init__.py @@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa from .icl_misc_evaluator import AverageMinKEvaluator # noqa from .icl_misc_evaluator import AveragePPLEvaluator # noqa +from .icl_plugin_evaluator import TEvalEvaluator # noqa from .icl_toxic_evaluator import ToxicEvaluator # noqa from .lm_evaluator import LMEvaluator # noqa diff --git a/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py b/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py new file mode 100644 index 00000000..61fdbd23 --- /dev/null +++ b/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py @@ -0,0 +1,97 @@ +"""Plugin Evaluator.""" + +import json + + +class TEvalEvaluator: + """This module contains the following evaluators for evaluating the + capabilities of the various dimensions of the LLM. + + specifically, InstructEvaluator is used to evaluate the instruction + following capability of LLM, i.e. the ability of the model to perform tool + calls according to an predefined format. ReasoningEvaluator is used to + evaluate the model's ability to reason about the next execution step based + on historical observations. PlanningEvaluator is used to evaluate the + model's ability to plan a solution or program based on a given task. + APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a + subset of tools relevant to the given task from a large number of tools. + ReviewEvaluator is used to evaluate the model's ability to review whether a + task was successfully completed. + """ + + def __init__(self, subset) -> None: + + from opencompass.datasets.teval.evaluators import ( + InstructEvaluator, PlanningEvaluator, + ReasonRetrieveUnderstandEvaluator, ReviewEvaluator) + + super().__init__() + self.subset = subset + if subset == 'instruct': + self.evaluator = InstructEvaluator('') + elif subset == 'plan': + self.evaluator = PlanningEvaluator('') + elif subset == 'review': + self.evaluator = ReviewEvaluator('') + elif subset == 'reason_retrieve_understand': + self.evaluator = ReasonRetrieveUnderstandEvaluator('') + elif subset == 'reason': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', default_prompt_type='str', eval_type='reason') + elif subset == 'retrieve': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', default_prompt_type='str', eval_type='retrieve') + elif subset == 'understand': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', default_prompt_type='str', eval_type='understand') + + elif subset == 'instruct_zh': + self.evaluator = InstructEvaluator('') + elif subset == 'plan_zh': + self.evaluator = PlanningEvaluator( + '', bert_score_model='thenlper/gte-large-zh') + elif subset == 'review_zh': + self.evaluator = ReviewEvaluator('') + elif subset == 'reason_retrieve_understand_zh': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', bert_score_model='thenlper/gte-large-zh') + elif subset == 'reason_zh': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', + default_prompt_type='str', + eval_type='reason', + bert_score_model='thenlper/gte-large-zh') + elif subset == 'retrieve_zh': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', default_prompt_type='str', eval_type='retrieve') + elif subset == 'understand_zh': + self.evaluator = ReasonRetrieveUnderstandEvaluator( + '', + default_prompt_type='str', + eval_type='understand', + bert_score_model='thenlper/gte-large-zh') + else: + raise NotImplementedError + + def score(self, predictions, references): + + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different ' + 'length' + } + + results_list = [] + for prediction, reference in zip(predictions, references): + + datum = json.loads(reference) + datum['prediction'] = prediction + + data_sample = self.evaluator._process_response(datum) + if isinstance(data_sample, tuple): + data_sample = data_sample[0] + metrics_result = self.evaluator._evaluate(data_sample) + results_list.append(metrics_result) + results_dict = self.evaluator._post_process(results_list) + results_dict = {k: v * 100 for k, v in results_dict.items()} + return results_dict