[Sync] Fix TEvalEvaluator (#929)

2025-05-30 16:03:24 +08:00 · 2024-02-28 16:05:30 +08:00 · 2024-02-28 16:05:30 +08:00 · 9afbfa3639
commit 9afbfa3639
parent ba7cd58da3
2 changed files with 98 additions and 0 deletions
--- a/opencompass/openicl/icl_evaluator/init.py
+++ b/opencompass/openicl/icl_evaluator/init.py
@ -7,5 +7,6 @@ from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from .icl_misc_evaluator import AverageMinKEvaluator  # noqa
 from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
+from .icl_plugin_evaluator import TEvalEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
@ -0,0 +1,97 @@
+"""Plugin Evaluator."""
+
+import json
+
+
+class TEvalEvaluator:
+    """This module contains the following evaluators for evaluating the
+    capabilities of the various dimensions of the LLM.
+
+    specifically, InstructEvaluator is used to evaluate the instruction
+    following capability of LLM, i.e. the ability of the model to perform tool
+    calls according to an predefined format. ReasoningEvaluator is used to
+    evaluate the model's ability to reason about the next execution step based
+    on historical observations. PlanningEvaluator is used to evaluate the
+    model's ability to plan a solution or program based on a given task.
+    APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a
+    subset of tools relevant to the given task from a large number of tools.
+    ReviewEvaluator is used to evaluate the model's ability to review whether a
+    task was successfully completed.
+    """
+
+    def __init__(self, subset) -> None:
+
+        from opencompass.datasets.teval.evaluators import (
+            InstructEvaluator, PlanningEvaluator,
+            ReasonRetrieveUnderstandEvaluator, ReviewEvaluator)
+
+        super().__init__()
+        self.subset = subset
+        if subset == 'instruct':
+            self.evaluator = InstructEvaluator('')
+        elif subset == 'plan':
+            self.evaluator = PlanningEvaluator('')
+        elif subset == 'review':
+            self.evaluator = ReviewEvaluator('')
+        elif subset == 'reason_retrieve_understand':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator('')
+        elif subset == 'reason':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '', default_prompt_type='str', eval_type='reason')
+        elif subset == 'retrieve':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '', default_prompt_type='str', eval_type='retrieve')
+        elif subset == 'understand':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '', default_prompt_type='str', eval_type='understand')
+
+        elif subset == 'instruct_zh':
+            self.evaluator = InstructEvaluator('')
+        elif subset == 'plan_zh':
+            self.evaluator = PlanningEvaluator(
+                '', bert_score_model='thenlper/gte-large-zh')
+        elif subset == 'review_zh':
+            self.evaluator = ReviewEvaluator('')
+        elif subset == 'reason_retrieve_understand_zh':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '', bert_score_model='thenlper/gte-large-zh')
+        elif subset == 'reason_zh':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '',
+                default_prompt_type='str',
+                eval_type='reason',
+                bert_score_model='thenlper/gte-large-zh')
+        elif subset == 'retrieve_zh':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '', default_prompt_type='str', eval_type='retrieve')
+        elif subset == 'understand_zh':
+            self.evaluator = ReasonRetrieveUnderstandEvaluator(
+                '',
+                default_prompt_type='str',
+                eval_type='understand',
+                bert_score_model='thenlper/gte-large-zh')
+        else:
+            raise NotImplementedError
+
+    def score(self, predictions, references):
+
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+
+        results_list = []
+        for prediction, reference in zip(predictions, references):
+
+            datum = json.loads(reference)
+            datum['prediction'] = prediction
+
+            data_sample = self.evaluator._process_response(datum)
+            if isinstance(data_sample, tuple):
+                data_sample = data_sample[0]
+            metrics_result = self.evaluator._evaluate(data_sample)
+            results_list.append(metrics_result)
+        results_dict = self.evaluator._post_process(results_list)
+        results_dict = {k: v * 100 for k, v in results_dict.items()}
+        return results_dict