[Sync] Fix TEvalEvaluator (#929)

This commit is contained in:
Fengzhe Zhou 2024-02-28 16:05:30 +08:00 committed by GitHub
parent ba7cd58da3
commit 9afbfa3639
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 98 additions and 0 deletions

View File

@ -7,5 +7,6 @@ from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_misc_evaluator import AverageMinKEvaluator # noqa
from .icl_misc_evaluator import AveragePPLEvaluator # noqa
from .icl_plugin_evaluator import TEvalEvaluator # noqa
from .icl_toxic_evaluator import ToxicEvaluator # noqa
from .lm_evaluator import LMEvaluator # noqa

View File

@ -0,0 +1,97 @@
"""Plugin Evaluator."""
import json
class TEvalEvaluator:
"""This module contains the following evaluators for evaluating the
capabilities of the various dimensions of the LLM.
specifically, InstructEvaluator is used to evaluate the instruction
following capability of LLM, i.e. the ability of the model to perform tool
calls according to an predefined format. ReasoningEvaluator is used to
evaluate the model's ability to reason about the next execution step based
on historical observations. PlanningEvaluator is used to evaluate the
model's ability to plan a solution or program based on a given task.
APIRetrievalEvaluator is used to evaluate the model's ability to retrieve a
subset of tools relevant to the given task from a large number of tools.
ReviewEvaluator is used to evaluate the model's ability to review whether a
task was successfully completed.
"""
def __init__(self, subset) -> None:
from opencompass.datasets.teval.evaluators import (
InstructEvaluator, PlanningEvaluator,
ReasonRetrieveUnderstandEvaluator, ReviewEvaluator)
super().__init__()
self.subset = subset
if subset == 'instruct':
self.evaluator = InstructEvaluator('')
elif subset == 'plan':
self.evaluator = PlanningEvaluator('')
elif subset == 'review':
self.evaluator = ReviewEvaluator('')
elif subset == 'reason_retrieve_understand':
self.evaluator = ReasonRetrieveUnderstandEvaluator('')
elif subset == 'reason':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='reason')
elif subset == 'retrieve':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='retrieve')
elif subset == 'understand':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='understand')
elif subset == 'instruct_zh':
self.evaluator = InstructEvaluator('')
elif subset == 'plan_zh':
self.evaluator = PlanningEvaluator(
'', bert_score_model='thenlper/gte-large-zh')
elif subset == 'review_zh':
self.evaluator = ReviewEvaluator('')
elif subset == 'reason_retrieve_understand_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', bert_score_model='thenlper/gte-large-zh')
elif subset == 'reason_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'',
default_prompt_type='str',
eval_type='reason',
bert_score_model='thenlper/gte-large-zh')
elif subset == 'retrieve_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'', default_prompt_type='str', eval_type='retrieve')
elif subset == 'understand_zh':
self.evaluator = ReasonRetrieveUnderstandEvaluator(
'',
default_prompt_type='str',
eval_type='understand',
bert_score_model='thenlper/gte-large-zh')
else:
raise NotImplementedError
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
results_list = []
for prediction, reference in zip(predictions, references):
datum = json.loads(reference)
datum['prediction'] = prediction
data_sample = self.evaluator._process_response(datum)
if isinstance(data_sample, tuple):
data_sample = data_sample[0]
metrics_result = self.evaluator._evaluate(data_sample)
results_list.append(metrics_result)
results_dict = self.evaluator._post_process(results_list)
results_dict = {k: v * 100 for k, v in results_dict.items()}
return results_dict