OpenCompass/opencompass/datasets/teval/evaluators/review_evaluator.py

123 lines
3.8 KiB
Python
Raw Normal View History

from collections import defaultdict
from mmengine import load
from ..schema import ResponseDataSample
import numpy as np
from ..utils.format_load import format_load
class ReviewEvaluator:
"""Review Capability Evaluation
Args:
dataset_path(str): File path of evaluation dataset.
"""
def __init__(
self,
dataset_path: str,
# bert_score_model: str = "all-mpnet-base-v2",
**kwargs,
) -> None:
self.dataset_path = dataset_path
# self.bert_score_model = bert_score_model
# self.sentence_model = SentenceTransformer(self.bert_score_model)
def _load_dataset(self):
self.dataset = []
dataset = load(self.dataset_path)
for key in dataset.keys():
datum = dataset[key]
data_sample = self._process_response(datum)
self.dataset.append(
dict(
origin_prompt=datum['origin_prompt'],
response_data_sample=data_sample))
self.num_samples = len(self.dataset)
def _process_response(
self,
datum: dict,
) -> ResponseDataSample:
"""Process the response to needed format.
Args:
datum(dict): inputs.
Returns:
dict: Processed response data sample.
"""
template = datum['template']
pred_data = datum['prediction']
gt_data = datum['ground_truth']['answer']
meta_data = datum['meta_data']
if meta_data['response_format'] == 'json':
pred_data = self.json_format_parse(pred_data)
else:
pred_data = pred_data[pred_data.find(":") + 1:]
pred_data = pred_data.strip()
if len(pred_data) > 0 and pred_data[0] in ['A', 'B', 'C', 'D', 'E']:
pred_data = pred_data[0]
else:
pred_data = None
return ResponseDataSample(
template=template, pred=pred_data, gt=gt_data, meta_data=meta_data)
def _evaluate(self, data_sample) -> dict:
metrics_result = dict(
parse_rate=0,
review_quality=0,
)
pred_data = data_sample.pred
if pred_data is not None:
metrics_result['review_quality'] = 1.0 if pred_data == \
data_sample.gt else 0.0
metrics_result['parse_rate'] = 1.0
return metrics_result
# def compute_sen_similarity(self, gt, pred):
# gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True)
# pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True)
# sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item())
# return sen_sim
def json_format_parse(self, pred_data):
try:
data = format_load(pred_data)
except Exception as e:
return None
try:
new_data = dict()
new_data['review'] = data['is_finished']
assert new_data['review'] in [True, False]
except Exception as e:
return None
return new_data
def evaluate(self):
self._load_dataset()
results_list = []
for data_sample in self.dataset:
metrics_result = self._evaluate(
data_sample['response_data_sample'])
results_list.append(metrics_result)
return self._post_process(results_list)
def _post_process(self, results_list):
# list of dict to dict of list
results_dict = defaultdict(list)
{
results_dict[key].append(sub[key])
for sub in results_list for key in sub
}
metric_list = ['parse_rate', 'review_quality']
for metric in metric_list:
results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4)
return results_dict