mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* Update CascadeEvaluator * Update CascadeEvaluator * Update CascadeEvaluator * Update Config * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update * Update
123 lines
3.8 KiB
Python
123 lines
3.8 KiB
Python
from collections import defaultdict
|
|
from mmengine import load
|
|
|
|
from ..schema import ResponseDataSample
|
|
import numpy as np
|
|
from ..utils.format_load import format_load
|
|
|
|
class ReviewEvaluator:
|
|
"""Review Capability Evaluation
|
|
|
|
Args:
|
|
dataset_path(str): File path of evaluation dataset.
|
|
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
dataset_path: str,
|
|
# bert_score_model: str = "all-mpnet-base-v2",
|
|
**kwargs,
|
|
) -> None:
|
|
self.dataset_path = dataset_path
|
|
# self.bert_score_model = bert_score_model
|
|
# self.sentence_model = SentenceTransformer(self.bert_score_model)
|
|
|
|
def _load_dataset(self):
|
|
self.dataset = []
|
|
dataset = load(self.dataset_path)
|
|
|
|
for key in dataset.keys():
|
|
datum = dataset[key]
|
|
data_sample = self._process_response(datum)
|
|
|
|
self.dataset.append(
|
|
dict(
|
|
origin_prompt=datum['origin_prompt'],
|
|
response_data_sample=data_sample))
|
|
self.num_samples = len(self.dataset)
|
|
|
|
def _process_response(
|
|
self,
|
|
datum: dict,
|
|
) -> ResponseDataSample:
|
|
"""Process the response to needed format.
|
|
|
|
Args:
|
|
datum(dict): inputs.
|
|
|
|
Returns:
|
|
dict: Processed response data sample.
|
|
"""
|
|
|
|
template = datum['template']
|
|
pred_data = datum['prediction']
|
|
gt_data = datum['ground_truth']['answer']
|
|
meta_data = datum['meta_data']
|
|
|
|
if meta_data['response_format'] == 'json':
|
|
pred_data = self.json_format_parse(pred_data)
|
|
else:
|
|
pred_data = pred_data[pred_data.find(":") + 1:]
|
|
pred_data = pred_data.strip()
|
|
if len(pred_data) > 0 and pred_data[0] in ['A', 'B', 'C', 'D', 'E']:
|
|
pred_data = pred_data[0]
|
|
else:
|
|
pred_data = None
|
|
|
|
return ResponseDataSample(
|
|
template=template, pred=pred_data, gt=gt_data, meta_data=meta_data)
|
|
|
|
def _evaluate(self, data_sample) -> dict:
|
|
metrics_result = dict(
|
|
parse_rate=0,
|
|
review_quality=0,
|
|
)
|
|
|
|
pred_data = data_sample.pred
|
|
if pred_data is not None:
|
|
metrics_result['review_quality'] = 1.0 if pred_data == \
|
|
data_sample.gt else 0.0
|
|
metrics_result['parse_rate'] = 1.0
|
|
return metrics_result
|
|
|
|
# def compute_sen_similarity(self, gt, pred):
|
|
# gt_embed = self.sentence_model.encode(gt, convert_to_tensor=True)
|
|
# pred_embed = self.sentence_model.encode(pred, convert_to_tensor=True)
|
|
# sen_sim = max(0, util.cos_sim(gt_embed, pred_embed).item())
|
|
# return sen_sim
|
|
|
|
def json_format_parse(self, pred_data):
|
|
try:
|
|
data = format_load(pred_data)
|
|
except Exception as e:
|
|
return None
|
|
try:
|
|
new_data = dict()
|
|
new_data['review'] = data['is_finished']
|
|
assert new_data['review'] in [True, False]
|
|
except Exception as e:
|
|
return None
|
|
return new_data
|
|
|
|
def evaluate(self):
|
|
self._load_dataset()
|
|
results_list = []
|
|
for data_sample in self.dataset:
|
|
metrics_result = self._evaluate(
|
|
data_sample['response_data_sample'])
|
|
results_list.append(metrics_result)
|
|
return self._post_process(results_list)
|
|
|
|
def _post_process(self, results_list):
|
|
# list of dict to dict of list
|
|
results_dict = defaultdict(list)
|
|
{
|
|
results_dict[key].append(sub[key])
|
|
for sub in results_list for key in sub
|
|
}
|
|
metric_list = ['parse_rate', 'review_quality']
|
|
for metric in metric_list:
|
|
results_dict[metric] = np.round(np.mean(results_dict[metric]), decimals=4)
|
|
return results_dict
|