mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
89 lines
3.5 KiB
Python
89 lines
3.5 KiB
Python
# flake8: noqa
|
|
# yapf: disable
|
|
import json
|
|
from typing import Dict, Optional
|
|
|
|
from datasets import Dataset
|
|
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
|
|
|
from .base import BaseDataset
|
|
|
|
|
|
@LOAD_DATASET.register_module()
|
|
class BenBenchDataset(BaseDataset):
|
|
|
|
@staticmethod
|
|
def load(path: str, tokenizer_path: str, tokenizer_kwargs: Optional[Dict] = dict(), num_gram: int=5, num_replica: int=5):
|
|
import numpy as np
|
|
from transformers import AutoTokenizer
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, **tokenizer_kwargs)
|
|
data = []
|
|
with open(path, encoding='utf-8') as f:
|
|
for index, line in enumerate(f):
|
|
line = json.loads(line)
|
|
if 'rewritten' in path:
|
|
text = line['rewritten_question'] + ' ' + line['rewritten_answer']
|
|
elif 'origin' in path:
|
|
text = line['question'] + ' ' + line['answer']
|
|
else:
|
|
raise ValueError(f'Unknown file type: {path}')
|
|
tokens = tokenizer.encode(text, add_special_tokens=False)
|
|
if len(tokens) >= num_gram + max(num_replica, 2):
|
|
starting_points = np.linspace(2, len(tokens) - num_gram, num=num_replica, endpoint=True, dtype=int).tolist()
|
|
else:
|
|
starting_points = np.linspace(2, max(2, len(tokens)), num=num_replica, endpoint=True, dtype=int).tolist()
|
|
for s in starting_points:
|
|
data.append({
|
|
'index': index,
|
|
'prompt': tokenizer.decode(tokens[:s]),
|
|
'reference': tokenizer.decode(tokens[s:s+num_gram])
|
|
})
|
|
dataset = Dataset.from_list(data)
|
|
return dataset
|
|
|
|
def exact_match_score(predicted_text, original_text):
|
|
return predicted_text == original_text
|
|
|
|
def edit_similarity_score(predicted_text, original_text):
|
|
# Calculate normalized edit distance
|
|
import editdistance
|
|
|
|
edit_dist = editdistance.eval(predicted_text, original_text)
|
|
max_length = max(len(predicted_text), len(original_text), 1)
|
|
edit_similarity = 1 - (edit_dist / max_length)
|
|
return edit_similarity
|
|
|
|
def rouge_l_score(predicted_text, original_text):
|
|
# Calculate Rouge-L score
|
|
from rouge_score import rouge_scorer
|
|
|
|
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
|
|
rouge_score = scorer.score(original_text, predicted_text)['rougeL'].fmeasure
|
|
return rouge_score
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class BenbenEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions, references):
|
|
if len(predictions) != len(references):
|
|
return {'error': 'pred and refr length mismatch'}
|
|
|
|
valid_exact_match, valid_edit_similarity, valid_rouge_score = 0, 0, 0
|
|
total = len(predictions)
|
|
for pred, ref in zip(predictions, references):
|
|
exact_match = exact_match_score(pred, ref)
|
|
edit_similarity = edit_similarity_score(pred, ref)
|
|
rougeL = rouge_l_score(pred, ref)
|
|
|
|
valid_exact_match += exact_match
|
|
valid_edit_similarity += edit_similarity > 0.75
|
|
valid_rouge_score += rougeL > 0.75
|
|
|
|
return {
|
|
'exact_match': valid_exact_match / total * 100,
|
|
'edit_similarity': valid_edit_similarity / total * 100,
|
|
'rougeL': valid_rouge_score / total * 100,
|
|
}
|