OpenCompass/opencompass/datasets/benbench.py

89 lines
3.5 KiB
Python
Raw Normal View History

# flake8: noqa
# yapf: disable
import json
from typing import Dict, Optional
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class BenBenchDataset(BaseDataset):
@staticmethod
def load(path: str, tokenizer_path: str, tokenizer_kwargs: Optional[Dict] = dict(), num_gram: int=5, num_replica: int=5):
import numpy as np
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True, **tokenizer_kwargs)
data = []
with open(path, encoding='utf-8') as f:
for index, line in enumerate(f):
line = json.loads(line)
if 'rewritten' in path:
text = line['rewritten_question'] + ' ' + line['rewritten_answer']
elif 'origin' in path:
text = line['question'] + ' ' + line['answer']
else:
raise ValueError(f'Unknown file type: {path}')
tokens = tokenizer.encode(text, add_special_tokens=False)
if len(tokens) >= num_gram + max(num_replica, 2):
starting_points = np.linspace(2, len(tokens) - num_gram, num=num_replica, endpoint=True, dtype=int).tolist()
else:
starting_points = np.linspace(2, max(2, len(tokens)), num=num_replica, endpoint=True, dtype=int).tolist()
for s in starting_points:
data.append({
'index': index,
'prompt': tokenizer.decode(tokens[:s]),
'reference': tokenizer.decode(tokens[s:s+num_gram])
})
dataset = Dataset.from_list(data)
return dataset
def exact_match_score(predicted_text, original_text):
return predicted_text == original_text
def edit_similarity_score(predicted_text, original_text):
# Calculate normalized edit distance
import editdistance
edit_dist = editdistance.eval(predicted_text, original_text)
max_length = max(len(predicted_text), len(original_text), 1)
edit_similarity = 1 - (edit_dist / max_length)
return edit_similarity
def rouge_l_score(predicted_text, original_text):
# Calculate Rouge-L score
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_score = scorer.score(original_text, predicted_text)['rougeL'].fmeasure
return rouge_score
@ICL_EVALUATORS.register_module()
class BenbenEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {'error': 'pred and refr length mismatch'}
valid_exact_match, valid_edit_similarity, valid_rouge_score = 0, 0, 0
total = len(predictions)
for pred, ref in zip(predictions, references):
exact_match = exact_match_score(pred, ref)
edit_similarity = edit_similarity_score(pred, ref)
rougeL = rouge_l_score(pred, ref)
valid_exact_match += exact_match
valid_edit_similarity += edit_similarity > 0.75
valid_rouge_score += rougeL > 0.75
return {
'exact_match': valid_exact_match / total * 100,
'edit_similarity': valid_edit_similarity / total * 100,
'rougeL': valid_rouge_score / total * 100,
}