mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
265 lines
9.4 KiB
Python
265 lines
9.4 KiB
Python
import difflib
|
|
import re
|
|
import string
|
|
from collections import Counter
|
|
from typing import List
|
|
|
|
import jieba
|
|
from fuzzywuzzy import fuzz
|
|
from rouge import Rouge
|
|
|
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
|
from opencompass.registry import ICL_EVALUATORS
|
|
|
|
|
|
def normalize_answer(s):
|
|
"""Lower text and remove punctuation, articles and extra whitespace."""
|
|
|
|
def remove_articles(text):
|
|
return re.sub(r'\b(a|an|the)\b', ' ', text)
|
|
|
|
def white_space_fix(text):
|
|
return ' '.join(text.split())
|
|
|
|
def remove_punc(text):
|
|
exclude = set(string.punctuation)
|
|
return ''.join(ch for ch in text if ch not in exclude)
|
|
|
|
def lower(text):
|
|
return text.lower()
|
|
|
|
return white_space_fix(remove_articles(remove_punc(lower(s))))
|
|
|
|
|
|
def normalize_zh_answer(s):
|
|
"""Lower text and remove punctuation, extra whitespace."""
|
|
|
|
def white_space_fix(text):
|
|
return ''.join(text.split())
|
|
|
|
def remove_punc(text):
|
|
cn_punctuation = '!?。。"#$%&'()*+,-/:;<=>@[\]^_`\
|
|
{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.'
|
|
|
|
all_punctuation = set(string.punctuation + cn_punctuation)
|
|
return ''.join(ch for ch in text if ch not in all_punctuation)
|
|
|
|
def lower(text):
|
|
return text.lower()
|
|
|
|
return white_space_fix(remove_punc(lower(s)))
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchF1Evaluator(BaseEvaluator):
|
|
|
|
def __init__(self, language: str = 'en') -> None:
|
|
super().__init__()
|
|
assert language in ['en', 'zh']
|
|
self.language = language
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
|
|
def f1_score(prediction, reference, **kwargs):
|
|
common = Counter(prediction) & Counter(reference)
|
|
num_same = sum(common.values())
|
|
if num_same == 0:
|
|
return 0
|
|
precision = 1.0 * num_same / len(prediction)
|
|
recall = 1.0 * num_same / len(reference)
|
|
f1 = (2 * precision * recall) / (precision + recall)
|
|
return f1
|
|
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]
|
|
task_score = 0.
|
|
for reference in reference_list:
|
|
if self.language == 'en':
|
|
normalized_prediction = normalize_answer(prediction)
|
|
normalized_reference = normalize_answer(reference)
|
|
|
|
prediction_tokens = normalized_prediction.split()
|
|
reference_tokens = normalized_reference.split()
|
|
|
|
else:
|
|
prediction_tokens = list(
|
|
jieba.cut(prediction, cut_all=False))
|
|
reference_tokens = list(jieba.cut(reference,
|
|
cut_all=False))
|
|
prediction_tokens = [
|
|
normalize_zh_answer(token)
|
|
for token in prediction_tokens
|
|
]
|
|
reference_tokens = [
|
|
normalize_zh_answer(token)
|
|
for token in reference_tokens
|
|
]
|
|
prediction_tokens = [
|
|
token for token in prediction_tokens if len(token) > 0
|
|
]
|
|
reference_tokens = [
|
|
token for token in reference_tokens if len(token) > 0
|
|
]
|
|
|
|
task_score = max(task_score,
|
|
f1_score(prediction_tokens, reference_tokens))
|
|
|
|
score += task_score
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchCountEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]
|
|
for reference in reference_list:
|
|
numbers = re.findall(r'\d+', prediction)
|
|
right_num = 0
|
|
for number in numbers:
|
|
if str(number) == str(reference):
|
|
right_num += 1
|
|
score += 0.0 if len(numbers) == 0 else float(right_num /
|
|
len(numbers))
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchRetrievalEvaluator(BaseEvaluator):
|
|
|
|
def __init__(self, language: str = 'en') -> None:
|
|
super().__init__()
|
|
assert language in ['en', 'zh']
|
|
self.language = language
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]
|
|
for reference in reference_list:
|
|
if self.language == 'en':
|
|
pattern = r'Paragraph (\d+)'
|
|
else:
|
|
pattern = r'段落(\d+)'
|
|
|
|
matches = re.findall(pattern, reference)
|
|
reference_id = matches[0]
|
|
numbers = re.findall(r'\d+', prediction)
|
|
right_num = 0
|
|
for number in numbers:
|
|
if str(number) == str(reference_id):
|
|
right_num += 1
|
|
|
|
score += 0.0 if len(numbers) == 0 else float(right_num /
|
|
len(numbers))
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchRougeEvaluator(BaseEvaluator):
|
|
|
|
def __init__(self, language: str = 'en') -> None:
|
|
super().__init__()
|
|
assert language in ['en', 'zh']
|
|
self.language = language
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]
|
|
task_score = 0.
|
|
for reference in reference_list:
|
|
if self.language == 'zh':
|
|
prediction = ' '.join(
|
|
list(jieba.cut(prediction, cut_all=False)))
|
|
reference = ' '.join(
|
|
list(jieba.cut(reference, cut_all=False)))
|
|
|
|
rouge = Rouge()
|
|
if prediction != '':
|
|
cur_score = rouge.get_scores([prediction], [reference],
|
|
avg=True)['rouge-l']['f']
|
|
else:
|
|
cur_score = 0.
|
|
task_score = max(task_score, cur_score)
|
|
|
|
score += task_score
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchCodeSimEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]
|
|
task_score = 0.
|
|
for reference in reference_list:
|
|
all_lines = prediction.lstrip('\n').split('\n')
|
|
prediction = ''
|
|
for line in all_lines:
|
|
if ('`' not in line) and ('#'
|
|
not in line) and ('//'
|
|
not in line):
|
|
prediction = line
|
|
break
|
|
task_score = max(task_score,
|
|
(fuzz.ratio(prediction, reference) / 100))
|
|
|
|
score += task_score
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|
|
|
|
|
|
@ICL_EVALUATORS.register_module()
|
|
class LongBenchClassificationEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions: List, references: List) -> dict:
|
|
score = 0.
|
|
for i in range(len(predictions)):
|
|
prediction = predictions[i]
|
|
reference_list = references[i]['answers']
|
|
for reference in reference_list:
|
|
em_match_list = []
|
|
all_classes = references[i]['all_classes']
|
|
for class_name in all_classes:
|
|
if class_name in prediction:
|
|
em_match_list.append(class_name)
|
|
for match_term in em_match_list:
|
|
if match_term in reference and match_term != reference:
|
|
em_match_list.remove(match_term)
|
|
if em_match_list != 0:
|
|
if reference in em_match_list:
|
|
score += (1.0 / len(em_match_list))
|
|
else:
|
|
best_match = None
|
|
highest_similarity = 0
|
|
for names in all_classes:
|
|
similarity = difflib.SequenceMatcher(
|
|
None, names, prediction).ratio()
|
|
if similarity > highest_similarity:
|
|
highest_similarity = similarity
|
|
best_match = names
|
|
score += float(best_match == reference)
|
|
|
|
score = score / len(predictions) * 100
|
|
return {'score': score}
|