OpenCompass/opencompass/datasets/calm/evaluation/accuracy/open-ended.py
Peng Bo edd0ffdf70
Calm dataset (#1287)
* add calm dataset

* modify config max_out_len

* update README

* Modify README

* update README

* update README

* update README

* update README

* update README

* add summarizer and modify readme

* delete summarizer config comment

* update summarizer

* modify same response to all questions

* update README
2024-07-26 11:48:16 +08:00

32 lines
838 B
Python

# -*- coding: utf-8 -*-
import jieba
from nltk import bleu
from nltk.translate.bleu_score import SmoothingFunction
from rouge import Rouge
def is_chinese(text):
for char in text:
if '\u4e00' <= char <= '\u9fff':
return True
return False
def compute_acc(gt_list, pred_list):
rouge_l = 0
rouge = Rouge()
for pred, gold in zip(pred_list, gt_list):
if is_chinese(pred):
prediction = " ".join(jieba.cut(pred))
gold = " ".join(jieba.cut(gold))
else:
prediction = pred
gold = gold
try:
scores = rouge.get_scores(prediction, gold)
rouge_l += scores[0]['rouge-l']['r']
except:
continue
avg_rougel = rouge_l / len(gt_list)
return avg_rougel