mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
34 lines
981 B
Python
34 lines
981 B
Python
# flake8: noqa
|
|
"""KOR-Bench Evaluator."""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
from .icl_base_evaluator import BaseEvaluator
|
|
|
|
|
|
class JudgeEvaluator(BaseEvaluator):
|
|
|
|
def score(self, predictions, references):
|
|
if len(predictions) != len(references):
|
|
return {'error': 'preds and refrs have different length'}
|
|
correct = 0
|
|
count = 0
|
|
details = []
|
|
for prediction, reference in zip(predictions, references):
|
|
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
|
|
gold_winner = reference.get('winner', '')
|
|
detail = {
|
|
'pred': prediction,
|
|
'answer': gold_winner,
|
|
'correct': False
|
|
}
|
|
count += 1
|
|
if choice == gold_winner:
|
|
correct += 1
|
|
detail['correct'] = True
|
|
details.append(detail)
|
|
result = {'accuracy': 100 * correct / count, 'details': details}
|
|
return result
|