mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
34 lines
981 B
Python
34 lines
981 B
Python
![]() |
# flake8: noqa
|
||
|
"""KOR-Bench Evaluator."""
|
||
|
|
||
|
import json
|
||
|
import os
|
||
|
import re
|
||
|
|
||
|
from .icl_base_evaluator import BaseEvaluator
|
||
|
|
||
|
|
||
|
class JudgeEvaluator(BaseEvaluator):
|
||
|
|
||
|
def score(self, predictions, references):
|
||
|
if len(predictions) != len(references):
|
||
|
return {'error': 'preds and refrs have different length'}
|
||
|
correct = 0
|
||
|
count = 0
|
||
|
details = []
|
||
|
for prediction, reference in zip(predictions, references):
|
||
|
choice = prediction.split("\"Choice\": \"Model ")[-1][0]
|
||
|
gold_winner = reference.get('winner', '')
|
||
|
detail = {
|
||
|
'pred': prediction,
|
||
|
'answer': gold_winner,
|
||
|
'correct': False
|
||
|
}
|
||
|
count += 1
|
||
|
if choice == gold_winner:
|
||
|
correct += 1
|
||
|
detail['correct'] = True
|
||
|
details.append(detail)
|
||
|
result = {'accuracy': 100 * correct / count, 'details': details}
|
||
|
return result
|