OpenCompass/opencompass/datasets/subjective/commonbench.py
zhulinJulia24 ae8354cb8a update
2025-04-08 20:22:11 +08:00

57 lines
1.7 KiB
Python

# flake8: noqa: E501
import re
from collections import defaultdict
from typing import Optional
from .utils import get_judgeanswer_and_reference
def post_process_mtbench(judgement: str):
"""Input a string like below:
xxx[[5]]xxx, and extract the score
"""
judgement = judgement['prediction']
pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
matched_result = re.findall(pattern, judgement)
if matched_result:
score = float(matched_result[0])
else:
return None
return {'score': score}
def get_capability_results(judged_answers, references):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total')))
return capability_avg_ratings
@DICT_POSTPROCESSORS.register_module('commenbench')
def commonbench_postprocess(
output: dict,
output_path: str,
post_process: Optional[callable] = post_process_mtbench,
) -> dict:
judged_answers, references = get_judgeanswer_and_reference(
output, output_path, post_process)
results = get_capability_results(judged_answers, references)
results['details'] = output
return results