OpenCompass/opencompass/datasets/subjective/utils.py
Alexander Lam 1bd594fc62
[Feature] Added CompassArena-SubjectiveBench with Bradley-Terry Model (#1751)
* fix lint issues

* updated gitignore

* changed infer_order from random to double for the pairwise_judge.py (not changing for pairwise_bt_judge.py

* added return statement to CompassArenaBradleyTerrySummarizer to return overall score for each judger model
2024-12-16 13:41:28 +08:00

34 lines
1.1 KiB
Python

# flake8: noqa: E501
def get_judgeanswer_and_reference(result, filename, post_process):
"""Extract judgements (scores) and references.
Args:
result (ConfigDict): Dataset config.
filename (str): Model path in results dir.
post_process (function): The pre-defined extract function.
"""
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename)
print('*' * 100)
judged_answers = []
references = []
for k, v in result.items():
processed_judge = post_process(v)
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
# else:
# print(v['prediction'])
# print('-' * 128)
if len(judged_answers) <= 0.95 * len(result):
print('*' * 100)
print(
f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
)
print('*' * 100)
return judged_answers, references