OpenCompass/opencompass/datasets/generic.py
Dongsheng Zhu 465e93e10e
[Update] Academic bench llm judge update (#1876)
* BigCodeBench update

* update LCBench

* update LCBench 2

* update code

* academicBench update

* academic bench ifeval&math update

* generic_llmjudge_aime_academic_postprocess delete

* aime delete

* postprocessors update

* ifeval delete

* update work_dir

* linting

* linting double-quote-string-fixer

* r1-distill out_len update

* fix lint

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
2025-02-24 15:45:24 +08:00

97 lines
3.1 KiB
Python

import re
def get_final_results(judged_answers,
references,
origial_responses,
metric_name='accuracy'):
count = 0
is_correct_count = 0
is_incorrect_count = 0
is_not_attempted_count = 0
details = []
for i, j, k in zip(judged_answers, references, origial_responses):
match = re.search(r'(A|B)', i)
grade_letter = match.group(
0) if match else 'B' # Default to "INCORRECT" if no match
detail = {
'pred': k,
'ref': j,
'origin_grade_response': i,
'grade_letter': grade_letter,
'correct': False
}
count += 1
if grade_letter == 'A':
is_correct_count += 1
detail['correct'] = True
elif grade_letter == 'B':
is_incorrect_count += 1
else:
is_not_attempted_count += 1
details.append(detail)
is_correct = is_correct_count / count
is_incorrect = is_incorrect_count / count
# is_not_attempted = is_not_attempted_count / count
is_given_attempted = is_correct + is_incorrect
accuracy_given_attempted = is_correct / is_given_attempted \
if is_given_attempted > 0 else 0
f1 = 2 * accuracy_given_attempted * is_correct / (
accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
is_correct) > 0 else 0
result = {
# 'accuracy_given_attempted': accuracy_given_attempted,
metric_name: accuracy_given_attempted * 100,
'f1': f1,
'details': details
}
return result
def _generic_llmjudge_postprocess(judgement: str):
match = re.search(r'(A|B)', judgement)
grade_letter = match.group(
0) if match else 'B' # Default to "INCORRECT" if no match
return grade_letter
def generic_llmjudge_postprocess(
output: dict,
output_path: str,
) -> dict:
judged_answers = []
origial_responses = []
references = []
for k, v in output.items():
origial_responses.append(v['prediction'])
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
results = get_final_results(judged_answers, references, origial_responses)
results['details'] = output
return results
def generic_llmjudge_academic_postprocess(
output: dict,
output_path: str,
metric_name: str = 'accuracy',
) -> dict:
judged_answers = []
origial_responses = []
references = []
for k, v in output.items():
origial_responses.append(v['prediction'])
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
results = get_final_results(judged_answers, references, origial_responses,
metric_name)
results['details'] = output
# For academic summarizer
results.pop('f1', None)
return results