mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00

* BigCodeBench update * update LCBench * update LCBench 2 * update code * academicBench update * academic bench ifeval&math update * generic_llmjudge_aime_academic_postprocess delete * aime delete * postprocessors update * ifeval delete * update work_dir * linting * linting double-quote-string-fixer * r1-distill out_len update * fix lint --------- Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
import re
|
|
|
|
|
|
def get_final_results(judged_answers,
|
|
references,
|
|
origial_responses,
|
|
metric_name='accuracy'):
|
|
count = 0
|
|
is_correct_count = 0
|
|
is_incorrect_count = 0
|
|
is_not_attempted_count = 0
|
|
details = []
|
|
for i, j, k in zip(judged_answers, references, origial_responses):
|
|
match = re.search(r'(A|B)', i)
|
|
grade_letter = match.group(
|
|
0) if match else 'B' # Default to "INCORRECT" if no match
|
|
detail = {
|
|
'pred': k,
|
|
'ref': j,
|
|
'origin_grade_response': i,
|
|
'grade_letter': grade_letter,
|
|
'correct': False
|
|
}
|
|
count += 1
|
|
if grade_letter == 'A':
|
|
is_correct_count += 1
|
|
detail['correct'] = True
|
|
elif grade_letter == 'B':
|
|
is_incorrect_count += 1
|
|
else:
|
|
is_not_attempted_count += 1
|
|
details.append(detail)
|
|
|
|
is_correct = is_correct_count / count
|
|
is_incorrect = is_incorrect_count / count
|
|
# is_not_attempted = is_not_attempted_count / count
|
|
is_given_attempted = is_correct + is_incorrect
|
|
accuracy_given_attempted = is_correct / is_given_attempted \
|
|
if is_given_attempted > 0 else 0
|
|
f1 = 2 * accuracy_given_attempted * is_correct / (
|
|
accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
|
|
is_correct) > 0 else 0
|
|
result = {
|
|
# 'accuracy_given_attempted': accuracy_given_attempted,
|
|
metric_name: accuracy_given_attempted * 100,
|
|
'f1': f1,
|
|
'details': details
|
|
}
|
|
return result
|
|
|
|
|
|
def _generic_llmjudge_postprocess(judgement: str):
|
|
match = re.search(r'(A|B)', judgement)
|
|
grade_letter = match.group(
|
|
0) if match else 'B' # Default to "INCORRECT" if no match
|
|
return grade_letter
|
|
|
|
|
|
def generic_llmjudge_postprocess(
|
|
output: dict,
|
|
output_path: str,
|
|
) -> dict:
|
|
judged_answers = []
|
|
origial_responses = []
|
|
references = []
|
|
for k, v in output.items():
|
|
origial_responses.append(v['prediction'])
|
|
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
|
|
if processed_judge is not None:
|
|
judged_answers.append(processed_judge)
|
|
references.append(v['gold'])
|
|
results = get_final_results(judged_answers, references, origial_responses)
|
|
results['details'] = output
|
|
return results
|
|
|
|
|
|
def generic_llmjudge_academic_postprocess(
|
|
output: dict,
|
|
output_path: str,
|
|
metric_name: str = 'accuracy',
|
|
) -> dict:
|
|
judged_answers = []
|
|
origial_responses = []
|
|
references = []
|
|
for k, v in output.items():
|
|
origial_responses.append(v['prediction'])
|
|
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
|
|
if processed_judge is not None:
|
|
judged_answers.append(processed_judge)
|
|
references.append(v['gold'])
|
|
results = get_final_results(judged_answers, references, origial_responses,
|
|
metric_name)
|
|
results['details'] = output
|
|
# For academic summarizer
|
|
results.pop('f1', None)
|
|
return results
|