From 6a6a1a5c0b24bb22f3b3fee4f395b03c21edb348 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Fri, 11 Apr 2025 19:01:39 +0800 Subject: [PATCH] [Feature] LLM Judge sanity check (#2012) * update * update --- opencompass/datasets/generic.py | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index dc783167..07b6a0bb 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -11,17 +11,18 @@ def get_final_results(judged_answers, is_correct_count = 0 is_incorrect_count = 0 is_not_attempted_count = 0 + attempted_judge_count = 0 details = [] for i, j, k in zip(judged_answers, references, origial_responses): - match = re.search(r'(A|B)', i) - grade_letter = match.group( - 0) if match else 'B' # Default to "INCORRECT" if no match + if i in ['A', 'B']: + attempted_judge_count += 1 + grade_letter = i detail = { 'pred': k, 'ref': j, 'origin_grade_response': i, 'grade_letter': grade_letter, - 'correct': False + 'correct': False, } count += 1 if grade_letter == 'A': @@ -35,26 +36,32 @@ def get_final_results(judged_answers, is_correct = is_correct_count / count is_incorrect = is_incorrect_count / count - # is_not_attempted = is_not_attempted_count / count is_given_attempted = is_correct + is_incorrect - accuracy_given_attempted = is_correct / is_given_attempted \ - if is_given_attempted > 0 else 0 - f1 = 2 * accuracy_given_attempted * is_correct / ( - accuracy_given_attempted + is_correct) if (accuracy_given_attempted + - is_correct) > 0 else 0 + loose_accuracy = is_correct / count + accuracy_given_attempted = (is_correct / is_given_attempted + if is_given_attempted > 0 else 0) + attempted_judge_ratio = attempted_judge_count / count + + f1 = (2 * accuracy_given_attempted * is_correct / + (accuracy_given_attempted + is_correct) if + (accuracy_given_attempted + is_correct) > 0 else 0) result = { - # 'accuracy_given_attempted': accuracy_given_attempted, - metric_name: accuracy_given_attempted * 100, + metric_name: loose_accuracy * 100, + f'{metric_name}_given_attempted': accuracy_given_attempted * 100, 'f1': f1, - 'details': details + 'attempted_ratio': attempted_judge_ratio * 100, + 'correct_count': is_correct_count, + 'incorrect_count': is_incorrect_count, + 'not_attempted_count': is_not_attempted_count, + 'details': details, } return result def _generic_llmjudge_postprocess(judgement: str): match = re.search(r'(A|B)', judgement) - grade_letter = match.group( - 0) if match else 'B' # Default to "INCORRECT" if no match + grade_letter = (match.group(0) if match else 'unknown' + ) # Return 'unknown' if no match return grade_letter