[Feature] LLM Judge sanity check (#2012)

* update * update
2025-05-30 16:03:24 +08:00 · 2025-04-11 19:01:39 +08:00 · 2025-04-11 19:01:39 +08:00 · 6a6a1a5c0b
commit 6a6a1a5c0b
parent 3f50b1dc49
1 changed files with 22 additions and 15 deletions
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@ -11,17 +11,18 @@ def get_final_results(judged_answers,
    is_correct_count = 0
    is_incorrect_count = 0
    is_not_attempted_count = 0
+    attempted_judge_count = 0
    details = []
    for i, j, k in zip(judged_answers, references, origial_responses):
-        match = re.search(r'(A|B)', i)
-        grade_letter = match.group(
-            0) if match else 'B'  # Default to "INCORRECT" if no match
+        if i in ['A', 'B']:
+            attempted_judge_count += 1
+        grade_letter = i
        detail = {
            'pred': k,
            'ref': j,
            'origin_grade_response': i,
            'grade_letter': grade_letter,
-            'correct': False
+            'correct': False,
        }
        count += 1
        if grade_letter == 'A':
@ -35,26 +36,32 @@ def get_final_results(judged_answers,

    is_correct = is_correct_count / count
    is_incorrect = is_incorrect_count / count
-    # is_not_attempted = is_not_attempted_count / count
    is_given_attempted = is_correct + is_incorrect
-    accuracy_given_attempted = is_correct / is_given_attempted \
-        if is_given_attempted > 0 else 0
-    f1 = 2 * accuracy_given_attempted * is_correct / (
-        accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
-                                                   is_correct) > 0 else 0
+    loose_accuracy = is_correct / count
+    accuracy_given_attempted = (is_correct / is_given_attempted
+                                if is_given_attempted > 0 else 0)
+    attempted_judge_ratio = attempted_judge_count / count
+
+    f1 = (2 * accuracy_given_attempted * is_correct /
+          (accuracy_given_attempted + is_correct) if
+          (accuracy_given_attempted + is_correct) > 0 else 0)
    result = {
-        # 'accuracy_given_attempted': accuracy_given_attempted,
-        metric_name: accuracy_given_attempted * 100,
+        metric_name: loose_accuracy * 100,
+        f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
        'f1': f1,
-        'details': details
+        'attempted_ratio': attempted_judge_ratio * 100,
+        'correct_count': is_correct_count,
+        'incorrect_count': is_incorrect_count,
+        'not_attempted_count': is_not_attempted_count,
+        'details': details,
    }
    return result


 def _generic_llmjudge_postprocess(judgement: str):
    match = re.search(r'(A|B)', judgement)
-    grade_letter = match.group(
-        0) if match else 'B'  # Default to "INCORRECT" if no match
+    grade_letter = (match.group(0) if match else 'unknown'
+                    )  # Return 'unknown' if no match
    return grade_letter