From 20660ab5071ab66215345d5fe8c4344df8bfcca6 Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Thu, 10 Apr 2025 19:47:21 +0800
Subject: [PATCH 1/3] [Fix] Fix compare error when k is list in base_evaluator
 (#2010)

* fix gpass compare error of list k

* fix compare error in 177
---
 opencompass/openicl/icl_evaluator/icl_base_evaluator.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index f7ff0277..10cc3fe4 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -159,9 +159,10 @@ class BaseEvaluator:
                         can_calculate = True
                         c += int(example['detail']['is_correct'])
 
-                if can_calculate and n > 1 and k > 1:
+                k_list = [k] if isinstance(k, int) else k
+                if can_calculate and n > 1 and max(k_list) > 1:
                     thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
-                    for _k in [k] if isinstance(k, int) else k:
+                    for _k in k_list:
                         for threshold in thresholds:
                             g_pass = compute_g_pass_at_k(n=n,
                                                          c=c,
@@ -174,7 +175,7 @@ class BaseEvaluator:
 
                 eval_details.append(detail)
 
-            if can_calculate and n > 1 and k > 1:
+            if can_calculate and n > 1 and max(k_list) > 1:
                 eval_results.update(self.reduce(eval_details))
 
             # Store eval_details in eval_results

From 3f50b1dc49778431fd6fefdd9a2e845e1e79f12e Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Fri, 11 Apr 2025 16:59:40 +0800
Subject: [PATCH 2/3] [Fix] fix order bug Update arena_hard.py (#2015)

---
 opencompass/datasets/subjective/arena_hard.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/datasets/subjective/arena_hard.py b/opencompass/datasets/subjective/arena_hard.py
index b146f3ac..1403c978 100644
--- a/opencompass/datasets/subjective/arena_hard.py
+++ b/opencompass/datasets/subjective/arena_hard.py
@@ -146,7 +146,7 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
 
 
 def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
-    names = sorted(list(elo_ratings.keys()))
+    names = list(elo_ratings.keys())
     wins = defaultdict(lambda: defaultdict(lambda: 0))
     for a in names:
         for b in names:

From 6a6a1a5c0b24bb22f3b3fee4f395b03c21edb348 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 11 Apr 2025 19:01:39 +0800
Subject: [PATCH 3/3] [Feature] LLM Judge sanity check (#2012)

* update

* update
---
 opencompass/datasets/generic.py | 37 ++++++++++++++++++++-------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py
index dc783167..07b6a0bb 100644
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@@ -11,17 +11,18 @@ def get_final_results(judged_answers,
     is_correct_count = 0
     is_incorrect_count = 0
     is_not_attempted_count = 0
+    attempted_judge_count = 0
     details = []
     for i, j, k in zip(judged_answers, references, origial_responses):
-        match = re.search(r'(A|B)', i)
-        grade_letter = match.group(
-            0) if match else 'B'  # Default to "INCORRECT" if no match
+        if i in ['A', 'B']:
+            attempted_judge_count += 1
+        grade_letter = i
         detail = {
             'pred': k,
             'ref': j,
             'origin_grade_response': i,
             'grade_letter': grade_letter,
-            'correct': False
+            'correct': False,
         }
         count += 1
         if grade_letter == 'A':
@@ -35,26 +36,32 @@ def get_final_results(judged_answers,
 
     is_correct = is_correct_count / count
     is_incorrect = is_incorrect_count / count
-    # is_not_attempted = is_not_attempted_count / count
     is_given_attempted = is_correct + is_incorrect
-    accuracy_given_attempted = is_correct / is_given_attempted \
-        if is_given_attempted > 0 else 0
-    f1 = 2 * accuracy_given_attempted * is_correct / (
-        accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
-                                                   is_correct) > 0 else 0
+    loose_accuracy = is_correct / count
+    accuracy_given_attempted = (is_correct / is_given_attempted
+                                if is_given_attempted > 0 else 0)
+    attempted_judge_ratio = attempted_judge_count / count
+
+    f1 = (2 * accuracy_given_attempted * is_correct /
+          (accuracy_given_attempted + is_correct) if
+          (accuracy_given_attempted + is_correct) > 0 else 0)
     result = {
-        # 'accuracy_given_attempted': accuracy_given_attempted,
-        metric_name: accuracy_given_attempted * 100,
+        metric_name: loose_accuracy * 100,
+        f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
         'f1': f1,
-        'details': details
+        'attempted_ratio': attempted_judge_ratio * 100,
+        'correct_count': is_correct_count,
+        'incorrect_count': is_incorrect_count,
+        'not_attempted_count': is_not_attempted_count,
+        'details': details,
     }
     return result
 
 
 def _generic_llmjudge_postprocess(judgement: str):
     match = re.search(r'(A|B)', judgement)
-    grade_letter = match.group(
-        0) if match else 'B'  # Default to "INCORRECT" if no match
+    grade_letter = (match.group(0) if match else 'unknown'
+                    )  # Return 'unknown' if no match
     return grade_letter