From 20660ab5071ab66215345d5fe8c4344df8bfcca6 Mon Sep 17 00:00:00 2001 From: Junnan Liu Date: Thu, 10 Apr 2025 19:47:21 +0800 Subject: [PATCH 1/3] [Fix] Fix compare error when k is list in base_evaluator (#2010) * fix gpass compare error of list k * fix compare error in 177 --- opencompass/openicl/icl_evaluator/icl_base_evaluator.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index f7ff0277..10cc3fe4 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -159,9 +159,10 @@ class BaseEvaluator: can_calculate = True c += int(example['detail']['is_correct']) - if can_calculate and n > 1 and k > 1: + k_list = [k] if isinstance(k, int) else k + if can_calculate and n > 1 and max(k_list) > 1: thresholds = [0.0, 0.25, 0.5, 0.75, 1.0] - for _k in [k] if isinstance(k, int) else k: + for _k in k_list: for threshold in thresholds: g_pass = compute_g_pass_at_k(n=n, c=c, @@ -174,7 +175,7 @@ class BaseEvaluator: eval_details.append(detail) - if can_calculate and n > 1 and k > 1: + if can_calculate and n > 1 and max(k_list) > 1: eval_results.update(self.reduce(eval_details)) # Store eval_details in eval_results From 3f50b1dc49778431fd6fefdd9a2e845e1e79f12e Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:59:40 +0800 Subject: [PATCH 2/3] [Fix] fix order bug Update arena_hard.py (#2015) --- opencompass/datasets/subjective/arena_hard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencompass/datasets/subjective/arena_hard.py b/opencompass/datasets/subjective/arena_hard.py index b146f3ac..1403c978 100644 --- a/opencompass/datasets/subjective/arena_hard.py +++ b/opencompass/datasets/subjective/arena_hard.py @@ -146,7 +146,7 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names): def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000): - names = sorted(list(elo_ratings.keys())) + names = list(elo_ratings.keys()) wins = defaultdict(lambda: defaultdict(lambda: 0)) for a in names: for b in names: From 6a6a1a5c0b24bb22f3b3fee4f395b03c21edb348 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Fri, 11 Apr 2025 19:01:39 +0800 Subject: [PATCH 3/3] [Feature] LLM Judge sanity check (#2012) * update * update --- opencompass/datasets/generic.py | 37 ++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index dc783167..07b6a0bb 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -11,17 +11,18 @@ def get_final_results(judged_answers, is_correct_count = 0 is_incorrect_count = 0 is_not_attempted_count = 0 + attempted_judge_count = 0 details = [] for i, j, k in zip(judged_answers, references, origial_responses): - match = re.search(r'(A|B)', i) - grade_letter = match.group( - 0) if match else 'B' # Default to "INCORRECT" if no match + if i in ['A', 'B']: + attempted_judge_count += 1 + grade_letter = i detail = { 'pred': k, 'ref': j, 'origin_grade_response': i, 'grade_letter': grade_letter, - 'correct': False + 'correct': False, } count += 1 if grade_letter == 'A': @@ -35,26 +36,32 @@ def get_final_results(judged_answers, is_correct = is_correct_count / count is_incorrect = is_incorrect_count / count - # is_not_attempted = is_not_attempted_count / count is_given_attempted = is_correct + is_incorrect - accuracy_given_attempted = is_correct / is_given_attempted \ - if is_given_attempted > 0 else 0 - f1 = 2 * accuracy_given_attempted * is_correct / ( - accuracy_given_attempted + is_correct) if (accuracy_given_attempted + - is_correct) > 0 else 0 + loose_accuracy = is_correct / count + accuracy_given_attempted = (is_correct / is_given_attempted + if is_given_attempted > 0 else 0) + attempted_judge_ratio = attempted_judge_count / count + + f1 = (2 * accuracy_given_attempted * is_correct / + (accuracy_given_attempted + is_correct) if + (accuracy_given_attempted + is_correct) > 0 else 0) result = { - # 'accuracy_given_attempted': accuracy_given_attempted, - metric_name: accuracy_given_attempted * 100, + metric_name: loose_accuracy * 100, + f'{metric_name}_given_attempted': accuracy_given_attempted * 100, 'f1': f1, - 'details': details + 'attempted_ratio': attempted_judge_ratio * 100, + 'correct_count': is_correct_count, + 'incorrect_count': is_incorrect_count, + 'not_attempted_count': is_not_attempted_count, + 'details': details, } return result def _generic_llmjudge_postprocess(judgement: str): match = re.search(r'(A|B)', judgement) - grade_letter = match.group( - 0) if match else 'B' # Default to "INCORRECT" if no match + grade_letter = (match.group(0) if match else 'unknown' + ) # Return 'unknown' if no match return grade_letter