Merge branch 'open-compass:main' into main

This commit is contained in:
Deadwalk 2025-04-14 08:13:48 +08:00 committed by GitHub
commit 6ce8643cfc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 27 additions and 19 deletions

View File

@ -11,17 +11,18 @@ def get_final_results(judged_answers,
is_correct_count = 0 is_correct_count = 0
is_incorrect_count = 0 is_incorrect_count = 0
is_not_attempted_count = 0 is_not_attempted_count = 0
attempted_judge_count = 0
details = [] details = []
for i, j, k in zip(judged_answers, references, origial_responses): for i, j, k in zip(judged_answers, references, origial_responses):
match = re.search(r'(A|B)', i) if i in ['A', 'B']:
grade_letter = match.group( attempted_judge_count += 1
0) if match else 'B' # Default to "INCORRECT" if no match grade_letter = i
detail = { detail = {
'pred': k, 'pred': k,
'ref': j, 'ref': j,
'origin_grade_response': i, 'origin_grade_response': i,
'grade_letter': grade_letter, 'grade_letter': grade_letter,
'correct': False 'correct': False,
} }
count += 1 count += 1
if grade_letter == 'A': if grade_letter == 'A':
@ -35,26 +36,32 @@ def get_final_results(judged_answers,
is_correct = is_correct_count / count is_correct = is_correct_count / count
is_incorrect = is_incorrect_count / count is_incorrect = is_incorrect_count / count
# is_not_attempted = is_not_attempted_count / count
is_given_attempted = is_correct + is_incorrect is_given_attempted = is_correct + is_incorrect
accuracy_given_attempted = is_correct / is_given_attempted \ loose_accuracy = is_correct / count
if is_given_attempted > 0 else 0 accuracy_given_attempted = (is_correct / is_given_attempted
f1 = 2 * accuracy_given_attempted * is_correct / ( if is_given_attempted > 0 else 0)
accuracy_given_attempted + is_correct) if (accuracy_given_attempted + attempted_judge_ratio = attempted_judge_count / count
is_correct) > 0 else 0
f1 = (2 * accuracy_given_attempted * is_correct /
(accuracy_given_attempted + is_correct) if
(accuracy_given_attempted + is_correct) > 0 else 0)
result = { result = {
# 'accuracy_given_attempted': accuracy_given_attempted, metric_name: loose_accuracy * 100,
metric_name: accuracy_given_attempted * 100, f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
'f1': f1, 'f1': f1,
'details': details 'attempted_ratio': attempted_judge_ratio * 100,
'correct_count': is_correct_count,
'incorrect_count': is_incorrect_count,
'not_attempted_count': is_not_attempted_count,
'details': details,
} }
return result return result
def _generic_llmjudge_postprocess(judgement: str): def _generic_llmjudge_postprocess(judgement: str):
match = re.search(r'(A|B)', judgement) match = re.search(r'(A|B)', judgement)
grade_letter = match.group( grade_letter = (match.group(0) if match else 'unknown'
0) if match else 'B' # Default to "INCORRECT" if no match ) # Return 'unknown' if no match
return grade_letter return grade_letter

View File

@ -146,7 +146,7 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000): def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
names = sorted(list(elo_ratings.keys())) names = list(elo_ratings.keys())
wins = defaultdict(lambda: defaultdict(lambda: 0)) wins = defaultdict(lambda: defaultdict(lambda: 0))
for a in names: for a in names:
for b in names: for b in names:

View File

@ -159,9 +159,10 @@ class BaseEvaluator:
can_calculate = True can_calculate = True
c += int(example['detail']['is_correct']) c += int(example['detail']['is_correct'])
if can_calculate and n > 1 and k > 1: k_list = [k] if isinstance(k, int) else k
if can_calculate and n > 1 and max(k_list) > 1:
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0] thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
for _k in [k] if isinstance(k, int) else k: for _k in k_list:
for threshold in thresholds: for threshold in thresholds:
g_pass = compute_g_pass_at_k(n=n, g_pass = compute_g_pass_at_k(n=n,
c=c, c=c,
@ -174,7 +175,7 @@ class BaseEvaluator:
eval_details.append(detail) eval_details.append(detail)
if can_calculate and n > 1 and k > 1: if can_calculate and n > 1 and max(k_list) > 1:
eval_results.update(self.reduce(eval_details)) eval_results.update(self.reduce(eval_details))
# Store eval_details in eval_results # Store eval_details in eval_results