[Feature] LLM Judge sanity check (#2012)

* update

* update
This commit is contained in:
Linchen Xiao 2025-04-11 19:01:39 +08:00 committed by GitHub
parent 3f50b1dc49
commit 6a6a1a5c0b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -11,17 +11,18 @@ def get_final_results(judged_answers,
is_correct_count = 0
is_incorrect_count = 0
is_not_attempted_count = 0
attempted_judge_count = 0
details = []
for i, j, k in zip(judged_answers, references, origial_responses):
match = re.search(r'(A|B)', i)
grade_letter = match.group(
0) if match else 'B' # Default to "INCORRECT" if no match
if i in ['A', 'B']:
attempted_judge_count += 1
grade_letter = i
detail = {
'pred': k,
'ref': j,
'origin_grade_response': i,
'grade_letter': grade_letter,
'correct': False
'correct': False,
}
count += 1
if grade_letter == 'A':
@ -35,26 +36,32 @@ def get_final_results(judged_answers,
is_correct = is_correct_count / count
is_incorrect = is_incorrect_count / count
# is_not_attempted = is_not_attempted_count / count
is_given_attempted = is_correct + is_incorrect
accuracy_given_attempted = is_correct / is_given_attempted \
if is_given_attempted > 0 else 0
f1 = 2 * accuracy_given_attempted * is_correct / (
accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
is_correct) > 0 else 0
loose_accuracy = is_correct / count
accuracy_given_attempted = (is_correct / is_given_attempted
if is_given_attempted > 0 else 0)
attempted_judge_ratio = attempted_judge_count / count
f1 = (2 * accuracy_given_attempted * is_correct /
(accuracy_given_attempted + is_correct) if
(accuracy_given_attempted + is_correct) > 0 else 0)
result = {
# 'accuracy_given_attempted': accuracy_given_attempted,
metric_name: accuracy_given_attempted * 100,
metric_name: loose_accuracy * 100,
f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
'f1': f1,
'details': details
'attempted_ratio': attempted_judge_ratio * 100,
'correct_count': is_correct_count,
'incorrect_count': is_incorrect_count,
'not_attempted_count': is_not_attempted_count,
'details': details,
}
return result
def _generic_llmjudge_postprocess(judgement: str):
match = re.search(r'(A|B)', judgement)
grade_letter = match.group(
0) if match else 'B' # Default to "INCORRECT" if no match
grade_letter = (match.group(0) if match else 'unknown'
) # Return 'unknown' if no match
return grade_letter