mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge branch 'open-compass:main' into main
This commit is contained in:
commit
6ce8643cfc
@ -11,17 +11,18 @@ def get_final_results(judged_answers,
|
|||||||
is_correct_count = 0
|
is_correct_count = 0
|
||||||
is_incorrect_count = 0
|
is_incorrect_count = 0
|
||||||
is_not_attempted_count = 0
|
is_not_attempted_count = 0
|
||||||
|
attempted_judge_count = 0
|
||||||
details = []
|
details = []
|
||||||
for i, j, k in zip(judged_answers, references, origial_responses):
|
for i, j, k in zip(judged_answers, references, origial_responses):
|
||||||
match = re.search(r'(A|B)', i)
|
if i in ['A', 'B']:
|
||||||
grade_letter = match.group(
|
attempted_judge_count += 1
|
||||||
0) if match else 'B' # Default to "INCORRECT" if no match
|
grade_letter = i
|
||||||
detail = {
|
detail = {
|
||||||
'pred': k,
|
'pred': k,
|
||||||
'ref': j,
|
'ref': j,
|
||||||
'origin_grade_response': i,
|
'origin_grade_response': i,
|
||||||
'grade_letter': grade_letter,
|
'grade_letter': grade_letter,
|
||||||
'correct': False
|
'correct': False,
|
||||||
}
|
}
|
||||||
count += 1
|
count += 1
|
||||||
if grade_letter == 'A':
|
if grade_letter == 'A':
|
||||||
@ -35,26 +36,32 @@ def get_final_results(judged_answers,
|
|||||||
|
|
||||||
is_correct = is_correct_count / count
|
is_correct = is_correct_count / count
|
||||||
is_incorrect = is_incorrect_count / count
|
is_incorrect = is_incorrect_count / count
|
||||||
# is_not_attempted = is_not_attempted_count / count
|
|
||||||
is_given_attempted = is_correct + is_incorrect
|
is_given_attempted = is_correct + is_incorrect
|
||||||
accuracy_given_attempted = is_correct / is_given_attempted \
|
loose_accuracy = is_correct / count
|
||||||
if is_given_attempted > 0 else 0
|
accuracy_given_attempted = (is_correct / is_given_attempted
|
||||||
f1 = 2 * accuracy_given_attempted * is_correct / (
|
if is_given_attempted > 0 else 0)
|
||||||
accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
|
attempted_judge_ratio = attempted_judge_count / count
|
||||||
is_correct) > 0 else 0
|
|
||||||
|
f1 = (2 * accuracy_given_attempted * is_correct /
|
||||||
|
(accuracy_given_attempted + is_correct) if
|
||||||
|
(accuracy_given_attempted + is_correct) > 0 else 0)
|
||||||
result = {
|
result = {
|
||||||
# 'accuracy_given_attempted': accuracy_given_attempted,
|
metric_name: loose_accuracy * 100,
|
||||||
metric_name: accuracy_given_attempted * 100,
|
f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
|
||||||
'f1': f1,
|
'f1': f1,
|
||||||
'details': details
|
'attempted_ratio': attempted_judge_ratio * 100,
|
||||||
|
'correct_count': is_correct_count,
|
||||||
|
'incorrect_count': is_incorrect_count,
|
||||||
|
'not_attempted_count': is_not_attempted_count,
|
||||||
|
'details': details,
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _generic_llmjudge_postprocess(judgement: str):
|
def _generic_llmjudge_postprocess(judgement: str):
|
||||||
match = re.search(r'(A|B)', judgement)
|
match = re.search(r'(A|B)', judgement)
|
||||||
grade_letter = match.group(
|
grade_letter = (match.group(0) if match else 'unknown'
|
||||||
0) if match else 'B' # Default to "INCORRECT" if no match
|
) # Return 'unknown' if no match
|
||||||
return grade_letter
|
return grade_letter
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ def preety_print_two_ratings(ratings_1, ratings_2, column_names):
|
|||||||
|
|
||||||
|
|
||||||
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
|
||||||
names = sorted(list(elo_ratings.keys()))
|
names = list(elo_ratings.keys())
|
||||||
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
wins = defaultdict(lambda: defaultdict(lambda: 0))
|
||||||
for a in names:
|
for a in names:
|
||||||
for b in names:
|
for b in names:
|
||||||
|
@ -159,9 +159,10 @@ class BaseEvaluator:
|
|||||||
can_calculate = True
|
can_calculate = True
|
||||||
c += int(example['detail']['is_correct'])
|
c += int(example['detail']['is_correct'])
|
||||||
|
|
||||||
if can_calculate and n > 1 and k > 1:
|
k_list = [k] if isinstance(k, int) else k
|
||||||
|
if can_calculate and n > 1 and max(k_list) > 1:
|
||||||
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
|
thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
|
||||||
for _k in [k] if isinstance(k, int) else k:
|
for _k in k_list:
|
||||||
for threshold in thresholds:
|
for threshold in thresholds:
|
||||||
g_pass = compute_g_pass_at_k(n=n,
|
g_pass = compute_g_pass_at_k(n=n,
|
||||||
c=c,
|
c=c,
|
||||||
@ -174,7 +175,7 @@ class BaseEvaluator:
|
|||||||
|
|
||||||
eval_details.append(detail)
|
eval_details.append(detail)
|
||||||
|
|
||||||
if can_calculate and n > 1 and k > 1:
|
if can_calculate and n > 1 and max(k_list) > 1:
|
||||||
eval_results.update(self.reduce(eval_details))
|
eval_results.update(self.reduce(eval_details))
|
||||||
|
|
||||||
# Store eval_details in eval_results
|
# Store eval_details in eval_results
|
||||||
|
Loading…
Reference in New Issue
Block a user