This commit is contained in:
zhangsongyang 2025-05-06 13:13:11 +00:00
parent ba0186ba1c
commit 7f76b12ae6

View File

@ -174,22 +174,22 @@ class korbenchEvaluator(BaseEvaluator):
super().__init__() super().__init__()
def sample_score(self, prediction, reference, test_item=None): def sample_score(self, prediction, reference, test_item=None):
"""对单个样本进行评测。 """Evaluate a single sample.
Args: Args:
prediction: 模型的预测结果 prediction: The model's prediction
reference: 参考答案 reference: The reference answer
test_item: 测试样本的其他信息 test_item: Additional information about the test sample
Returns: Returns:
Dict: 包含评测结果的字典 Dict: A dictionary containing evaluation results
""" """
if test_item is None: if test_item is None:
raise ValueError('Test item is required.') raise ValueError('Test item is required.')
prompt_mode = test_item.get('prompt_mode') prompt_mode = test_item.get('prompt_mode')
# 构建单个样本的数据 # Build data for a single sample
entry = { entry = {
'prediction': prediction, 'prediction': prediction,
'gold': reference, 'gold': reference,
@ -200,10 +200,10 @@ class korbenchEvaluator(BaseEvaluator):
'base_path': test_item.get('base_path', None), 'base_path': test_item.get('base_path', None),
} }
# 对单个样本进行评测 # Evaluate the single sample
data = {0: entry} data = {0: entry}
# 根据不同的 prompt_mode 进行评测 # Evaluate based on different prompt_mode
if prompt_mode == '0_shot': if prompt_mode == '0_shot':
evaluation_results = evaluate_responses(data, '0_shot') evaluation_results = evaluate_responses(data, '0_shot')
elif prompt_mode == '3_shot': elif prompt_mode == '3_shot':
@ -218,21 +218,21 @@ class korbenchEvaluator(BaseEvaluator):
'answer': reference 'answer': reference
} }
# 返回评测结果 # Return evaluation results
result = evaluation_results[0] result = evaluation_results[0]
result['correct'] = result['is_correct'] result['correct'] = result['is_correct']
result.update({'pred': prediction, 'answer': reference}) result.update({'pred': prediction, 'answer': reference})
return result return result
def score(self, predictions, references, test_set): def score(self, predictions, references, test_set):
"""使用 sample_score 对每个样本进行评测。""" """Evaluate each sample using sample_score."""
if not test_set: if not test_set:
raise ValueError('Test set is empty.') raise ValueError('Test set is empty.')
details = [] details = []
correct_count = 0 correct_count = 0
# 对每个样本调用 sample_score 进行评测 # Call sample_score for each sample
for i in range(len(predictions)): for i in range(len(predictions)):
result = self.sample_score(predictions[i], references[i], result = self.sample_score(predictions[i], references[i],
test_set[i]) test_set[i])
@ -240,9 +240,9 @@ class korbenchEvaluator(BaseEvaluator):
if result.get('is_correct', False): if result.get('is_correct', False):
correct_count += 1 correct_count += 1
# 计算准确率 # Calculate accuracy
accuracy = (correct_count / accuracy = (correct_count /
len(predictions)) * 100 if predictions else 0 len(predictions)) * 100 if predictions else 0
# 返回评测结果 # Return evaluation results
return {'accuracy': accuracy, 'details': details} return {'accuracy': accuracy, 'details': details}