This commit is contained in:
zhangsongyang 2025-04-30 08:50:25 +00:00
parent 7605cc2ca4
commit 4c2e66d335
5 changed files with 95 additions and 45 deletions

View File

@ -10,6 +10,7 @@ Setting:
Avaliable Models:
- Instruct/Chat Models
"""
from datasets import parallel
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
@ -114,6 +115,7 @@ for category in categories:
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
parallel=False,
)
)

View File

@ -173,44 +173,76 @@ class korbenchEvaluator(BaseEvaluator):
def __init__(self):
super().__init__()
def score(self, predictions, references, test_set):
"""Evaluate predictions for a single prompt_mode in KOR-Bench."""
if not test_set:
raise ValueError('Test set is empty.')
def sample_score(self, prediction, reference, test_item=None):
"""对单个样本进行评测。
prompt_mode = test_set[0][
'prompt_mode'] # Determine the prompt_mode from the first entry
data = {}
Args:
prediction: 模型的预测结果
reference: 参考答案
test_item: 测试样本的其他信息
# Organize data for the given prompt_mode
for i in range(len(predictions)):
Returns:
Dict: 包含评测结果的字典
"""
if test_item is None:
raise ValueError('Test item is required.')
prompt_mode = test_item.get('prompt_mode')
# 构建单个样本的数据
entry = {
'prediction': predictions[i],
'gold': references[i],
'rule_id': test_set[i].get('rule_id', None),
'category': test_set[i].get('category', None),
'rule_list': test_set[i].get('rule_list', None),
'question_list': test_set[i].get('question_list', None),
'base_path': test_set[i].get('base_path', None),
'prediction': prediction,
'gold': reference,
'rule_id': test_item.get('rule_id', None),
'category': test_item.get('category', None),
'rule_list': test_item.get('rule_list', None),
'question_list': test_item.get('question_list', None),
'base_path': test_item.get('base_path', None),
}
data[i] = entry
if not data:
raise ValueError(f"No data found for prompt_mode '{prompt_mode}'")
# 对单个样本进行评测
data = {0: entry}
# Evaluate based on the prompt_mode
# 根据不同的 prompt_mode 进行评测
if prompt_mode == '0_shot':
evaluation_results = evaluate_responses(data, '0_shot')
elif prompt_mode == '3_shot':
evaluation_results = evaluate_responses(data, '3_shot')
elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
evaluation_results = evaluate_responses(data, 'mixed',
test_set[0]['base_path'])
test_item.get('base_path'))
else:
raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
# Calculate accuracy
correct_count = sum(res['is_correct'] for res in evaluation_results)
accuracy = (correct_count / len(evaluation_results)) * 100
return {
'is_correct': False,
'pred': prediction,
'answer': reference
}
# Return scores
return {'accuracy': accuracy}
# 返回评测结果
result = evaluation_results[0]
result['correct'] = result['is_correct']
result.update({'pred': prediction, 'answer': reference})
return result
def score(self, predictions, references, test_set):
"""使用 sample_score 对每个样本进行评测。"""
if not test_set:
raise ValueError('Test set is empty.')
details = []
correct_count = 0
# 对每个样本调用 sample_score 进行评测
for i in range(len(predictions)):
result = self.sample_score(predictions[i], references[i],
test_set[i])
details.append(result)
if result.get('is_correct', False):
correct_count += 1
# 计算准确率
accuracy = (correct_count /
len(predictions)) * 100 if predictions else 0
# 返回评测结果
return {'accuracy': accuracy, 'details': details}

View File

@ -34,7 +34,8 @@ class CascadeEvaluator(BaseEvaluator):
sample_score_fn: Optional[Callable] = None,
parallel: bool = True,
) -> None:
self.logger = get_logger()
super().__init__()
self.logger = get_logger(__name__)
# Initialize the LLM evaluator
llm_evaluator_type = llm_evaluator.pop('type')
@ -58,7 +59,10 @@ class CascadeEvaluator(BaseEvaluator):
raise ValueError(
'Either rule_evaluator or sample_score_fn must be provided')
def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
def sample_score(self,
prediction: str,
reference: str,
test_set=None) -> Dict[str, Any]:
"""Score a single sample using sample_score_fn or rule_evaluator.
Args:
@ -70,7 +74,7 @@ class CascadeEvaluator(BaseEvaluator):
"""
if self.sample_score_fn:
# Use user-provided function to evaluate a single sample
result = self.sample_score_fn(prediction, reference)
result = self.sample_score_fn(prediction, reference, test_set)
if not isinstance(result, dict):
# Ensure result is a dictionary with at least 'correct' field
result = {
@ -82,7 +86,8 @@ class CascadeEvaluator(BaseEvaluator):
else:
# Use rule_evaluator to evaluate a single sample by calling
# the score method with single-element lists
result = self.rule_evaluator.score([prediction], [reference])
result = self.rule_evaluator.score([prediction], [reference],
[test_set])
if 'details' in result and len(result['details']) > 0:
return result['details'][0]
else:
@ -137,7 +142,11 @@ class CascadeEvaluator(BaseEvaluator):
failed_indices = []
for i, (pred, ref) in enumerate(zip(predictions, references)):
result = self.sample_score(pred, ref)
if test_set is not None:
test_item = test_set[i]
else:
test_item = None
result = self.sample_score(pred, ref, test_item)
result['evaluation_method'] = 'rule'
details.append({'rule_evaluation': result})
@ -182,8 +191,9 @@ class CascadeEvaluator(BaseEvaluator):
self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
# Generate random hash suffix
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}' # noqa
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
self.logger.info(f'LLM evaluation results will be saved at '
f'{llm_results_path}')
# Check if results already exist to avoid re-evaluation
if os.path.exists(llm_results_path):
self.logger.info(
@ -214,7 +224,9 @@ class CascadeEvaluator(BaseEvaluator):
# Use GenericLLMEvaluator to evaluate samples
# unset dataset_cfg for GenericLLMEvaluator to
# directly use test_set
self.llm_evaluator.output_path = llm_results_path
# self.llm_evaluator.output_path = llm_results_path
self.llm_evaluator._dataset_replica_idx = \
self._dataset_replica_idx
self.llm_evaluator.dataset_cfg = None
llm_results = self.llm_evaluator.score(

View File

@ -15,6 +15,8 @@ from opencompass.registry import (DICT_POSTPROCESSORS, ICL_PROMPT_TEMPLATES,
from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
from opencompass.utils.logging import get_logger
logger = get_logger(__name__)
class GenericLLMEvaluator(BaseEvaluator):
"""Generic LLM evaluator.
@ -37,8 +39,7 @@ class GenericLLMEvaluator(BaseEvaluator):
dict_postprocessor: Optional[ConfigDict] = None,
keep_predictions: bool = False,
) -> None:
self.logger = get_logger()
super().__init__(pred_postprocessor=pred_postprocessor)
# If judge_cfg is not provided, fall back to the default configuration
if not judge_cfg:
self.judge_cfg = self.default_judge_cfg
@ -57,14 +58,12 @@ class GenericLLMEvaluator(BaseEvaluator):
def build_inferencer(self):
"""Build LLM Inference."""
if not self.output_path:
self.output_path = self._out_dir
self.output_path = f'{self._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
logger.info(f'LLM judge details will be saved at:{self.output_path}')
out_dir, out_name = osp.split(self.output_path)
out_name = f'{out_name}.json'
self.output_path = osp.join(out_dir, out_name)
self.logger.info(
logger.info(
f'Set self.output_path to {self.output_path} for current task')
assert self.output_path is not None, 'output_path is None'
@ -194,7 +193,7 @@ class GenericLLMEvaluator(BaseEvaluator):
@property
def default_judge_cfg(self):
from opencompass.models import OpenAISDK
self.logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
`OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.')
DEFAULT_JUDGE_CFG = dict(
type=OpenAISDK,

View File

@ -9,6 +9,9 @@ from datasets import Dataset
from scipy.stats import hypergeom
from opencompass.registry import TEXT_POSTPROCESSORS
from opencompass.utils.logging import get_logger
logger = get_logger(__name__)
def compute_pass_at_k(n, c, k):
@ -43,6 +46,7 @@ class BaseEvaluator:
def __init__(self, pred_postprocessor=None) -> None:
self.pred_postprocessor = pred_postprocessor
self._dataset_replica_idx = 0 # Default value for dataset_replica_idx
@property
def output_dir(self):
@ -117,6 +121,7 @@ class BaseEvaluator:
all_results = []
for i in range(n):
self._dataset_replica_idx = i
logger.info(f'Running {i}-th replica of evaluation')
def select_fn(i, real_size, x):
if isinstance(x, Dataset):