mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Update
This commit is contained in:
parent
7605cc2ca4
commit
4c2e66d335
@ -10,6 +10,7 @@ Setting:
|
||||
Avaliable Models:
|
||||
- Instruct/Chat Models
|
||||
"""
|
||||
from datasets import parallel
|
||||
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
@ -114,6 +115,7 @@ for category in categories:
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
parallel=False,
|
||||
)
|
||||
)
|
||||
|
||||
|
@ -173,44 +173,76 @@ class korbenchEvaluator(BaseEvaluator):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def score(self, predictions, references, test_set):
|
||||
"""Evaluate predictions for a single prompt_mode in KOR-Bench."""
|
||||
if not test_set:
|
||||
raise ValueError('Test set is empty.')
|
||||
def sample_score(self, prediction, reference, test_item=None):
|
||||
"""对单个样本进行评测。
|
||||
|
||||
prompt_mode = test_set[0][
|
||||
'prompt_mode'] # Determine the prompt_mode from the first entry
|
||||
data = {}
|
||||
Args:
|
||||
prediction: 模型的预测结果
|
||||
reference: 参考答案
|
||||
test_item: 测试样本的其他信息
|
||||
|
||||
# Organize data for the given prompt_mode
|
||||
for i in range(len(predictions)):
|
||||
Returns:
|
||||
Dict: 包含评测结果的字典
|
||||
"""
|
||||
if test_item is None:
|
||||
raise ValueError('Test item is required.')
|
||||
|
||||
prompt_mode = test_item.get('prompt_mode')
|
||||
|
||||
# 构建单个样本的数据
|
||||
entry = {
|
||||
'prediction': predictions[i],
|
||||
'gold': references[i],
|
||||
'rule_id': test_set[i].get('rule_id', None),
|
||||
'category': test_set[i].get('category', None),
|
||||
'rule_list': test_set[i].get('rule_list', None),
|
||||
'question_list': test_set[i].get('question_list', None),
|
||||
'base_path': test_set[i].get('base_path', None),
|
||||
'prediction': prediction,
|
||||
'gold': reference,
|
||||
'rule_id': test_item.get('rule_id', None),
|
||||
'category': test_item.get('category', None),
|
||||
'rule_list': test_item.get('rule_list', None),
|
||||
'question_list': test_item.get('question_list', None),
|
||||
'base_path': test_item.get('base_path', None),
|
||||
}
|
||||
data[i] = entry
|
||||
|
||||
if not data:
|
||||
raise ValueError(f"No data found for prompt_mode '{prompt_mode}'")
|
||||
# 对单个样本进行评测
|
||||
data = {0: entry}
|
||||
|
||||
# Evaluate based on the prompt_mode
|
||||
# 根据不同的 prompt_mode 进行评测
|
||||
if prompt_mode == '0_shot':
|
||||
evaluation_results = evaluate_responses(data, '0_shot')
|
||||
elif prompt_mode == '3_shot':
|
||||
evaluation_results = evaluate_responses(data, '3_shot')
|
||||
elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
|
||||
evaluation_results = evaluate_responses(data, 'mixed',
|
||||
test_set[0]['base_path'])
|
||||
test_item.get('base_path'))
|
||||
else:
|
||||
raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
|
||||
# Calculate accuracy
|
||||
correct_count = sum(res['is_correct'] for res in evaluation_results)
|
||||
accuracy = (correct_count / len(evaluation_results)) * 100
|
||||
return {
|
||||
'is_correct': False,
|
||||
'pred': prediction,
|
||||
'answer': reference
|
||||
}
|
||||
|
||||
# Return scores
|
||||
return {'accuracy': accuracy}
|
||||
# 返回评测结果
|
||||
result = evaluation_results[0]
|
||||
result['correct'] = result['is_correct']
|
||||
result.update({'pred': prediction, 'answer': reference})
|
||||
return result
|
||||
|
||||
def score(self, predictions, references, test_set):
|
||||
"""使用 sample_score 对每个样本进行评测。"""
|
||||
if not test_set:
|
||||
raise ValueError('Test set is empty.')
|
||||
|
||||
details = []
|
||||
correct_count = 0
|
||||
|
||||
# 对每个样本调用 sample_score 进行评测
|
||||
for i in range(len(predictions)):
|
||||
result = self.sample_score(predictions[i], references[i],
|
||||
test_set[i])
|
||||
details.append(result)
|
||||
if result.get('is_correct', False):
|
||||
correct_count += 1
|
||||
|
||||
# 计算准确率
|
||||
accuracy = (correct_count /
|
||||
len(predictions)) * 100 if predictions else 0
|
||||
|
||||
# 返回评测结果
|
||||
return {'accuracy': accuracy, 'details': details}
|
||||
|
@ -34,7 +34,8 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
sample_score_fn: Optional[Callable] = None,
|
||||
parallel: bool = True,
|
||||
) -> None:
|
||||
self.logger = get_logger()
|
||||
super().__init__()
|
||||
self.logger = get_logger(__name__)
|
||||
|
||||
# Initialize the LLM evaluator
|
||||
llm_evaluator_type = llm_evaluator.pop('type')
|
||||
@ -58,7 +59,10 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
raise ValueError(
|
||||
'Either rule_evaluator or sample_score_fn must be provided')
|
||||
|
||||
def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
|
||||
def sample_score(self,
|
||||
prediction: str,
|
||||
reference: str,
|
||||
test_set=None) -> Dict[str, Any]:
|
||||
"""Score a single sample using sample_score_fn or rule_evaluator.
|
||||
|
||||
Args:
|
||||
@ -70,7 +74,7 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
"""
|
||||
if self.sample_score_fn:
|
||||
# Use user-provided function to evaluate a single sample
|
||||
result = self.sample_score_fn(prediction, reference)
|
||||
result = self.sample_score_fn(prediction, reference, test_set)
|
||||
if not isinstance(result, dict):
|
||||
# Ensure result is a dictionary with at least 'correct' field
|
||||
result = {
|
||||
@ -82,7 +86,8 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
else:
|
||||
# Use rule_evaluator to evaluate a single sample by calling
|
||||
# the score method with single-element lists
|
||||
result = self.rule_evaluator.score([prediction], [reference])
|
||||
result = self.rule_evaluator.score([prediction], [reference],
|
||||
[test_set])
|
||||
if 'details' in result and len(result['details']) > 0:
|
||||
return result['details'][0]
|
||||
else:
|
||||
@ -137,7 +142,11 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
failed_indices = []
|
||||
|
||||
for i, (pred, ref) in enumerate(zip(predictions, references)):
|
||||
result = self.sample_score(pred, ref)
|
||||
if test_set is not None:
|
||||
test_item = test_set[i]
|
||||
else:
|
||||
test_item = None
|
||||
result = self.sample_score(pred, ref, test_item)
|
||||
result['evaluation_method'] = 'rule'
|
||||
details.append({'rule_evaluation': result})
|
||||
|
||||
@ -182,8 +191,9 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
|
||||
|
||||
# Generate random hash suffix
|
||||
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}' # noqa
|
||||
|
||||
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
|
||||
self.logger.info(f'LLM evaluation results will be saved at '
|
||||
f'{llm_results_path}')
|
||||
# Check if results already exist to avoid re-evaluation
|
||||
if os.path.exists(llm_results_path):
|
||||
self.logger.info(
|
||||
@ -214,7 +224,9 @@ class CascadeEvaluator(BaseEvaluator):
|
||||
# Use GenericLLMEvaluator to evaluate samples
|
||||
# unset dataset_cfg for GenericLLMEvaluator to
|
||||
# directly use test_set
|
||||
self.llm_evaluator.output_path = llm_results_path
|
||||
# self.llm_evaluator.output_path = llm_results_path
|
||||
self.llm_evaluator._dataset_replica_idx = \
|
||||
self._dataset_replica_idx
|
||||
self.llm_evaluator.dataset_cfg = None
|
||||
|
||||
llm_results = self.llm_evaluator.score(
|
||||
|
@ -15,6 +15,8 @@ from opencompass.registry import (DICT_POSTPROCESSORS, ICL_PROMPT_TEMPLATES,
|
||||
from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
|
||||
from opencompass.utils.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class GenericLLMEvaluator(BaseEvaluator):
|
||||
"""Generic LLM evaluator.
|
||||
@ -37,8 +39,7 @@ class GenericLLMEvaluator(BaseEvaluator):
|
||||
dict_postprocessor: Optional[ConfigDict] = None,
|
||||
keep_predictions: bool = False,
|
||||
) -> None:
|
||||
|
||||
self.logger = get_logger()
|
||||
super().__init__(pred_postprocessor=pred_postprocessor)
|
||||
# If judge_cfg is not provided, fall back to the default configuration
|
||||
if not judge_cfg:
|
||||
self.judge_cfg = self.default_judge_cfg
|
||||
@ -57,14 +58,12 @@ class GenericLLMEvaluator(BaseEvaluator):
|
||||
|
||||
def build_inferencer(self):
|
||||
"""Build LLM Inference."""
|
||||
if not self.output_path:
|
||||
self.output_path = self._out_dir
|
||||
|
||||
self.output_path = f'{self._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
|
||||
logger.info(f'LLM judge details will be saved at:{self.output_path}')
|
||||
out_dir, out_name = osp.split(self.output_path)
|
||||
out_name = f'{out_name}.json'
|
||||
self.output_path = osp.join(out_dir, out_name)
|
||||
|
||||
self.logger.info(
|
||||
logger.info(
|
||||
f'Set self.output_path to {self.output_path} for current task')
|
||||
assert self.output_path is not None, 'output_path is None'
|
||||
|
||||
@ -194,7 +193,7 @@ class GenericLLMEvaluator(BaseEvaluator):
|
||||
@property
|
||||
def default_judge_cfg(self):
|
||||
from opencompass.models import OpenAISDK
|
||||
self.logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
|
||||
logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
|
||||
`OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.')
|
||||
DEFAULT_JUDGE_CFG = dict(
|
||||
type=OpenAISDK,
|
||||
|
@ -9,6 +9,9 @@ from datasets import Dataset
|
||||
from scipy.stats import hypergeom
|
||||
|
||||
from opencompass.registry import TEXT_POSTPROCESSORS
|
||||
from opencompass.utils.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def compute_pass_at_k(n, c, k):
|
||||
@ -43,6 +46,7 @@ class BaseEvaluator:
|
||||
|
||||
def __init__(self, pred_postprocessor=None) -> None:
|
||||
self.pred_postprocessor = pred_postprocessor
|
||||
self._dataset_replica_idx = 0 # Default value for dataset_replica_idx
|
||||
|
||||
@property
|
||||
def output_dir(self):
|
||||
@ -117,6 +121,7 @@ class BaseEvaluator:
|
||||
all_results = []
|
||||
for i in range(n):
|
||||
self._dataset_replica_idx = i
|
||||
logger.info(f'Running {i}-th replica of evaluation')
|
||||
|
||||
def select_fn(i, real_size, x):
|
||||
if isinstance(x, Dataset):
|
||||
|
Loading…
Reference in New Issue
Block a user