mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Update
This commit is contained in:
parent
7605cc2ca4
commit
4c2e66d335
@ -10,6 +10,7 @@ Setting:
|
|||||||
Avaliable Models:
|
Avaliable Models:
|
||||||
- Instruct/Chat Models
|
- Instruct/Chat Models
|
||||||
"""
|
"""
|
||||||
|
from datasets import parallel
|
||||||
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
@ -114,6 +115,7 @@ for category in categories:
|
|||||||
judge_cfg=dict(),
|
judge_cfg=dict(),
|
||||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||||
),
|
),
|
||||||
|
parallel=False,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -173,44 +173,76 @@ class korbenchEvaluator(BaseEvaluator):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def score(self, predictions, references, test_set):
|
def sample_score(self, prediction, reference, test_item=None):
|
||||||
"""Evaluate predictions for a single prompt_mode in KOR-Bench."""
|
"""对单个样本进行评测。
|
||||||
if not test_set:
|
|
||||||
raise ValueError('Test set is empty.')
|
|
||||||
|
|
||||||
prompt_mode = test_set[0][
|
Args:
|
||||||
'prompt_mode'] # Determine the prompt_mode from the first entry
|
prediction: 模型的预测结果
|
||||||
data = {}
|
reference: 参考答案
|
||||||
|
test_item: 测试样本的其他信息
|
||||||
|
|
||||||
# Organize data for the given prompt_mode
|
Returns:
|
||||||
for i in range(len(predictions)):
|
Dict: 包含评测结果的字典
|
||||||
|
"""
|
||||||
|
if test_item is None:
|
||||||
|
raise ValueError('Test item is required.')
|
||||||
|
|
||||||
|
prompt_mode = test_item.get('prompt_mode')
|
||||||
|
|
||||||
|
# 构建单个样本的数据
|
||||||
entry = {
|
entry = {
|
||||||
'prediction': predictions[i],
|
'prediction': prediction,
|
||||||
'gold': references[i],
|
'gold': reference,
|
||||||
'rule_id': test_set[i].get('rule_id', None),
|
'rule_id': test_item.get('rule_id', None),
|
||||||
'category': test_set[i].get('category', None),
|
'category': test_item.get('category', None),
|
||||||
'rule_list': test_set[i].get('rule_list', None),
|
'rule_list': test_item.get('rule_list', None),
|
||||||
'question_list': test_set[i].get('question_list', None),
|
'question_list': test_item.get('question_list', None),
|
||||||
'base_path': test_set[i].get('base_path', None),
|
'base_path': test_item.get('base_path', None),
|
||||||
}
|
}
|
||||||
data[i] = entry
|
|
||||||
|
|
||||||
if not data:
|
# 对单个样本进行评测
|
||||||
raise ValueError(f"No data found for prompt_mode '{prompt_mode}'")
|
data = {0: entry}
|
||||||
|
|
||||||
# Evaluate based on the prompt_mode
|
# 根据不同的 prompt_mode 进行评测
|
||||||
if prompt_mode == '0_shot':
|
if prompt_mode == '0_shot':
|
||||||
evaluation_results = evaluate_responses(data, '0_shot')
|
evaluation_results = evaluate_responses(data, '0_shot')
|
||||||
elif prompt_mode == '3_shot':
|
elif prompt_mode == '3_shot':
|
||||||
evaluation_results = evaluate_responses(data, '3_shot')
|
evaluation_results = evaluate_responses(data, '3_shot')
|
||||||
elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
|
elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
|
||||||
evaluation_results = evaluate_responses(data, 'mixed',
|
evaluation_results = evaluate_responses(data, 'mixed',
|
||||||
test_set[0]['base_path'])
|
test_item.get('base_path'))
|
||||||
else:
|
else:
|
||||||
raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
|
return {
|
||||||
# Calculate accuracy
|
'is_correct': False,
|
||||||
correct_count = sum(res['is_correct'] for res in evaluation_results)
|
'pred': prediction,
|
||||||
accuracy = (correct_count / len(evaluation_results)) * 100
|
'answer': reference
|
||||||
|
}
|
||||||
|
|
||||||
# Return scores
|
# 返回评测结果
|
||||||
return {'accuracy': accuracy}
|
result = evaluation_results[0]
|
||||||
|
result['correct'] = result['is_correct']
|
||||||
|
result.update({'pred': prediction, 'answer': reference})
|
||||||
|
return result
|
||||||
|
|
||||||
|
def score(self, predictions, references, test_set):
|
||||||
|
"""使用 sample_score 对每个样本进行评测。"""
|
||||||
|
if not test_set:
|
||||||
|
raise ValueError('Test set is empty.')
|
||||||
|
|
||||||
|
details = []
|
||||||
|
correct_count = 0
|
||||||
|
|
||||||
|
# 对每个样本调用 sample_score 进行评测
|
||||||
|
for i in range(len(predictions)):
|
||||||
|
result = self.sample_score(predictions[i], references[i],
|
||||||
|
test_set[i])
|
||||||
|
details.append(result)
|
||||||
|
if result.get('is_correct', False):
|
||||||
|
correct_count += 1
|
||||||
|
|
||||||
|
# 计算准确率
|
||||||
|
accuracy = (correct_count /
|
||||||
|
len(predictions)) * 100 if predictions else 0
|
||||||
|
|
||||||
|
# 返回评测结果
|
||||||
|
return {'accuracy': accuracy, 'details': details}
|
||||||
|
@ -34,7 +34,8 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
sample_score_fn: Optional[Callable] = None,
|
sample_score_fn: Optional[Callable] = None,
|
||||||
parallel: bool = True,
|
parallel: bool = True,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.logger = get_logger()
|
super().__init__()
|
||||||
|
self.logger = get_logger(__name__)
|
||||||
|
|
||||||
# Initialize the LLM evaluator
|
# Initialize the LLM evaluator
|
||||||
llm_evaluator_type = llm_evaluator.pop('type')
|
llm_evaluator_type = llm_evaluator.pop('type')
|
||||||
@ -58,7 +59,10 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Either rule_evaluator or sample_score_fn must be provided')
|
'Either rule_evaluator or sample_score_fn must be provided')
|
||||||
|
|
||||||
def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
|
def sample_score(self,
|
||||||
|
prediction: str,
|
||||||
|
reference: str,
|
||||||
|
test_set=None) -> Dict[str, Any]:
|
||||||
"""Score a single sample using sample_score_fn or rule_evaluator.
|
"""Score a single sample using sample_score_fn or rule_evaluator.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -70,7 +74,7 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
"""
|
"""
|
||||||
if self.sample_score_fn:
|
if self.sample_score_fn:
|
||||||
# Use user-provided function to evaluate a single sample
|
# Use user-provided function to evaluate a single sample
|
||||||
result = self.sample_score_fn(prediction, reference)
|
result = self.sample_score_fn(prediction, reference, test_set)
|
||||||
if not isinstance(result, dict):
|
if not isinstance(result, dict):
|
||||||
# Ensure result is a dictionary with at least 'correct' field
|
# Ensure result is a dictionary with at least 'correct' field
|
||||||
result = {
|
result = {
|
||||||
@ -82,7 +86,8 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
else:
|
else:
|
||||||
# Use rule_evaluator to evaluate a single sample by calling
|
# Use rule_evaluator to evaluate a single sample by calling
|
||||||
# the score method with single-element lists
|
# the score method with single-element lists
|
||||||
result = self.rule_evaluator.score([prediction], [reference])
|
result = self.rule_evaluator.score([prediction], [reference],
|
||||||
|
[test_set])
|
||||||
if 'details' in result and len(result['details']) > 0:
|
if 'details' in result and len(result['details']) > 0:
|
||||||
return result['details'][0]
|
return result['details'][0]
|
||||||
else:
|
else:
|
||||||
@ -137,7 +142,11 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
failed_indices = []
|
failed_indices = []
|
||||||
|
|
||||||
for i, (pred, ref) in enumerate(zip(predictions, references)):
|
for i, (pred, ref) in enumerate(zip(predictions, references)):
|
||||||
result = self.sample_score(pred, ref)
|
if test_set is not None:
|
||||||
|
test_item = test_set[i]
|
||||||
|
else:
|
||||||
|
test_item = None
|
||||||
|
result = self.sample_score(pred, ref, test_item)
|
||||||
result['evaluation_method'] = 'rule'
|
result['evaluation_method'] = 'rule'
|
||||||
details.append({'rule_evaluation': result})
|
details.append({'rule_evaluation': result})
|
||||||
|
|
||||||
@ -182,8 +191,9 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
|
self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
|
||||||
|
|
||||||
# Generate random hash suffix
|
# Generate random hash suffix
|
||||||
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}' # noqa
|
llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
|
||||||
|
self.logger.info(f'LLM evaluation results will be saved at '
|
||||||
|
f'{llm_results_path}')
|
||||||
# Check if results already exist to avoid re-evaluation
|
# Check if results already exist to avoid re-evaluation
|
||||||
if os.path.exists(llm_results_path):
|
if os.path.exists(llm_results_path):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
@ -214,7 +224,9 @@ class CascadeEvaluator(BaseEvaluator):
|
|||||||
# Use GenericLLMEvaluator to evaluate samples
|
# Use GenericLLMEvaluator to evaluate samples
|
||||||
# unset dataset_cfg for GenericLLMEvaluator to
|
# unset dataset_cfg for GenericLLMEvaluator to
|
||||||
# directly use test_set
|
# directly use test_set
|
||||||
self.llm_evaluator.output_path = llm_results_path
|
# self.llm_evaluator.output_path = llm_results_path
|
||||||
|
self.llm_evaluator._dataset_replica_idx = \
|
||||||
|
self._dataset_replica_idx
|
||||||
self.llm_evaluator.dataset_cfg = None
|
self.llm_evaluator.dataset_cfg = None
|
||||||
|
|
||||||
llm_results = self.llm_evaluator.score(
|
llm_results = self.llm_evaluator.score(
|
||||||
|
@ -15,6 +15,8 @@ from opencompass.registry import (DICT_POSTPROCESSORS, ICL_PROMPT_TEMPLATES,
|
|||||||
from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
|
from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
|
||||||
from opencompass.utils.logging import get_logger
|
from opencompass.utils.logging import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class GenericLLMEvaluator(BaseEvaluator):
|
class GenericLLMEvaluator(BaseEvaluator):
|
||||||
"""Generic LLM evaluator.
|
"""Generic LLM evaluator.
|
||||||
@ -37,8 +39,7 @@ class GenericLLMEvaluator(BaseEvaluator):
|
|||||||
dict_postprocessor: Optional[ConfigDict] = None,
|
dict_postprocessor: Optional[ConfigDict] = None,
|
||||||
keep_predictions: bool = False,
|
keep_predictions: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
super().__init__(pred_postprocessor=pred_postprocessor)
|
||||||
self.logger = get_logger()
|
|
||||||
# If judge_cfg is not provided, fall back to the default configuration
|
# If judge_cfg is not provided, fall back to the default configuration
|
||||||
if not judge_cfg:
|
if not judge_cfg:
|
||||||
self.judge_cfg = self.default_judge_cfg
|
self.judge_cfg = self.default_judge_cfg
|
||||||
@ -57,14 +58,12 @@ class GenericLLMEvaluator(BaseEvaluator):
|
|||||||
|
|
||||||
def build_inferencer(self):
|
def build_inferencer(self):
|
||||||
"""Build LLM Inference."""
|
"""Build LLM Inference."""
|
||||||
if not self.output_path:
|
|
||||||
self.output_path = self._out_dir
|
|
||||||
|
|
||||||
|
self.output_path = f'{self._out_dir}_replica{self.dataset_replica_idx}.json' # noqa
|
||||||
|
logger.info(f'LLM judge details will be saved at:{self.output_path}')
|
||||||
out_dir, out_name = osp.split(self.output_path)
|
out_dir, out_name = osp.split(self.output_path)
|
||||||
out_name = f'{out_name}.json'
|
|
||||||
self.output_path = osp.join(out_dir, out_name)
|
|
||||||
|
|
||||||
self.logger.info(
|
logger.info(
|
||||||
f'Set self.output_path to {self.output_path} for current task')
|
f'Set self.output_path to {self.output_path} for current task')
|
||||||
assert self.output_path is not None, 'output_path is None'
|
assert self.output_path is not None, 'output_path is None'
|
||||||
|
|
||||||
@ -194,7 +193,7 @@ class GenericLLMEvaluator(BaseEvaluator):
|
|||||||
@property
|
@property
|
||||||
def default_judge_cfg(self):
|
def default_judge_cfg(self):
|
||||||
from opencompass.models import OpenAISDK
|
from opencompass.models import OpenAISDK
|
||||||
self.logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
|
logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
|
||||||
`OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.')
|
`OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.')
|
||||||
DEFAULT_JUDGE_CFG = dict(
|
DEFAULT_JUDGE_CFG = dict(
|
||||||
type=OpenAISDK,
|
type=OpenAISDK,
|
||||||
|
@ -9,6 +9,9 @@ from datasets import Dataset
|
|||||||
from scipy.stats import hypergeom
|
from scipy.stats import hypergeom
|
||||||
|
|
||||||
from opencompass.registry import TEXT_POSTPROCESSORS
|
from opencompass.registry import TEXT_POSTPROCESSORS
|
||||||
|
from opencompass.utils.logging import get_logger
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def compute_pass_at_k(n, c, k):
|
def compute_pass_at_k(n, c, k):
|
||||||
@ -43,6 +46,7 @@ class BaseEvaluator:
|
|||||||
|
|
||||||
def __init__(self, pred_postprocessor=None) -> None:
|
def __init__(self, pred_postprocessor=None) -> None:
|
||||||
self.pred_postprocessor = pred_postprocessor
|
self.pred_postprocessor = pred_postprocessor
|
||||||
|
self._dataset_replica_idx = 0 # Default value for dataset_replica_idx
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def output_dir(self):
|
def output_dir(self):
|
||||||
@ -117,6 +121,7 @@ class BaseEvaluator:
|
|||||||
all_results = []
|
all_results = []
|
||||||
for i in range(n):
|
for i in range(n):
|
||||||
self._dataset_replica_idx = i
|
self._dataset_replica_idx = i
|
||||||
|
logger.info(f'Running {i}-th replica of evaluation')
|
||||||
|
|
||||||
def select_fn(i, real_size, x):
|
def select_fn(i, real_size, x):
|
||||||
if isinstance(x, Dataset):
|
if isinstance(x, Dataset):
|
||||||
|
Loading…
Reference in New Issue
Block a user