From 4c2e66d335861c879fecf43d67d3b0e6e3b28123 Mon Sep 17 00:00:00 2001 From: zhangsongyang Date: Wed, 30 Apr 2025 08:50:25 +0000 Subject: [PATCH] Update --- ...ch_single_0shot_cascade_eval_gen_56cf43.py | 2 + opencompass/datasets/korbench/korbench.py | 90 +++++++++++++------ opencompass/evaluator/cascade_evaluator.py | 28 ++++-- .../evaluator/generic_llm_evaluator.py | 15 ++-- .../icl_evaluator/icl_base_evaluator.py | 5 ++ 5 files changed, 95 insertions(+), 45 deletions(-) diff --git a/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py b/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py index cdf66e17..50f4f15f 100644 --- a/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py +++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py @@ -10,6 +10,7 @@ Setting: Avaliable Models: - Instruct/Chat Models """ +from datasets import parallel from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate @@ -114,6 +115,7 @@ for category in categories: judge_cfg=dict(), dict_postprocessor=dict(type=generic_llmjudge_postprocess), ), + parallel=False, ) ) diff --git a/opencompass/datasets/korbench/korbench.py b/opencompass/datasets/korbench/korbench.py index 856c844e..d4a216a9 100644 --- a/opencompass/datasets/korbench/korbench.py +++ b/opencompass/datasets/korbench/korbench.py @@ -173,44 +173,76 @@ class korbenchEvaluator(BaseEvaluator): def __init__(self): super().__init__() - def score(self, predictions, references, test_set): - """Evaluate predictions for a single prompt_mode in KOR-Bench.""" - if not test_set: - raise ValueError('Test set is empty.') + def sample_score(self, prediction, reference, test_item=None): + """对单个样本进行评测。 - prompt_mode = test_set[0][ - 'prompt_mode'] # Determine the prompt_mode from the first entry - data = {} + Args: + prediction: 模型的预测结果 + reference: 参考答案 + test_item: 测试样本的其他信息 - # Organize data for the given prompt_mode - for i in range(len(predictions)): - entry = { - 'prediction': predictions[i], - 'gold': references[i], - 'rule_id': test_set[i].get('rule_id', None), - 'category': test_set[i].get('category', None), - 'rule_list': test_set[i].get('rule_list', None), - 'question_list': test_set[i].get('question_list', None), - 'base_path': test_set[i].get('base_path', None), - } - data[i] = entry + Returns: + Dict: 包含评测结果的字典 + """ + if test_item is None: + raise ValueError('Test item is required.') - if not data: - raise ValueError(f"No data found for prompt_mode '{prompt_mode}'") + prompt_mode = test_item.get('prompt_mode') - # Evaluate based on the prompt_mode + # 构建单个样本的数据 + entry = { + 'prediction': prediction, + 'gold': reference, + 'rule_id': test_item.get('rule_id', None), + 'category': test_item.get('category', None), + 'rule_list': test_item.get('rule_list', None), + 'question_list': test_item.get('question_list', None), + 'base_path': test_item.get('base_path', None), + } + + # 对单个样本进行评测 + data = {0: entry} + + # 根据不同的 prompt_mode 进行评测 if prompt_mode == '0_shot': evaluation_results = evaluate_responses(data, '0_shot') elif prompt_mode == '3_shot': evaluation_results = evaluate_responses(data, '3_shot') elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']: evaluation_results = evaluate_responses(data, 'mixed', - test_set[0]['base_path']) + test_item.get('base_path')) else: - raise ValueError(f'Unsupported prompt_mode: {prompt_mode}') - # Calculate accuracy - correct_count = sum(res['is_correct'] for res in evaluation_results) - accuracy = (correct_count / len(evaluation_results)) * 100 + return { + 'is_correct': False, + 'pred': prediction, + 'answer': reference + } - # Return scores - return {'accuracy': accuracy} + # 返回评测结果 + result = evaluation_results[0] + result['correct'] = result['is_correct'] + result.update({'pred': prediction, 'answer': reference}) + return result + + def score(self, predictions, references, test_set): + """使用 sample_score 对每个样本进行评测。""" + if not test_set: + raise ValueError('Test set is empty.') + + details = [] + correct_count = 0 + + # 对每个样本调用 sample_score 进行评测 + for i in range(len(predictions)): + result = self.sample_score(predictions[i], references[i], + test_set[i]) + details.append(result) + if result.get('is_correct', False): + correct_count += 1 + + # 计算准确率 + accuracy = (correct_count / + len(predictions)) * 100 if predictions else 0 + + # 返回评测结果 + return {'accuracy': accuracy, 'details': details} diff --git a/opencompass/evaluator/cascade_evaluator.py b/opencompass/evaluator/cascade_evaluator.py index 8d86fe1b..62d11192 100644 --- a/opencompass/evaluator/cascade_evaluator.py +++ b/opencompass/evaluator/cascade_evaluator.py @@ -34,7 +34,8 @@ class CascadeEvaluator(BaseEvaluator): sample_score_fn: Optional[Callable] = None, parallel: bool = True, ) -> None: - self.logger = get_logger() + super().__init__() + self.logger = get_logger(__name__) # Initialize the LLM evaluator llm_evaluator_type = llm_evaluator.pop('type') @@ -58,7 +59,10 @@ class CascadeEvaluator(BaseEvaluator): raise ValueError( 'Either rule_evaluator or sample_score_fn must be provided') - def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]: + def sample_score(self, + prediction: str, + reference: str, + test_set=None) -> Dict[str, Any]: """Score a single sample using sample_score_fn or rule_evaluator. Args: @@ -70,7 +74,7 @@ class CascadeEvaluator(BaseEvaluator): """ if self.sample_score_fn: # Use user-provided function to evaluate a single sample - result = self.sample_score_fn(prediction, reference) + result = self.sample_score_fn(prediction, reference, test_set) if not isinstance(result, dict): # Ensure result is a dictionary with at least 'correct' field result = { @@ -82,7 +86,8 @@ class CascadeEvaluator(BaseEvaluator): else: # Use rule_evaluator to evaluate a single sample by calling # the score method with single-element lists - result = self.rule_evaluator.score([prediction], [reference]) + result = self.rule_evaluator.score([prediction], [reference], + [test_set]) if 'details' in result and len(result['details']) > 0: return result['details'][0] else: @@ -137,7 +142,11 @@ class CascadeEvaluator(BaseEvaluator): failed_indices = [] for i, (pred, ref) in enumerate(zip(predictions, references)): - result = self.sample_score(pred, ref) + if test_set is not None: + test_item = test_set[i] + else: + test_item = None + result = self.sample_score(pred, ref, test_item) result['evaluation_method'] = 'rule' details.append({'rule_evaluation': result}) @@ -182,8 +191,9 @@ class CascadeEvaluator(BaseEvaluator): self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge' # Generate random hash suffix - llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}' # noqa - + llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}.json' # noqa + self.logger.info(f'LLM evaluation results will be saved at ' + f'{llm_results_path}') # Check if results already exist to avoid re-evaluation if os.path.exists(llm_results_path): self.logger.info( @@ -214,7 +224,9 @@ class CascadeEvaluator(BaseEvaluator): # Use GenericLLMEvaluator to evaluate samples # unset dataset_cfg for GenericLLMEvaluator to # directly use test_set - self.llm_evaluator.output_path = llm_results_path + # self.llm_evaluator.output_path = llm_results_path + self.llm_evaluator._dataset_replica_idx = \ + self._dataset_replica_idx self.llm_evaluator.dataset_cfg = None llm_results = self.llm_evaluator.score( diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py index 4c101b34..9246edba 100644 --- a/opencompass/evaluator/generic_llm_evaluator.py +++ b/opencompass/evaluator/generic_llm_evaluator.py @@ -15,6 +15,8 @@ from opencompass.registry import (DICT_POSTPROCESSORS, ICL_PROMPT_TEMPLATES, from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg from opencompass.utils.logging import get_logger +logger = get_logger(__name__) + class GenericLLMEvaluator(BaseEvaluator): """Generic LLM evaluator. @@ -37,8 +39,7 @@ class GenericLLMEvaluator(BaseEvaluator): dict_postprocessor: Optional[ConfigDict] = None, keep_predictions: bool = False, ) -> None: - - self.logger = get_logger() + super().__init__(pred_postprocessor=pred_postprocessor) # If judge_cfg is not provided, fall back to the default configuration if not judge_cfg: self.judge_cfg = self.default_judge_cfg @@ -57,14 +58,12 @@ class GenericLLMEvaluator(BaseEvaluator): def build_inferencer(self): """Build LLM Inference.""" - if not self.output_path: - self.output_path = self._out_dir + self.output_path = f'{self._out_dir}_replica{self.dataset_replica_idx}.json' # noqa + logger.info(f'LLM judge details will be saved at:{self.output_path}') out_dir, out_name = osp.split(self.output_path) - out_name = f'{out_name}.json' - self.output_path = osp.join(out_dir, out_name) - self.logger.info( + logger.info( f'Set self.output_path to {self.output_path} for current task') assert self.output_path is not None, 'output_path is None' @@ -194,7 +193,7 @@ class GenericLLMEvaluator(BaseEvaluator): @property def default_judge_cfg(self): from opencompass.models import OpenAISDK - self.logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \ + logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \ `OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.') DEFAULT_JUDGE_CFG = dict( type=OpenAISDK, diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index 1f605d4e..d497f7a8 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -9,6 +9,9 @@ from datasets import Dataset from scipy.stats import hypergeom from opencompass.registry import TEXT_POSTPROCESSORS +from opencompass.utils.logging import get_logger + +logger = get_logger(__name__) def compute_pass_at_k(n, c, k): @@ -43,6 +46,7 @@ class BaseEvaluator: def __init__(self, pred_postprocessor=None) -> None: self.pred_postprocessor = pred_postprocessor + self._dataset_replica_idx = 0 # Default value for dataset_replica_idx @property def output_dir(self): @@ -117,6 +121,7 @@ class BaseEvaluator: all_results = [] for i in range(n): self._dataset_replica_idx = i + logger.info(f'Running {i}-th replica of evaluation') def select_fn(i, real_size, x): if isinstance(x, Dataset):