From db96161a4eeb0fc5be9b174d05e443215b6a0879 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Mon, 24 Mar 2025 14:25:12 +0800 Subject: [PATCH] [Update] Add SuperGPQA subset metrics (#1966) --- .../supergpqa_llmjudge_gen_12b8bc.py | 4 +- opencompass/datasets/supergpqa/supergpqa.py | 132 ++++++++++++++++++ .../evaluator/generic_llm_evaluator.py | 17 ++- .../icl_evaluator/icl_base_evaluator.py | 8 ++ 4 files changed, 156 insertions(+), 5 deletions(-) diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py index 02e6f2da..053eda07 100644 --- a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py +++ b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py @@ -1,5 +1,5 @@ from opencompass.datasets.supergpqa.supergpqa import ( - SuperGPQADataset, + SuperGPQADataset, supergpqa_llmjudge_postprocess ) from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate @@ -87,7 +87,7 @@ eval_cfg = dict( reader_cfg=reader_cfg, ), judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess), + dict_postprocessor=dict(type=supergpqa_llmjudge_postprocess), ), ) supergpqa_dataset = dict( diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py index 9dd96dd4..401422e1 100644 --- a/opencompass/datasets/supergpqa/supergpqa.py +++ b/opencompass/datasets/supergpqa/supergpqa.py @@ -1,4 +1,5 @@ import os +import re from datasets import Dataset, load_dataset @@ -7,6 +8,7 @@ from opencompass.datasets.supergpqa.supergpqa_eval import ( from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET +from opencompass.utils import get_logger from ..base import BaseDataset @@ -180,3 +182,133 @@ class SuperGPQAEvaluator(BaseEvaluator): 'details': details, } + + +def _generic_llmjudge_postprocess(judgement: str): + match = re.search(r'(A|B)', judgement) + grade_letter = (match.group(0) if match else 'B' + ) # Default to "INCORRECT" if no match + return grade_letter + + +def supergpqa_llmjudge_postprocess( + output: dict, + output_path: str, + dataset: Dataset, +) -> dict: + # Get the original dataset + original_dataset = dataset.reader.dataset['test'] + + judged_answers = [] + original_responses = [] + references = [] + details = [] + + # Initialize statistics dictionaries + stats = {'discipline': {}, 'field': {}, 'subfield': {}} + + total_correct = 0 + total_count = 0 + + # Process each sample + for k, v in output.items(): + idx = int(k) # Convert key to integer for indexing + original_responses.append(v['prediction']) + processed_judge = _generic_llmjudge_postprocess(v['prediction']) + + # Get category information from the dataset + sample = original_dataset[idx] + discipline = sample.get('discipline', 'unknown') + field = sample.get('field', 'unknown') + subfield = sample.get('subfield', 'unknown') + + # Initialize category stats if not exists + for level, key in [ + ('discipline', discipline), + ('field', f'{discipline}/{field}'), + ('subfield', f'{discipline}/{field}/{subfield}'), + ]: + if key not in stats[level]: + stats[level][key] = {'correct': 0, 'total': 0} + + # Record the judgment + if processed_judge is not None: + judged_answers.append(processed_judge) + try: + gold = v['gold'] + references.append(gold) + except KeyError: + get_logger().warning( + f'No gold answer for {k}, use empty string as reference!') + gold = '' + references.append('') + + # Check if the answer is correct (A means correct) + is_correct = processed_judge == 'A' + total_count += 1 + + if is_correct: + total_correct += 1 + # Update category stats + for level, key in [ + ('discipline', discipline), + ('field', f'{discipline}/{field}'), + ('subfield', f'{discipline}/{field}/{subfield}'), + ]: + stats[level][key]['correct'] += 1 + + # Update category totals + for level, key in [ + ('discipline', discipline), + ('field', f'{discipline}/{field}'), + ('subfield', f'{discipline}/{field}/{subfield}'), + ]: + stats[level][key]['total'] += 1 + # Add to details + details.append({ + 'id': k, + 'question': sample['question'], + 'options': sample['options'], + 'origin_prompt': v['origin_prompt'], + 'llm_judge': processed_judge, + 'gold': gold, + 'is_correct': is_correct, + 'discipline': discipline, + 'field': field, + 'subfield': subfield, + }) + + # Calculate overall accuracy with two decimal places + overall_accuracy = (round( + (total_correct / total_count * 100), 2) if total_count > 0 else 0.00) + + # Initialize results dictionary + results = { + 'accuracy': overall_accuracy, + 'total_correct': total_correct, + 'total_count': total_count, + 'details': details, + } + + # Calculate accuracy for each category and flatten into results + for level in stats: + for key, value in stats[level].items(): + if value['total'] > 0: + # Calculate accuracy with two decimal places + accuracy = round((value['correct'] / value['total'] * 100), 2) + + # Create a flattened key for the category + flat_key = f'SuperGPQA-{level}' + if level == 'discipline': + flat_key = f'SuperGPQA-{key}' + elif level == 'field': + discipline, field = key.split('/') + flat_key = f'SuperGPQA-{discipline}-{field}' + elif level == 'subfield': + discipline, field, subfield = key.split('/') + flat_key = f'SuperGPQA-{discipline}-{field}-{subfield}' + + # Add to results + results[flat_key] = accuracy + + return results diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py index c0b33a69..2b829ba1 100644 --- a/opencompass/evaluator/generic_llm_evaluator.py +++ b/opencompass/evaluator/generic_llm_evaluator.py @@ -84,6 +84,8 @@ class GenericLLMEvaluator(BaseEvaluator): references: Optional[List] = None, ) -> Dict: """Apply to single-model scoring.""" + assert len(predictions) == len( + references), 'predictions and references must have the same length' # -------------- Build Inferencer ---------------- self.build_inferencer() @@ -127,7 +129,7 @@ class GenericLLMEvaluator(BaseEvaluator): prompt_template=self.prompt_template) output = mmengine.load(self.output_path) - return self.output_postprocess(output) + return self.output_postprocess(output, dataset) def pred_postprocess(self, predictions: List) -> Dict: if self.pred_postprocessor is None: @@ -137,15 +139,24 @@ class GenericLLMEvaluator(BaseEvaluator): proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) return [proc(pred, **kwargs) for pred in predictions] - def output_postprocess(self, output: Dict) -> Dict: + def output_postprocess(self, output: Dict, dataset=None) -> Dict: """Postprocess output by adding necessary statistics or data into it.""" + import inspect + if self.dict_postprocessor is None: return output else: kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) - return proc(output, self.output_path, **kwargs) + sig = inspect.signature(proc) + if 'dataset' in sig.parameters: + return proc(output, + self.output_path, + dataset=dataset, + **kwargs) + else: + return proc(output, self.output_path, **kwargs) @property def default_judge_cfg(self): diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py index e2aad9be..794c0ed6 100644 --- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py @@ -89,6 +89,14 @@ class BaseEvaluator: original_dataset: Dataset, **score_kwargs, ): + # Check if predictions and references have the + # same length if both are provided + if 'predictions' in score_kwargs and 'references' in score_kwargs: + if len(score_kwargs['predictions']) != len( + score_kwargs['references']): + raise ValueError( + 'Predictions and references must have the same length') + real_size = len(original_dataset) // n all_details = [] all_results = []