SuperGPQA subset metrics

2025-05-30 16:03:24 +08:00 · 2025-03-21 12:06:18 +00:00 · 2025-03-21 12:06:18 +00:00 · 03531e7a2f
commit 03531e7a2f
parent b9de8b0e2b
4 changed files with 156 additions and 5 deletions
--- a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
+++ b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
@ -1,5 +1,5 @@
 from opencompass.datasets.supergpqa.supergpqa import (
-    SuperGPQADataset,
+    SuperGPQADataset, supergpqa_llmjudge_postprocess
 )
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
@ -87,7 +87,7 @@ eval_cfg = dict(
            reader_cfg=reader_cfg,
        ),
        judge_cfg=dict(),
-        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        dict_postprocessor=dict(type=supergpqa_llmjudge_postprocess),
    ),
 )
 supergpqa_dataset = dict(
--- a/opencompass/datasets/supergpqa/supergpqa.py
+++ b/opencompass/datasets/supergpqa/supergpqa.py
@ -1,4 +1,5 @@
 import os
 import re
 from datasets import Dataset, load_dataset
@ -7,6 +8,7 @@ from opencompass.datasets.supergpqa.supergpqa_eval import (
 from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from opencompass.utils import get_logger
 from ..base import BaseDataset
@ -180,3 +182,133 @@ class SuperGPQAEvaluator(BaseEvaluator):
            'details':
            details,
        }
 def _generic_llmjudge_postprocess(judgement: str):
    match = re.search(r'(A|B)', judgement)
    grade_letter = (match.group(0) if match else 'B'
                    )  # Default to "INCORRECT" if no match
    return grade_letter
 def supergpqa_llmjudge_postprocess(
    output: dict,
    output_path: str,
    dataset: Dataset,
 ) -> dict:
    # Get the original dataset
    original_dataset = dataset.reader.dataset['test']
    judged_answers = []
    original_responses = []
    references = []
    details = []
    # Initialize statistics dictionaries
    stats = {'discipline': {}, 'field': {}, 'subfield': {}}
    total_correct = 0
    total_count = 0
    # Process each sample
    for k, v in output.items():
        idx = int(k)  # Convert key to integer for indexing
        original_responses.append(v['prediction'])
        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
        # Get category information from the dataset
        sample = original_dataset[idx]
        discipline = sample.get('discipline', 'unknown')
        field = sample.get('field', 'unknown')
        subfield = sample.get('subfield', 'unknown')
        # Initialize category stats if not exists
        for level, key in [
            ('discipline', discipline),
            ('field', f'{discipline}/{field}'),
            ('subfield', f'{discipline}/{field}/{subfield}'),
        ]:
            if key not in stats[level]:
                stats[level][key] = {'correct': 0, 'total': 0}
        # Record the judgment
        if processed_judge is not None:
            judged_answers.append(processed_judge)
            try:
                gold = v['gold']
                references.append(gold)
            except KeyError:
                get_logger().warning(
                    f'No gold answer for {k}, use empty string as reference!')
                gold = ''
                references.append('')
            # Check if the answer is correct (A means correct)
            is_correct = processed_judge == 'A'
            total_count += 1
            if is_correct:
                total_correct += 1
                # Update category stats
                for level, key in [
                    ('discipline', discipline),
                    ('field', f'{discipline}/{field}'),
                    ('subfield', f'{discipline}/{field}/{subfield}'),
                ]:
                    stats[level][key]['correct'] += 1
            # Update category totals
            for level, key in [
                ('discipline', discipline),
                ('field', f'{discipline}/{field}'),
                ('subfield', f'{discipline}/{field}/{subfield}'),
            ]:
                stats[level][key]['total'] += 1
            # Add to details
            details.append({
                'id': k,
                'question': sample['question'],
                'options': sample['options'],
                'origin_prompt': v['origin_prompt'],
                'llm_judge': processed_judge,
                'gold': gold,
                'is_correct': is_correct,
                'discipline': discipline,
                'field': field,
                'subfield': subfield,
            })
    # Calculate overall accuracy with two decimal places
    overall_accuracy = (round(
        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
    # Initialize results dictionary
    results = {
        'accuracy': overall_accuracy,
        'total_correct': total_correct,
        'total_count': total_count,
        'details': details,
    }
    # Calculate accuracy for each category and flatten into results
    for level in stats:
        for key, value in stats[level].items():
            if value['total'] > 0:
                # Calculate accuracy with two decimal places
                accuracy = round((value['correct'] / value['total'] * 100), 2)
                # Create a flattened key for the category
                flat_key = f'SuperGPQA-{level}'
                if level == 'discipline':
                    flat_key = f'SuperGPQA-{key}'
                elif level == 'field':
                    discipline, field = key.split('/')
                    flat_key = f'SuperGPQA-{discipline}-{field}'
                elif level == 'subfield':
                    discipline, field, subfield = key.split('/')
                    flat_key = f'SuperGPQA-{discipline}-{field}-{subfield}'
                # Add to results
                results[flat_key] = accuracy
    return results
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@ -84,6 +84,8 @@ class GenericLLMEvaluator(BaseEvaluator):
        references: Optional[List] = None,
    ) -> Dict:
        """Apply to single-model scoring."""
        assert len(predictions) == len(
            references), 'predictions and references must have the same length'
        # -------------- Build Inferencer ----------------
        self.build_inferencer()
@ -127,7 +129,7 @@ class GenericLLMEvaluator(BaseEvaluator):
                                  prompt_template=self.prompt_template)
        output = mmengine.load(self.output_path)
-        return self.output_postprocess(output)
+        return self.output_postprocess(output, dataset)
    def pred_postprocess(self, predictions: List) -> Dict:
        if self.pred_postprocessor is None:
@ -137,15 +139,24 @@ class GenericLLMEvaluator(BaseEvaluator):
            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
            return [proc(pred, **kwargs) for pred in predictions]
-    def output_postprocess(self, output: Dict) -> Dict:
+    def output_postprocess(self, output: Dict, dataset=None) -> Dict:
        """Postprocess output by adding necessary statistics or data into
        it."""
        import inspect
        if self.dict_postprocessor is None:
            return output
        else:
            kwargs = self.dict_postprocessor
            proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            sig = inspect.signature(proc)
            if 'dataset' in sig.parameters:
                return proc(output,
                            self.output_path,
                            dataset=dataset,
                            **kwargs)
            else:
                return proc(output, self.output_path, **kwargs)
    @property
    def default_judge_cfg(self):
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@ -89,6 +89,14 @@ class BaseEvaluator:
        original_dataset: Dataset,
        **score_kwargs,
    ):
        # Check if predictions and references have the
        # same length if both are provided
        if 'predictions' in score_kwargs and 'references' in score_kwargs:
            if len(score_kwargs['predictions']) != len(
                    score_kwargs['references']):
                raise ValueError(
                    'Predictions and references must have the same length')
        real_size = len(original_dataset) // n
        all_details = []
        all_results = []