From db96161a4eeb0fc5be9b174d05e443215b6a0879 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 24 Mar 2025 14:25:12 +0800
Subject: [PATCH] [Update] Add SuperGPQA subset metrics (#1966)

---
 .../supergpqa_llmjudge_gen_12b8bc.py          |   4 +-
 opencompass/datasets/supergpqa/supergpqa.py   | 132 ++++++++++++++++++
 .../evaluator/generic_llm_evaluator.py        |  17 ++-
 .../icl_evaluator/icl_base_evaluator.py       |   8 ++
 4 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
index 02e6f2da..053eda07 100644
--- a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
+++ b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
@@ -1,5 +1,5 @@
 from opencompass.datasets.supergpqa.supergpqa import (
-    SuperGPQADataset,
+    SuperGPQADataset, supergpqa_llmjudge_postprocess
 )
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
@@ -87,7 +87,7 @@ eval_cfg = dict(
             reader_cfg=reader_cfg,
         ),
         judge_cfg=dict(),
-        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        dict_postprocessor=dict(type=supergpqa_llmjudge_postprocess),
     ),
 )
 supergpqa_dataset = dict(
diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py
index 9dd96dd4..401422e1 100644
--- a/opencompass/datasets/supergpqa/supergpqa.py
+++ b/opencompass/datasets/supergpqa/supergpqa.py
@@ -1,4 +1,5 @@
 import os
+import re
 
 from datasets import Dataset, load_dataset
 
@@ -7,6 +8,7 @@ from opencompass.datasets.supergpqa.supergpqa_eval import (
 from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_logger
 
 from ..base import BaseDataset
 
@@ -180,3 +182,133 @@ class SuperGPQAEvaluator(BaseEvaluator):
             'details':
             details,
         }
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def supergpqa_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    # Get the original dataset
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'discipline': {}, 'field': {}, 'subfield': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        discipline = sample.get('discipline', 'unknown')
+        field = sample.get('field', 'unknown')
+        subfield = sample.get('subfield', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('discipline', discipline),
+            ('field', f'{discipline}/{field}'),
+            ('subfield', f'{discipline}/{field}/{subfield}'),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('discipline', discipline),
+                    ('field', f'{discipline}/{field}'),
+                    ('subfield', f'{discipline}/{field}/{subfield}'),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('discipline', discipline),
+                ('field', f'{discipline}/{field}'),
+                ('subfield', f'{discipline}/{field}/{subfield}'),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'question': sample['question'],
+                'options': sample['options'],
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'discipline': discipline,
+                'field': field,
+                'subfield': subfield,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'SuperGPQA-{level}'
+                if level == 'discipline':
+                    flat_key = f'SuperGPQA-{key}'
+                elif level == 'field':
+                    discipline, field = key.split('/')
+                    flat_key = f'SuperGPQA-{discipline}-{field}'
+                elif level == 'subfield':
+                    discipline, field, subfield = key.split('/')
+                    flat_key = f'SuperGPQA-{discipline}-{field}-{subfield}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py
index c0b33a69..2b829ba1 100644
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@@ -84,6 +84,8 @@ class GenericLLMEvaluator(BaseEvaluator):
         references: Optional[List] = None,
     ) -> Dict:
         """Apply to single-model scoring."""
+        assert len(predictions) == len(
+            references), 'predictions and references must have the same length'
         # -------------- Build Inferencer ----------------
         self.build_inferencer()
 
@@ -127,7 +129,7 @@ class GenericLLMEvaluator(BaseEvaluator):
                                   prompt_template=self.prompt_template)
 
         output = mmengine.load(self.output_path)
-        return self.output_postprocess(output)
+        return self.output_postprocess(output, dataset)
 
     def pred_postprocess(self, predictions: List) -> Dict:
         if self.pred_postprocessor is None:
@@ -137,15 +139,24 @@ class GenericLLMEvaluator(BaseEvaluator):
             proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
             return [proc(pred, **kwargs) for pred in predictions]
 
-    def output_postprocess(self, output: Dict) -> Dict:
+    def output_postprocess(self, output: Dict, dataset=None) -> Dict:
         """Postprocess output by adding necessary statistics or data into
         it."""
+        import inspect
+
         if self.dict_postprocessor is None:
             return output
         else:
             kwargs = self.dict_postprocessor
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            sig = inspect.signature(proc)
+            if 'dataset' in sig.parameters:
+                return proc(output,
+                            self.output_path,
+                            dataset=dataset,
+                            **kwargs)
+            else:
+                return proc(output, self.output_path, **kwargs)
 
     @property
     def default_judge_cfg(self):
diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index e2aad9be..794c0ed6 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -89,6 +89,14 @@ class BaseEvaluator:
         original_dataset: Dataset,
         **score_kwargs,
     ):
+        # Check if predictions and references have the
+        # same length if both are provided
+        if 'predictions' in score_kwargs and 'references' in score_kwargs:
+            if len(score_kwargs['predictions']) != len(
+                    score_kwargs['references']):
+                raise ValueError(
+                    'Predictions and references must have the same length')
+
         real_size = len(original_dataset) // n
         all_details = []
         all_results = []