phybench

2025-05-30 16:03:24 +08:00 · 2025-04-30 12:29:54 +00:00 · 2025-04-30 12:29:54 +00:00 · 71173c4fef
commit 71173c4fef
parent 37cbaf8d92
1 changed files with 149 additions and 0 deletions
--- a/opencompass/datasets/PHYBench/PHYBench.py
+++ b/opencompass/datasets/PHYBench/PHYBench.py
@ -0,0 +1,149 @@
 import re
 from typing import Dict, List
 import numpy as np
 import sympy
 from datasets import load_dataset
 from opencompass.datasets.base import BaseDataset
 from opencompass.datasets.PHYBench.EED.EED import EED
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
@LOAD_DATASET.register_module()
 class PHYBenchDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        dataset = load_dataset(path, split='train')
        # only use first 100 examples
        return dataset.select(range(100))
 def extract_last_latex(prediction: str) -> str:
    # 1) Find all \boxed{ occurrences and manually extract balanced content
    boxed_positions = [
        m.start() for m in re.finditer(r'\\boxed\s*\{', prediction)
    ]
    boxed_contents = []
    for pos in boxed_positions:
        # find the opening brace
        brace_start = prediction.find('{', pos)
        if brace_start == -1:
            continue
        # scan forward to find matching closing brace
        depth = 0
        for i in range(brace_start, len(prediction)):
            if prediction[i] == '{':
                depth += 1
            elif prediction[i] == '}':
                depth -= 1
                if depth == 0:
                    # extract between braces
                    boxed_contents.append(prediction[brace_start +
                                                     1:i].strip())
                    break
    if boxed_contents:
        return boxed_contents[-1]
    # 2) fallback: other delimiters
    cleaned = re.sub(r'^###.*$', '', prediction, flags=re.MULTILINE)
    cleaned = re.sub(r'[*\\-]{3,}', '', cleaned)
    cleaned = re.sub(r'(^|\n)[ \t]*[-*+] ', r'\1', cleaned)
    patterns = [
        r'\$\$(.*?)\$\$',
        r'\\\[(.*?)\\\]',
        r'\$(.*?)\$',
        r'\\\((.*?)\\\)',
    ]
    fragments = []
    for pat in patterns:
        for mm in re.finditer(pat, cleaned, re.DOTALL):
            fragments.append(mm.group(1).strip())
    if fragments:
        return fragments[-1]
    # 3) final fallback
    m2 = re.search(r'Final\s*Answer\s*:?\s*(.+)$', prediction, re.DOTALL)
    return m2.group(1).strip() if m2 else prediction.strip()
 def _calculate_eed_score(pred_str: str, ref_str: str) -> float:
    """Calculate the Expression Edit Distance (EED) score.
    Args:
        pred_str (str): Predicted answer string (LaTeX format)
        ref_str (str): Reference answer string (LaTeX format)
    Returns:
        float: EED score between 0 and 100
    """
    try:
        # Normalize the inputs first
        # remove the first $$ and the last $$ from the ref_str
        clean_pred = extract_last_latex(pred_str)
        if '$$' in ref_str:
            clean_ref = ref_str.split('$$')[1].strip()
        else:
            clean_ref = extract_last_latex(ref_str)
        # only compare the rhs of rightmost =
        clean_pred = clean_pred.split('=')[-1].strip()
        clean_ref = clean_ref.split('=')[-1].strip()
        # try to convert the latex to sympy expression
        try:
            clean_pred_expr = sympy.latex(sympy.sympify(clean_pred))
            clean_ref_expr = sympy.latex(sympy.sympify(clean_ref))
        except Exception:
            clean_pred_expr = None
            clean_ref_expr = None
        eed_result = EED(clean_ref, clean_pred)
        if clean_pred_expr and clean_ref_expr:
            clean_eed_result = EED(clean_ref_expr, clean_pred_expr)
            final_eed_result = max(clean_eed_result[0], eed_result[0])
        else:
            final_eed_result = eed_result[0]
        return final_eed_result
    except Exception:
        return 0
@ICL_EVALUATORS.register_module()
 class PHYBenchEvaluator(BaseEvaluator):
    def __init__(self):
        super().__init__()
    def score(self,
              predictions: List[str],
              references: List[str],
              test_set: List[Dict] = None) -> Dict:
        """Evaluate predictions for PHYBench based on Accuracy and EED
        Score."""
        if len(predictions) != len(references):
            return {'error': 'Number of predictions and references mismatch.'}
        correct_count = 0
        total_count = len(predictions)
        eed_scores = []
        for idx, (pred_str, ref_str) in enumerate(zip(predictions,
                                                      references)):
            eed = _calculate_eed_score(pred_str, ref_str)
            eed_scores.append(eed)
            if abs(eed - 100) < 1e-6:
                correct_count += 1
        accuracy = (correct_count /
                    total_count) * 100 if total_count > 0 else 0
        average_eed_score = np.mean(eed_scores) if eed_scores else 0
        # Return results as a dictionary
        return {'accuracy': accuracy, 'eed_score': average_eed_score}