[Feature] Support OlympiadBench Benchmark (#1841)

* Support OlympiadBench Benchmark * Support OlympiadBench Benchmark * Support OlympiadBench Benchmark * update dataset path * Update olmpiadBench * Update olmpiadBench * Update olmpiadBench --------- Co-authored-by: liushz <qq1791167085@163.com>
2025-05-30 16:03:24 +08:00 · 2025-01-24 10:00:01 +08:00 · 2025-01-24 10:00:01 +08:00 · 412199f802
commit 412199f802
parent 70f2c963d3
9 changed files with 932 additions and 0 deletions
--- a/examples/eval_OlympiadBench.py
+++ b/examples/eval_OlympiadBench.py
@ -0,0 +1,36 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_gen_be8b13 import olympiadbench_datasets
+
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model
+
+    from opencompass.configs.summarizers.OlympiadBench import summarizer
+
+
+datasets = sum([v for k, v in locals().items() if k.endswith('_datasets') or k == 'datasets'], [])
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=8,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=256,
+        task=dict(type=OpenICLEvalTask)
+    ),
+)
+
+work_dir = 'outputs/debug/OlympiadBench'
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_gen_be8b13.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_gen_be8b13.py
@ -0,0 +1,50 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+
+
+with read_base():
+    from .OlympiadBench_categories import categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+olympiadbench_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(type=OlympiadBenchEvaluator, version='v2'), 
+        pred_postprocessor=dict(type=olympiadbench_postprocess_v2),
+    )
+
+    olympiadbench_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+        )
+    )
+
+del _name
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
@ -0,0 +1,7 @@
+categories = [
+    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
--- a/opencompass/configs/summarizers/OlympiadBench.py
+++ b/opencompass/configs/summarizers/OlympiadBench.py
@ -0,0 +1,15 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.OlympiadBench import OlympiadBench_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        'OlympiadBench_OE_TO_maths_en_COMP',
+        'OlympiadBench_OE_TO_maths_zh_COMP',
+        'OlympiadBench_OE_TO_maths_zh_CEE',
+        'OlympiadBench_OE_TO_physics_en_COMP',
+        'OlympiadBench_OE_TO_physics_zh_CEE'
+    ],
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
--- a/opencompass/configs/summarizers/groups/OlympiadBench.py
+++ b/opencompass/configs/summarizers/groups/OlympiadBench.py
@ -0,0 +1,11 @@
+categories = [
+    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
+
+OlympiadBench_summary_groups = [
+    {'name': 'OlympiadBench', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in categories]},
+]
--- a/opencompass/datasets/OlympiadBench.py
+++ b/opencompass/datasets/OlympiadBench.py
@ -0,0 +1,801 @@
+import json
+import math
+import os
+import re
+from os import environ
+from typing import Dict
+
+import sympy as sp
+from datasets import Dataset, DatasetDict
+from sympy import Eq, Pow, simplify, sympify
+from sympy.parsing.latex import parse_latex
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.registry import (ICL_PROMPT_TEMPLATES, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+# Load Dataset
+
+
+@LOAD_DATASET.register_module()
+class OlympiadBenchDataset(BaseDataset):
+    """Dataset for OlympiadBench.
+
+    Args:
+        path (str): Path to dataset directory
+        name (str): Name of specific json file to load
+        e.g. 'OE_TO_maths_en_COMP'
+    """
+
+    @staticmethod
+    def load(path: str, name: str = None, **kwargs):
+        """Load dataset.
+
+        Args:
+            path (str): Path to dataset directory
+            name (str): Name of specific json file to load
+
+        Returns:
+            DatasetDict: Dataset with test and train splits
+        """
+        path = get_data_path(path)
+        dataset = DatasetDict()
+        raw_data = []
+
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            from modelscope import MsDataset
+
+            ms_dataset = MsDataset.load(path, split='train')
+            for item in ms_dataset:
+                raw_data.append({
+                    'problem':
+                    item['question'],
+                    'solution':
+                    item['final_answer'][0],
+                    'language':
+                    item['language'],
+                    'subject':
+                    item['subject'],
+                    'question_type':
+                    item['question_type'],
+                    'answer_type':
+                    item['answer_type'],
+                    'is_multiple_answer':
+                    item['is_multiple_answer'],
+                    'unit':
+                    item['unit'],
+                    'error':
+                    item['error'],
+                    'questions':
+                    item,  # may not be used
+                })
+        else:
+            # Construct file path using name parameter
+            if name is None:
+                raise ValueError(
+                    "Must specify 'name' parameter to load specific json file")
+
+            # file_path = os.path.join(path, name, f'{name}.json')
+            file_path = os.path.join(path, f'{name}.json')
+            if not os.path.exists(file_path):
+                raise FileNotFoundError(f'File not found: {file_path}')
+
+            # Load the specified json file
+            data = json.load(open(file_path, encoding='utf-8'))
+            for item in data:
+                raw_data.append({
+                    'problem':
+                    item['question'],
+                    'solution':
+                    item['final_answer'][0],
+                    'language':
+                    item['language'],
+                    'subject':
+                    item['subject'],
+                    'question_type':
+                    item['question_type'],
+                    'answer_type':
+                    item['answer_type'],
+                    'is_multiple_answer':
+                    item['is_multiple_answer'],
+                    'unit':
+                    item['unit'],
+                    'error':
+                    item['error'],
+                    'questions':
+                    item,  # may not be used
+                })
+
+        dataset['test'] = Dataset.from_list(raw_data)
+        dataset['train'] = Dataset.from_list(raw_data)
+        return dataset
+
+
+# Construct Prompt
+
+
+def get_single_answer_type_text(answer_type, is_chinese):
+    if '-' in answer_type:  # No need now
+        answer_type = answer_type[:answer_type.find('-')]
+    chinese_answer_type_dict = {
+        'Numerical': '数值',
+        'Expression': '表达式',
+        'Equation': '方程',
+        'Interval': '区间',
+    }
+    english_answer_type_dict = {
+        'Numerical': 'a numerical value',
+        'Expression': 'an expression',
+        'Equation': 'an equation',
+        'Interval': 'an interval',
+    }
+
+    for t in ['Numerical', 'Expression', 'Equation', 'Interval']:
+        if t in answer_type:
+            if is_chinese:
+                return chinese_answer_type_dict[t]
+            else:
+                return english_answer_type_dict[t]
+    raise ValueError(f'Error parsing answer type {answer_type}!')
+
+
+def get_answer_type_text(answer_type, is_chinese, multiple_answer):
+    if ('Need_human_evaluate' in answer_type) or ('Tuple' in answer_type):
+        return ''
+    if not multiple_answer:
+        answer_text = get_single_answer_type_text(answer_type, is_chinese)
+        if is_chinese:
+            return f'，答案类型为{answer_text}'
+        else:
+            return (f'The answer of The problem should be '
+                    f'{answer_text}. ')
+    # Multiple answers case
+    if ',' not in answer_type:  # Same answer type for all answers
+        answer_text = get_single_answer_type_text(answer_type, is_chinese)
+        if is_chinese:
+            return f'，题目有多个答案，答案类型均为{answer_text}'
+        else:
+            return (f'The problem has multiple answers, each of them '
+                    f'should be {answer_text}. ')
+    # Different answer types
+    answer_types = answer_type.split(',')
+    answer_types = [
+        get_single_answer_type_text(t, is_chinese) for t in answer_types
+    ]
+    if len(set(answer_types)) == 1:
+        answer_text = answer_types[0]
+        if is_chinese:
+            return f'，题目有多个答案，答案类型均为{answer_text}'
+        else:
+            return (f'The problem has multiple answers, each of them '
+                    f'should be {answer_text}. ')
+    else:
+        if is_chinese:
+            answer_text = '、'.join(answer_types)
+            return f'，题目有多个答案，答案类型分别为{answer_text}'
+        else:
+            answer_text = ', '.join(answer_types)
+            return (f'The problem has multiple answers, '
+                    f'with the answers in order being {answer_text}. ')
+
+
+class OlympiadBenchPrompter:
+
+    def __init__(self):
+        pass
+
+    def make_prompt(
+        self,
+        language,
+        subject,
+        question_type,
+        answer_type,
+        is_multiple_answer,
+        unit,
+    ):
+        self.is_chinese = language == 'Chinese'
+        self.is_math = subject == 'Math'
+        self.is_theorem_proving = question_type == 'Theorem proof'
+        """Generate prompt based on question properties."""
+        if self.is_chinese:
+            subject_content = '数学' if self.is_math else '物理'
+            if self.is_theorem_proving:
+                prompt = (f'以下是中国{subject_content}竞赛中的证明题。请根据题目的要求，'
+                          f'运用逻辑推理及常用定理证明题目中的命题。证明过程中使用的变量和公式请使用LaTeX格式表示。')
+            else:
+                answer_type_text = get_answer_type_text(
+                    answer_type,
+                    is_chinese=True,
+                    multiple_answer=is_multiple_answer,
+                )
+                if is_multiple_answer:
+                    multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
+                else:
+                    multiple_answer_text = '\\boxed{答案}'
+                unit_text = ''
+                if unit:
+                    multiple_answer_text += '(单位)'
+                    unit_text = '，注意答案的单位不要放在\\boxed{}中'
+                prompt = (f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。'
+                          f'请根据题目的要求和所提供的信息计算得出答案。解答过程和结果中使用的'
+                          f'变量和公式请使用LaTeX格式表示。请在最后以"所以最终答案是'
+                          f'{multiple_answer_text}。"显式给出结果{unit_text}。')
+        else:
+            subject_content = 'Math' if self.is_math else 'Physics'
+            if self.is_theorem_proving:
+                prompt = (
+                    f'The following is a theorem proving problem from an '
+                    f'International {subject_content} competition. Please use '
+                    f'logical reasoning and common theorems to prove the '
+                    f'proposition in the problem according to the given '
+                    f'requirements. Please use LaTeX format to represent the '
+                    f'variables and formulas used in the proof.')
+            else:
+                if is_multiple_answer:
+                    multiple_answer_text = (
+                        '\\boxed{multiple answers connected with commas}')
+                else:
+                    multiple_answer_text = '\\boxed{answer}'
+                unit_text = ''
+                if unit:
+                    multiple_answer_text += '(unit)'
+                    unit_text = (', note that the unit of the answer should '
+                                 'not be included in \\boxed{}')
+                answer_type_text = get_answer_type_text(
+                    answer_type,
+                    is_chinese=False,
+                    multiple_answer=is_multiple_answer,
+                )
+                prompt = (
+                    f'The following is an open-ended problem from an '
+                    f'International {subject_content} competition. '
+                    f'{answer_type_text}Please calculate the answer according '
+                    f'to the given requirements and the information provided. '
+                    f'Please use LaTeX format to represent the variables and '
+                    f'formulas used in the solution process and results. '
+                    f'Please end your solution with "So the final answer is '
+                    f'{multiple_answer_text}." and give the result explicitly'
+                    f'{unit_text}.')
+        # Add problem statement to the prompt
+        prompt = prompt + '\n' + '{problem}' + '\n'
+        # Add step-by-step reasoning instruction
+        if self.is_chinese:
+            prompt += '\n请通过逐步推理来解答问题，并把最终答案放置于\\boxed{}中。'
+        else:
+            prompt += ('\nPlease reason step by step, and put your final '
+                       'answer within \\boxed{}.')
+        return prompt
+
+
+# Evaluate
+
+
+class MathJudger:
+
+    def __init__(self):
+        self.special_signal_map = {
+            '\\left': '',
+            '\\right': '',
+            '∶': ':',
+            '，': ',',
+            '$': '',
+            '\\approx': '=',
+            '\\simeq': '=',
+            '\\sim': '=',
+            '^\\prime': "'",
+            '^{\\prime}': "'",
+            '^\\circ': '',
+            '%': '',
+        }
+        self.pi = parse_latex('\\pi')
+        self.precision = 1e-8
+
+    def split_by_comma(self, expr: str):
+        in_bracket_num = 0
+        splitted_expr = []
+        start_idx = 0
+        for i, char in enumerate(expr):
+            if char == '(' or char == '[':
+                in_bracket_num += 1
+            elif char == ')' or char == ']':
+                in_bracket_num -= 1
+            elif char == ',' and in_bracket_num == 0:
+                splitted_expr.append(expr[start_idx:i].strip())
+                start_idx = i + 1
+
+        if start_idx < len(expr):
+            splitted_expr.append(expr[start_idx:].strip())
+
+        return splitted_expr
+
+    def trans_plus_minus_sign(self, expr_list: list):
+        new_expr_list = []
+        for expr in expr_list:
+            if '\\pm' in expr:
+                new_expr_list.append(expr.replace('\\pm', '+'))
+                new_expr_list.append(expr.replace('\\pm', '-'))
+            else:
+                new_expr_list.append(expr)
+
+        return new_expr_list
+
+    def judge(self, expression1, expression2, precision=1e-8):
+        # (默认 expression1 为 Ground_Truth)
+        precision = precision if type(precision) == list else [precision]
+
+        try:
+            expression1, expression2 = self.preprocess(expression1,
+                                                       expression2)
+        except Exception:  # 处理具体异常
+            return False
+        if expression1 == expression2:
+            return True
+
+        # 去除字符串中的中文字符
+        expression1 = re.sub(r'[\u4e00-\u9fff]+', '', expression1)
+        expression2 = re.sub(r'[\u4e00-\u9fff]+', '', expression2)
+
+        expression1 = self.split_by_comma(expression1)
+        expression2 = self.split_by_comma(expression2)
+
+        temp_list1 = self.trans_plus_minus_sign(expression1)
+        temp_list2 = self.trans_plus_minus_sign(expression2)
+
+        # 设计误差值列表
+        if len(precision) <= 1:
+            precision = precision * len(temp_list1)
+
+        if len(temp_list1) != len(temp_list2):
+            return False
+
+        # 判断两个列表中的元素是否可以两两配对，并且两两相等
+        idx = -1
+        while len(temp_list1) != 0:
+            idx = (idx + 1) % len(temp_list1)
+
+            item1 = temp_list1[idx]
+            self.precision = precision[idx]
+
+            for item2 in temp_list2:
+                if self.is_equal(item1, item2):
+                    temp_list1.remove(item1)
+                    temp_list2.remove(item2)
+                    precision.remove(self.precision)
+                    break
+            else:
+                return False
+
+        # 如果所有元素都匹配并移除，列表可以配对
+        return True
+
+    def is_interval(self, epr):
+        return epr.startswith(('(', '[')) and epr.endswith((')', ']'))
+
+    def sympy_sub_pi(self, expression_sympy):
+        return expression_sympy.subs(self.pi, math.pi)
+
+    def is_equal(self, expression1, expression2):
+        if (expression1 == expression2 and expression1 != ''
+                and expression2 != ''):
+            return True
+
+        # 先判断是否是两个区间
+        if self.is_interval(expression1) and self.is_interval(expression2):
+            try:
+                if self.interval_equal(expression1, expression2):
+                    return True
+            except Exception:  # 处理具体异常
+                return False
+
+        # 再判断是否在数值上相等
+        try:
+            if self.numerical_equal(expression1, expression2):
+                return True
+        except Exception:  # 处理具体异常
+            pass
+
+        # 再判断是否是表达式相等
+        try:
+            if self.expression_equal(
+                    expression1, expression2) and not ('=' in expression1
+                                                       and '=' in expression2):
+                return True
+        except Exception:  # 处理具体异常
+            pass
+
+        # 再判断是否是等式相等
+        try:
+            if self.equation_equal(expression1, expression2):
+                return True
+        except Exception:  # 处理具体异常
+            pass
+
+        return False
+
+    def numerical_equal(
+        self,
+        expression1: str,
+        expression2: str,
+        include_percentage: bool = True,
+    ):
+        """
+        (默认 expression1 为 Ground_Truth)
+        函数: 判读两个数值是否在误差允许范围内相等
+        步骤1: 将可能出现的百分号的情况包含进来
+        步骤2: 使用 math.isclose 函数判断是否相等
+        """
+        reference = float(expression1)
+        prediction = float(expression2)
+
+        if include_percentage:
+            gt_result = [reference / 100, reference, reference * 100]
+        else:
+            gt_result = [reference]
+
+        for item in gt_result:
+            if abs(item - prediction) <= self.precision * 1.01:
+                return True
+        return False
+
+    def expression_equal(self, exp1, exp2):
+        """
+        (默认 expression1 为 Ground_Truth)
+        函数: 判断两个表达式是否在数学意义上等价
+        步骤1: 提取表达式, 防止有的模型会给出"x=1"而不是"1"
+        步骤2: 使用 sympy 库进行等价判断
+        """
+
+        # 只提取等号右边的表达式
+        def extract_expression(expression):
+            if '=' in expression:
+                expression = expression.split('=')[1]
+            return expression.strip()
+
+        exp1 = extract_expression(exp1)
+        exp2 = extract_expression(exp2)
+
+        # 将表达式转换为 sympy 中能够进行处理的格式
+        expr1_sym = sympify(parse_latex(exp1))
+        expr2_sym = sympify(parse_latex(exp2))
+
+        if expr1_sym == expr2_sym:
+            return True
+        else:
+            expr1_sym = self.sympy_sub_pi(expr1_sym)
+            expr2_sym = self.sympy_sub_pi(expr2_sym)
+
+            if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or (
+                    not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
+                return False
+            elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
+                try:
+                    if not (self.can_compute_power(expr1_sym)
+                            and self.can_compute_power(expr2_sym)):
+                        print(f'These two number can not be calculated by '
+                              f'current computer for: '
+                              f'"{str(expr1_sym)}" and "{str(expr2_sym)}"')
+                        return False
+
+                    if (abs(expr1_sym.evalf() - expr2_sym.evalf()) <=
+                            self.precision * 1.01):
+                        return True
+                    else:
+                        return False
+                except Exception:  # 处理具体异常
+                    return False
+            else:
+                try:
+                    simplified_expr = simplify(expr1_sym - expr2_sym)
+
+                    num_value = simplified_expr.evalf()
+
+                    return abs(num_value) < 1e-3
+                except Exception:  # 处理具体异常
+                    return False
+
+    def equation_equal(self, expression1, expression2):
+        """
+        (expression1 is assumed to be Ground_Truth)
+        Function: Check if two equations are mathematically equivalent
+        Step 1: Simplify equations to standard form with right side equal to 0
+        Step 2: Use sympy library to calculate quotient of left sides,
+        if quotient or its reciprocal is integer, equations are equivalent
+        """
+
+        # Convert equations to sympy format with right side moved to left side
+        def simplify_equation(latex_eq):
+            # Split left and right sides of equation
+            lhs, rhs = latex_eq.split('=')
+
+            # Parse LaTeX expressions using parse_latex
+            lhs_expr = parse_latex(lhs)
+            rhs_expr = parse_latex(rhs)
+
+            # Create equation object
+            equation = Eq(lhs_expr, rhs_expr)
+
+            # Simplify equation by moving right side to left
+            simplified_eq = simplify(equation.lhs - equation.rhs)
+
+            return simplified_eq
+
+        expr1_sym = simplify_equation(expression1)
+        expr2_sym = simplify_equation(expression2)
+
+        division_result_1 = simplify(expr1_sym / expr2_sym)
+        division_result_2 = simplify(expr2_sym / expr1_sym)
+
+        # If division result or its reciprocal is
+        # non-zero integer, equations are equivalent
+        if (division_result_1.is_Integer
+                and division_result_1 != 0) or (division_result_2.is_Integer
+                                                and division_result_2 != 0):
+            return True
+        else:
+            return False
+
+    def interval_equal(self, expression1, expression2):
+        """
+        Function: Check if two intervals are mathematically equivalent
+        Step 1: Simplify interval expressions,
+                remove irrelevant symbols
+                like "\\left", "\\right", and "x \\in"
+        Step 2: Compare brackets and mathematical expressions in between
+        """
+
+        def compare_two_interval(inter1, inter2):
+            # First compare brackets on both sides
+            if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
+                return False
+
+            inter1 = inter1.strip('[]()')
+            inter2 = inter2.strip('[]()')
+
+            # Split interval into left and right parts
+            items_1 = inter1.split(',')
+            items_2 = inter2.split(',')
+
+            for item_1, item_2 in zip(items_1, items_2):
+                if not self.expression_equal(item_1, item_2):
+                    return False
+            return True
+
+        interval1 = expression1
+        interval2 = expression2
+
+        if interval1 == interval2:
+            return True
+        else:
+            inter_list1 = interval1.split('\\cup')
+            inter_list2 = interval2.split('\\cup')
+
+            if len(inter_list1) != len(inter_list2):
+                return False
+            else:
+                for inter1, inter2 in zip(inter_list1, inter_list2):
+                    if not compare_two_interval(inter1, inter2):
+                        return False
+                return True
+
+    def preprocess(self, expression1, expression2):
+        """Extract and preprocess expressions from model output."""
+
+        def extract_boxed_content(latex_str):
+            # Find all \boxed{...} structures
+            boxed_matches = re.finditer(r'\\boxed{', latex_str)
+            results = ''
+
+            for match in boxed_matches:
+                start_index = match.end()
+                end_index = start_index
+                stack = 1
+
+                # Search from after \boxed{ until
+                # finding matching closing brace
+                while stack > 0 and end_index < len(latex_str):
+                    if latex_str[end_index] == '{':
+                        stack += 1
+                    elif latex_str[end_index] == '}':
+                        stack -= 1
+                    end_index += 1
+
+                if stack == 0:
+                    # Extract content inside \boxed{}
+                    content = latex_str[start_index:end_index - 1]
+                    results += content + ','
+                else:
+                    raise ValueError('Mismatched braces in LaTeX string.')
+
+            # If no \boxed{} found, extract formulas from last line
+            if results == '':
+                last_line_ans = latex_str.strip().split('\n')[-1]
+                dollar_pattern = r'\$(.*?)\$'
+                answers = re.findall(dollar_pattern, last_line_ans)
+
+                if answers:
+                    for ans in answers:
+                        results += ans + ','
+                else:
+                    results = latex_str
+
+            return results
+
+        def special_symbol_replace(expression):
+            if '\\in ' in expression:
+                expression = expression.split('\\in ')[1]
+
+            # Replace special characters that
+            # don't affect LaTeX parsing (decorative)
+            for signal in self.special_signal_map:
+                expression = expression.replace(
+                    signal, self.special_signal_map[signal])
+
+            expression = expression.strip('\n$,.:;^_=+`!@#$%^&*~，。')
+
+            pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
+            expression = re.sub(pattern, r'\1', expression)
+
+            return expression
+
+        exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(
+            expression2)
+        exp1, exp2 = special_symbol_replace(exp1), special_symbol_replace(exp2)
+
+        return exp1, exp2
+
+    def can_compute_power(self, expr):
+        """Check if the power expression can be computed.
+
+        Parameters:
+        expr (sympy expression): The expression to check.
+
+        Returns:
+        bool: True if the expression can be computed, False otherwise.
+        """
+        # Check if the expression is a power expression
+        if isinstance(expr, Pow):
+            # Extract the base and the exponent
+            base, exp = expr.as_base_exp()
+
+            # Check if the base and the exponent are numbers
+            if base.is_number and exp.is_number:
+                # Set a threshold for the maximum size of the exponent
+                # can be adjusted based on the computing environment
+                MAX_EXP = 1000
+
+                # Check if the exponent is greater than the threshold
+                if abs(exp.evalf()) > MAX_EXP:
+                    return False
+                else:
+                    return True
+            else:
+                # If the base or the exponent is not a number,
+                # we cannot compute the power
+                return False
+        else:
+            # If the expression is not a power expression,
+            # return True as it is not the case we are checking for
+            return True
+
+
+@TEXT_POSTPROCESSORS.register_module('olympiadbench_postprocess_v2')
+def olympiadbench_postprocess_v2(text: str,
+                                 is_chinese: bool = False,
+                                 is_deepseek: bool = False) -> str:
+    """Extract answer from model output."""
+    # deepseekmath has special answering format
+    if is_deepseek:
+        if is_chinese:
+            matches = re.findall('## 解题答案(.*)', text)
+        else:
+            matches = re.findall('The answer is: (.*)', text)
+    else:
+        if is_chinese:
+            matches = re.findall('所以最终答案是(.*)', text)
+        else:
+            matches = re.findall('So the final answer is (.*)', text)
+
+    # If found matches, take the last one, otherwise return the whole text
+    if matches:
+        return matches[-1].strip()
+    return text
+
+
+class OlympiadBenchEvaluator(BaseEvaluator):
+    """Evaluator for OlympiadBench dataset."""
+
+    def __init__(self, version='v1'):
+        assert version in ['v1', 'v2']
+        self.version = version
+        self.judger = MathJudger()
+
+    def score(self, predictions, references):  # Remove questions parameter
+        """Calculate accuracy score.
+
+        Args:
+            predictions (list): List of model predictions
+            references (list): List of ground truth answers
+        """
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different length'
+            }
+
+        correct = 0
+        count = 0
+        details = []
+
+        for pred, ref in zip(predictions, references):
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            count += 1
+
+            # Get precision/error threshold from reference if available
+            precision = 1e-8
+            if isinstance(ref, dict) and 'error' in ref:
+                if ',' in ref['error']:
+                    # Multiple precisions for multiple answers
+                    precisions = ref['error'].split(',')
+                    precisions = [float(p) if p else 1e-8 for p in precisions]
+                    precision = precisions
+                else:
+                    precision = float(ref['error'])
+
+            # Check if answer is correct
+            try:
+                if (isinstance(ref, dict) and 'answer_type' in ref
+                        and 'Tuple' in ref['answer_type']):
+                    # Special handling for tuple type answers
+                    is_correct = self.judger.judge(pred,
+                                                   ref['final_answer'][0],
+                                                   precision)
+                else:
+                    is_correct = self.judger.judge(pred, ref, precision)
+
+                if is_correct:
+                    correct += 1
+                    detail['correct'] = True
+            except Exception as e:  # 处理具体异常
+                detail['error'] = str(e)
+
+            details.append(detail)
+
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@ICL_PROMPT_TEMPLATES.register_module()
+class OlympiadBenchTemplate(PromptTemplate):
+    """Template for OlympiadBench dataset."""
+
+    def __init__(self):
+        # Define basic template structure
+        template = dict(round=[dict(role='HUMAN', prompt='{prompt}')])
+        super().__init__(template=template)
+        self.prompter = OlympiadBenchPrompter()
+
+    def generate_item(self, entry: Dict, *args, **kwargs) -> str:
+        """Generate prompt for a single item."""
+        problem = entry.get('problem', '')
+        language = entry.get('language', 'English')
+        subject = entry.get('subject', 'Math')
+        question_type = entry.get('question_type', '')
+        answer_type = entry.get('answer_type', '')
+        is_multiple_answer = entry.get('is_multiple_answer', False)
+        unit = entry.get('unit', '')
+
+        prompt = self.prompter.make_prompt(
+            language=language,
+            subject=subject,
+            question_type=question_type,
+            answer_type=answer_type,
+            is_multiple_answer=is_multiple_answer,
+            unit=unit,
+        )
+
+        new_entry = {'prompt': prompt, 'problem': problem}
+
+        return super().generate_item(new_entry, *args, **kwargs)
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -103,6 +103,7 @@ from .natural_question import *  # noqa: F401, F403
 from .natural_question_cn import *  # noqa: F401, F403
 from .NPHardEval import *  # noqa: F401, F403
 from .obqa import *  # noqa: F401, F403
+from .OlympiadBench import *  # noqa: F401, F403
 from .OpenFinData import *  # noqa: F401, F403
 from .piqa import *  # noqa: F401, F403
 from .py150 import *  # noqa: F401, F403
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -398,9 +398,18 @@ DATASETS_MAPPING = {
        "hf_id": "THUDM/LongBench-v2",
        "local": "./data/longbenchv2/data.json",
    },
+    "opencompass/OlympiadBench": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/OlympiadBench",
+    },
 }

 DATASETS_URL = {
+    "/OlympiadBench": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip",
+        "md5": "97e8b1ae7f6170d94817288a8930ef00",
+    },
    "/longbenchv2":{
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip",
        "md5": "09b7e06e6f98c5cca8ad597b3d7b42f0",
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -1,5 +1,7 @@
 # Alpaca-eval
 alpaca-eval==0.6
+# OlympiadBench
+antlr4-python3-runtime==4.11
 cn2an
 # Dingo
 dingo-python==1.1.2