[Update] Update Skywork/Qwen-QwQ (#1728)

* Update JuderBench * Support O1-style Prompts * Update Code * Update OpenAI * Update BigCodeBench * Update BigCodeBench * Update BigCodeBench * Update BigCodeBench * Update BigCodeBench * Update
2025-05-30 16:03:24 +08:00 · 2024-12-05 19:30:43 +08:00 · 2024-12-05 19:30:43 +08:00 · fb43dd1906
commit fb43dd1906
parent 6181ac1122
49 changed files with 2104 additions and 131 deletions
--- a/configs/datasets/LCBench/lcbench_repeat10_gen.py
+++ b/configs/datasets/LCBench/lcbench_repeat10_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .lcbench_repeat10_gen_5ff288 import LCBench_datasets_repeat10  # noqa: F401, F403
+    from .lcbench_repeat10_gen_5ff288 import LCBench_repeat10_datasets  # noqa: F401, F403
--- a/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
+++ b/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
@ -86,7 +86,7 @@ LC_cn_infer_cfg = dict(

 LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')

-LCBench_datasets_repeat10 = [
+LCBench_repeat10_datasets = [
    dict(
        type=LCDataset,
        abbr='lcbench_en_repeat10',
--- a/opencompass/configs/datasets/LCBench/lcbench_repeat10_gen.py
+++ b/opencompass/configs/datasets/LCBench/lcbench_repeat10_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .lcbench_repeat10_gen_5ff288 import LCBench_datasets_repeat10  # noqa: F401, F403
+    from .lcbench_repeat10_gen_5ff288 import LCBench_repeat10_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
+++ b/opencompass/configs/datasets/LCBench/lcbench_repeat10_gen_5ff288.py
@ -86,7 +86,7 @@ LC_cn_infer_cfg = dict(

 LC_eval_cfg = dict(evaluator=dict(type=LCPassKEvaluator), pred_role='BOT')

-LCBench_datasets_repeat10 = [
+LCBench_repeat10_datasets = [
    dict(
        type=LCDataset,
        abbr='lcbench_en_repeat10',
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
@ -0,0 +1,87 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_full_complete_gen_faf748 import bigcodebench_full_complete_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
@ -0,0 +1,53 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+
+
+bigcodebench_full_reader_cfg = dict(
+        input_columns=['complete_prompt'],
+        output_column='test',
+)
+
+
+bigcodebench_full_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{complete_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        remote_execute_api="https://bigcode-bigcodebench-evaluator.hf.space/",
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_full_complete_datasets = [
+    dict(
+        abbr='bigcodebench_full_complete',
+        type=BigCodeBenchDataset,
+        path="opencompass/bigcodebench",
+        reader_cfg=bigcodebench_full_reader_cfg,
+        infer_cfg=bigcodebench_full_infer_cfg,
+        eval_cfg=bigcodebench_full_eval_cfg,
+        release_version='v0.1.2'
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_full_instruct_gen_8815eb import bigcodebench_full_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
@ -0,0 +1,53 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+
+
+bigcodebench_full_reader_cfg = dict(
+        input_columns=['instruct_prompt'],
+        output_column='test',
+)
+
+
+bigcodebench_full_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{instruct_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=8192)
+)
+
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        remote_execute_api="https://bigcode-bigcodebench-evaluator.hf.space/",
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_full_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_full_instruct',
+        type=BigCodeBenchDataset,
+        path="opencompass/bigcodebench",
+        reader_cfg=bigcodebench_full_reader_cfg,
+        infer_cfg=bigcodebench_full_infer_cfg,
+        eval_cfg=bigcodebench_full_eval_cfg,
+        release_version='v0.1.2'
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_hard_complete_gen_faf748 import bigcodebench_hard_complete_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
@ -0,0 +1,54 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+
+
+bigcodebench_hard_reader_cfg = dict(
+        input_columns=['complete_prompt'],
+        output_column='test',
+)
+
+
+bigcodebench_hard_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{complete_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024)
+)
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        remote_execute_api="https://bigcode-bigcodebench-evaluator.hf.space/",
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_complete_datasets = [
+    dict(
+        abbr='bigcodebench_hard_complete',
+        type=BigCodeBenchDataset,
+        path="opencompass/bigcodebench",
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
@ -0,0 +1,54 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BigCodeBenchDataset,
+    BigCodeBenchEvaluator
+)
+
+
+bigcodebench_hard_reader_cfg = dict(
+        input_columns=['instruct_prompt'],
+        output_column='test',
+)
+
+
+bigcodebench_hard_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[dict(role='system',
+                        fallback_role='HUMAN',
+                        prompt='')],
+            round=[
+               dict(role='HUMAN', prompt='{instruct_prompt}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=8192)
+)
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        remote_execute_api="https://bigcode-bigcodebench-evaluator.hf.space/",
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path="opencompass/bigcodebench",
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
--- a/opencompass/configs/datasets/cmmlu/cmmlu_0shot_nocot_llmjudge_gen_e1cd9a.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_0shot_nocot_llmjudge_gen_e1cd9a.py
@ -0,0 +1,173 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教'
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix+QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test'),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        ))
+
+del _name, _ch_name
--- a/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_llmjudge_gen_772ea0.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_0shot_nocot_llmjudge_gen_772ea0.py
@ -0,0 +1,101 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess, GPQAEvaluator
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+# openai_simple_eval prompt
+align_prompt = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=align_prompt),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+
+gpqa_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg,
+            mode='singlescore',
+        )
+    )
--- a/opencompass/configs/datasets/korbench/korbench_single_0shot_llmjudge_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_llmjudge_gen.py
@ -0,0 +1,109 @@
+from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+categories = ["cipher", "counterfactual", "logic", "operation", "puzzle"]
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+korbench_0shot_single_datasets = []
+
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role="HUMAN",
+                    prompt=""
+                )
+            ],
+            round=[
+                dict(
+                    role="HUMAN",
+                    prompt="{prompt}" # f-string
+                )
+            ]
+        )
+    )
+
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=["prompt"],
+        output_column="answer",
+    )
+
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024),
+    )
+
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    # Dataset
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f"korbench_{category}",
+        path="opencompass/korbench",
+        prompt_mode='0_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+        mode='singlescore',
+    )
+
+    korbench_0shot_single_datasets.append(korbench_dataset)
--- a/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
+++ b/opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .livereasonbench_gen_0283c3 import simpleqa_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py
+++ b/opencompass/configs/datasets/livereasonbench/livereasonbench_gen_f990de.py
@ -0,0 +1,136 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+# from opencompass.datasets import SimpleQADataset, simpleqa_postprocess
+from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
+
+
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+
+
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+
+
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+
+
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+
+
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k". 
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT. 
+    - Predicted answers "100k" and "113k" are INCORRECT. 
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name. 
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+
+Grade the predicted answer of this new question as one of:
+A: CORRECT 
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {answer}
+Predicted answer: {prediction}
+```
+""".strip()
+
+livereasonbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+livereasonbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt="Question: {question}\n"),
+            ],
+        )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=16384))
+
+livereasonbench_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dict_postprocessor=dict(type=livereasonbench_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+livereasonbench_datasets = [
+    dict(
+        abbr='LiveReasonBench-20241202',
+        type=LiveReasonBenchDataset,
+        path='opencompass/LiveReasonBench',
+        reader_cfg=livereasonbench_reader_cfg,
+        infer_cfg=livereasonbench_infer_cfg,
+        eval_cfg=livereasonbench_eval_cfg,
+        version='livereasonbench-20241202',
+        mode='singlescore',
+  )
+]
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_llmjudge_gen_63a000.py
@ -1,40 +1,11 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, GaoKaoMATHEvaluator
-# from opencompass.utils.model_postprocessors import naive_model_postprocess, xfinder_postprocess
-from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
-
-# ----------------------------- Eval Parameters -----------------------------
-## Postprocess function
-post_func = 're' # 're', 'xfinder_model', 'naive_model'
-
-## Evalute function
-eval_func = 'naive_model' # 're', 'naive_model'
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset


-## Model api url
-# xfinder_url = 'http://0.0.0.0:23333/v1' # for 'xFinder-qwen1505' if post_func is 'xfinder_model'
-# naive_model_name = 'Qwen/Qwen2.5-72B-Instruct' # replace with your model name
-naive_model_name = 'dlc_model'
-# naive_model_url = [
-#     'http://172.30.56.38:23001/v1',
-# ] # Multi-apis for accerlation
-naive_model_url = [
-    "http://172.30.56.38:23001/v1",
-    "http://172.30.8.4:23003/v1",
-    "http://172.30.8.14:23002/v1",
-    "http://172.30.48.80:23004/v1",
-    "http://172.30.56.132:23005/v1",
-    "http://172.30.16.115:23006/v1",
-    "http://172.30.48.82:23007/v1",
-    "http://172.30.24.53:23008/v1",
-    "http://172.30.56.141:23009/v1",
-    "http://172.30.8.35:23010/v1",
-    "http://172.30.48.85:23011/v1",
-    "http://172.30.16.116:23012/v1"
-]
 # ----------------------------- Detailed Config -----------------------------

 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
@ -53,25 +24,57 @@ math_infer_cfg = dict(
 )


-if post_func == 're':
-    pred_postprocessor = dict(type=math_postprocess_v2)
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.


-if eval_func == 're':
-    evaluator = dict(type=MATHEvaluator, version='v2')
-elif eval_func == 'naive_model':
-    evaluator = dict(
-        type=GaoKaoMATHEvaluator,
-        judge_model_name=naive_model_name,
-        url=naive_model_url,
-    )
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()

-# postprocess v2
+# Evaluation configuration
 math_eval_cfg = dict(
-    evaluator=evaluator,
-    pred_postprocessor=pred_postprocessor,
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
 )

+
 math_datasets = [
    dict(
        type=MATHDataset,
@ -81,5 +84,6 @@ math_datasets = [
        reader_cfg=math_reader_cfg,
        infer_cfg=math_infer_cfg,
        eval_cfg=math_eval_cfg,
+        mode='singlescore',
    )
 ]
--- a/opencompass/configs/datasets/mmlu/mmlu_openai_0shot_nocot_llmjudge_gen_216503.py
+++ b/opencompass/configs/datasets/mmlu/mmlu_openai_0shot_nocot_llmjudge_gen_216503.py
@ -0,0 +1,104 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets
+    from .mmlu_stem_sets import mmlu_all_sets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    
+    mmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+            mode='singlescore',
+        ))
--- a/opencompass/configs/datasets/mmlu/mmlu_stem_sets.py
+++ b/opencompass/configs/datasets/mmlu/mmlu_stem_sets.py
@ -0,0 +1,3 @@
+mmlu_all_sets = [
+    'abstract_algebra', 'anatomy', 'astronomy', 'college_biology', 'college_chemistry', 'college_computer_science', 'college_mathematics', 'college_physics', 'computer_security', 'conceptual_physics', 'electrical_engineering', 'elementary_mathematics', 'high_school_biology', 'high_school_chemistry', 'high_school_computer_science', 'high_school_mathematics', 'high_school_physics', 'high_school_statistics', 'machine_learning'
+]
--- a/opencompass/configs/models/qwq/lmdeploy_qwq_32b_preview.py
+++ b/opencompass/configs/models/qwq/lmdeploy_qwq_32b_preview.py
@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='QwQ-32B-Preview',
+        path='Qwen/QwQ-32B-Preview',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
+        max_seq_len=32768,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/opencompass/configs/models/skywork/lmdeploy_skywork_o1_open_llama3_1_8b_instruct.py
+++ b/opencompass/configs/models/skywork/lmdeploy_skywork_o1_open_llama3_1_8b_instruct.py
@ -0,0 +1,16 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='Skywork-o1-Open-Llama-3_1-8B-turbomind',
+        path='Skywork/Skywork-o1-Open-Llama-3.1-8B',
+        engine_config=dict(max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096),
+        max_seq_len=16384,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['<|end_of_text|>', '<|eot_id|>'],
+    )
+]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -10,6 +10,7 @@ from .arc_prize_public_evaluation import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
 from .babilong import *  # noqa: F401, F403
 from .bbh import *  # noqa: F401, F403
+from .bigcodebench import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
 from .bustum import *  # noqa: F401, F403
 from .c3 import *  # noqa: F401, F403
@ -49,6 +50,7 @@ from .flores import *  # noqa: F401, F403
 from .game24 import *  # noqa: F401, F403
 from .gaokao_math import *  # noqa: F401, F403
 from .GaokaoBench import *  # noqa: F401, F403
+from .generic import *  # noqa: F401, F403
 from .govrepcrs import *  # noqa: F401, F403
 from .gpqa import *  # noqa: F401, F403
 from .gsm8k import *  # noqa: F401, F403
@ -73,6 +75,8 @@ from .LCBench import *  # noqa: F401, F403
 from .lcsts import *  # noqa: F401, F403
 from .leval import *  # noqa: F401, F403
 from .livecodebench import *  # noqa: F401, F403
+from .livemathbench import *  # noqa: F401, F403
+from .livereasonbench import *  # noqa: F401, F403
 from .llm_compression import LLMCompressionDataset  # noqa: F401, F403
 from .longbench import *  # noqa: F401, F403
 from .lveval import *  # noqa: F401, F403
--- a/opencompass/datasets/aime2024.py
+++ b/opencompass/datasets/aime2024.py
@ -12,7 +12,7 @@ from .base import BaseDataset
 class Aime2024Dataset(BaseDataset):

    @staticmethod
-    def load(path):
+    def load(path, **kwargs):
        path = get_data_path(path)
        dataset = []
        with open(path, 'r') as f:
--- a/opencompass/datasets/bigcodebench/init.py
+++ b/opencompass/datasets/bigcodebench/init.py
@ -0,0 +1 @@
+from .bigcodebench import BigCodeBenchDataset, BigCodeBenchEvaluator  # noqa
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@ -0,0 +1,169 @@
+# Copyright (c) 2024, BigCodeBench and its contributors.
+# Copyright (c) 2023, OpenCompass and its contributors.
+
+import os
+import time
+from concurrent.futures._base import CancelledError
+
+import httpx
+from datasets import Dataset, DatasetDict
+from gradio_client import Client, handle_file
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.utils import JSONToolkit  # noqa: F401, F403
+from opencompass.utils import (check_url_accessibility, get_data_path,
+                               get_logger, setup_proxies)
+
+from ..base import BaseDataset
+from .extractor import extract_code_generation
+
+
+class BigCodeBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str = 'opencompass/bigcodebench',
+             local_mode: bool = False,
+             release_version: str = 'v0.1.2',
+             dataset_version: str = 'full'):
+        """
+        Args:
+            path (str): The path to the dataset.
+            local_mode (bool): Whether to use local give path or use
+                automatically download.
+            release_version (str): The release version of the dataset.
+            dataset_version (str): The data version of the dataset.
+                only support ['full', 'hard']
+        """
+        assert dataset_version in ['full', 'hard'], \
+            'dataset_version should be one of ["full", "hard"], '
+        f'but got {dataset_version}'
+        path = get_data_path(path, local_mode=local_mode)
+        dataset = DatasetDict()
+        # Valid Keys:
+        # 'task_id', 'complete_prompt', 'instruct_prompt',
+        # 'canonical_solution', 'code_prompt', 'test',
+        # 'entry_point', 'doc_struct', 'libs'
+        if dataset_version == 'full':
+            items = JSONToolkit.read_jsonl(
+                os.path.join(path, f'BigCodeBench-{release_version}.jsonl'))
+        else:
+            items = JSONToolkit.read_jsonl(
+                os.path.join(path,
+                             f'BigCodeBench-Hard-{release_version}.jsonl'))
+
+        dataset['train'] = Dataset.from_list(items)
+        dataset['test'] = Dataset.from_list(items)
+
+        return dataset
+
+
+class BigCodeBenchEvaluator(BaseEvaluator):
+    """Evaluator for BigCodeBench.
+
+    Args:
+        num_process_evaluate (int): number of processes to evaluate
+        timeout (int): timeout for each evaluation
+        release_version (str): release version of BigCodeBench
+        eval_type (str): type of evaluation, either 'instruct' or 'completion'
+    """
+
+    def __init__(
+            self,
+            release_version='v0.1.2',
+            eval_type='instruct',
+            remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',  # noqa
+            dataset_version: str = 'full',
+            pass_k: str = '1,5,10',
+            parallel: int = -1,
+            min_time_limit: float = 1,
+            max_as_limit: int = 30 * 1024,
+            max_data_limit: int = 30 * 1024,
+            max_stack_limit: int = 10,
+            check_gt_only: bool = False,
+            no_gt: bool = False):
+        super().__init__()
+        self.dataset = BigCodeBenchDataset.load(
+            release_version=release_version,
+            dataset_version=dataset_version)['test']
+        self.eval_type = eval_type
+        self.remote_execute_api = remote_execute_api
+
+        self.eval_kwargs = dict(subset=dataset_version,
+                                pass_k=pass_k,
+                                parallel=parallel,
+                                min_time_limit=min_time_limit,
+                                max_as_limit=max_as_limit,
+                                max_data_limit=max_data_limit,
+                                max_stack_limit=max_stack_limit,
+                                check_gt_only=check_gt_only,
+                                no_gt=no_gt)
+
+    def score(self, predictions, references):
+        logger = get_logger()
+        entrypoints = [item['entry_point'] for item in self.dataset]
+
+        # Append content to the end of the prompt for Completion
+        if self.eval_type == 'complete':
+            content = [item['complete_prompt'] for item in self.dataset]
+            predictions = [
+                content[idx] + item for idx, item in enumerate(predictions)
+            ]
+        elif self.eval_type == 'instruct':
+            pass
+        else:
+            raise ValueError(f'Unknown eval_type: {self.eval_type}')
+
+        # Sanitize predictions for execution
+        logger.info('Start to extract code from predictions')
+        sanitized_predictions = []
+        for prediction, entrypoint in zip(predictions, entrypoints):
+            sanitized_prediction = extract_code_generation(
+                prediction, entrypoint=entrypoint)
+            sanitized_predictions.append(sanitized_prediction)
+
+        # Prepare for submission
+        submitted_contents = []
+        task_ids = [item['task_id'] for item in self.dataset]
+        for task_id, sanitized_prediction in zip(task_ids,
+                                                 sanitized_predictions):
+            submitted_content = {
+                'task_id': task_id,
+                'solution': sanitized_prediction
+            }
+            submitted_contents.append(submitted_content)
+
+        submitted_contents_path = os.path.join(
+            self._out_dir, 'bigcodebench_submitted_contents.jsonl')
+        JSONToolkit.save_jsonl(submitted_contents, submitted_contents_path)
+        logger.info(f'Dump submitted contents to {submitted_contents_path}')
+
+        logger.info(
+            f'Start to connect to {self.remote_execute_api} for evaluating')
+        # Conduct evaluation with Eval Client
+        proxies = setup_proxies('BIGCODEBENCH_EVAL_PROXY_URL')
+
+        is_accessible, status_code = check_url_accessibility(
+            self.remote_execute_api)
+        if not is_accessible:
+            logger.error(f'Failed to connect to {self.remote_execute_api} '
+                         f'with status code {status_code}')
+            return False
+
+        while True:
+            try:
+                eval_client = Client(self.remote_execute_api,
+                                     httpx_kwargs=dict(proxies=proxies))
+                results, pass_at_k = eval_client.predict(
+                    split=self.eval_type,
+                    samples=handle_file(submitted_contents_path),
+                    api_name='/predict',
+                    **self.eval_kwargs)
+                break
+            except (httpx.ReadTimeout, CancelledError):
+                logger.info('Read timeout error. Retrying in 4s...')
+                time.sleep(4)
+
+        dump_results = {'details': results}
+        dump_results.update(pass_at_k)
+
+        return dump_results
--- a/opencompass/datasets/bigcodebench/extractor.py
+++ b/opencompass/datasets/bigcodebench/extractor.py
@ -0,0 +1,192 @@
+# Copyright (c) 2024, BigCodeBench and its contributors.
+# Copyright (c) 2023, OpenCompass and its contributors.
+
+import ast
+import traceback
+from typing import Dict, Generator, List, Optional, Set, Tuple
+
+from tree_sitter import Node
+from tree_sitter_languages import get_parser
+
+CLASS_TYPE = 'class_definition'
+FUNCTION_TYPE = 'function_definition'
+IMPORT_TYPE = ['import_statement', 'import_from_statement']
+IDENTIFIER_TYPE = 'identifier'
+ATTRIBUTE_TYPE = 'attribute'
+RETURN_TYPE = 'return_statement'
+EXPRESSION_TYPE = 'expression_statement'
+ASSIGNMENT_TYPE = 'assignment'
+
+
+def syntax_check(code, verbose=False):
+    try:
+        ast.parse(code)
+        return True
+    except (SyntaxError, MemoryError):
+        if verbose:
+            traceback.print_exc()
+        return False
+
+
+def code_extract(text: str) -> str:
+    lines = text.split('\n')
+    longest_line_pair = (0, 0)
+    longest_so_far = 0
+
+    for i in range(len(lines)):
+        for j in range(i + 1, len(lines)):
+            current_lines = '\n'.join(lines[i:j + 1])
+            if syntax_check(current_lines):
+                current_length = sum(1 for line in lines[i:j + 1]
+                                     if line.strip())
+                if current_length > longest_so_far:
+                    longest_so_far = current_length
+                    longest_line_pair = (i, j)
+
+    return '\n'.join(lines[longest_line_pair[0]:longest_line_pair[1] + 1])
+
+
+def get_deps(nodes: List[Tuple[str, Node]]) -> Dict[str, Set[str]]:
+
+    def dfs_get_deps(node: Node, deps: Set[str]) -> None:
+        for child in node.children:
+            if child.type == IDENTIFIER_TYPE:
+                deps.add(child.text.decode('utf8'))
+            else:
+                dfs_get_deps(child, deps)
+
+    name2deps = {}
+    for name, node in nodes:
+        deps = set()
+        dfs_get_deps(node, deps)
+        name2deps[name] = deps
+    return name2deps
+
+
+def get_function_dependency(entrypoint: str,
+                            call_graph: Dict[str, str]) -> Set[str]:
+    queue = [entrypoint]
+    visited = {entrypoint}
+    while queue:
+        current = queue.pop(0)
+        if current not in call_graph:
+            continue
+        for neighbour in call_graph[current]:
+            if not (neighbour in visited):
+                visited.add(neighbour)
+                queue.append(neighbour)
+    return visited
+
+
+def get_definition_name(node: Node) -> str:
+    for child in node.children:
+        if child.type == IDENTIFIER_TYPE:
+            return child.text.decode('utf8')
+
+
+def traverse_tree(node: Node) -> Generator[Node, None, None]:
+    cursor = node.walk()
+    depth = 0
+
+    visited_children = False
+    while True:
+        if not visited_children:
+            yield cursor.node
+            if not cursor.goto_first_child():
+                depth += 1
+                visited_children = True
+        elif cursor.goto_next_sibling():
+            visited_children = False
+        elif not cursor.goto_parent() or depth == 0:
+            break
+        else:
+            depth -= 1
+
+
+def has_return_statement(node: Node) -> bool:
+    traverse_nodes = traverse_tree(node)
+    for node in traverse_nodes:
+        if node.type == RETURN_TYPE:
+            return True
+    return False
+
+
+def extract_target_code_or_empty(code: str,
+                                 entrypoint: Optional[str] = None) -> str:
+    code = code_extract(code.strip())
+    code_bytes = bytes(code, 'utf8')
+    parser = get_parser('python')
+    tree = parser.parse(code_bytes)
+    class_names = set()
+    function_names = set()
+    variable_names = set()
+
+    root_node = tree.root_node
+    import_nodes = []
+    definition_nodes = []
+
+    for child in root_node.children:
+        if child.type in IMPORT_TYPE:
+            import_nodes.append(child)
+        elif child.type == CLASS_TYPE:
+            name = get_definition_name(child)
+            if not (name in class_names or name in variable_names
+                    or name in function_names):
+                definition_nodes.append((name, child))
+                class_names.add(name)
+        elif child.type == FUNCTION_TYPE:
+            name = get_definition_name(child)
+            if not (name in function_names or name in variable_names
+                    or name in class_names):
+                definition_nodes.append((name, child))
+                function_names.add(get_definition_name(child))
+        elif (child.type == EXPRESSION_TYPE
+              and child.children[0].type == ASSIGNMENT_TYPE):
+            subchild = child.children[0]
+            name = get_definition_name(subchild)
+            if not (name in variable_names or name in function_names
+                    or name in class_names):
+                definition_nodes.append((name, subchild))
+                variable_names.add(name)
+
+    if entrypoint:
+        name2deps = get_deps(definition_nodes)
+        reachable = get_function_dependency(entrypoint, name2deps)
+
+    sanitized_output = b''
+
+    for node in import_nodes:
+        sanitized_output += code_bytes[node.start_byte:node.end_byte] + b'\n'
+
+    for pair in definition_nodes:
+        name, node = pair
+        if entrypoint and not (name in reachable):
+            continue
+        sanitized_output += code_bytes[node.start_byte:node.end_byte] + b'\n'
+
+    sanitized_output = sanitized_output[:-1].decode('utf8')
+
+    # ad-hoc approach to remove unnecessary lines, but it works
+    lines = sanitized_output.splitlines()
+    outer_lines = []
+    for i in range(len(lines) - 1, -1, -1):
+        if lines[i].startswith(' '):
+            break
+        if not lines[i].startswith(' ') and entrypoint in lines[i]:
+            outer_lines.append(i)
+    if outer_lines:
+        sanitized_output = '\n'.join(lines[:outer_lines[-1]])
+    return sanitized_output
+
+
+def extract_code_generation(model_output: str,
+                            entrypoint: Optional[str] = None):
+
+    # Extract code according to the entrypoint
+    sanitized_code = extract_target_code_or_empty(model_output,
+                                                  entrypoint).strip()
+    # Fallback to extract first codeblock if sanitized_code is empty
+    sanitized_code = code_extract(
+        model_output) if not sanitized_code else sanitized_code
+
+    return sanitized_code
--- a/opencompass/datasets/cmmlu.py
+++ b/opencompass/datasets/cmmlu.py
@ -14,7 +14,7 @@ from .base import BaseDataset
 class CMMLUDataset(BaseDataset):

    @staticmethod
-    def load(path: str, name: str):
+    def load(path: str, name: str, **kwargs):
        path = get_data_path(path)
        if environ.get('DATASET_SOURCE') == 'ModelScope':
            from modelscope import MsDataset
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@ -0,0 +1,71 @@
+import re
+
+
+def get_final_results(judged_answers, references, origial_responses):
+    count = 0
+    is_correct_count = 0
+    is_incorrect_count = 0
+    is_not_attempted_count = 0
+    details = []
+    for i, j, k in zip(judged_answers, references, origial_responses):
+        match = re.search(r'(A|B)', i)
+        grade_letter = match.group(
+            0) if match else 'B'  # Default to "INCORRECT" if no match
+        detail = {
+            'pred': k,
+            'ref': j,
+            'origin_grade_response': i,
+            'grade_letter': grade_letter,
+            'correct': False
+        }
+        count += 1
+        if grade_letter == 'A':
+            is_correct_count += 1
+            detail['correct'] = True
+        elif grade_letter == 'B':
+            is_incorrect_count += 1
+        else:
+            is_not_attempted_count += 1
+        details.append(detail)
+
+    is_correct = is_correct_count / count
+    is_incorrect = is_incorrect_count / count
+    # is_not_attempted = is_not_attempted_count / count
+    is_given_attempted = is_correct + is_incorrect
+    accuracy_given_attempted = is_correct / is_given_attempted \
+        if is_given_attempted > 0 else 0
+    f1 = 2 * accuracy_given_attempted * is_correct / (
+        accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
+                                                   is_correct) > 0 else 0
+    result = {
+        # 'accuracy_given_attempted': accuracy_given_attempted,
+        'accuracy': accuracy_given_attempted * 100,
+        'f1': f1,
+        'details': details
+    }
+    return result
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = match.group(
+        0) if match else 'B'  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def generic_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+    judged_answers = []
+    origial_responses = []
+    references = []
+    for k, v in output.items():
+        origial_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            references.append(v['gold'])
+    results = get_final_results(judged_answers, references, origial_responses)
+    results['details'] = output
+    return results
--- a/opencompass/datasets/gpqa.py
+++ b/opencompass/datasets/gpqa.py
@ -16,7 +16,7 @@ from .base import BaseDataset
 class GPQADataset(BaseDataset):

    @staticmethod
-    def load(path: str, name: str):
+    def load(path: str, name: str, **kwargs):
        path = get_data_path(path, local_mode=True)
        cnt = 0
        data = []
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@ -185,6 +185,11 @@ def humaneval_postprocess_v2(text: str) -> str:
        text = blocks[0]
    return text

+def humaneval_postprocess_v3(text: str) -> str:
+    blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
+    if len(blocks) >= 1:
+        text = blocks[-1]
+    return text

 def humaneval_internal_v2_postprocess(text: str):
    if text.startswith('   ') and not text.startswith('    '):
--- a/opencompass/datasets/korbench/korbench.py
+++ b/opencompass/datasets/korbench/korbench.py
@ -17,40 +17,40 @@ class korbenchDataset(BaseDataset):
    """Dataset loader for the  task in KOR-Bench."""

    @staticmethod
-    def load(path, mode, category):
+    def load(path, prompt_mode, category, **kwargs):
        """Load the  dataset using shared ."""
        base_path = get_data_path(path)
        rule_file = None
        sample_file = None
        mixed_file = None
        mixed_data = None
-        if '0_shot' in mode or '3_shot' in mode:
+        if '0_shot' in prompt_mode or '3_shot' in prompt_mode:
            rule_file = find_file(base_path, os.path.join(category, 'rule'))
            sample_file = find_file(base_path,
                                    os.path.join(category, 'sample'))
-        elif mode == 'mixed':
+        elif prompt_mode == 'mixed':
            mixed_file = find_file(base_path, os.path.join('mixed', category))
            mixed_data = load_json_or_jsonl(mixed_file) or []
        else:
-            raise ValueError(f'Unsupported mode: {mode}')
+            raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
        three_shot_file = None
-        if mode == '3_shot':
+        if prompt_mode == '3_shot':
            ts_path = os.path.join(category, 'three-shot')
            three_shot_file = find_file(base_path, ts_path)
        # Load data
-        if mode in ['0_shot', '3_shot']:
+        if prompt_mode in ['0_shot', '3_shot']:
            rules = load_json_or_jsonl(rule_file) or []
            samples = load_json_or_jsonl(sample_file) or []
        template_path = None
-        if mode == '0_shot':
+        if prompt_mode == '0_shot':
            template_path = os.path.join(
                os.path.dirname(__file__),
                'korbench_dataset_config/prompt/0_shot.yaml')
-        elif mode == '3_shot':
+        elif prompt_mode == '3_shot':
            template_path = os.path.join(
                os.path.dirname(__file__),
                'korbench_dataset_config/prompt/3_shot.yaml')
-        elif mode == 'mixed':
+        elif prompt_mode == 'mixed':
            template_path = os.path.join(
                os.path.dirname(__file__),
                'korbench_dataset_config/prompt/mixed.yaml')
@ -62,7 +62,7 @@ class korbenchDataset(BaseDataset):

        # Process data
        data = []
-        if mode == '0_shot':
+        if prompt_mode == '0_shot':
            for sample in samples:
                rule_id = sample['rule_id']
                rule = next((r for r in rules if r['idx'] == rule_id), None)
@ -81,13 +81,13 @@ class korbenchDataset(BaseDataset):
                    'answer': sample['answer'],
                    'prompt': prompt,
                    'rule_id': rule['idx'],
-                    'mode': '0_shot',
+                    'prompt_mode': '0_shot',
                    'category': category,
                })

            return Dataset.from_list(data)

-        if mode == '3_shot':
+        if prompt_mode == '3_shot':
            data = []
            three_shot = load_json_or_jsonl(three_shot_file) or []
            for sample in samples:
@ -111,13 +111,13 @@ class korbenchDataset(BaseDataset):
                    'answer': sample['answer'],
                    'prompt': prompt,
                    'rule_id': rule['idx'],
-                    'mode': '3_shot',
+                    'prompt_mode': '3_shot',
                    'category': category,
                })

            return Dataset.from_list(data)

-        if mode == 'mixed':
+        if prompt_mode == 'mixed':
            # Process data
            data = []
            for item in mixed_data:
@ -159,7 +159,7 @@ class korbenchDataset(BaseDataset):
                    'rule_list': rule_list,
                    'question_list': question_list,
                    'prompt': prompt,
-                    'mode': 'mixed',
+                    'prompt_mode': 'mixed',
                    'answer': '',
                    'base_path': base_path,
                })
@ -174,14 +174,15 @@ class korbenchEvaluator(BaseEvaluator):
        super().__init__()

    def score(self, predictions, references, test_set):
-        """Evaluate predictions for a single mode in KOR-Bench."""
+        """Evaluate predictions for a single prompt_mode in KOR-Bench."""
        if not test_set:
            raise ValueError('Test set is empty.')

-        mode = test_set[0]['mode']  # Determine the mode from the first entry
+        prompt_mode = test_set[0][
+            'prompt_mode']  # Determine the prompt_mode from the first entry
        data = {}

-        # Organize data for the given mode
+        # Organize data for the given prompt_mode
        for i in range(len(predictions)):
            entry = {
                'prediction': predictions[i],
@ -195,18 +196,18 @@ class korbenchEvaluator(BaseEvaluator):
            data[i] = entry

        if not data:
-            raise ValueError(f"No data found for mode '{mode}'")
+            raise ValueError(f"No data found for prompt_mode '{prompt_mode}'")

-        # Evaluate based on the mode
-        if mode == '0_shot':
+        # Evaluate based on the prompt_mode
+        if prompt_mode == '0_shot':
            evaluation_results = evaluate_responses(data, '0_shot')
-        elif mode == '3_shot':
+        elif prompt_mode == '3_shot':
            evaluation_results = evaluate_responses(data, '3_shot')
-        elif mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
+        elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
            evaluation_results = evaluate_responses(data, 'mixed',
                                                    test_set[0]['base_path'])
        else:
-            raise ValueError(f'Unsupported mode: {mode}')
+            raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
        # Calculate accuracy
        correct_count = sum(res['is_correct'] for res in evaluation_results)
        accuracy = (correct_count / len(evaluation_results)) * 100
--- a/opencompass/datasets/livecodebench/evaluator.py
+++ b/opencompass/datasets/livecodebench/evaluator.py
@ -13,6 +13,7 @@ from opencompass.utils import get_logger

 from .execute_utils import BASE_IMPORTS, codeexecute_check_correctness
 from .extract_utils import (extract_code_execution, extract_code_generation,
+                            extract_code_generation_v2,
                            extract_test_output_code)
 from .livecodebench import LCBCodeGenerationDataset
 from .pass_k_utils import compute_metrics_from_results
@ -231,15 +232,22 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
    def __init__(self,
                 num_process_evaluate,
                 timeout=6,
-                 release_version='release_v1'):
+                 release_version='release_v1',
+                 extractor_version='v1'):
        super().__init__()
        self.num_process_evaluate = num_process_evaluate
        self.timeout = timeout
        self.dataset = LCBCodeGenerationDataset.load(
            release_version=release_version)['test']
+        self.extractor_version = extractor_version

    def score(self, predictions, references):
-        predictions = [[extract_code_generation(item)] for item in predictions]
+        if self.extractor_version == 'v1':
+            predictions = [[extract_code_generation(item)]
+                           for item in predictions]
+        elif self.extractor_version == 'v2':
+            predictions = [[extract_code_generation_v2(item)]
+                           for item in predictions]

        evaluation_samples = dict()
        for idx in range(len(self.dataset)):
@ -252,12 +260,9 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):

        BaseEvaluator.is_num_equal(predictions, references)

-        results = {  # noqa: F841
-            'pass': 0,
-            'timeout': 0,
-            'failed': 0,
-            'wrong_answer': 0
-        }  # noqa: F401, F403
+        extracted_predictions = {}
+        for idx, content in enumerate(predictions):
+            extracted_predictions[idx] = content

        metrics, eval_results, final_metadata = codegen_metrics(
            references,
@ -266,8 +271,13 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
            num_process_evaluate=self.num_process_evaluate,
            timeout=self.timeout,
        )
+        results = {
+            'extracted_predictions': extracted_predictions,
+            'eval_results': eval_results
+        }
+        results.update(metrics)

-        return metrics
+        return results


 def evaluate_score(args) -> list[bool]:
--- a/opencompass/datasets/livecodebench/extract_utils.py
+++ b/opencompass/datasets/livecodebench/extract_utils.py
@ -8,6 +8,22 @@ def extract_code_generation(model_output: str, model_type: str = 'chat'):
    outputlines = model_output.split('\n')
    # TODO: handle codellama

+    if model_type == 'base':
+        return model_output.strip()
+    elif model_type == 'chat':
+        indexlines = [i for i, line in enumerate(outputlines) if '```' in line]
+    else:
+        raise ValueError(f'Invalid mode type: {model_type}')
+    if len(indexlines) < 2:
+        return ''
+    return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])
+
+
+def extract_code_generation_v2(model_output: str, model_type: str = 'chat'):
+    # modified from
+    outputlines = model_output.split('\n')
+    # TODO: handle codellama
+
    if model_type == 'base':
        return model_output.strip()
    elif model_type == 'chat':
@ -17,6 +33,10 @@ def extract_code_generation(model_output: str, model_type: str = 'chat'):

    if len(indexlines) < 2:
        return ''
+    elif len(indexlines) > 2:
+        # Only Keep the last code block
+        indexlines = indexlines[-2:]
+
    return '\n'.join(outputlines[indexlines[0] + 1:indexlines[1]])


--- a/opencompass/datasets/livemathbench/livemathbench.py
+++ b/opencompass/datasets/livemathbench/livemathbench.py
@ -12,6 +12,7 @@ from datasets import Dataset
 from opencompass.models import OpenAISDK
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
+from opencompass.utils import get_data_path

 from ..base import BaseDataset
 from .prompts import (EXTRACT_PROMPT_CN, EXTRACT_PROMPT_EN, JUDGE_PROMPT_CN,
@ -31,6 +32,7 @@ class LiveMathBenchDataset(BaseDataset):
    ) -> List[Dict[str, Any]]:
        dataset = []
        dataset_info = {}
+        path = get_data_path(path)
        for split, language in product(LiveMathBenchDataset.dataset_splits,
                                       LiveMathBenchDataset.dataset_languages):
            file_path = os.path.join(path, f'{split}_{language}.jsonl')
@ -101,10 +103,10 @@ class LiveMathBenchEvaluator(BaseEvaluator):
                    path=model_name,
                    openai_api_base=url,
                    key='EMPTY',
-                    query_per_second=2,
+                    query_per_second=128,
                    meta_template=self.api_meta_template,
-                    temperature=kwargs.get('temperature', 0.01),
-                    max_seq_len=kwargs.get('max_tokens', 2048),
+                    temperature=kwargs.get('temperature', 0.001),
+                    max_seq_len=kwargs.get('max_tokens', 16384),
                )) for url in url
        ]
        self.with_postprocess = with_postprocess
--- a/opencompass/datasets/livereasonbench/init.py
+++ b/opencompass/datasets/livereasonbench/init.py
@ -0,0 +1,2 @@
+from .livereasonbench import LiveReasonBenchDataset  # noqa: F401, F403
+from .livereasonbench import livereasonbench_postprocess  # noqa: F401, F403
--- a/opencompass/datasets/livereasonbench/livereasonbench.py
+++ b/opencompass/datasets/livereasonbench/livereasonbench.py
@ -0,0 +1,193 @@
+# Edited from the official SimpleQA config: https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py # noqa E501
+import json
+import os
+import random
+import re
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class LiveReasonBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str,
+             num_examples: int | None = None,
+             n_repeats: int = 1,
+             version: str = 'livereasonbench-20241202',
+             **kwargs):
+        path = get_data_path(path)
+        dataset = DatasetDict()
+        # data = read
+        path = os.path.join(path, f'{version}.json')
+        with open(path, 'r', encoding='utf-8') as f:
+            examples = json.load(f)
+
+        if num_examples:
+            assert n_repeats == 1, \
+                'n_repeats only supported when max_examples = None'
+            rng = random.Random(0)
+            examples = rng.sample(examples, num_examples)
+        examples = examples * n_repeats
+        dataset['train'] = Dataset.from_list(examples)
+        dataset['test'] = Dataset.from_list(examples)
+        return dataset
+
+
+GRADER_TEMPLATE = """
+Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
+First, I will give examples of each grade, and then you will grade a new example.
+
+
+The following are examples of CORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia Obama and Sasha Obama
+Predicted answer 1: sasha and malia obama
+Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
+Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
+```
+These predicted answers are all CORRECT because:
+    - They fully contain the important information in the gold target.
+    - They do not contain any information that contradicts the gold target.
+    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
+    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
+
+
+The following are examples of INCORRECT predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: Malia.
+Predicted answer 2: Malia, Sasha, and Susan.
+Predicted answer 3: Barack Obama does not have any children.
+Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
+Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
+Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
+Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
+```
+These predicted answers are all INCORRECT because:
+    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
+
+
+The following are examples of NOT_ATTEMPTED predicted answers.
+```
+Question: What are the names of Barack Obama's children?
+Gold target: Malia and Sasha
+Predicted answer 1: I don't know.
+Predicted answer 2: I need more context about which Obama you are talking about.
+Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
+Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
+```
+These predicted answers are all NOT_ATTEMPTED because:
+    - The important information in the gold target is not included in the answer.
+    - No statements in the answer contradict the gold target.
+
+
+Also note the following things:
+- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
+    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
+    - Predicted answers "100k" and "113k" are INCORRECT.
+    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
+- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
+    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
+- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
+    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
+    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
+    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
+    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
+- Do not punish for typos in people's name if it's clearly the same name.
+    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
+
+Grade the predicted answer of this new question as one of:
+A: CORRECT
+B: INCORRECT
+C: NOT_ATTEMPTED
+Just return the letters "A", "B", or "C", with no text around it.
+
+Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+```
+Question: {question}
+Gold target: {gold_answer}
+Predicted answer: {answer}
+```
+""".strip()  # noqa E501
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+])
+
+
+def get_final_results(judged_answers, references, origial_responses):
+    count = 0
+    is_correct_count = 0
+    is_incorrect_count = 0
+    is_not_attempted_count = 0
+    details = []
+    for i, j, k in zip(judged_answers, references, origial_responses):
+        match = re.search(r'(A|B|C)', i)
+        grade_letter = match.group(
+            0) if match else 'C'  # Default to "NOT_ATTEMPTED" if no match
+        detail = {
+            'pred': k,
+            'ref': j,
+            'origin_grade_response': i,
+            'grade_letter': grade_letter,
+            'correct': False
+        }
+        count += 1
+        if grade_letter == 'A':
+            is_correct_count += 1
+            detail['correct'] = True
+        elif grade_letter == 'B':
+            is_incorrect_count += 1
+        else:
+            is_not_attempted_count += 1
+        details.append(detail)
+
+    is_correct = is_correct_count / count
+    is_incorrect = is_incorrect_count / count
+    # is_not_attempted = is_not_attempted_count / count
+    is_given_attempted = is_correct + is_incorrect
+    accuracy_given_attempted = is_correct / is_given_attempted \
+        if is_given_attempted > 0 else 0
+    f1 = 2 * accuracy_given_attempted * is_correct / (
+        accuracy_given_attempted + is_correct) if (accuracy_given_attempted +
+                                                   is_correct) > 0 else 0
+    result = {
+        'accuracy_given_attempted': accuracy_given_attempted,
+        'f1': f1,
+        'details': details
+    }
+    return result
+
+
+def _livereasonbench_postprocess(judgement: str):
+    match = re.search(r'(A|B|C)', judgement)
+    grade_letter = match.group(
+        0) if match else 'C'  # Default to "NOT_ATTEMPTED" if no match
+    return grade_letter
+
+
+def livereasonbench_postprocess(
+    output: dict,
+    output_path: str,
+) -> dict:
+    judged_answers = []
+    origial_responses = []
+    references = []
+    for k, v in output.items():
+        origial_responses.append(v['prediction'])
+        processed_judge = _livereasonbench_postprocess(v['prediction'])
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            references.append(v['gold'])
+    results = get_final_results(judged_answers, references, origial_responses)
+    results['details'] = output
+    return results
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@ -141,7 +141,7 @@ def extract_answer(response_text: str):
 class MATHDataset(BaseDataset):

    @staticmethod
-    def load(path: str, file_name: str = 'math.json'):
+    def load(path: str, file_name: str = 'math.json', **kwargs):
        path = get_data_path(path)
        dataset = DatasetDict()
        raw_data = []
--- a/opencompass/datasets/mmlu.py
+++ b/opencompass/datasets/mmlu.py
@ -15,7 +15,7 @@ from .base import BaseDataset
 class MMLUDataset(BaseDataset):

    @staticmethod
-    def load(path: str, name: str):
+    def load(path: str, name: str, **kwargs):
        path = get_data_path(path)
        dataset = DatasetDict()
        if environ.get('DATASET_SOURCE') == 'ModelScope':
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -9,6 +9,7 @@ from typing import Dict, List, Optional, Union
 import httpx
 import jieba
 import requests
+from tqdm import tqdm

 from opencompass.registry import MODELS
 from opencompass.utils.prompt import PromptList
@ -19,6 +20,8 @@ PromptType = Union[PromptList, str]
 OPENAI_API_BASE = os.path.join(
    os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1/'),
    'chat/completions')
+OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
+                                    'https://api.openai.com/v1/')

 O1_MODEL_LIST = [
    'o1-preview-2024-09-12',
@ -170,9 +173,11 @@ class OpenAI(BaseAPIModel):

        with ThreadPoolExecutor() as executor:
            results = list(
-                executor.map(self._generate, inputs,
-                             [max_out_len] * len(inputs),
-                             [temperature] * len(inputs)))
+                tqdm(executor.map(self._generate, inputs,
+                                  [max_out_len] * len(inputs),
+                                  [temperature] * len(inputs)),
+                     total=len(inputs),
+                     desc='Inferencing'))
        return results

    def _generate(self, input: PromptType, max_out_len: int,
@ -476,7 +481,7 @@ class OpenAISDK(OpenAI):
        key: str | List[str] = 'ENV',
        org: str | List[str] | None = None,
        meta_template: Dict | None = None,
-        openai_api_base: str = OPENAI_API_BASE,
+        openai_api_base: str = OPENAISDK_API_BASE,
        openai_proxy_url: Optional[str] = None,
        mode: str = 'none',
        logprobs: bool | None = False,
--- a/opencompass/tasks/base.py
+++ b/opencompass/tasks/base.py
@ -1,13 +1,45 @@
 import copy
 import os
+import re
 from abc import abstractmethod
-from typing import List
+from typing import List, Optional

 from mmengine.config import ConfigDict

 from opencompass.utils import get_infer_output_path, task_abbr_from_cfg


+def extract_role_pred(s: str, begin_str: Optional[str],
+                      end_str: Optional[str]) -> str:
+    """Extract the role prediction from the full prediction string. The role
+    prediction may be the substring between the begin and end string.
+
+    Args:
+        s (str): Full prediction string.
+        begin_str (str): The beginning string of the role
+        end_str (str): The ending string of the role.
+
+    Returns:
+        str: The extracted role prediction.
+    """
+    start = 0
+    end = len(s)
+
+    if begin_str and re.match(r'\s*', begin_str) is None:
+        begin_idx = s.find(begin_str)
+        if begin_idx != -1:
+            start = begin_idx + len(begin_str)
+
+    if end_str and re.match(r'\s*', end_str) is None:
+        # TODO: Support calling tokenizer for the accurate eos token
+        # and avoid such hardcode
+        end_idx = s.find(end_str, start)
+        if end_idx != -1:
+            end = end_idx
+
+    return s[start:end]
+
+
 class BaseTask:
    """Base class for all tasks. There are two ways to run the task:
    1. Directly by calling the `run` method.
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -4,13 +4,12 @@ import fnmatch
 import math
 import os
 import os.path as osp
-import re
 import statistics
 import sys
 import time
 from collections import Counter
 from inspect import signature
-from typing import List, Optional
+from typing import List

 import mmengine
 from mmengine.config import Config, ConfigDict
@ -18,43 +17,12 @@ from mmengine.utils import mkdir_or_exist

 from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
                                  TEXT_POSTPROCESSORS)
-from opencompass.tasks.base import BaseTask
+from opencompass.tasks.base import BaseTask, extract_role_pred
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               get_infer_output_path, get_logger,
                               task_abbr_from_cfg)


-def extract_role_pred(s: str, begin_str: Optional[str],
-                      end_str: Optional[str]) -> str:
-    """Extract the role prediction from the full prediction string. The role
-    prediction may be the substring between the begin and end string.
-
-    Args:
-        s (str): Full prediction string.
-        begin_str (str): The beginning string of the role
-        end_str (str): The ending string of the role.
-
-    Returns:
-        str: The extracted role prediction.
-    """
-    start = 0
-    end = len(s)
-
-    if begin_str and re.match(r'\s*', begin_str) is None:
-        begin_idx = s.find(begin_str)
-        if begin_idx != -1:
-            start = begin_idx + len(begin_str)
-
-    if end_str and re.match(r'\s*', end_str) is None:
-        # TODO: Support calling tokenizer for the accurate eos token
-        # and avoid such hardcode
-        end_idx = s.find(end_str, start)
-        if end_idx != -1:
-            end = end_idx
-
-    return s[start:end]
-
-
@TASKS.register_module()
 class OpenICLEvalTask(BaseTask):
    """OpenICL Evaluation Task.
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@ -12,8 +12,7 @@ from mmengine.config import Config, ConfigDict
 from mmengine.utils import mkdir_or_exist

 from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS
-from opencompass.tasks.base import BaseTask
-from opencompass.tasks.openicl_eval import extract_role_pred
+from opencompass.tasks.base import BaseTask, extract_role_pred
 from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
                               deal_with_judge_model_abbr, get_data_path,
                               get_infer_output_path, get_logger,
--- a/opencompass/utils/init.py
+++ b/opencompass/utils/init.py
@ -11,6 +11,7 @@ from .lark import *  # noqa
 from .logging import *  # noqa
 from .menu import *  # noqa
 from .model_postprocessors import *  # noqa
+from .network import *  # noqa
 from .postprocessors import *  # noqa
 from .prompt import *  # noqa
 from .text_postprocessors import *  # noqa
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -357,6 +357,21 @@ DATASETS_MAPPING = {
        "ms_id": "",
        "hf_id": "",
        "local": "./data/simpleqa/simple_qa_test_set.csv",
+    },
+    "opencompass/LiveMathBench202412": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/LiveMathBench/",
+    },
+    "opencompass/LiveReasonBench": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/LiveReasonBench/",    
+    },
+    "opencompass/bigcodebench": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/bigcodebench/",
    }
 }

@ -584,5 +599,13 @@ DATASETS_URL = {
    "P-MMEval": {
        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip",
        "md5": "09e401e6229a50647b9e13c429e634d1",
+    },
+    "LiveMathBench": {
+        'url': "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip",
+        "md5": "789df4604260d5cf3ba7a891077cf6a0",
+    },
+    "bigcodebench": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
+        "md5": "2c1c7956ca49a1124617e8c037ec57d8"
    }
 }
--- a/opencompass/utils/fileio.py
+++ b/opencompass/utils/fileio.py
@ -1,6 +1,7 @@
 import gzip
 import hashlib
 import io
+import json
 import os
 import os.path
 import shutil
@ -10,10 +11,156 @@ import urllib.error
 import urllib.request
 import zipfile
 from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union

 import mmengine.fileio as fileio
 from mmengine.fileio import LocalBackend, get_file_backend

+from .logging import get_logger
+
+logger = get_logger()
+
+
+class JSONToolkit:
+    """A toolkit for handling JSON and JSONL file operations."""
+
+    @staticmethod
+    def read_json(file_path: Union[str, Path]) -> Dict[str, Any]:
+        """Read a JSON file and return its contents as a dictionary.
+
+        Args:
+            file_path: Path to the JSON file
+
+        Returns:
+            Dictionary containing the JSON data
+
+        Raises:
+            FileNotFoundError: If the file doesn't exist
+            json.JSONDecodeError: If the file contains invalid JSON
+        """
+        file_path = Path(file_path)
+        try:
+            with file_path.open('r', encoding='utf-8') as f:
+                return json.load(f)
+        except FileNotFoundError:
+            logger.error(f'File not found: {file_path}')
+            raise
+        except json.JSONDecodeError as e:
+            logger.error(f'Invalid JSON in file {file_path}: {str(e)}')
+            raise
+
+    @staticmethod
+    def read_jsonl(file_path: Union[str, Path]) -> List[Dict[str, Any]]:
+        """Read a JSONL file and return its contents as a list of dictionaries.
+
+        Args:
+            file_path: Path to the JSONL file
+
+        Returns:
+            List of dictionaries, each representing a JSON line
+
+        Raises:
+            FileNotFoundError: If the file doesn't exist
+            json.JSONDecodeError: If any line contains invalid JSON
+        """
+        file_path = Path(file_path)
+        results = []
+        try:
+            with file_path.open('r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if not line:  # Skip empty lines
+                        continue
+                    try:
+                        results.append(json.loads(line))
+                    except json.JSONDecodeError as e:
+                        logger.error(
+                            f'Invalid JSON on line {line_num}: {str(e)}')
+                        raise
+        except FileNotFoundError:
+            logger.error(f'File not found: {file_path}')
+            raise
+        return results
+
+    @staticmethod
+    def save_json(data: Dict[str, Any],
+                  file_path: Union[str, Path],
+                  indent: Optional[int] = 2) -> None:
+        """Save a dictionary as a JSON file.
+
+        Args:
+            data: Dictionary to save
+            file_path: Path where to save the JSON file
+            indent: Number of spaces for indentation
+                (None for no pretty printing)
+
+        Raises:
+            TypeError: If data is not JSON serializable
+        """
+        file_path = Path(file_path)
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            with file_path.open('w', encoding='utf-8') as f:
+                json.dump(data, f, indent=indent, ensure_ascii=False)
+            logger.info(f'Successfully saved JSON to {file_path}')
+        except TypeError as e:
+            logger.error(f'Data is not JSON serializable: {str(e)}')
+            raise
+
+    @staticmethod
+    def save_jsonl(data: List[Dict[str, Any]], file_path: Union[str,
+                                                                Path]) -> None:
+        """Save a list of dictionaries as a JSONL file.
+
+        Args:
+            data: List of dictionaries to save
+            file_path: Path where to save the JSONL file
+
+        Raises:
+            TypeError: If any item in data is not JSON serializable
+        """
+        file_path = Path(file_path)
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        try:
+            with file_path.open('w', encoding='utf-8') as f:
+                for item in data:
+                    json_line = json.dumps(item, ensure_ascii=False)
+                    f.write(json_line + '\n')
+            logger.info(f'Successfully saved JSONL to {file_path}')
+        except TypeError as e:
+            logger.error(f'Data is not JSON serializable: {str(e)}')
+            raise
+
+    @staticmethod
+    @contextmanager
+    def jsonl_writer(file_path: Union[str, Path]):
+        """Context manager for writing JSONL files line by line.
+
+        Args:
+            file_path: Path where to save the JSONL file
+
+        Yields:
+            Function to write individual JSON lines
+        """
+        file_path = Path(file_path)
+        file_path.parent.mkdir(parents=True, exist_ok=True)
+
+        def write_line(data: Dict[str, Any]):
+            nonlocal f
+            json_line = json.dumps(data, ensure_ascii=False)
+            f.write(json_line + '\n')
+
+        try:
+            with file_path.open('w', encoding='utf-8') as f:
+                yield write_line
+            logger.info(f'Successfully saved JSONL to {file_path}')
+        except TypeError as e:
+            logger.error(f'Data is not JSON serializable: {str(e)}')
+            raise
+

 def patch_func(module, fn_name_to_wrap):
    backup = getattr(patch_func, '_backup', [])
--- a/opencompass/utils/network.py
+++ b/opencompass/utils/network.py
@ -0,0 +1,142 @@
+import os
+import platform
+import socket
+import subprocess
+from typing import Dict, Optional, Tuple
+
+import requests
+
+from .logging import get_logger
+
+logger = get_logger()
+
+
+def setup_proxies(proxy_env_name):
+    try:
+        proxy_url = os.environ[proxy_env_name]
+        if not proxy_url:
+            raise ValueError('Proxy URL environment variable is empty')
+
+        # Validate proxy URL format
+        if not proxy_url.startswith(('http://', 'https://')):
+            raise ValueError('Proxy URL must start with http:// or https://')
+
+        proxies = {'http://': proxy_url, 'https://': proxy_url}
+        return proxies
+
+    except KeyError:
+        # Handle the proxy_env_name environment is nonexistent
+        logger.warning(f'{proxy_env_name} environment variable not found')
+        return {}
+
+    except ValueError as e:
+        # Handle invalid proxy URL format
+        logger.error(f'Invalid proxy configuration: {str(e)}')
+        return None
+
+    except Exception as e:
+        # Handle any unexpected errors
+        logger.error(f'Unexpected error while setting up proxies: {str(e)}')
+        return None
+
+
+def check_network_connectivity(
+        host: str = '8.8.8.8',
+        port: int = 53,
+        timeout: float = 3,
+        proxies: Optional[Dict[str, str]] = None) -> Tuple[bool, str]:
+    """Check network connectivity using multiple methods with optional proxy
+    support.
+
+    Args:
+        host: str, target host to check (default: Google DNS "8.8.8.8")
+        port: int, target port to check (default: 53 for DNS)
+        timeout: float, timeout in seconds (default: 3)
+        proxies: Optional[Dict[str, str]], proxy configuration (default: None)
+            Example: {
+                'http': 'http://proxy:8080',
+                'https': 'https://proxy:8080'
+            }
+
+    Returns:
+        Tuple[bool, str]: (is_connected, message)
+    """
+
+    # Method 1: Socket connection test (direct connection, no proxy)
+    def check_socket() -> bool:
+        try:
+            socket.create_connection((host, port), timeout=timeout)
+            return True
+        except OSError:
+            return False
+
+    # Method 2: HTTP request test (supports proxy)
+    def check_http() -> bool:
+        try:
+            response = requests.get('http://www.google.com',
+                                    timeout=timeout,
+                                    proxies=proxies)
+            return response.status_code == 200
+        except requests.RequestException:
+            return False
+
+    # Method 3: Ping test (direct connection, no proxy)
+    def check_ping() -> bool:
+        param = '-n' if platform.system().lower() == 'windows' else '-c'
+        command = ['ping', param, '1', host]
+        try:
+            return subprocess.call(command,
+                                   stdout=subprocess.DEVNULL,
+                                   stderr=subprocess.DEVNULL) == 0
+        except subprocess.SubprocessError:
+            return False
+
+    # Try all methods
+    is_socket_connected = check_socket()
+    is_http_connected = check_http()
+    is_ping_connected = check_ping()
+
+    # Generate detailed message including proxy information
+    status_msg = (
+        f'Network Status:\n'
+        f"Socket Test: {'Success' if is_socket_connected else 'Failed'}\n"
+        f"HTTP Test (via {'Proxy' if proxies else 'Direct'}): "
+        f"{'Success' if is_http_connected else 'Failed'}\n"
+        f"Ping Test: {'Success' if is_ping_connected else 'Failed'}")
+
+    # If using proxy, add proxy details to message
+    if proxies:
+        status_msg += '\nProxy Configuration:'
+        for protocol, proxy in proxies.items():
+            status_msg += f'\n  {protocol}: {proxy}'
+
+    is_connected = any(
+        [is_socket_connected, is_http_connected, is_ping_connected])
+    logger.info(status_msg)
+    return is_connected, status_msg
+
+
+def check_url_accessibility(
+        url: str,
+        timeout: float = 3,
+        proxies: Optional[Dict[str,
+                               str]] = None) -> Tuple[bool, Optional[int]]:
+    """Check if a specific URL is accessible through optional proxy.
+
+    Args:
+        url: str, target URL to check
+        timeout: float, timeout in seconds (default: 3)
+        proxies: Optional[Dict[str, str]], proxy configuration (default: None)
+            Example: {
+                'http': 'http://proxy:8080',
+                'https': 'https://proxy:8080'}
+
+    Returns:
+        Tuple[bool, Optional[int]]: (is_accessible, status_code)
+    """
+    try:
+        response = requests.get(url, timeout=timeout, proxies=proxies)
+        return True, response.status_code
+    except requests.RequestException as e:
+        logger.error(f'Failed to access URL {url}: {str(e)}')
+        return False, None
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -6,12 +6,14 @@ einops==0.5.0
 evaluate>=0.3.0
 func_timeout
 fuzzywuzzy
+gradio-client
 h5py
 huggingface_hub<=0.24.7
 immutabledict
 importlib-metadata
 jieba
 json5
+jsonlines
 mmengine-lite
 nltk==3.8
 numpy>=1.23.4,<2.0.0
@ -42,4 +44,6 @@ tokenizers>=0.13.3
 torch>=1.13.1
 tqdm>=4.64.1
 transformers>=4.29.1
+tree-sitter==0.21.3
+tree_sitter_languages>=1.10.2
 typer
				`@ -0,0 +1 @@`
				`from .bigcodebench import BigCodeBenchDataset, BigCodeBenchEvaluator # noqa`