From ddc9cc0afbf5d8c14a05f61c47bbecc80c21a92f Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Wed, 7 May 2025 10:57:23 +0800
Subject: [PATCH 01/28] [Add] add a config to Judge dataset all (#2077)

* fix pip version

* fix pip version

* add judgedatasetall

* add judgedatasetall

* add judgedatasetall
---
 examples/eval_judge_dataset_all.py            | 61 +++++++++++++
 .../configs/summarizers/judgedataset_all.py   | 90 +++++++++++++++++++
 .../icl_evaluator/icl_judge_evaluator.py      |  6 +-
 3 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100644 examples/eval_judge_dataset_all.py
 create mode 100644 opencompass/configs/summarizers/judgedataset_all.py

diff --git a/examples/eval_judge_dataset_all.py b/examples/eval_judge_dataset_all.py
new file mode 100644
index 00000000..4cc237f4
--- /dev/null
+++ b/examples/eval_judge_dataset_all.py
@@ -0,0 +1,61 @@
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.judgerbenchv2 import get_judgerbenchv2_dataset as get_judgerbenchv2_datasets
+    from opencompass.configs.datasets.judge.rmb import get_rmb_dataset as get_rmb_datasets
+    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
+    from opencompass.configs.datasets.judge.judgebench import get_judgebench_datasets
+
+    from opencompass.configs.summarizers.judgedataset_all import summarizer
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.models import TurboMindModelwithChatTemplate
+
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+
+
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+
+
+work_dir = './outputs/judge_dataset_all/'
diff --git a/opencompass/configs/summarizers/judgedataset_all.py b/opencompass/configs/summarizers/judgedataset_all.py
new file mode 100644
index 00000000..229d322e
--- /dev/null
+++ b/opencompass/configs/summarizers/judgedataset_all.py
@@ -0,0 +1,90 @@
+Judge_all_summary_groups = []
+
+
+# RewardBench
+_Chat_weights = {
+'alpacaeval-easy': 0.32355305466237944,
+'alpacaeval-length': 0.32355305466237944,
+'alpacaeval-hard': 0.32355305466237944,
+'mt-bench-easy': 0.011254019292604502,
+'mt-bench-med': 0.018086816720257234,
+}
+
+_Chat_Hard_weights = {
+'mt-bench-hard': 0.09698275862068965,
+'llmbar-natural': 0.21551724137931033,
+'llmbar-adver-neighbor': 0.28879310344827586,
+'llmbar-adver-GPTInst': 0.19827586206896552,
+'llmbar-adver-GPTOut': 0.10129310344827586,
+'llmbar-adver-manual': 0.09913793103448276,
+}
+
+_Safety_weights = {
+'refusals-dangerous': 0.13513513513513514,
+'refusals-offensive': 0.13513513513513514,
+'xstest-should-refuse': 0.20810810810810812,
+'xstest-should-respond': 0.33783783783783783,
+'donotanswer': 0.1837837837837838,
+}
+
+_Reasoning_weights = {
+'math-prm': 0.31236897274633124,
+'hep-cpp': 0.1146051712089448,
+'hep-go': 0.1146051712089448,
+'hep-java': 0.1146051712089448,
+'hep-js': 0.1146051712089448,
+'hep-python': 0.1146051712089448,
+'hep-rust': 0.1146051712089448,
+}
+
+_RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
+
+Judge_all_summary_groups.append({'name': 'RewardBench_avg', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
+Judge_all_summary_groups.append({'name': 'RewardBench_Chat', 'subsets': list(_Chat_weights.keys()), 'weights': _Chat_weights})
+Judge_all_summary_groups.append({'name': 'RewardBench_Chat Hard', 'subsets': list(_Chat_Hard_weights.keys()), 'weights': _Chat_Hard_weights})
+Judge_all_summary_groups.append({'name': 'RewardBench_Safety', 'subsets': list(_Safety_weights.keys()), 'weights': _Safety_weights})
+Judge_all_summary_groups.append({'name': 'RewardBench_Reasoning', 'subsets': list(_Reasoning_weights.keys()), 'weights': _Reasoning_weights})
+
+
+
+# Judgerbenchv2
+Judgerbenchv2_tasks = ['Code_and_AI', 'Creation', 'LanTask', 'IF', 'chatQA', 'Hallucination', 'safe', 'Reason_and_analysis', 'Longtext', 'Knowledge']
+Judgerbenchv2_metrics = ['final_score', 'accuracy', 'normalized_diff', 'rank_diff', 'score_diff']
+Judgerbenchv2_summary_names = []
+for metric in Judgerbenchv2_metrics:
+    for task in Judgerbenchv2_tasks:
+        Judgerbenchv2_summary_names.append([task, metric])
+
+Judge_all_summary_groups.append({'name': 'Judgerbenchv2_final_score', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'final_score']})
+Judge_all_summary_groups.append({'name': 'Judgerbenchv2_accuracy', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'accuracy']})
+Judge_all_summary_groups.append({'name': 'Judgerbenchv2_normalized_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'normalized_diff']})
+Judge_all_summary_groups.append({'name': 'Judgerbenchv2_rank_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'rank_diff']})
+Judge_all_summary_groups.append({'name': 'Judgerbenchv2_score_diff', 'subsets': [[name, metric] for name, metric in Judgerbenchv2_summary_names if metric == 'score_diff']})
+
+Judge_all_summary_groups.append({'name': 'Judgebench', 'subsets': ['judgebench']})
+Judge_all_summary_groups.append({'name': 'rmb_dataset_total_avg', 'subsets': [['rmb_dataset', 'total_accuracy']]})
+Judge_all_summary_groups.append({'name': 'rmb_dataset_pair', 'subsets': [['rmb_dataset', 'pair_average']]})
+Judge_all_summary_groups.append({'name': 'rmb_dataset_bon', 'subsets': [['rmb_dataset', 'bon_average']]})
+
+summarizer = dict(
+    dataset_abbrs=[        
+        'Judgerbenchv2_final_score',
+        'Judgebench',
+        'rmb_dataset_total_avg',
+        'RewardBench_avg',
+        '',
+        'Judgerbenchv2_accuracy',
+        'Judgerbenchv2_normalized_diff',
+        'Judgerbenchv2_rank_diff',
+        'Judgerbenchv2_score_diff',
+        '', 
+        'rmb_dataset_pair',
+        'rmb_dataset_bon',
+        '',
+        'RewardBench_Chat',
+        'RewardBench_Chat Hard',
+        'RewardBench_Safety',
+        'RewardBench_Reasoning',
+    ],
+    summary_groups=Judge_all_summary_groups,
+)
diff --git a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
index d7f3531a..e59cdc12 100644
--- a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
@@ -16,7 +16,8 @@ class JudgeEvaluator(BaseEvaluator):
         count = 0
         details = []
         for prediction, reference in zip(predictions, references):
-            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
+            choice = prediction.split("\"Choice\": \"Model ")[-1][0] if len(
+                prediction) != 0 else None
             gold_winner = reference.get('winner', '')
             detail = {
                 'pred': prediction,
@@ -75,7 +76,8 @@ class RMBEvaluator(BaseEvaluator):
         pair_harm_list = []
 
         for prediction, reference in zip(predictions, references):
-            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
+            choice = prediction.split("\"Choice\": \"Model ")[-1][0] if len(
+                prediction) != 0 else None
             gold_winner = reference.get('winner', '')
             subset = reference.get('subset', '')
             goal = reference.get('goal', '')

From af8432e1d63714a1766e1a403b5c1bbf71e78d8c Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 7 May 2025 14:06:40 +0800
Subject: [PATCH 02/28] [Update] OpenAI SDK model reasoning content  (#2078)

* update

* update

* update
---
 opencompass/models/openai_api.py | 83 ++++++++++++++++++--------------
 1 file changed, 48 insertions(+), 35 deletions(-)

diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 9c2baed1..692edcf1 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -531,26 +531,28 @@ class OpenAI(BaseAPIModel):
 
 class OpenAISDK(OpenAI):
 
-    def __init__(self,
-                 path: str = 'gpt-3.5-turbo',
-                 max_seq_len: int = 16384,
-                 query_per_second: int = 1,
-                 rpm_verbose: bool = False,
-                 retry: int = 2,
-                 key: str | List[str] = 'ENV',
-                 org: str | List[str] | None = None,
-                 meta_template: Dict | None = None,
-                 openai_api_base: str | List[str] = OPENAISDK_API_BASE,
-                 openai_proxy_url: Optional[str] = None,
-                 mode: str = 'none',
-                 logprobs: bool | None = False,
-                 top_logprobs: int | None = None,
-                 temperature: float | None = None,
-                 tokenizer_path: str | None = None,
-                 extra_body: Dict | None = None,
-                 verbose: bool = False,
-                 status_code_mappings: dict = {},
-                 think_tag: str = '</think>'):
+    def __init__(
+        self,
+        path: str = 'gpt-3.5-turbo',
+        max_seq_len: int = 16384,
+        query_per_second: int = 1,
+        rpm_verbose: bool = False,
+        retry: int = 2,
+        key: str | List[str] = 'ENV',
+        org: str | List[str] | None = None,
+        meta_template: Dict | None = None,
+        openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+        openai_proxy_url: Optional[str] = None,
+        mode: str = 'none',
+        logprobs: bool | None = False,
+        top_logprobs: int | None = None,
+        temperature: float | None = None,
+        tokenizer_path: str | None = None,
+        extra_body: Dict | None = None,
+        verbose: bool = False,
+        status_code_mappings: dict = {},
+        think_tag: str = '</think>',
+    ):
         super().__init__(
             path,
             max_seq_len,
@@ -597,11 +599,13 @@ class OpenAISDK(OpenAI):
         self.status_code_mappings = status_code_mappings
         self.think_tag = think_tag
 
-    def _generate(self,
-                  input: PromptList | str,
-                  max_out_len: int,
-                  temperature: float,
-                  timeout: int = 3600) -> str:
+    def _generate(
+        self,
+        input: PromptList | str,
+        max_out_len: int,
+        temperature: float,
+        timeout: int = 3600,
+    ) -> str:
         """Generate results given a list of inputs.
 
         Args:
@@ -662,7 +666,12 @@ class OpenAISDK(OpenAI):
 
                 # Check if response is empty or content is empty
                 if (not responses.choices or not responses.choices[0].message
-                        or not responses.choices[0].message.content):
+                        or
+                    (not responses.choices[0].message.content and not getattr(
+                        responses.choices[0].message,
+                        'reasoning_content',
+                        '',
+                    ))):  # noqa: E125
                     self.logger.error(
                         'Failed to extract content from the responses. '
                         'Please check the API response for detail information.'
@@ -670,12 +679,13 @@ class OpenAISDK(OpenAI):
                         responses,
                     )
                     num_retries += 1
-                    # Continue to retry instead of returning empty response
                     continue
 
+                reasoning_content = (getattr(responses.choices[0].message,
+                                             'reasoning_content', '') or '')
+                content = responses.choices[0].message.content or ''
                 # Concat Reasoning Content and tags to content
-                if (hasattr(responses.choices[0].message, 'reasoning_content')
-                        and responses.choices[0].message.reasoning_content):
+                if reasoning_content:
                     if self.verbose:
                         self.logger.info(
                             'Follow'
@@ -684,14 +694,17 @@ class OpenAISDK(OpenAI):
                             'Reasoning Content: %s, \n'
                             'Tags: %s, \n'
                             'Content: %s',
-                            responses.choices[0].message.reasoning_content,
+                            reasoning_content,
                             self.think_tag,
-                            responses.choices[0].message.content)
-                    return (responses.choices[0].message.reasoning_content +
-                            self.think_tag +
-                            responses.choices[0].message.content)
+                            content,
+                        )
+                    if content:
+                        return reasoning_content + self.think_tag + content
+                    else:
+                        return reasoning_content
 
-                return responses.choices[0].message.content
+                else:
+                    return content
 
             except (BadRequestError, APIStatusError) as e:
                 # Handle BadRequest status

From d62b69aaefdd746c00bc530abcddf9d8245ab26b Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Wed, 7 May 2025 15:51:18 +0800
Subject: [PATCH 03/28] [Fix] Fix InternVL model config (#2068)

* intervl-8b&38b

* intervl adjustment

* internvl fix
---
 .../models/internvl/lmdeploy_internvl_2_5_38b.py  | 15 +++++++++++++++
 .../models/internvl/lmdeploy_internvl_2_5_8b.py   | 15 +++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py
 create mode 100644 opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py

diff --git a/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py
new file mode 100644
index 00000000..98713696
--- /dev/null
+++ b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_38b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internvl2_5-38b-turbomind',
+        path='OpenGVLab/InternVL2_5-38B',
+        engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py
new file mode 100644
index 00000000..3541249c
--- /dev/null
+++ b/opencompass/configs/models/internvl/lmdeploy_internvl_2_5_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internvl2_5-8b-turbomind',
+        path='OpenGVLab/InternVL2_5-8B',
+        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]

From 43b2c4ed765755560f506f91739502756de60423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=B0=A2=E6=98=95=E8=BE=B0?= <xiexinch@outlook.com>
Date: Wed, 7 May 2025 16:18:43 +0800
Subject: [PATCH 04/28] [Fix] Update lawbench data path (#2037)

---
 opencompass/datasets/lawbench/utils/modules/alignment.py  | 5 +++--
 opencompass/datasets/lawbench/utils/modules/classifier.py | 3 ++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/opencompass/datasets/lawbench/utils/modules/alignment.py b/opencompass/datasets/lawbench/utils/modules/alignment.py
index d11feb74..5330b2fa 100644
--- a/opencompass/datasets/lawbench/utils/modules/alignment.py
+++ b/opencompass/datasets/lawbench/utils/modules/alignment.py
@@ -8,6 +8,7 @@ REAL_PATH = os.path.split(os.path.realpath(__file__))[0]
 chinese_punct = "！？｡＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛“”„‟…‧﹏"
 english_punct = punctuation
 punct = chinese_punct + english_punct
+cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
 
 def check_all_chinese(word):
     """
@@ -22,7 +23,7 @@ def read_cilin():
     Cilin 詞林 is a thesaurus with semantic information
     """
     # TODO -- fix this path
-    lines = open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n")
+    lines = open(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "cilin.txt"), "r", encoding="gbk").read().strip().split("\n")
     semantic_dict = {}
     semantic_classes = {}
     for line in lines:
@@ -39,7 +40,7 @@ def read_cilin():
 
 def read_confusion():
     confusion_dict = {}
-    with open(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f:
+    with open(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "confusion_dict.txt"), "r", encoding="utf-8") as f:
         for line in f:
             li = line.rstrip('\n').split(" ")
             confusion_dict[li[0]] = li[1:]
diff --git a/opencompass/datasets/lawbench/utils/modules/classifier.py b/opencompass/datasets/lawbench/utils/modules/classifier.py
index a8e9b921..b8ee407b 100644
--- a/opencompass/datasets/lawbench/utils/modules/classifier.py
+++ b/opencompass/datasets/lawbench/utils/modules/classifier.py
@@ -10,7 +10,8 @@ Correction = namedtuple(
         "inds",
     ],
 ) 
-char_smi = CharFuncs(os.path.join(os.path.dirname(__file__), "..", "..", "..", "..", "..", "data", "lawbench", "eval_assets", "char_meta.txt"))
+cache_dir = os.environ.get('COMPASS_DATA_CACHE', '')
+char_smi = CharFuncs(os.path.join(cache_dir, "data", "lawbench", "eval_assets", "char_meta.txt"))
 
 def check_spell_error(src_span: str,
                       tgt_span: str,

From ba0e32292c23cadd4c6c061a132b95b1c8b9e4e0 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Wed, 7 May 2025 16:42:09 +0800
Subject: [PATCH 05/28] [Feature] Support InternSandbox  (#2049)

* internsandbox init

* internsandbox

* dataset_index

* dataset_index_add
---
 dataset-index.yml                             |  6 ++
 .../internsandbox/internsandbox_gen.py        |  4 +
 .../internsandbox/internsandbox_gen_44b982.py | 59 ++++++++++++++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/internsandbox.py         | 78 +++++++++++++++++++
 5 files changed, 148 insertions(+)
 create mode 100644 opencompass/configs/datasets/internsandbox/internsandbox_gen.py
 create mode 100644 opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
 create mode 100644 opencompass/datasets/internsandbox.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 9585f97c..4a920071 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1023,3 +1023,9 @@
     paper: https://arxiv.org/pdf/2402.09391
     configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
     configpath_llmjudge: ''
+- internsandbox:
+    name: InternSandbox
+    category: Reasoning/Code/Agent
+    paper: ''
+    configpath: opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
+    configpath_llmjudge: ''
\ No newline at end of file
diff --git a/opencompass/configs/datasets/internsandbox/internsandbox_gen.py b/opencompass/configs/datasets/internsandbox/internsandbox_gen.py
new file mode 100644
index 00000000..1af0955c
--- /dev/null
+++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .internsandbox_gen_44b982 import internsandbox_datasets
\ No newline at end of file
diff --git a/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py b/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
new file mode 100644
index 00000000..368189a5
--- /dev/null
+++ b/opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
@@ -0,0 +1,59 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import InternSandboxDataset, InternSandboxEvaluator
+
+
+_SANDBOXS_ = ['aquarium', 'arc', 'arrowmaze', 'bbehboardgameqa', 'bbehbooleanexpressions', 'BbehDyckLanguages', 'BbehGeometricShapes', 'BbehMultistepArithmetic', 'bbehobjectcounting', 'bbehobjectproperties', 'bbehshuffobject', 'BbehWebOfLies', 'BbehWordSorting', 'binairo', 'calcudoku', 'campsite', 'cipher', 'cryptomath', 'dominosa', 'futoshiki', 'galaxies', 'game24', 'kakurasu', 'korLogicAnalogicalReasoning', 'korLogicCanonicalPropositions', 'korLogicCooperativePrinciple', 'korLogicDefinitions', 'korLogicDerivativeReasoningOfPropositionalLogic', 'korLogicDisjunctiveNormalFormAndConjunctiveNormalForm', 'korLogicDynamicLogic', 'korLogicEnumerativeInductiveReasoning', 'korLogicEpistemicLogic', 'korLogicEquivalenceCalculus', 'korLogicFigureOfTheSyllogism', 'korLogicFormalFallacies', 'korLogicInductionParadox', 'korLogicLogicalMethodsForExploringCauseAndEffectRelationships', 'korLogicPredicateLogicFormalization', 'korLogicPropositionalLogicConcepts', 'korLogicPropositionalLogicFormalization', 'korLogicResolution', 'korLogicSpeechActs', 'korLogicStatisticalReasoning', 'korLogicTemporalPropositions', 'korLogicTruthValueModalPropositions', 'korOperationUnicode20ac', 'korOperationUnicode2295', 'korOperationUnicode25a0', 'korOperationUnicode25a1', 'korOperationUnicode25b3', 'korOperationUnicode25bd', 'korOperationUnicode25cb', 'korOperationUnicode25ce', 'korOperationUnicode25cf', 'korOperationUnicode2605', 'korOperationUnicodeffe0', 'korOperationUnicodeffe1', 'korPuzzle24Points', 'korPuzzleArrowMaze', 'korPuzzleCalcudoko', 'korPuzzleCampsite', 'korPuzzleConnectWords', 'korPuzzleCryptoMath', 'korPuzzleKukurasu', 'korPuzzleLogicPuzzle', 'korPuzzleSkyscrapers', 'korPuzzleWordBrainTeasers', 'korPuzzleWordLadder', 'korPuzzleWordRootsAndAffixes', 'korPuzzleWordscapes', 'korPuzzleWordSearch', 'LightUp', 'maze', 'minesweeper', 'nonograms', 'starbattle', 'stitches', 'sudoku', 'tents', 'thermometers']
+
+internsandbox_reader_cfg = dict(
+    input_columns=['prompt'], 
+    output_column='ground_truth'
+)
+
+internsandbox_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are a helpful assistant.',
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN', 
+                    prompt='{prompt}'
+                ),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+internsandbox_eval_cfg = {
+    sandbox: dict(
+        evaluator=dict(
+            type=InternSandboxEvaluator,
+            short_penalty=False,
+            format_penalty=False,
+        ),
+        pred_role='BOT',
+    ) for sandbox in _SANDBOXS_
+}
+
+internsandbox_datasets = [
+    dict(
+        type=InternSandboxDataset,
+        abbr=f'internsandbox-{sandbox}',
+        path='./data/InternSandboxBenchmark_verified_V0.3.1/',
+        local_mode=True,
+        sandbox=sandbox,
+        reader_cfg=internsandbox_reader_cfg,
+        infer_cfg=internsandbox_infer_cfg,
+        eval_cfg=internsandbox_eval_cfg[sandbox],
+    ) for sandbox in _SANDBOXS_
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index b00162d1..a7c037cf 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -68,6 +68,7 @@ from .hungarian_math import *  # noqa: F401, F403
 from .IFEval.ifeval import IFEvalDataset, IFEvaluator  # noqa: F401, F403
 from .inference_ppl import InferencePPLDataset  # noqa: F401, F403
 from .infinitebench import *  # noqa: F401, F403
+from .internsandbox import *  # noqa: F401, F403
 from .iwslt2017 import *  # noqa: F401, F403
 from .jigsawmultilingual import *  # noqa: F401, F403
 from .jsonl import JsonlDataset  # noqa: F401, F403
diff --git a/opencompass/datasets/internsandbox.py b/opencompass/datasets/internsandbox.py
new file mode 100644
index 00000000..c71cc3f7
--- /dev/null
+++ b/opencompass/datasets/internsandbox.py
@@ -0,0 +1,78 @@
+import importlib
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class InternSandboxDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, sandbox: str, local_mode: bool = False):
+        path = get_data_path(path, local_mode=local_mode)
+        file_path = osp.join(path, f'{sandbox}.jsonl')
+        data = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                origin_data = json.loads(line)
+                origin_data['ground_truth'] = json.dumps(
+                    origin_data['ground_truth'])
+                data.append(origin_data)
+        return Dataset.from_list(data)
+
+
+@ICL_EVALUATORS.register_module()
+class InternSandboxEvaluator(BaseEvaluator):
+
+    def __init__(self,
+                 short_penalty: bool = False,
+                 format_penalty: bool = False):
+        super().__init__()
+        self.short_penalty = short_penalty
+        self.format_penalty = format_penalty
+
+    def score(self, predictions, references, test_set):
+
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        class_name = f"{test_set[0]['data_source']}Sandbox"
+
+        details = []
+        for pred, ref, ts in zip(predictions, references, test_set):
+            ref = json.loads(ref)
+            module = importlib.import_module('intern_sandbox')
+            score = getattr(module, class_name).verify_score(
+                pred,
+                ref,
+                short_penalty=self.short_penalty,
+                format_penalty=self.format_penalty)
+            try:
+                extracted = getattr(module, class_name).extract_output(pred)
+            except:  # noqa: E722
+                extracted = None
+
+            res = {
+                'prompt': ts['prompt'],
+                'score': score,
+                'extracted_output': extracted,
+                'ground_truth': ref,
+                'output': pred,
+            }
+            details.append(res)
+
+        avg_score = sum(r['score'] for r in details) / len(details)
+        results = {'accuracy': avg_score, 'details': details}
+        return results

From 9ec23c145b4c0ac441b51d6e326e2074975b2400 Mon Sep 17 00:00:00 2001
From: Jiahao Xu <48542487+xuxuxuxuxuxjh@users.noreply.github.com>
Date: Thu, 8 May 2025 16:25:43 +0800
Subject: [PATCH 06/28] [Datasets] Add ClinicBench, PubMedQA and ScienceQA
 (#2061)

* Add ClinicBench

* Add PubMedQA & ScienceQA & ClinicBench

* Add PubMedQA & ScienceQA & ClinicBench

* Update datasets_info & hf_path

* Update hf_path
---
 dataset-index.yml                             |  18 ++++
 .../ClinicBench/ClinicBench_llmjudge_gen.py   |   4 +
 .../ClinicBench_llmjudge_gen_d09668.py        | 100 ++++++++++++++++++
 .../PubMedQA/PubMedQA_llmjudge_gen.py         |   4 +
 .../PubMedQA/PubMedQA_llmjudge_gen_f00302.py  |  94 ++++++++++++++++
 .../ScienceQA/ScienceQA_llmjudge_gen.py       |   4 +
 .../ScienceQA_llmjudge_gen_f00302.py          |  94 ++++++++++++++++
 opencompass/datasets/ClinicBench.py           |  19 ++++
 opencompass/datasets/PubMedQA.py              |  34 ++++++
 opencompass/datasets/ScienceQA.py             |  32 ++++++
 10 files changed, 403 insertions(+)
 create mode 100644 opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py
 create mode 100644 opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py
 create mode 100644 opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py
 create mode 100644 opencompass/datasets/ClinicBench.py
 create mode 100644 opencompass/datasets/PubMedQA.py
 create mode 100644 opencompass/datasets/ScienceQA.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 4a920071..94e88200 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -128,6 +128,24 @@
     paper: https://arxiv.org/abs/2501.18362
     configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
     configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
+- ClinicBench:
+    name: ClinicBench
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2405.00716
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
+- ScienceQA:
+    name: ScienceQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2209.09513
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
+- PubMedQA:
+    name: PubMedQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/1909.06146
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
 - musr:
     name: MuSR
     category: Reasoning
diff --git a/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
new file mode 100644
index 00000000..febfce11
--- /dev/null
+++ b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ClinicBench_llmjudge_gen_d09668 import ClinicBench_datasets
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py
new file mode 100644
index 00000000..358a91f5
--- /dev/null
+++ b/opencompass/configs/datasets/ClinicBench/ClinicBench_llmjudge_gen_d09668.py
@@ -0,0 +1,100 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ClinicBench import ClinicBenchDataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+ClinicBench_datasets = []
+
+ClinicBench_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+ClinicBench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ClinicBench_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ClinicBenchDataset,
+            path='xuxuxuxuxu/Pharmacology-QA',
+            reader_cfg=ClinicBench_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+ClinicBench_datasets.append(
+    dict(
+        abbr=f'ClinicBench',
+        type=ClinicBenchDataset,
+        path='xuxuxuxuxu/Pharmacology-QA',
+        reader_cfg=ClinicBench_reader_cfg,
+        infer_cfg=ClinicBench_infer_cfg,
+        eval_cfg=ClinicBench_eval_cfg,
+    )
+)
diff --git a/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
new file mode 100644
index 00000000..4055d0f5
--- /dev/null
+++ b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .PubMedQA_llmjudge_gen_f00302 import PubMedQA_datasets
\ No newline at end of file
diff --git a/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py
new file mode 100644
index 00000000..b38a8fe5
--- /dev/null
+++ b/opencompass/configs/datasets/PubMedQA/PubMedQA_llmjudge_gen_f00302.py
@@ -0,0 +1,94 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.PubMedQA import PubMedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{choices}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+PubMedQA_datasets = []
+
+PubMedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+PubMedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+PubMedQA_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=PubMedQADataset,
+            path='qiaojin/PubMedQA',
+            reader_cfg=PubMedQA_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+PubMedQA_datasets.append(
+    dict(
+        abbr=f'PubMedQA',
+        type=PubMedQADataset,
+        path='qiaojin/PubMedQA',
+        reader_cfg=PubMedQA_reader_cfg,
+        infer_cfg=PubMedQA_infer_cfg,
+        eval_cfg=PubMedQA_eval_cfg,
+    )
+)
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
new file mode 100644
index 00000000..32305456
--- /dev/null
+++ b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ScienceQA_llmjudge_gen_f00302 import ScienceQA_datasets
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py
new file mode 100644
index 00000000..e128c2a0
--- /dev/null
+++ b/opencompass/configs/datasets/ScienceQA/ScienceQA_llmjudge_gen_f00302.py
@@ -0,0 +1,94 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ScienceQA import ScienceQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{choices}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+ScienceQA_datasets = []
+
+ScienceQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+ScienceQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+ScienceQA_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ScienceQADataset,
+            path='derek-thomas/ScienceQA',
+            reader_cfg=ScienceQA_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+ScienceQA_datasets.append(
+    dict(
+        abbr=f'ScienceQA',
+        type=ScienceQADataset,
+        path='derek-thomas/ScienceQA',
+        reader_cfg=ScienceQA_reader_cfg,
+        infer_cfg=ScienceQA_infer_cfg,
+        eval_cfg=ScienceQA_eval_cfg,
+    )
+)
\ No newline at end of file
diff --git a/opencompass/datasets/ClinicBench.py b/opencompass/datasets/ClinicBench.py
new file mode 100644
index 00000000..86ef5082
--- /dev/null
+++ b/opencompass/datasets/ClinicBench.py
@@ -0,0 +1,19 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class ClinicBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load_single(path):
+        dataset = load_dataset(path)['train']
+        return dataset
+
+    @staticmethod
+    def load(path):
+        dataset = ClinicBenchDataset.load_single(path)
+        return dataset
diff --git a/opencompass/datasets/PubMedQA.py b/opencompass/datasets/PubMedQA.py
new file mode 100644
index 00000000..b0db32e3
--- /dev/null
+++ b/opencompass/datasets/PubMedQA.py
@@ -0,0 +1,34 @@
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class PubMedQADataset(BaseDataset):
+
+    @staticmethod
+    def load_single(path):
+        dataset = []
+        ds = load_dataset(path, 'pqa_labeled')
+        for data in ds['train']:
+            data['question'] = (f"CONTEXTS: {data['context']}\n"
+                                f"QUESTION: {data['question']}")
+            choices = 'A. yes\nB. no\nC. maybe'
+            data['choices'] = choices
+            if data['final_decision'] == 'yes':
+                data['label'] = 'A. yes'
+            elif data['final_decision'] == 'no':
+                data['label'] = 'B. no'
+            else:
+                data['label'] = 'C. maybe'
+
+            dataset.append(data)
+
+        return Dataset.from_list(dataset)
+
+    @staticmethod
+    def load(path):
+        dataset = PubMedQADataset.load_single(path)
+        return dataset
diff --git a/opencompass/datasets/ScienceQA.py b/opencompass/datasets/ScienceQA.py
new file mode 100644
index 00000000..1bc9c952
--- /dev/null
+++ b/opencompass/datasets/ScienceQA.py
@@ -0,0 +1,32 @@
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class ScienceQADataset(BaseDataset):
+
+    @staticmethod
+    def load_single(path):
+        dataset = []
+        ds = load_dataset(path)
+        for data in ds['test']:
+            if data['image'] is None:
+                data['label'] = chr(65 + data['answer']
+                                    ) + '. ' + data['choices'][data['answer']]
+                choices = ''
+                for i in range(len(data['choices'])):
+                    choices += chr(65 + i) + '. ' + data['choices'][i] + '\n'
+                data['choices'] = choices
+                # print(data)
+
+                dataset.append(data)
+
+        return Dataset.from_list(dataset)
+
+    @staticmethod
+    def load(path):
+        dataset = ScienceQADataset.load_single(path)
+        return dataset

From a685ed7daffcab6b7cfb27ba1332242e403d138f Mon Sep 17 00:00:00 2001
From: Wei Li <1253865871@qq.com>
Date: Thu, 8 May 2025 16:44:05 +0800
Subject: [PATCH 07/28] [Dataset] Add nejm ai benchmark (#2063)

* support nejm ai benchmark

* add dataset files

* revise gen name

* revise gen name

* revise class name & remove csv file & add dataset-index.yml info

* update

* update

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
---
 dataset-index.yml                             |   8 +-
 .../nejm_ai_benchmark/nejmaibench_gen.py      |   4 +
 .../nejmaibench_gen_60c8f5.py                 |  59 ++++++++
 .../nejmaibench_llmjudge_gen.py               |   4 +
 .../nejmaibench_llmjudge_gen_60c8f5.py        | 108 ++++++++++++++
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/nejmaibench.py           | 139 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  10 ++
 8 files changed, 332 insertions(+), 1 deletion(-)
 create mode 100644 opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
 create mode 100644 opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py
 create mode 100644 opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py
 create mode 100644 opencompass/datasets/nejmaibench.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 94e88200..1bfbdbbc 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1046,4 +1046,10 @@
     category: Reasoning/Code/Agent
     paper: ''
     configpath: opencompass/configs/datasets/internsandbox/internsandbox_gen_44b982.py
-    configpath_llmjudge: ''
\ No newline at end of file
+    configpath_llmjudge: ''
+- nejmaibench:
+    name: nejmaibench
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2308.04709
+    configpath: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
diff --git a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
new file mode 100644
index 00000000..2116726c
--- /dev/null
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .nejmaibench_gen_60c8f5 import nejmaibench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py
new file mode 100644
index 00000000..ec817c57
--- /dev/null
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen_60c8f5.py
@@ -0,0 +1,59 @@
+from opencompass.datasets import NejmaibenchDataset, NejmaibenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'Subject',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=NejmaibenchEvaluator),
+    pred_role='BOT',
+)
+nejmaibench_dataset = dict(
+    type=NejmaibenchDataset,
+    abbr='nejmaibench',
+    path='opencompass/nejmaibench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+nejmaibench_datasets = [nejmaibench_dataset]
diff --git a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
new file mode 100644
index 00000000..de683ccc
--- /dev/null
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .nejmaibench_llmjudge_gen_60c8f5 import nejmaibench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py
new file mode 100644
index 00000000..31be8049
--- /dev/null
+++ b/opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen_60c8f5.py
@@ -0,0 +1,108 @@
+from opencompass.datasets import NejmaibenchDataset
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import GenericLLMEvaluator
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'Subject',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=NejmaibenchDataset,
+            path='opencompass/nejmaibench',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+
+nejmaibench_dataset = dict(
+    type=NejmaibenchDataset,
+    abbr='nejmaibench',
+    path='opencompass/nejmaibench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+nejmaibench_datasets = [nejmaibench_dataset]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index a7c037cf..220ce030 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -109,6 +109,7 @@ from .musr import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
 from .natural_question import *  # noqa: F401, F403
 from .natural_question_cn import *  # noqa: F401, F403
+from .nejmaibench import *  # noqa: F401, F403
 from .NPHardEval import *  # noqa: F401, F403
 from .obqa import *  # noqa: F401, F403
 from .olymmath import *  # noqa: F401, F403
diff --git a/opencompass/datasets/nejmaibench.py b/opencompass/datasets/nejmaibench.py
new file mode 100644
index 00000000..768f4688
--- /dev/null
+++ b/opencompass/datasets/nejmaibench.py
@@ -0,0 +1,139 @@
+import re
+
+import pandas as pd
+from datasets import Dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode):
+    # 1. 从 Choices 字符串里按行拆分出每个选项
+    raw_choices = item.get('Choices', '')
+    # 去掉首尾空白并按行分割，过滤掉空行
+    lines = [
+        line.strip() for line in raw_choices.strip().splitlines()
+        if line.strip()
+    ]
+
+    # 2. 用正则去掉行首的 "A. "/"B. " 等前缀，只保留选项内容
+    options_list = [re.sub(r'^[A-Z]\.\s*', '', line) for line in lines]
+
+    # 3. 写回 item
+    item['options'] = options_list
+
+    # 4. 重建带标号的选项字符串
+    options_str = '\n'.join(f'{chr(65 + i)}. {opt}'
+                            for i, opt in enumerate(options_list))
+
+    # 5. 构造 question、label、prompt_mode、start、end
+    item['question'] = f"{item['Question']}\n{options_str}"
+    item['label'] = item['Answer']
+    item['prompt_mode'] = prompt_mode
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(options_list) - 1)
+    return item
+
+
+@LOAD_DATASET.register_module()
+class NejmaibenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
+        # 读取 CSV 文件为 DataFrame，并将 NaN 转为空字符串
+        path = get_data_path(path)
+        df = pd.read_csv(path, encoding='utf-8')
+        df = df.fillna('')
+
+        # 转换为字典列表
+        data_list = df.to_dict(orient='records')
+
+        # 将数据列表包装为 Dataset
+        dataset = Dataset.from_list(data_list)
+
+        # 根据提示模式进行解析
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode))
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt handling
+        return dataset
+
+
+class NejmaibenchEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['options'][idx],
+                                 test_set['label'][idx])
+            detail = {
+                'pred': i,
+                'answer': j,
+                'correct': False,
+                'Subject': test_set['Subject'][idx],
+            }
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+
+    # Clean up unwanted phrases in the prediction
+    for unwanted_phrase in [
+            'I understand',
+            'A through J',
+            'A through E',
+            'A through D',
+    ]:
+        prediction = prediction.replace(unwanted_phrase, '')
+
+    options_num = len(options)
+    options = [chr(65 + i) for i in range(options_num)]
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+        return prediction
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+        return prediction[0]
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 5048a496..10ca4436 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -446,6 +446,11 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/ChemBench4K",
     },
+    "opencompass/nejmaibench": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
+    },
 
 }
 
@@ -798,6 +803,11 @@ DATASETS_URL = {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ChemBench4K.zip",
         "md5": "fc23fd21b2566a5dbbebfa4601d7779c"
+    },
+    "nejmaibench": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
+        "md5": "e6082cae3596b3ebea73e23ba445b99e"
     }
     
 }

From ff3275edf023f2ce49c3fbfed90ff675ca27c1c9 Mon Sep 17 00:00:00 2001
From: Mo Li <2568818204@qq.com>
Date: Thu, 8 May 2025 19:06:56 +0800
Subject: [PATCH 08/28] [Update] Add Long-Context configs for Gemma, OREAL, and
 Qwen2.5 models (#2048)

* [Update] Update Gemma, Oreal, Qwen Config

* fix lint
---
 .../models/gemma/vllm_gemma_3_12b_it.py       | 16 ++++++++++++++
 .../models/gemma/vllm_gemma_3_27b_it.py       | 16 ++++++++++++++
 .../models/gemma/vllm_gemma_3_4b_it.py        | 17 +++++++++++++++
 .../lmdeploy_internlm3_8b_instruct_128k.py    | 19 +++++++++++++++++
 .../models/hf_internlm/lmdeploy_oreal_32b.py  | 20 ++++++++++++++++++
 .../qwen2_5/vllm_qwen2_5_14b_instruct_128k.py | 21 +++++++++++++++++++
 .../qwen2_5/vllm_qwen2_5_32b_instruct_128k.py | 21 +++++++++++++++++++
 .../qwen2_5/vllm_qwen2_5_72b_instruct_128k.py | 21 +++++++++++++++++++
 .../qwen2_5/vllm_qwen2_5_7b_instruct_128k.py  | 21 +++++++++++++++++++
 9 files changed, 172 insertions(+)
 create mode 100644 opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py
 create mode 100644 opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py
 create mode 100644 opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py
 create mode 100644 opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py
 create mode 100644 opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py
 create mode 100644 opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py
 create mode 100644 opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct_128k.py
 create mode 100644 opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct_128k.py
 create mode 100644 opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct_128k.py

diff --git a/opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py b/opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py
new file mode 100644
index 00000000..2914640f
--- /dev/null
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_12b_it.py
@@ -0,0 +1,16 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-12b-it-vllm',
+        path='google/gemma-3-12b-it',
+        model_kwargs=dict(tensor_parallel_size=4,
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py b/opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py
new file mode 100644
index 00000000..b6f4b93b
--- /dev/null
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_27b_it.py
@@ -0,0 +1,16 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-27b-it-vllm',
+        path='google/gemma-3-27b-it',
+        model_kwargs=dict(tensor_parallel_size=4,
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )   
+]
diff --git a/opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py b/opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py
new file mode 100644
index 00000000..22516ff7
--- /dev/null
+++ b/opencompass/configs/models/gemma/vllm_gemma_3_4b_it.py
@@ -0,0 +1,17 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='gemma-3-4b-it-vllm',
+        path='google/gemma-3-4b-it',
+        model_kwargs=dict(tensor_parallel_size=2, 
+                          # for long context
+                          rope_scaling={'factor': 8.0, 'rope_type': 'linear'}),
+        max_seq_len=140000,
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py b/opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py
new file mode 100644
index 00000000..1cc4e251
--- /dev/null
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_internlm3_8b_instruct_128k.py
@@ -0,0 +1,19 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm3-8b-instruct-turbomind',
+        path='internlm/internlm3-8b-instruct',
+        engine_config=dict(session_len=142000, max_batch_size=1, tp=2,
+                           # for long context
+                           rope_scaling_factor=6.0),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192
+        ),
+        max_seq_len=142000,
+        max_out_len=8192,
+        batch_size=1,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py b/opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py
new file mode 100644
index 00000000..1d10bd94
--- /dev/null
+++ b/opencompass/configs/models/hf_internlm/lmdeploy_oreal_32b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='OREAL-32B',
+        path='internlm/OREAL-32B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py
new file mode 100644
index 00000000..6dec3743
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_14b_instruct_128k.py
@@ -0,0 +1,21 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2.5-14b-instruct-vllm',
+        path='Qwen/Qwen2.5-14B-Instruct',
+        model_kwargs=dict(
+            tensor_parallel_size=4,
+            rope_scaling={
+                'factor': 4.0,
+                'original_max_position_embeddings': 32768,
+                'rope_type': 'yarn'
+            },
+        ),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct_128k.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct_128k.py
new file mode 100644
index 00000000..5c326734
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_32b_instruct_128k.py
@@ -0,0 +1,21 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2.5-32b-instruct-vllm',
+        path='Qwen/Qwen2.5-32B-Instruct',
+        model_kwargs=dict(
+            tensor_parallel_size=8,
+            rope_scaling={
+                'factor': 4.0,
+                'original_max_position_embeddings': 32768,
+                'rope_type': 'yarn'
+            },
+        ),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct_128k.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct_128k.py
new file mode 100644
index 00000000..2a4a52fa
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_72b_instruct_128k.py
@@ -0,0 +1,21 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2_5-72b-instruct-vllm',
+        path='Qwen/Qwen2.5-72B-Instruct',
+        model_kwargs=dict(
+            tensor_parallel_size=8,
+            rope_scaling={
+                'factor': 4.0,
+                'original_max_position_embeddings': 32768,
+                'rope_type': 'yarn'
+            },
+        ),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct_128k.py b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct_128k.py
new file mode 100644
index 00000000..db21f730
--- /dev/null
+++ b/opencompass/configs/models/qwen2_5/vllm_qwen2_5_7b_instruct_128k.py
@@ -0,0 +1,21 @@
+from opencompass.models import VLLMwithChatTemplate
+
+models = [
+    dict(
+        type=VLLMwithChatTemplate,
+        abbr='qwen2.5-7b-instruct-vllm',
+        path='Qwen/Qwen2.5-7B-Instruct',
+        model_kwargs=dict(
+            tensor_parallel_size=4,
+            rope_scaling={
+                'factor': 4.0,
+                'original_max_position_embeddings': 32768,
+                'rope_type': 'yarn'
+            },
+        ),
+        max_out_len=4096,
+        batch_size=1,
+        generation_kwargs=dict(temperature=0),
+        run_cfg=dict(num_gpus=4),
+    )
+]

From a7f3ac20b259339b851e4664991686161fa0d866 Mon Sep 17 00:00:00 2001
From: huihui1999 <107675879+bio-mlhui@users.noreply.github.com>
Date: Thu, 8 May 2025 19:44:46 +0800
Subject: [PATCH 09/28] [Dataset] Add CARDBiomedBench (#2071)

* CARDBiomedBench

* fix hash

* fix dataset-index

* use official llmjudge postprocess

* use official llmjudge_postprocess

* fix lint

* fix init
---
 dataset-index.yml                             |   6 ++
 .../CARDBiomedBench_llmjudge_gen_99a231.py    | 101 ++++++++++++++++++
 opencompass/datasets/CARDBiomedBench.py       |  30 ++++++
 opencompass/datasets/__init__.py              |   1 +
 4 files changed, 138 insertions(+)
 create mode 100644 opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py
 create mode 100644 opencompass/datasets/CARDBiomedBench.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 1bfbdbbc..f0960740 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -361,6 +361,12 @@
     paper: https://arxiv.org/pdf/2004.05986
     configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
     configpath_llmjudge: ''
+- CARDBiomedBench:
+    name: CARDBiomedBench
+    category: Knowledge / Medicine
+    paper: https://www.biorxiv.org/content/10.1101/2025.01.15.633272v1
+    configpath: opencompass/configs/datasets/CARDBiomedBench
+    configpath_llmjudge: 'opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py'
 - cb:
     name: SuperGLUE / CB
     category: Reasoning
diff --git a/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py b/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py
new file mode 100644
index 00000000..c6acb71e
--- /dev/null
+++ b/opencompass/configs/datasets/CARDBiomedBench/CARDBiomedBench_llmjudge_gen_99a231.py
@@ -0,0 +1,101 @@
+from opencompass.datasets import CARDBiomedBenchDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+ZERO_SHOT_PROMPT = 'You are an expert in {expert}.\n{question}\n'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: You are an expert in {expert}.\n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'answer',
+        'Bio_Category',
+        'SQL_Category',
+        'uuid',
+        'template uuid',
+        'expert',
+    ],
+    output_column='answer',
+)
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CARDBiomedBenchDataset,
+            path='NIH-CARD/CARDBiomedBench',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+cardbiomedbench_dataset = dict(
+    type=CARDBiomedBenchDataset,
+    abbr='cardbiomedbench',
+    path='NIH-CARD/CARDBiomedBench',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+cardbiomedbench_datasets = [cardbiomedbench_dataset]
diff --git a/opencompass/datasets/CARDBiomedBench.py b/opencompass/datasets/CARDBiomedBench.py
new file mode 100644
index 00000000..77ff9ee6
--- /dev/null
+++ b/opencompass/datasets/CARDBiomedBench.py
@@ -0,0 +1,30 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode):
+    item['expert'] = item['Bio_Category']
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(item.get('choices', {'label': []})['label']) -
+                      1)
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class CARDBiomedBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        data_files = {'test': 'data/CARDBiomedBench.csv'}
+        dataset = load_dataset(path, data_files=data_files, split='test')
+        # dataset = dataset.select(range(200))
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode),
+                                  load_from_cache_file=False)
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+        return dataset
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 220ce030..03e7d228 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -16,6 +16,7 @@ from .boolq import *  # noqa: F401, F403
 from .bustum import *  # noqa: F401, F403
 from .c3 import *  # noqa: F401, F403
 from .calm import *  # noqa: F401, F403
+from .CARDBiomedBench import CARDBiomedBenchDataset  # noqa: F401
 from .cb import *  # noqa: F401, F403
 from .ceval import *  # noqa: F401, F403
 from .charm import *  # noqa: F401, F403

From c5048bfec7890ad1d1a3efa77963e3eba0c97730 Mon Sep 17 00:00:00 2001
From: tcheng <tangcheng231@mails.ucas.ac.cn>
Date: Fri, 9 May 2025 14:31:12 +0800
Subject: [PATCH 10/28] [Dataset] Add Lifescience Sub-set Support for  SciEval
 (#2059)

* style: pass all formatting hooks (yapf & quote fixer)

* revise name:Add Lifescience Sub-set Support for MMLU & SciEval (datasets + configs + loader)

* revise name:Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

---------

Co-authored-by: root <tangcheng231@mails.ucas.edu.cn>
---
 dataset-index.yml                             |   6 +
 .../SciEval_lifescience_0shot_gen_4043d4.py   |  61 +++++++++
 ...l_lifescience_0shot_llmjudge_gen_012dd1.py | 125 ++++++++++++++++++
 .../SciEval_lifescience_sets.py               |   3 +
 opencompass/datasets/SciEval_lifescience.py   |  62 +++++++++
 opencompass/datasets/__init__.py              |   1 +
 6 files changed, 258 insertions(+)
 create mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
 create mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
 create mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
 create mode 100644 opencompass/datasets/SciEval_lifescience.py

diff --git a/dataset-index.yml b/dataset-index.yml
index f0960740..fcf34dcb 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -695,6 +695,12 @@
     paper: https://arxiv.org/pdf/2009.03300
     configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
     configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+- SciEval:
+    name: SciEval
+    category: Understanding
+    paper: https://arxiv.org/pdf/2308.13149
+    configpath: opencompass/configs/datasets/SciEval_lifscience/SciEval_lifscience_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/SciEval_lifscience/SciEval_lifscience_llm_judge_gen.py
 - mmlu_cf:
     name: MMLU-CF
     category: Understanding
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
new file mode 100644
index 00000000..5381abcf
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
@@ -0,0 +1,61 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets import SciEvalDataset  # 你自己实现的类
+
+# 只评测 biology + multiple-choice 的 test split
+_hint = ('Given a question and four options, please select the right answer. '
+         "Your answer should be 'A', 'B', 'C' or 'D'.")
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+            ),
+            dict(role='BOT', prompt='{target}\n')
+        ]),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=GenInferencer),
+)
+
+scieval_eval_cfg = dict(
+    evaluator=dict(type=AccwithDetailsEvaluator),
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+
+scieval_datasets = [
+    dict(
+        abbr='scieval_biology',
+        type=SciEvalDataset,
+        path='OpenDFM/SciEval',
+        name='default',
+        reader_cfg=scieval_reader_cfg,
+        infer_cfg=scieval_infer_cfg,
+        eval_cfg=scieval_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
new file mode 100644
index 00000000..26af5cd3
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
@@ -0,0 +1,125 @@
+# SciEval_lifescience_llmjudge_gen.py
+
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import SciEvalDataset
+
+with read_base():
+    from .SciEval_lifescience_sets import SciEval_lifescience_subsets
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+GRADER_TEMPLATE = """
+Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+Here are some evaluation criteria:
+1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+A: CORRECT 
+B: INCORRECT
+Just return the letters "A" or "B", with no text around it.
+
+Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+<Original Question Begin>: {input}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+<Original Question End>
+
+<Gold Target Begin>:
+{target}
+<Gold Target End>
+
+<Predicted Answer Begin>:
+{prediction}
+<Predicted End>
+
+Judging the correctness of candidates' answers:
+""".strip()
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_datasets = []
+for name in SciEval_lifescience_subsets:
+    scieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    scieval_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=(
+                                'You are a helpful assistant who evaluates the correctness '
+                                "and quality of models' outputs."
+                            ),
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=SciEvalDataset,
+                path='OpenDFM/SciEval',
+                name='default',
+                reader_cfg=scieval_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    scieval_datasets.append(
+        dict(
+            abbr=f'scieval_lifescience_{name}_llmjudge',
+            type=SciEvalDataset,
+            path='OpenDFM/SciEval',
+            name='default',
+            reader_cfg=scieval_reader_cfg,
+            infer_cfg=scieval_infer_cfg,
+            eval_cfg=scieval_eval_cfg,
+            mode='singlescore',
+        )
+    )
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
new file mode 100644
index 00000000..8d0a0a83
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
@@ -0,0 +1,3 @@
+SciEval_lifescience_subsets = [
+    'biology',        # 大学生物学
+]
diff --git a/opencompass/datasets/SciEval_lifescience.py b/opencompass/datasets/SciEval_lifescience.py
new file mode 100644
index 00000000..af93e496
--- /dev/null
+++ b/opencompass/datasets/SciEval_lifescience.py
@@ -0,0 +1,62 @@
+import re
+from typing import List
+
+from datasets import Dataset, DatasetDict, load_dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.registry import LOAD_DATASET
+
+# 预编译的多选题正则，按 PEP-8 每行 < 79 字符
+_PATTERN_MC = (
+    r'^(?P<stem>.*?)'  # 题干
+    r'(?:A\.)\s*(?P<A>.*?)\s*'  # 选项 A
+    r'B\.\s*(?P<B>.*?)\s*'  # 选项 B
+    r'C\.\s*(?P<C>.*?)\s*'  # 选项 C
+    r'D\.\s*(?P<D>.*?)'  # 选项 D
+    r'Answer:'  # 答案分隔符
+)
+
+
+@LOAD_DATASET.register_module()
+class SciEvalDataset(BaseDataset):
+    """Biology multiple-choice subset of SciEval."""
+
+    @staticmethod
+    def load(path: str, name: str, **kwargs) -> DatasetDict:
+        dataset = DatasetDict()
+
+        for split in ('test', ):
+            raw_iter = load_dataset(
+                path,
+                name=name,
+                split=split,
+                streaming=True,
+            )
+
+            examples: List[dict] = []
+            for ex in raw_iter:
+                if (ex.get('category') != 'biology'
+                        or ex.get('type') != 'multiple-choice'):
+                    continue
+
+                ans_list = ex.get('answer') or ex.get('answers') or []
+                if not ans_list:
+                    continue
+                target = ans_list[0]
+
+                match = re.search(_PATTERN_MC, ex.get('question', ''), re.S)
+                if not match:
+                    continue
+
+                examples.append({
+                    'input': match.group('stem').strip(),
+                    'A': match.group('A').strip(),
+                    'B': match.group('B').strip(),
+                    'C': match.group('C').strip(),
+                    'D': match.group('D').strip(),
+                    'target': target,
+                })
+
+            dataset[split] = Dataset.from_list(examples)
+
+        return dataset
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 03e7d228..a70b27d5 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -130,6 +130,7 @@ from .ruler import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
 from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
+from .SciEval_lifescience import SciEvalDataset  # noqa: F401
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .smolinstruct import *  # noqa: F401, F403

From d72df59363ef1e4e67c6f7a3873268badf16c205 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 9 May 2025 14:46:27 +0800
Subject: [PATCH 11/28] [Revert] Add Lifescience Sub-set Support for  SciEval
 (#2059) (#2087)

This reverts commit c5048bfec7890ad1d1a3efa77963e3eba0c97730.
---
 dataset-index.yml                             |   6 -
 .../SciEval_lifescience_0shot_gen_4043d4.py   |  61 ---------
 ...l_lifescience_0shot_llmjudge_gen_012dd1.py | 125 ------------------
 .../SciEval_lifescience_sets.py               |   3 -
 opencompass/datasets/SciEval_lifescience.py   |  62 ---------
 opencompass/datasets/__init__.py              |   1 -
 6 files changed, 258 deletions(-)
 delete mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
 delete mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
 delete mode 100644 opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
 delete mode 100644 opencompass/datasets/SciEval_lifescience.py

diff --git a/dataset-index.yml b/dataset-index.yml
index fcf34dcb..f0960740 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -695,12 +695,6 @@
     paper: https://arxiv.org/pdf/2009.03300
     configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
     configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
-- SciEval:
-    name: SciEval
-    category: Understanding
-    paper: https://arxiv.org/pdf/2308.13149
-    configpath: opencompass/configs/datasets/SciEval_lifscience/SciEval_lifscience_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/SciEval_lifscience/SciEval_lifscience_llm_judge_gen.py
 - mmlu_cf:
     name: MMLU-CF
     category: Understanding
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
deleted file mode 100644
index 5381abcf..00000000
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_gen_4043d4.py
+++ /dev/null
@@ -1,61 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import FixKRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
-from opencompass.utils.text_postprocessors import first_option_postprocess
-from opencompass.datasets import SciEvalDataset  # 你自己实现的类
-
-# 只评测 biology + multiple-choice 的 test split
-_hint = ('Given a question and four options, please select the right answer. '
-         "Your answer should be 'A', 'B', 'C' or 'D'.")
-
-scieval_reader_cfg = dict(
-    input_columns=['input', 'A', 'B', 'C', 'D'],
-    output_column='target',
-    train_split='test',
-)
-
-scieval_infer_cfg = dict(
-    ice_template=dict(
-        type=PromptTemplate,
-        template=dict(round=[
-            dict(
-                role='HUMAN',
-                prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
-            ),
-            dict(role='BOT', prompt='{target}\n')
-        ]),
-    ),
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin='</E>',
-            round=[
-                dict(
-                    role='HUMAN',
-                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
-                ),
-            ],
-        ),
-        ice_token='</E>',
-    ),
-    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
-    inferencer=dict(type=GenInferencer),
-)
-
-scieval_eval_cfg = dict(
-    evaluator=dict(type=AccwithDetailsEvaluator),
-    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
-)
-
-scieval_datasets = [
-    dict(
-        abbr='scieval_biology',
-        type=SciEvalDataset,
-        path='OpenDFM/SciEval',
-        name='default',
-        reader_cfg=scieval_reader_cfg,
-        infer_cfg=scieval_infer_cfg,
-        eval_cfg=scieval_eval_cfg,
-    )
-]
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
deleted file mode 100644
index 26af5cd3..00000000
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_0shot_llmjudge_gen_012dd1.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# SciEval_lifescience_llmjudge_gen.py
-
-from mmengine.config import read_base
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.utils.text_postprocessors import match_answer_pattern
-from opencompass.evaluator import GenericLLMEvaluator
-from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.datasets import SciEvalDataset
-
-with read_base():
-    from .SciEval_lifescience_sets import SciEval_lifescience_subsets
-
-QUERY_TEMPLATE = """
-Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
-
-{input}
-
-A) {A}
-B) {B}
-C) {C}
-D) {D}
-""".strip()
-
-GRADER_TEMPLATE = """
-Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
-
-Here are some evaluation criteria:
-1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
-2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
-3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
-4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
-
-Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
-A: CORRECT 
-B: INCORRECT
-Just return the letters "A" or "B", with no text around it.
-
-Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
-
-<Original Question Begin>: {input}
-A) {A}
-B) {B}
-C) {C}
-D) {D}
-<Original Question End>
-
-<Gold Target Begin>:
-{target}
-<Gold Target End>
-
-<Predicted Answer Begin>:
-{prediction}
-<Predicted End>
-
-Judging the correctness of candidates' answers:
-""".strip()
-
-scieval_reader_cfg = dict(
-    input_columns=['input', 'A', 'B', 'C', 'D'],
-    output_column='target',
-    train_split='test',
-)
-
-scieval_datasets = []
-for name in SciEval_lifescience_subsets:
-    scieval_infer_cfg = dict(
-        prompt_template=dict(
-            type=PromptTemplate,
-            template=dict(
-                round=[
-                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
-                ]
-            )
-        ),
-        retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer),
-    )
-
-    scieval_eval_cfg = dict(
-        evaluator=dict(
-            type=GenericLLMEvaluator,
-            prompt_template=dict(
-                type=PromptTemplate,
-                template=dict(
-                    begin=[
-                        dict(
-                            role='SYSTEM',
-                            fallback_role='HUMAN',
-                            prompt=(
-                                'You are a helpful assistant who evaluates the correctness '
-                                "and quality of models' outputs."
-                            ),
-                        )
-                    ],
-                    round=[
-                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
-                    ],
-                ),
-            ),
-            dataset_cfg=dict(
-                type=SciEvalDataset,
-                path='OpenDFM/SciEval',
-                name='default',
-                reader_cfg=scieval_reader_cfg,
-            ),
-            judge_cfg=dict(),
-            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
-        ),
-        pred_role='BOT',
-    )
-
-    scieval_datasets.append(
-        dict(
-            abbr=f'scieval_lifescience_{name}_llmjudge',
-            type=SciEvalDataset,
-            path='OpenDFM/SciEval',
-            name='default',
-            reader_cfg=scieval_reader_cfg,
-            infer_cfg=scieval_infer_cfg,
-            eval_cfg=scieval_eval_cfg,
-            mode='singlescore',
-        )
-    )
diff --git a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
deleted file mode 100644
index 8d0a0a83..00000000
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
+++ /dev/null
@@ -1,3 +0,0 @@
-SciEval_lifescience_subsets = [
-    'biology',        # 大学生物学
-]
diff --git a/opencompass/datasets/SciEval_lifescience.py b/opencompass/datasets/SciEval_lifescience.py
deleted file mode 100644
index af93e496..00000000
--- a/opencompass/datasets/SciEval_lifescience.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import re
-from typing import List
-
-from datasets import Dataset, DatasetDict, load_dataset
-
-from opencompass.datasets.base import BaseDataset
-from opencompass.registry import LOAD_DATASET
-
-# 预编译的多选题正则，按 PEP-8 每行 < 79 字符
-_PATTERN_MC = (
-    r'^(?P<stem>.*?)'  # 题干
-    r'(?:A\.)\s*(?P<A>.*?)\s*'  # 选项 A
-    r'B\.\s*(?P<B>.*?)\s*'  # 选项 B
-    r'C\.\s*(?P<C>.*?)\s*'  # 选项 C
-    r'D\.\s*(?P<D>.*?)'  # 选项 D
-    r'Answer:'  # 答案分隔符
-)
-
-
-@LOAD_DATASET.register_module()
-class SciEvalDataset(BaseDataset):
-    """Biology multiple-choice subset of SciEval."""
-
-    @staticmethod
-    def load(path: str, name: str, **kwargs) -> DatasetDict:
-        dataset = DatasetDict()
-
-        for split in ('test', ):
-            raw_iter = load_dataset(
-                path,
-                name=name,
-                split=split,
-                streaming=True,
-            )
-
-            examples: List[dict] = []
-            for ex in raw_iter:
-                if (ex.get('category') != 'biology'
-                        or ex.get('type') != 'multiple-choice'):
-                    continue
-
-                ans_list = ex.get('answer') or ex.get('answers') or []
-                if not ans_list:
-                    continue
-                target = ans_list[0]
-
-                match = re.search(_PATTERN_MC, ex.get('question', ''), re.S)
-                if not match:
-                    continue
-
-                examples.append({
-                    'input': match.group('stem').strip(),
-                    'A': match.group('A').strip(),
-                    'B': match.group('B').strip(),
-                    'C': match.group('C').strip(),
-                    'D': match.group('D').strip(),
-                    'target': target,
-                })
-
-            dataset[split] = Dataset.from_list(examples)
-
-        return dataset
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index a70b27d5..03e7d228 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -130,7 +130,6 @@ from .ruler import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
 from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
-from .SciEval_lifescience import SciEvalDataset  # noqa: F401
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .smolinstruct import *  # noqa: F401, F403

From 6097186a95e8bbaa38d817dd990065aa83552fe6 Mon Sep 17 00:00:00 2001
From: Jin Ye <eugene.j.yonng@gmail.com>
Date: Fri, 9 May 2025 16:47:44 +1000
Subject: [PATCH 12/28] [Datasets] MedQA, ProteinLMBench; Add Models:
 huatuogpt, baichuanM1 (#2064)

* Add Datasets: MedQA, ProteinLMBench; Add Models: huatuogpt, baichuanM1

* Fix bugs for MedQA. Add info in dataset-index

* Add version code for MedQA and ProteinLMBench

* Add version code for MedQA and ProteinLMBench
---
 dataset-index.yml                             |  12 ++
 .../datasets/MedQA/MedQA_gen_3bf756.py        |  63 ++++++++++
 .../MedQA/MedQA_llmjudge_gen_3bf756.py        | 108 ++++++++++++++++++
 .../ProteinLMBench_gen_a67965.py              |  46 ++++++++
 .../ProteinLMBench_llmjudge_gen_a67965.py     |  89 +++++++++++++++
 .../baichuan/hf_baichuan_m1_14b_base.py       |  14 +++
 .../baichuan/hf_baichuan_m1_14b_instruct.py   |  14 +++
 .../models/huatuogpt/hf_huatuogpt2_13b.py     |  17 +++
 .../models/huatuogpt/hf_huatuogpt2_7b.py      |  13 +++
 .../models/huatuogpt/hf_huatuogpt_o1_7b.py    |  15 +++
 .../models/huatuogpt/hf_huatuogpt_o1_8b.py    |  15 +++
 opencompass/datasets/MedQA.py                 |  29 +++++
 opencompass/datasets/ProteinLMBench.py        |  58 ++++++++++
 opencompass/datasets/__init__.py              |   2 +
 14 files changed, 495 insertions(+)
 create mode 100644 opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
 create mode 100644 opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
 create mode 100644 opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py
 create mode 100644 opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py
 create mode 100644 opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py
 create mode 100644 opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py
 create mode 100644 opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py
 create mode 100644 opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py
 create mode 100644 opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py
 create mode 100644 opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py
 create mode 100644 opencompass/datasets/MedQA.py
 create mode 100644 opencompass/datasets/ProteinLMBench.py

diff --git a/dataset-index.yml b/dataset-index.yml
index f0960740..abd0878a 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -122,6 +122,12 @@
     paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
     configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
     configpath_llmjudge: ''
+- MedXpertQA:
+    name: MedQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2009.13081
+    configpath: opencompass/configs/datasets/MedQA/MedQA_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen.py
 - MedXpertQA:
     name: MedXpertQA
     category: Knowledge / Medicine
@@ -763,6 +769,12 @@
     paper: https://arxiv.org/pdf/1911.11641v1
     configpath: opencompass/configs/datasets/piqa/piqa_gen.py
     configpath_llmjudge: ''
+- ProteinLMBench:
+    name: ProteinLMBench
+    category: Knowledge / Biology (Protein)
+    paper: https://arxiv.org/abs/2406.05540
+    configpath: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen.py
 - py150:
     name: py150
     category: Code
diff --git a/opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py b/opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
new file mode 100644
index 00000000..01306134
--- /dev/null
+++ b/opencompass/configs/datasets/MedQA/MedQA_gen_3bf756.py
@@ -0,0 +1,63 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets.MedQA import MedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+
+MedQA_datasets = []
+
+MedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+MedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+MedQA_subsets = {
+    'US': 'xuxuxuxuxu/MedQA_US_test',
+    'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
+    'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
+}
+
+for split in list(MedQA_subsets.keys()):
+
+    MedQA_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')
+    )
+
+    MedQA_datasets.append(
+        dict(
+            abbr=f'MedQA_{split}',
+            type=MedQADataset,
+            path=MedQA_subsets[split],
+            reader_cfg=MedQA_reader_cfg,
+            infer_cfg=MedQA_infer_cfg,
+            eval_cfg=MedQA_eval_cfg,
+        )
+    )
diff --git a/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py b/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
new file mode 100644
index 00000000..d6c19119
--- /dev/null
+++ b/opencompass/configs/datasets/MedQA/MedQA_llmjudge_gen_3bf756.py
@@ -0,0 +1,108 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.MedQA import MedQADataset
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{choices}
+
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n {choices} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+MedQA_datasets = []
+
+MedQA_reader_cfg = dict(
+    input_columns=['question', 'choices'],
+    output_column='label',
+)
+
+MedQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+MedQA_subsets = {
+    'US': 'xuxuxuxuxu/MedQA_US_test',
+    'Mainland': 'xuxuxuxuxu/MedQA_Mainland_test',
+    'Taiwan': 'xuxuxuxuxu/MedQA_Taiwan_test',
+}
+
+for split in list(MedQA_subsets.keys()):
+
+    MedQA_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MedQADataset,
+                path=MedQA_subsets[split],
+                reader_cfg=MedQA_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+    )
+
+    MedQA_datasets.append(
+        dict(
+            abbr=f'MedQA_{split}',
+            type=MedQADataset,
+            path=MedQA_subsets[split],
+            reader_cfg=MedQA_reader_cfg,
+            infer_cfg=MedQA_infer_cfg,
+            eval_cfg=MedQA_eval_cfg,
+        )
+    )
diff --git a/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py
new file mode 100644
index 00000000..2cf2f220
--- /dev/null
+++ b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_gen_a67965.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset, ProteinLMBenchEvaluator
+
+QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=['question', 'start', 'end', 'options'],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=QUERY_TEMPLATE
+                )
+            ], ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=ProteinLMBenchEvaluator),
+)
+
+proteinlmbench_dataset = dict(
+    abbr='ProteinLMBench',
+    type=ProteinLMBenchDataset,
+    path='tsynbio/ProteinLMBench',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg
+)
+
+proteinlmbench_datasets = [proteinlmbench_dataset]
diff --git a/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py
new file mode 100644
index 00000000..5254677e
--- /dev/null
+++ b/opencompass/configs/datasets/ProteinLMBench/ProteinLMBench_llmjudge_gen_a67965.py
@@ -0,0 +1,89 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.ProteinLMBench import ProteinLMBenchDataset
+
+QUERY_TEMPLATE = "Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is the letter among {start} through {end}.\n{question}"
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+reader_cfg = dict(
+    input_columns=['question', 'start', 'end', 'options'],
+    output_column='label',
+)
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=ProteinLMBenchDataset,
+            path='tsynbio/ProteinLMBench',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+proteinlmbench_dataset = dict(
+    abbr='ProteinLMBench',
+    type=ProteinLMBenchDataset,
+    path='tsynbio/ProteinLMBench',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg
+)
+
+proteinlmbench_datasets = [proteinlmbench_dataset]
diff --git a/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py
new file mode 100644
index 00000000..e5b59bfb
--- /dev/null
+++ b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_base.py
@@ -0,0 +1,14 @@
+import torch
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='baichuan-m1-14b-base-hf',
+        path='baichuan-inc/Baichuan-M1-14B-Base',
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py
new file mode 100644
index 00000000..b90f39fb
--- /dev/null
+++ b/opencompass/configs/models/baichuan/hf_baichuan_m1_14b_instruct.py
@@ -0,0 +1,14 @@
+import torch
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='baichuan-m1-14b-instruct-hf',
+        path='baichuan-inc/Baichuan-M1-14B-Instruct',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True, torch_dtype=torch.bfloat16),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py
new file mode 100644
index 00000000..d5ffbf6e
--- /dev/null
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_13b.py
@@ -0,0 +1,17 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt2-13b-hf',
+        path='FreedomIntelligence/HuatuoGPT2-13B',
+        tokenizer_kwargs=dict(padding_side='left',
+                              truncation_side='left',
+                              trust_remote_code=True,
+                              use_fast=True,),
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py
new file mode 100644
index 00000000..98d29ad2
--- /dev/null
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt2_7b.py
@@ -0,0 +1,13 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt2-7b-hf',
+        path='FreedomIntelligence/HuatuoGPT2-7B',
+        max_out_len=1024,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+    )
+]
diff --git a/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py
new file mode 100644
index 00000000..db1130e1
--- /dev/null
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_7b.py
@@ -0,0 +1,15 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt-o1-7b-hf',
+        path='FreedomIntelligence/HuatuoGPT-o1-7B',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
+    )
+]
diff --git a/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py
new file mode 100644
index 00000000..ba2e2c1d
--- /dev/null
+++ b/opencompass/configs/models/huatuogpt/hf_huatuogpt_o1_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='huatuogpt-o1-8b-hf',
+        path='FreedomIntelligence/HuatuoGPT-o1-8B',
+        max_out_len=2048,
+        batch_size=8,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content, think_start_token='## Thinking', think_end_token='## Final Response'),
+    )
+]
diff --git a/opencompass/datasets/MedQA.py b/opencompass/datasets/MedQA.py
new file mode 100644
index 00000000..256f9910
--- /dev/null
+++ b/opencompass/datasets/MedQA.py
@@ -0,0 +1,29 @@
+from datasets import Dataset, load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MedQADataset(BaseDataset):
+
+    @staticmethod
+    def load_single(path):
+        dataset = []
+        ds = load_dataset(path)
+        for data in ds['train']:
+            data['label'] = data['answer_idx']
+            choices = ''
+            for option in data['options']:
+                choices += option + '. ' + data['options'][option] + '\n'
+            data['choices'] = choices
+
+            dataset.append(data)
+
+        return Dataset.from_list(dataset)
+
+    @staticmethod
+    def load(path):
+        dataset = MedQADataset.load_single(path)
+        return dataset
diff --git a/opencompass/datasets/ProteinLMBench.py b/opencompass/datasets/ProteinLMBench.py
new file mode 100644
index 00000000..bebaadfd
--- /dev/null
+++ b/opencompass/datasets/ProteinLMBench.py
@@ -0,0 +1,58 @@
+from datasets import load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+from .base import BaseDataset
+
+
+def _parse(item):
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(item.get('options', [])) - 1)
+    new_options = []
+    choices = ''
+    for i in range(len(item['options'])):
+        new_options.append(item['options'][i].split(': ')[-1])
+        choices += chr(65 +
+                       i) + '. ' + item['options'][i].split(': ')[-1] + '\n'
+    item['question'] = (f'\nQuestion: {item["question"]}\n'
+                        f'Answer Choices: \n{choices}')
+    item['options'] = new_options
+    item['label'] = chr(65 + int(item['answer'].split(' ')[-1]) -
+                        1)  # Index from 1 in answer
+    return item
+
+
+@LOAD_DATASET.register_module()
+class ProteinLMBenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, **kwargs):
+        dataset = load_dataset(path, 'evaluation', split='train')
+        dataset = dataset.map(lambda item: _parse(item))
+
+        return dataset
+
+
+class ProteinLMBenchEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (prediction,
+                  reference) in enumerate(zip(predictions, references)):
+            options = ''.join(
+                [chr(65 + i) for i in range(len(test_set['options'][idx]))])
+            predict = first_option_postprocess(prediction, options)
+            detail = {'pred': predict, 'answer': reference, 'correct': False}
+            count += 1
+            if predict == reference:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 03e7d228..5a98a942 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -97,6 +97,7 @@ from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
+from .MedQA import *  # noqa: F401, F403
 from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
@@ -118,6 +119,7 @@ from .OlympiadBench import *  # noqa: F401, F403
 from .OpenFinData import *  # noqa: F401, F403
 from .physics import *  # noqa: F401, F403
 from .piqa import *  # noqa: F401, F403
+from .ProteinLMBench import *  # noqa: F401, F403
 from .py150 import *  # noqa: F401, F403
 from .qasper import *  # noqa: F401, F403
 from .qaspercut import *  # noqa: F401, F403

From 7bdd3c190451f83fc4fa8f250b77f0bf35c8d628 Mon Sep 17 00:00:00 2001
From: Kun Yuan <31314392+Flaick@users.noreply.github.com>
Date: Fri, 9 May 2025 09:07:26 +0200
Subject: [PATCH 13/28] [Dataset] MMLU_Pro Biomedical Version Support (#2081)

---
 .../mmlu_pro_biomed_0shot_cot_gen_057927.py   |  60 +++++++++++
 ...d_0shot_nocot_genericllmeval_gen_057927.py | 101 ++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py
 create mode 100644 opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py

diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py
new file mode 100644
index 00000000..02766491
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_cot_gen_057927.py
@@ -0,0 +1,60 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUProDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+categories = [
+    'health',
+]
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{options_str}
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN',
+                         prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(
+            type=match_answer_pattern,
+            answer_pattern=r'(?i)ANSWER\s*:\s*([A-P])')
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        ))
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
new file mode 100644
index 00000000..58cd20b1
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_biomed_0shot_nocot_genericllmeval_gen_057927.py
@@ -0,0 +1,101 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess
+
+categories = [
+    'health',
+]
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+Question:\n
+{question}
+Options:\n
+{options_str}
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MMLUProDataset,
+                path='opencompass/mmlu_pro',
+                category=category,
+                reader_cfg=mmlu_pro_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        )
+    )
\ No newline at end of file

From 508e2b0cb252ce6adb2a12b92ef4bfa38f13d4d7 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 9 May 2025 15:21:47 +0800
Subject: [PATCH 14/28] [Update] Set load_from_cache_file to False (#2085)

---
 opencompass/datasets/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py
index ac6c4570..1ccbe9fd 100644
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@@ -23,7 +23,8 @@ class BaseDataset:
                 'idx': idx
             },
                                   with_indices=True,
-                                  writer_batch_size=16)
+                                  writer_batch_size=16,
+                                  load_from_cache_file=False)
             dataset = concatenate_datasets([dataset] * n)
             self.dataset = dataset
         else:
@@ -34,7 +35,8 @@ class BaseDataset:
                     'idx': idx
                 },
                                                 with_indices=True,
-                                                writer_batch_size=16)
+                                                writer_batch_size=16,
+                                                load_from_cache_file=False)
                 dataset[key] = concatenate_datasets([dataset[key]] * n)
                 self.dataset[key] = dataset[key]
         self._init_reader(**reader_cfg)

From 44a7024ed556917e158c41852fd7d0e23719e884 Mon Sep 17 00:00:00 2001
From: huihui1999 <107675879+bio-mlhui@users.noreply.github.com>
Date: Fri, 9 May 2025 16:58:55 +0800
Subject: [PATCH 15/28] [Dataset] MedCalc_Bench (#2072)

* MedCalc_Bench

* MedCal_Bench

* add hash

* fix hash

* fix comments &dataset-index yml

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

---------

Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
---
 dataset-index.yml                             |   6 +
 .../MedCalcBench_official_gen_a5155f.py       |  57 ++++
 opencompass/datasets/MedCalc_Bench.py         | 323 ++++++++++++++++++
 opencompass/datasets/__init__.py              |   2 +
 4 files changed, 388 insertions(+)
 create mode 100644 opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
 create mode 100644 opencompass/datasets/MedCalc_Bench.py

diff --git a/dataset-index.yml b/dataset-index.yml
index abd0878a..a2179b92 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -122,6 +122,12 @@
     paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
     configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
     configpath_llmjudge: ''
+- MedCalc_Bench:
+    name: MedCalc_Bench
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2406.12036
+    configpath: opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
+    configpath_llmjudge: ''
 - MedXpertQA:
     name: MedQA
     category: Knowledge / Medicine
diff --git a/opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py b/opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
new file mode 100644
index 00000000..74fdff5e
--- /dev/null
+++ b/opencompass/configs/datasets/MedCalc_Bench/MedCalcBench_official_gen_a5155f.py
@@ -0,0 +1,57 @@
+from opencompass.datasets import MedCalc_BenchDataset, MedCalcOfficial_Evaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+ZERO_SHOT_PROMPT = 'You are a helpful assistant for calculating a score for a given patient note. Please think step-by-step to solve the question and then generate the required score. Your output should only contain a JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}. \n Here is the patient note:\n{patient_note}\n\nHere is the task:\n{question}\n\nPlease directly output the JSON dict formatted as {"step_by_step_thinking": str(your_step_by_step_thinking_procress_to_solve_the_question), "answer": str(short_and_direct_answer_of_the_question)}:'
+# Reader configuration
+reader_cfg = dict(
+        input_columns=[
+        'row_number',
+        'calculator_id',
+        'calculator_name',
+        'category',
+        'note_id',
+        'output_type',
+        'note_type',
+        'patient_note',
+        'question',
+        'relevant_entities',
+        'ground_truth_answer',
+        'lower_limit',
+        'upper_limit',
+        'ground_truth_explanation'
+    ],
+    output_column='ground_truth_answer',
+)
+
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+            dict(role='HUMAN',prompt=ZERO_SHOT_PROMPT),
+        ])
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedCalcOfficial_Evaluator),
+    pred_role='BOT',
+)
+medcal_bench_dataset = dict(
+    type=MedCalc_BenchDataset,
+    abbr='medcal_bench_official_zero_shot_eval',
+    path='ncbi/MedCalc-Bench-v1.0',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medcal_bench_datasets = [medcal_bench_dataset]
diff --git a/opencompass/datasets/MedCalc_Bench.py b/opencompass/datasets/MedCalc_Bench.py
new file mode 100644
index 00000000..66855d5c
--- /dev/null
+++ b/opencompass/datasets/MedCalc_Bench.py
@@ -0,0 +1,323 @@
+import math
+import re
+from datetime import datetime
+
+import numpy as np
+from datasets import load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+def check_correctness(answer: str, ground_truth, calid, upper_limit,
+                      lower_limit):
+    """"""
+    calid = int(calid)
+
+    if calid in [13, 68]:
+        # Output Type: date
+
+        if datetime.strptime(
+                answer,
+                '%m/%d/%Y').strftime('%-m/%-d/%Y') == datetime.strptime(
+                    ground_truth, '%m/%d/%Y').strftime('%-m/%-d/%Y'):
+            correctness = 1
+        else:
+            correctness = 0
+    elif calid in [69]:
+        # Output Type: integer (A, B)
+        match = re.search(
+            r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?"
+            r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", ground_truth)
+        ground_truth = f'({match.group(1)}, {match.group(3)})'
+        match = re.search(
+            r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,?"
+            r"\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", answer)
+        if match:
+            weeks = match.group(1)
+            days = match.group(3)
+            answer = f'({weeks}, {days})'
+            if eval(answer) == eval(ground_truth):
+                correctness = 1
+            else:
+                correctness = 0
+        else:
+            correctness = 0
+    elif calid in [
+            4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48,
+            51, 69
+    ]:
+        # Output Type: integer A
+        answer = round(eval(answer))
+        if answer == eval(ground_truth):
+            correctness = 1
+        else:
+            correctness = 0
+    elif calid in [
+            2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39,
+            40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67
+    ]:
+        # Output Type: decimal
+        answer = eval(answer)
+        if answer >= eval(lower_limit) and answer <= eval(upper_limit):
+            correctness = 1
+        else:
+            correctness = 0
+    else:
+        raise ValueError(f'Unknown calculator ID: {calid}')
+    return correctness
+
+
+def extract_answer(answer, calid):
+
+    calid = int(calid)
+    extracted_answer = re.findall(r'[Aa]nswer":\s*(.*?)\}', answer)
+    matches = re.findall(
+        r'"step_by_step_thinking":\s*"'
+        r'([^"]+)"\s*,\s*"[Aa]nswer"', answer)
+
+    if matches:
+        # Select the last match
+        last_match = matches[-1]
+        explanation = last_match
+    else:
+        explanation = 'No Explanation'
+
+    if len(extracted_answer) == 0:
+        extracted_answer = 'Not Found'
+    else:
+        extracted_answer = extracted_answer[-1].strip().strip('"')
+        if extracted_answer == 'str(short_and_direct\
+                _answer_of_the_question)':
+            extracted_answer = 'Not Found'
+        if extracted_answer == 'str(value which is\
+                the answer to the question)':
+            extracted_answer = 'Not Found'
+        if extracted_answer == 'X.XX':
+            extracted_answer = 'Not Found'
+
+    if calid in [13, 68]:
+        # Output Type: date
+        match = re.search(
+            r'^(0?[1-9]|1[0-2])\/(0?[1-9]'
+            r'|[12][0-9]|3[01])\/(\d{4})', extracted_answer)
+        if match:
+            month = int(match.group(1))
+            day = int(match.group(2))
+            year = match.group(3)
+            answer = f'{month:02}/{day:02}/{year}'
+        else:
+            answer = 'N/A'
+
+    elif calid in [69]:
+        # Output Type: integer (A, B)
+        match = re.search(
+            r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,"
+            r"\?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer)
+        extracted_answer = extracted_answer.replace('[', '(').replace(
+            ']', ')').replace("'", '').replace('"', '')
+        match = re.search(
+            r"\(?[\"\']?(\d+)\s*(weeks?)?[\"\']?,"
+            r"?\s*[\"\']?(\d+)\s*(days?)?[\"\']?\s*\)?", extracted_answer)
+        if match:
+            weeks = match.group(1)
+            days = match.group(3)
+            answer = f'({weeks}, {days})'
+        else:
+            answer = 'N/A'
+    elif calid in [
+            4, 15, 16, 17, 18, 20, 21, 25, 27, 28, 29, 32, 33, 36, 43, 45, 48,
+            51, 69
+    ]:
+        # Output Type: integer A
+        match = re.search(r'(\d+) out of', extracted_answer)
+        if match:  # cases like "3 out of 5"
+            answer = match.group(1)
+        else:
+            match = re.search(r'-?\d+(, ?-?\d+)+', extracted_answer)
+            if match:  # cases like "3, 4, 5"
+                answer = str(len(match.group(0).split(',')))
+            else:
+                # match = re.findall(r"(?<!-)\d+", extracted_answer)
+                match = re.findall(r'(-?\d+(\.\d+)?)', extracted_answer)
+                # match = re.findall(r"-?\d+", extracted_answer)
+                if len(match) > 0:  # find the last integer
+                    answer = match[-1][0]
+                    # answer = match[-1].lstrip("0")
+                else:
+                    answer = 'N/A'
+    elif calid in [
+            2, 3, 5, 6, 7, 8, 9, 10, 11, 19, 22, 23, 24, 26, 30, 31, 38, 39,
+            40, 44, 46, 49, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67
+    ]:
+        # Output Type: decimal
+        match = re.search(r'str\((.*)\)', extracted_answer)
+        if match:
+            expression = match.group(1).replace('^', '**').replace(
+                'is odd', '% 2 == 1').replace('is even', '% 2 == 0').replace(
+                    'sqrt', 'math.sqrt').replace('.math', '').replace(
+                        'weight',
+                        '').replace('height', '').replace('mg/dl', '').replace(
+                            'g/dl', '').replace('mmol/L', '').replace(
+                                'kg', '').replace('g',
+                                                  '').replace('mEq/L', '')
+            expression = expression.split('#')[0]
+            if expression.count('(') > expression.count(')'):  # add missing ')
+                expression += ')' * (expression.count('(') -
+                                     expression.count(')'))
+            elif expression.count(')') > expression.count(
+                    '('):  # add missing (
+                expression = '(' * (expression.count(')') -
+                                    expression.count('(')) + expression
+            try:
+                answer = eval(expression, {'__builtins__': None}, {
+                    'min': min,
+                    'pow': pow,
+                    'round': round,
+                    'abs': abs,
+                    'int': int,
+                    'float': float,
+                    'math': math,
+                    'np': np,
+                    'numpy': np
+                })
+            except Exception:
+                print(f'Error in evaluating expression: {expression}')
+                answer = 'N/A'
+        else:
+            match = re.search(r'(-?\d+(\.\d+)?)\s*mL/min/1.73',
+                              extracted_answer)
+            if match:  # cases like "8.1 mL/min/1.73 m\u00b2"
+                answer = eval(match.group(1))
+            else:
+                match = re.findall(r'(-?\d+(\.\d+)?)\%', extracted_answer)
+                if len(match) > 0:  # cases like "53.1%"
+                    answer = eval(match[-1][0]) / 100
+                else:
+                    match = re.findall(r'(-?\d+(\.\d+)?)', extracted_answer)
+                    if len(
+                            match
+                    ) > 0:  # cases like "8.1 mL/min/1.73 m\u00b2" or "11.1"
+                        answer = eval(match[-1][0])
+                    else:
+                        answer = 'N/A'
+        if answer != 'N/A':
+            answer = str(answer)
+
+    return answer, explanation
+
+
+def _parse(item, prompt_mode):
+    item['row_number'] = item['Row Number']
+    item['calculator_id'] = item['Calculator ID']
+    item['calculator_name'] = item['Calculator Name']
+    item['category'] = item['Category']
+    item['output_type'] = item['Output Type']
+    item['note_id'] = item['Note ID']
+    item['note_type'] = item['Note Type']
+    item['patient_note'] = item['Patient Note']
+    item['question'] = item['Question']
+    item['relevant_entities'] = item['Relevant Entities']
+    item['ground_truth_answer'] = item['Ground Truth Answer']
+    item['lower_limit'] = item['Lower Limit']
+    item['upper_limit'] = item['Upper Limit']
+    item['ground_truth_explanation'] = item['Ground Truth Explanation']
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MedCalc_BenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        data_files = {
+            'test': 'data/test-00000-of-00001.parquet',
+            'train': 'data/train-00000-of-00001.parquet'
+        }
+        dataset = load_dataset(path, data_files=data_files, split='test')
+        # dataset = dataset.select(range(2))
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode),
+                                  load_from_cache_file=False)
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+        return dataset
+
+
+class MedCalcOfficial_Evaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            calculator_id = test_set['calculator_id'][idx]
+            lower_limit = test_set['lower_limit'][idx]
+            upper_limit = test_set['upper_limit'][idx]
+            row_number = test_set['row_number'][idx]
+            note_id = test_set['note_id'][idx]
+            category = test_set['category'][idx]
+            question = test_set['question'][idx]
+            calculator_name = test_set['calculator_name'][idx]
+            patient_note = test_set['patient_note'][idx]
+            ground_truth_explanation = test_set['ground_truth_explanation'][
+                idx]
+            ground_truth_answer = test_set['ground_truth_answer'][idx]
+            try:
+                answer_value, explanation = extract_answer(
+                    i, int(calculator_id))
+
+                print(answer_value)
+                print(explanation)
+
+                correctness = check_correctness(answer_value,
+                                                ground_truth_answer,
+                                                calculator_id, upper_limit,
+                                                lower_limit)
+
+                status = 'Correct' if correctness else 'Incorrect'
+
+                outputs = {
+                    'Row Number': int(row_number),
+                    'Calculator Name': calculator_name,
+                    'Calculator ID': calculator_id,
+                    'Category': category,
+                    'Note ID': note_id,
+                    'Patient Note': patient_note,
+                    'Question': question,
+                    'LLM Answer': answer_value,
+                    'LLM Explanation': explanation,
+                    'Ground Truth Answer': ground_truth_answer,
+                    'Ground Truth Explanation': ground_truth_explanation,
+                    'Result': status
+                }
+
+            except Exception as e:
+                outputs = {
+                    'Row Number': int(row_number),
+                    'Calculator Name': calculator_name,
+                    'Calculator ID': calculator_id,
+                    'Category': category,
+                    'Note ID': note_id,
+                    'Patient Note': patient_note,
+                    'Question': question,
+                    'LLM Answer': str(e),
+                    'LLM Explanation': str(e),
+                    'Ground Truth Answer': ground_truth_answer,
+                    'Ground Truth Explanation': ground_truth_explanation,
+                    'Result': 'Incorrect'
+                }
+                status = 'Incorrect'
+            count += 1
+            if status == 'Correct':
+                correct += 1
+            details.append(outputs)
+
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 5a98a942..babdcef2 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -97,6 +97,8 @@ from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
+from .MedCalc_Bench import MedCalc_BenchDataset  # noqa: F401
+from .MedCalc_Bench import MedCalcOfficial_Evaluator  # noqa: F401
 from .MedQA import *  # noqa: F401, F403
 from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403

From 8aa18df36859c5dc66e0e4292171910681cb6728 Mon Sep 17 00:00:00 2001
From: Kun Yuan <31314392+Flaick@users.noreply.github.com>
Date: Mon, 12 May 2025 04:14:11 +0200
Subject: [PATCH 16/28] [Dataset] HLE Biomedical version support (#2080)

* HLE Biomedical version support

* set up default category value for hle
---
 .../HLE/hle_biomed_llm_verify_gen_6ff468.py   | 88 +++++++++++++++++++
 opencompass/datasets/hle.py                   | 11 ++-
 2 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 opencompass/configs/datasets/HLE/hle_biomed_llm_verify_gen_6ff468.py

diff --git a/opencompass/configs/datasets/HLE/hle_biomed_llm_verify_gen_6ff468.py b/opencompass/configs/datasets/HLE/hle_biomed_llm_verify_gen_6ff468.py
new file mode 100644
index 00000000..5e74c8d0
--- /dev/null
+++ b/opencompass/configs/datasets/HLE/hle_biomed_llm_verify_gen_6ff468.py
@@ -0,0 +1,88 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import HLEDataset
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=HLEDataset,
+            path='cais/hle',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+hle_datasets = [
+    dict(
+        type=HLEDataset,
+        abbr='hle_llmjudge',
+        path='cais/hle',
+        category='Biology/Medicine',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
\ No newline at end of file
diff --git a/opencompass/datasets/hle.py b/opencompass/datasets/hle.py
index 2d7cf74b..b14507cc 100644
--- a/opencompass/datasets/hle.py
+++ b/opencompass/datasets/hle.py
@@ -9,9 +9,12 @@ from .base import BaseDataset
 class HLEDataset(BaseDataset):
 
     @staticmethod
-    def load(path: str):
+    def load(path: str, category: str | None = None):
         dataset = load_dataset(path)
-        dataset['test'] = dataset['test'].filter(lambda x: x['image'] == '')
-        dataset['test'] = dataset['test'].rename_column('question', 'problem')
-        dataset['train'] = dataset['test']
+        ds = dataset['test'].filter(lambda x: x['image'] == '')
+        if category:
+            ds = ds.filter(lambda x: x['category'] == category)
+        ds = ds.rename_column('question', 'problem')
+        dataset['train'] = ds
+        dataset['test'] = ds
         return dataset

From 345674f700a0aa22bba16e6fde2dcdb5869e3d33 Mon Sep 17 00:00:00 2001
From: huihui1999 <107675879+bio-mlhui@users.noreply.github.com>
Date: Mon, 12 May 2025 17:23:44 +0800
Subject: [PATCH 17/28] [Dataset] Add SciknowEval Dataset (#2070)

* first

* first

* first

* first

* SciKnowEval

* fix hash

* fix dataset-index & use official llm_judge_postprocess

* fix dataset-index.yml

* use official llmjudge_postprocess

* fix lint

* fix lint

* fix lint

* fix lint

* fix lint

* merge with main

---------

Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
---
 dataset-index.yml                             |   6 +
 .../SciKnowEval/SciKnowEval_gen_ebe47d.py     |  92 +++++++
 .../SciKnowEval_llmjudge_gen_ebe47d.py        | 232 ++++++++++++++++++
 opencompass/datasets/SciKnowEval.py           | 107 ++++++++
 opencompass/datasets/__init__.py              |   1 +
 5 files changed, 438 insertions(+)
 create mode 100644 opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
 create mode 100644 opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
 create mode 100644 opencompass/datasets/SciKnowEval.py

diff --git a/dataset-index.yml b/dataset-index.yml
index a2179b92..57bd924e 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1065,6 +1065,12 @@
     paper: https://arxiv.org/pdf/2402.09391
     configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
     configpath_llmjudge: ''
+- SciKnowEval:
+    name: SciKnowEval
+    category: Science
+    paper: https://arxiv.org/abs/2406.09098
+    configpath: opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
+    configpath_llmjudge: opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
 - internsandbox:
     name: InternSandbox
     category: Reasoning/Code/Agent
diff --git a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
new file mode 100644
index 00000000..7231d74b
--- /dev/null
+++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_gen_ebe47d.py
@@ -0,0 +1,92 @@
+from opencompass.datasets import SciKnowEvalDataset, SciKnowEvalEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+ZERO_SHOT_PROMPT = '{q4}'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'prompt',
+        'question',
+        'choices',
+        'label',
+        'answerKey',
+        'type',
+        'domain',
+        'details',
+        'answer',
+        'q4'
+    ],
+    output_column='answerKey',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=SciKnowEvalEvaluator),
+    pred_role='BOT',
+)
+sciknoweval_dataset_biology = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_biology',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='biology',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_chemistry = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_chemistry',
+    path='hicai-zju/SciKnowEval',
+    subset='chemistry',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_material = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_material',
+    path='hicai-zju/SciKnowEval',
+    subset='material',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+sciknoweval_dataset_physics = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_physics',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='physics',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+
+sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
diff --git a/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
new file mode 100644
index 00000000..0a432e26
--- /dev/null
+++ b/opencompass/configs/datasets/SciKnowEval/SciKnowEval_llmjudge_gen_ebe47d.py
@@ -0,0 +1,232 @@
+from opencompass.datasets import SciKnowEvalDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+ZERO_SHOT_PROMPT = '{q4}'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {q4}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answerKey}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'prompt',
+        'question',
+        'choices',
+        'label',
+        'answerKey',
+        'type',
+        'domain',
+        'details',
+        'answer',
+        'q4'
+    ],
+    output_column='answerKey',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg_biology = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            subset='biology',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_chemistry = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='chemistry',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_material = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='material',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+eval_cfg_physics = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SciKnowEvalDataset,
+            path='hicai-zju/SciKnowEval',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+            subset='physics',
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+sciknoweval_dataset_biology = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_biology_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='biology',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_biology,
+)
+
+sciknoweval_dataset_chemistry = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_chemistry_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    subset='chemistry',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_chemistry,
+)
+sciknoweval_dataset_material = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_material_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    subset='material',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_material,
+)
+
+
+sciknoweval_dataset_physics = dict(
+    type=SciKnowEvalDataset,
+    abbr='sciknoweval_physics_llmjudge',
+    path='hicai-zju/SciKnowEval',
+    prompt_mode='zero-shot',
+    subset='physics',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg_physics,
+)
+sciknoweval_datasets = [sciknoweval_dataset_biology, sciknoweval_dataset_chemistry, sciknoweval_dataset_physics, sciknoweval_dataset_material]
\ No newline at end of file
diff --git a/opencompass/datasets/SciKnowEval.py b/opencompass/datasets/SciKnowEval.py
new file mode 100644
index 00000000..d9635d96
--- /dev/null
+++ b/opencompass/datasets/SciKnowEval.py
@@ -0,0 +1,107 @@
+import re
+
+from datasets import load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode, discipline):
+    choices = item['choices']
+
+    item['q4'] = f'You are an expert in {discipline}.\n'
+    item['q4'] += item['prompt']['default'] + '\n' + item['question'] + '\n'
+    label_texts = []
+    for label_meta, text_meta in zip(choices['label'], choices['text']):
+        label_texts.append(f'{label_meta}. {text_meta}')
+    item['q4'] += '\n'.join(label_texts)  # noqa: E501, E741, E741
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class SciKnowEvalDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+
+        def capitalize_first_letter(s):
+            if not s:  # 检查字符串是否为空
+                return s
+            return s[0].upper() + s[1:]
+
+        subset = kwargs['subset']
+        data_files = {}
+        test_file = f'data/{capitalize_first_letter(subset)}/'
+        test_file += f'sciknoweval_{subset}_test.jsonl'
+        data_files['test'] = test_file
+        dataset = load_dataset(path, data_files=data_files, split='test')
+        # dataset = dataset.select(range(20))
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(
+                lambda item: _parse(item, prompt_mode, subset),
+                load_from_cache_file=False)
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+        return dataset
+
+
+class SciKnowEvalEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['choices'][idx]['label'],
+                                 test_set['answerKey'][idx])
+            detail = {'pred': i, 'answer': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+    return prediction[0]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index babdcef2..c441a2d8 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -134,6 +134,7 @@ from .ruler import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
 from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
+from .SciKnowEval import *  # noqa: F401, F403
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .smolinstruct import *  # noqa: F401, F403

From 2c79dc522723fe294671a3ff52e138879b9571b6 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Mon, 12 May 2025 18:38:13 +0800
Subject: [PATCH 18/28] [Dataset] Add human_eval/mbpp pro (#2092)

* add bench

* update

* bug fix

* time update

* add index

* fix repeat bug
---
 dataset-index.yml                             |  12 ++
 .../configs/datasets/humaneval_pro/README.md  |  17 +++
 .../humaneval_pro/humaneval_pro_gen.py        |   4 +
 .../humaneval_pro/humaneval_pro_gen_3dc067.py |  46 ++++++
 .../humaneval_pro_repeat_gen_3dc067.py        |  48 ++++++
 .../configs/datasets/mbpp_pro/README.md       |  17 +++
 .../configs/datasets/mbpp_pro/mbpp_pro_gen.py |   4 +
 .../datasets/mbpp_pro/mbpp_pro_gen_3dc067.py  |  46 ++++++
 .../mbpp_pro/mbpp_pro_repeat_gen_3dc067.py    |  48 ++++++
 .../datasets/multipl_e/multiple_gen.py        |   4 +
 ..._gen.py => multiple_top_ten_gen_f44aaf.py} |   2 -
 .../multiple_top_ten_repeat_gen_0cd6ce.py     |  58 +++++++
 opencompass/datasets/__init__.py              |   2 +
 opencompass/datasets/humaneval_pro.py         |  81 ++++++++++
 opencompass/datasets/mbpp_pro.py              |  81 ++++++++++
 opencompass/datasets/multipl_e.py             |  65 ++++++--
 .../openicl/icl_evaluator/code_evaluator.py   | 142 +++++++-----------
 opencompass/utils/datasets_info.py            |  22 ++-
 18 files changed, 593 insertions(+), 106 deletions(-)
 create mode 100644 opencompass/configs/datasets/humaneval_pro/README.md
 create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
 create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
 create mode 100644 opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
 create mode 100644 opencompass/configs/datasets/mbpp_pro/README.md
 create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
 create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
 create mode 100644 opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
 create mode 100644 opencompass/configs/datasets/multipl_e/multiple_gen.py
 rename opencompass/configs/datasets/multipl_e/{multiple_top_ten_gen.py => multiple_top_ten_gen_f44aaf.py} (97%)
 create mode 100644 opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
 create mode 100644 opencompass/datasets/humaneval_pro.py
 create mode 100644 opencompass/datasets/mbpp_pro.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 57bd924e..5ebad535 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -611,6 +611,12 @@
     paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
     configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
     configpath_llmjudge: ''
+- humaneval_pro:
+    name: HumanEval Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
+    configpath_llmjudge: ''
 - hungarian_math:
     name: Hungarian_Math
     category: Math
@@ -695,6 +701,12 @@
     paper: ''
     configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
     configpath_llmjudge: ''
+- mbpp_pro:
+    name: MBPP Pro
+    category: Code
+    paper: https://arxiv.org/abs/2412.21199
+    configpath: opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
+    configpath_llmjudge: ''
 - mgsm:
     name: MGSM
     category: Language / Math
diff --git a/opencompass/configs/datasets/humaneval_pro/README.md b/opencompass/configs/datasets/humaneval_pro/README.md
new file mode 100644
index 00000000..853b59f2
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval_pro/README.md
@@ -0,0 +1,17 @@
+# HumanEval pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     67   |
+|  deepseek-v2-lite-chat-hf  |     35   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     28   |
\ No newline at end of file
diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
new file mode 100644
index 00000000..9bccdd66
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .humaneval_pro_gen_3dc067 import humanevalpro_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
new file mode 100644
index 00000000..e3ed8349
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_gen_3dc067.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
new file mode 100644
index 00000000..98320f78
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval_pro/humaneval_pro_repeat_gen_3dc067.py
@@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalevalProDataset, HumanevalProEvaluator, humaneval_postprocess_v2
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+humanevalpro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+humanevalpro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalpro_eval_cfg = dict(
+    evaluator=dict(type=HumanevalProEvaluator,
+                   ip_address='https://opencompass-multiple-evaluator.hf.space')
+)
+
+humanevalpro_datasets = [
+    dict(
+        abbr='humaneval_pro',
+        type=HumanevalevalProDataset,
+        path='opencompass/humaneval_pro',
+        reader_cfg=humanevalpro_reader_cfg,
+        infer_cfg=humanevalpro_infer_cfg,
+        eval_cfg=humanevalpro_eval_cfg,
+        n=5,
+        k=3)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mbpp_pro/README.md b/opencompass/configs/datasets/mbpp_pro/README.md
new file mode 100644
index 00000000..d34980e1
--- /dev/null
+++ b/opencompass/configs/datasets/mbpp_pro/README.md
@@ -0,0 +1,17 @@
+# MBPP pro
+
+## OC results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     66   |
+|  qwen2.5-14b-instruct-hf   |     64   |
+|  deepseek-v2-lite-chat-hf  |     36   |
+
+## CodeEval-pro results
+
+|           model            |   pass@1 |
+|:--------------------------:|---------:|
+|qwen2.5-coder-7b-instruct-hf|     65   |
+|  qwen2.5-14b-instruct-hf   |     65   |
+|  deepseek-v2-lite-chat-hf  |     39   |
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
new file mode 100644
index 00000000..84d45d83
--- /dev/null
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mbpp_pro_gen_3dc067 import mbpppro_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
new file mode 100644
index 00000000..c14b05cb
--- /dev/null
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_gen_3dc067.py
@@ -0,0 +1,46 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
new file mode 100644
index 00000000..631713b8
--- /dev/null
+++ b/opencompass/configs/datasets/mbpp_pro/mbpp_pro_repeat_gen_3dc067.py
@@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPProDataset, MBPPProEvaluator
+
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+mbpppro_reader_cfg = dict(
+    input_columns=['raw_problem', 'new_problem'], output_column='test_code')
+
+mbpppro_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=PROMPT_WRAPPER),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+mbpppro_eval_cfg = dict(
+    evaluator=dict(type=MBPPProEvaluator, 
+                   ip_address='https://opencompass-multiple-evaluator.hf.space'),
+)
+
+mbpppro_datasets = [
+    dict(
+        abbr='mbpp_pro',
+        type=MBPPProDataset,
+        path='opencompass/mbpp_pro',
+        reader_cfg=mbpppro_reader_cfg,
+        infer_cfg=mbpppro_infer_cfg,
+        eval_cfg=mbpppro_eval_cfg,
+        n=5,
+        k=3)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/multipl_e/multiple_gen.py b/opencompass/configs/datasets/multipl_e/multiple_gen.py
new file mode 100644
index 00000000..b32af567
--- /dev/null
+++ b/opencompass/configs/datasets/multipl_e/multiple_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .multiple_top_ten_gen_f44aaf import multiple_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
similarity index 97%
rename from opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
rename to opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
index 93ab2962..040c5ba5 100644
--- a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen_f44aaf.py
@@ -32,7 +32,6 @@ multiple_datasets = [
         type=MultiplEDataset,
         abbr=f'humaneval-multiple-{lang}',
         language=lang,
-        num_repeats=1,
         path='opencompass/multipl_e',
         tag='humaneval',
         reader_cfg=multiple_reader_cfg,
@@ -46,7 +45,6 @@ multiple_datasets += [
         type=MultiplEDataset,
         abbr=f'mbpp-multiple-{lang}',
         language=lang,
-        num_repeats=1,
         path='opencompass/multipl_e',
         tag='mbpp',
         reader_cfg=multiple_reader_cfg,
diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
new file mode 100644
index 00000000..1a603d32
--- /dev/null
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_repeat_gen_0cd6ce.py
@@ -0,0 +1,58 @@
+# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
+
+
+_TOP_TEN_LANGUAGE_ = ['cpp']
+
+multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
+
+multiple_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+multiple_eval_cfg = {
+    lang: dict(
+        evaluator=dict(
+            type=MultiplEEvaluator,
+            language=lang,
+            ip_address='https://opencompass-multiple-evaluator.hf.space',
+        ),
+        pred_role='BOT',
+    ) for lang in _TOP_TEN_LANGUAGE_
+}
+
+multiple_datasets = [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'humaneval-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='humaneval',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
+
+multiple_datasets += [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'mbpp-multiple-{lang}',
+        language=lang,
+        path='opencompass/multipl_e',
+        tag='mbpp',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+        n=5,
+        k=3
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index c441a2d8..92cda579 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -64,6 +64,7 @@ from .hle import *  # noqa: F401, F403
 from .huggingface import *  # noqa: F401, F403
 from .humaneval import *  # noqa: F401, F403
 from .humaneval_multi import *  # noqa: F401, F403
+from .humaneval_pro import *  # noqa: F401, F403
 from .humanevalx import *  # noqa: F401, F403
 from .hungarian_math import *  # noqa: F401, F403
 from .IFEval.ifeval import IFEvalDataset, IFEvaluator  # noqa: F401, F403
@@ -96,6 +97,7 @@ from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
+from .mbpp_pro import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
 from .MedCalc_Bench import MedCalc_BenchDataset  # noqa: F401
 from .MedCalc_Bench import MedCalcOfficial_Evaluator  # noqa: F401
diff --git a/opencompass/datasets/humaneval_pro.py b/opencompass/datasets/humaneval_pro.py
new file mode 100644
index 00000000..871b468f
--- /dev/null
+++ b/opencompass/datasets/humaneval_pro.py
@@ -0,0 +1,81 @@
+# flake8: noqa: E501s
+
+import json
+from typing import Dict, List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+class HumanevalevalProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            raw_data = json.load(f)
+            for data in raw_data:
+                dataset.append(data)
+        return Dataset.from_list(dataset)
+
+
+class HumanevalProEvaluator(CodeEvaluator):
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+
+        # 1. Prepare data for all test cases
+        all_test_cases, prompts = [], []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completion = predictions[i]
+
+            # Process code completions
+            processed_completion = self._process_completions(completion)
+            code = processed_completion + '\n' + test_case['test_code']
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'code': code,
+            }
+            all_test_cases.append(sub_data_dict)
+
+            prompt = PROMPT_WRAPPER.format(
+                raw_problem=test_case['raw_problem'],
+                new_problem=test_case['new_problem'])
+            prompts.append(prompt)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        return self._process_results(outputs, prompts, len(test_set_origin))
diff --git a/opencompass/datasets/mbpp_pro.py b/opencompass/datasets/mbpp_pro.py
new file mode 100644
index 00000000..fe7d01a4
--- /dev/null
+++ b/opencompass/datasets/mbpp_pro.py
@@ -0,0 +1,81 @@
+# flake8: noqa: E501
+
+import json
+from typing import Dict, List
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+PROMPT_WRAPPER = """You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.
+Write a solution of python file to the following problems, the solution of the second problem requires single or multiple calls to the first solution.
+```python
+{raw_problem}
+{new_problem}
+```
+Please put the two solutions within the Python code block provided below, and make sure that the block contains no other unrelated content:
+```python
+```
+"""
+
+
+class MBPPProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        print(path)
+        dataset = []
+        with open(path, encoding='utf-8') as f:
+            for line in f:
+                dataset.append(json.loads(line.strip()))
+        return Dataset.from_list(dataset)
+
+
+class MBPPProEvaluator(CodeEvaluator):
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+
+        # 1. Prepare data for all test cases
+        all_test_cases, prompts = [], []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completion = predictions[i]
+
+            # Process code completions
+            processed_completion = self._process_completions(completion)
+            code = processed_completion + '\n' + test_case['test_code']
+            sub_data_dict = {
+                'name': int(test_case['id']),
+                'language': self.language,
+                'code': code,
+            }
+            all_test_cases.append(sub_data_dict)
+
+            prompt = PROMPT_WRAPPER.format(
+                raw_problem=test_case['raw_problem'],
+                new_problem=test_case['new_problem'])
+            prompts.append(prompt)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        return self._process_results(outputs, prompts, len(test_set_origin))
diff --git a/opencompass/datasets/multipl_e.py b/opencompass/datasets/multipl_e.py
index 657b52de..c8f80632 100644
--- a/opencompass/datasets/multipl_e.py
+++ b/opencompass/datasets/multipl_e.py
@@ -1,3 +1,4 @@
+import difflib
 import json
 import os.path as osp
 
@@ -28,7 +29,6 @@ class MultiplEDataset(BaseDataset):
     @staticmethod
     def load(path: str,
              language: str,
-             num_repeats: int = 1,
              tag: str = 'humaneval',
              local_mode: bool = False):
         """Load dataset for pass k mode.
@@ -56,8 +56,7 @@ class MultiplEDataset(BaseDataset):
         dataset = []
         with open(file_path, 'r', encoding='utf-8') as f:
             for line in f:
-                dataset.extend(
-                    [json.loads(line.strip()) for _ in range(num_repeats)])
+                dataset.append(json.loads(line.strip()))
         return Dataset.from_list(dataset)
 
 
@@ -84,20 +83,56 @@ class MultiplEEvaluator(CodeEvaluator):
                 min_stop_index = stop_index
         return decoded_string[:min_stop_index]
 
-    def _process_completions(self, test_case, completions):
+    def _remove_prefix(self,
+                       prompt: str,
+                       completion: str,
+                       threshold: float = 0.95) -> str:
+        """Determine the truncation point in the completion based on the last
+        line of the prompt, remove all content before that line in the
+        completion, and return the completion string after removing the prefix.
+        This is done to convert chatbot-style inference mode to completion
+        mode.
+
+        Args:
+            prompt (str): The prompt text.
+            completion (str): The completion text.
+            threshold (float): Line similarity threshold.
+
+        Returns:
+            str: The completion string after removing the prefix.
+        """
+        prompt_lines = prompt.splitlines()
+        completion_lines = completion.splitlines()
+
+        if not prompt_lines:
+            return completion
+
+        last_prompt_line = prompt_lines[-1]
+        cut_index = -1
+
+        for i, completion_line in enumerate(completion_lines):
+            similarity = difflib.SequenceMatcher(None, last_prompt_line,
+                                                 completion_line).ratio()
+            if similarity >= threshold:
+                cut_index = i
+                break
+
+        if cut_index != -1:
+            return '\n'.join(completion_lines[cut_index + 1:])
+        else:
+            return completion
+
+    def _process_completions(self, test_case, completion):
         """Process completions with a test case.
 
         Args:
-            test_case: A test case.
-            completions: A list of completions.
+            test_case (dict): A test case containing prompt and stop tokens.
+            completion (str): The generated code completion.
         Returns:
-            A list of processed completions.
+            str: Processed code completion.
         """
-        processed_completions = []
-        for comp in completions:
-            comp = self._extract_code(comp)
-            post_comp = self._remove_prefix(test_case['prompt'], comp)
-            post_comp = self._stop_at_stop_token(post_comp,
-                                                 test_case['stop_tokens'])
-            processed_completions.append(post_comp)
-        return processed_completions
+        post_comp = self._extract_code(completion)
+        post_comp = self._remove_prefix(test_case['prompt'], post_comp)
+        post_comp = self._stop_at_stop_token(post_comp,
+                                             test_case['stop_tokens'])
+        return post_comp
diff --git a/opencompass/openicl/icl_evaluator/code_evaluator.py b/opencompass/openicl/icl_evaluator/code_evaluator.py
index d586cd6e..a2804207 100644
--- a/opencompass/openicl/icl_evaluator/code_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/code_evaluator.py
@@ -1,12 +1,12 @@
 # flake8: noqa: E501
 
-import difflib
 import os
 import re
 import tempfile
 import time
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 from datasets import Dataset
 from gradio_client import Client
 
@@ -24,9 +24,9 @@ class CodeEvaluator(BaseEvaluator):
     """
 
     def __init__(self,
-                 language: str,
+                 language: str = 'py',
                  ip_address: str = 'localhost',
-                 retry: int = 3) -> None:
+                 retry: int = 5) -> None:
         """Initialize the CodeEvaluator.
 
         Args:
@@ -71,6 +71,7 @@ class CodeEvaluator(BaseEvaluator):
                 - output (dict/list/str): Evaluation results or error message
         """
         try:
+            import requests
             temp_file_path = None
             # Handle file path input
             if isinstance(input_data, str):
@@ -83,7 +84,15 @@ class CodeEvaluator(BaseEvaluator):
                 input_data = temp_file_path
 
             # Send to evaluation service
-            result = self.client.predict(input_data, api_name='/evaluate')
+            try:
+                result = self.client.predict(input_data, api_name='/evaluate')
+            except Exception as e:
+                # Catch timeout and other exceptions
+                if 'timed out' in str(e).lower() or 'timeout' in str(
+                        e).lower():
+                    return False, f'Request to code eval service timed out: {e}'
+                else:
+                    raise
 
             # Process the result
             if isinstance(result, (dict, list)):
@@ -107,63 +116,16 @@ class CodeEvaluator(BaseEvaluator):
                 except:  # noqa: E722
                     pass
 
-    def _remove_prefix(self,
-                       prompt: str,
-                       completion: str,
-                       threshold: float = 0.95) -> str:
-        """Determine the truncation point in the completion based on the last
-        line of the prompt, remove all content before that line in the
-        completion, and return the completion string after removing the prefix.
-        This is done to convert chatbot-style inference mode to completion
-        mode.
+    def _process_completions(self, completion: str) -> list:
+        """Process code completions to extract the relevant code.
 
         Args:
-            prompt (str): The prompt text.
-            completion (str): The completion text.
-            threshold (float): Line similarity threshold.
-
+            completion (str): Code completion string.
         Returns:
-            str: The completion string after removing the prefix.
+            list: List of processed code completions.
         """
-        prompt_lines = prompt.splitlines()
-        completion_lines = completion.splitlines()
-
-        if not prompt_lines:
-            return completion
-
-        last_prompt_line = prompt_lines[-1]
-        cut_index = -1
-
-        for i, completion_line in enumerate(completion_lines):
-            similarity = difflib.SequenceMatcher(None, last_prompt_line,
-                                                 completion_line).ratio()
-            if similarity >= threshold:
-                cut_index = i
-                break
-
-        if cut_index != -1:
-            return '\n'.join(completion_lines[cut_index + 1:])
-        else:
-            return completion
-
-    def _process_completions(self, test_case: dict, completions: list) -> list:
-        """Process code completion list, which typically involves extracting
-        code, removing repetitive prefixes caused by chatbot mode, and other
-        steps to ensure the model-generated code can be compiled successfully.
-
-        Args:
-            test_case (dict): Dictionary containing test case information including:
-            completions (list): List of code completions generated by the model.
-
-        Returns:
-            list: Processed code completion list.
-        """
-        processed_completions = []
-        for comp in completions:
-            comp = self._extract_code(comp)
-            post_comp = self._remove_prefix(test_case['prompt'], comp)
-            processed_completions.append(post_comp)
-        return processed_completions
+        post_comp = self._extract_code(completion)
+        return post_comp
 
     def _evaluate(
         self, input_data: Union[Dict, List]
@@ -186,7 +148,7 @@ class CodeEvaluator(BaseEvaluator):
             succeed, output = self._code_eval_service(input_data)
             if not succeed:
                 num_retry += 1
-                time.sleep(10)
+                time.sleep(30)
             else:
                 break
 
@@ -195,6 +157,31 @@ class CodeEvaluator(BaseEvaluator):
 
         return True, output, None
 
+    def _process_results(self, outputs: List, prompts: List,
+                         total_count: int) -> Dict:
+        """Process the evaluation results.
+        Args:
+            outputs (list): List of evaluation results for each test case.
+            prompts (list): List of prompts used for each test case.
+            total_count (int): Total number of test cases.
+        Returns:
+            dict: Processed results including:
+                - pass@1: Percentage of test cases passed
+                - details: Detailed results for each test case
+        """
+        details = []
+        correct = 0
+        for output, prompt in zip(outputs, prompts):
+            output['prompt'] = prompt
+            if output.get('status') == 'OK':
+                output['correct'] = True
+                correct += 1
+            else:
+                output['correct'] = False
+            details.append(output)
+
+        return {f'pass@1': 100 * correct / total_count, 'details': details}
+
     def score(self, predictions: List, references: List,
               test_set: Dataset) -> Dict:
         """Score code generation predictions against references.
@@ -221,28 +208,25 @@ class CodeEvaluator(BaseEvaluator):
         test_set = test_set.to_pandas()
         # Use the first column as the unique identifier
         test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
-        num_repeats = int(len(test_set) / len(test_set_origin))
 
         # 1. Prepare data for all test cases
-        all_test_cases = []
+        all_test_cases, prompts = [], []
         for i in range(len(test_set_origin)):
             test_case = test_set_origin.iloc[i]
-            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+            completion = predictions[i]
 
             # Process code completions
-            processed_completions = self._process_completions(
-                test_case, completions)
-
-            result_dict = {
+            processed_completion = self._process_completions(
+                test_case, completion)
+            code = test_case[
+                'prompt'] + processed_completion + '\n' + test_case['tests']
+            sub_data_dict = {
                 'name': test_case['name'],
                 'language': test_case['language'],
-                'prompt': test_case['prompt'],
-                'tests': test_case['tests'],
-                'processed_completions': processed_completions,
-                'completions': completions
+                'code': code
             }
-
-            all_test_cases.append(result_dict)
+            all_test_cases.append(sub_data_dict)
+            prompts.append(test_case['prompt'])
 
         # 2. Send all test cases to the evaluation service
         success, outputs, error_message = self._evaluate(all_test_cases)
@@ -250,18 +234,4 @@ class CodeEvaluator(BaseEvaluator):
             return {'error': error_message}
 
         # 3. Process the returned results
-        details = []
-        correct = 0
-        for output in outputs:
-            if output.get('status') == 'OK':
-                output['correct'] = True
-                correct += 1
-            else:
-                output['correct'] = False
-
-            details.append(output)
-
-        return {
-            f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
-            'details': details
-        }
+        return self._process_results(outputs, prompts, len(test_set_origin))
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 10ca4436..ce12af64 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -451,7 +451,16 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/nejmaibench/NEJM_All_Questions_And_Answers.csv",
     },
-
+    "opencompass/humaneval_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/humaneval_pro/humaneval_pro.json",
+    },
+    "opencompass/mbpp_pro": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/mbpp_pro/mbpp_pro.json",
+    },
 }
 
 DATASETS_URL = {
@@ -808,6 +817,13 @@ DATASETS_URL = {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nejmaibench.zip",
         "md5": "e6082cae3596b3ebea73e23ba445b99e"
-    }
-    
+    },
+    "humaneval_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval_pro.zip",
+        "md5": "4c6fe556e84e905e4f0902d699e46de5",
+    },
+    "mbpp_pro": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
+        "md5": "eac330b8a0a8687f006265c9383503ce",
+    },
 }

From c492e49e799ee4b88c8864cae90816e4c6277b84 Mon Sep 17 00:00:00 2001
From: yuehua-s <zhangyuehua_rh@163.com>
Date: Mon, 12 May 2025 18:39:44 +0800
Subject: [PATCH 19/28] [Update] Add o4 in OpenaiSDK (#2083)

* feature:1.add o4-mini;2.o3 or o4-mini only support temperature==1

* feature:change 4o-mini to 4o

---------

Co-authored-by: yuehuazhang <yuehuazhang@tencent.com>
---
 opencompass/models/openai_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 692edcf1..9a67c87d 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -25,7 +25,7 @@ OPENAI_API_BASE = os.path.join(
 OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
                                     'https://api.openai.com/v1/')
 
-O1_MODEL_LIST = ['o1', 'o3']
+O1_MODEL_LIST = ['o1', 'o3', 'o4']
 
 
 @MODELS.register_module()

From d590f557bb017f13d2457fd0db16c25b6d791dee Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 12 May 2025 19:38:30 +0800
Subject: [PATCH 20/28] [Update] OpenaiSDK handle empty content (#2096)

---
 opencompass/models/openai_api.py | 37 ++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 9a67c87d..f48869c5 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -69,6 +69,8 @@ class OpenAI(BaseAPIModel):
             Defaults to None.
         extra_body (Dict, optional): Add additional JSON properties to
             the request
+        think_tag (str, optional): The tag to use for reasoning content.
+            Defaults to '</think>'.
     """
 
     is_api: bool = True
@@ -92,6 +94,7 @@ class OpenAI(BaseAPIModel):
         tokenizer_path: Optional[str] = None,
         extra_body: Optional[Dict] = None,
         verbose: bool = False,
+        think_tag: str = '</think>',
     ):
 
         super().__init__(
@@ -114,6 +117,7 @@ class OpenAI(BaseAPIModel):
         self.tokenizer_path = tokenizer_path
         self.hf_tokenizer = None
         self.extra_body = extra_body
+        self.think_tag = think_tag
 
         if isinstance(key, str):
             if key == 'ENV':
@@ -319,7 +323,28 @@ class OpenAI(BaseAPIModel):
                 if self.logprobs:
                     return response['choices']
                 else:
-                    return response['choices'][0]['message']['content'].strip()
+                    # Extract content and reasoning_content from response
+                    message = response['choices'][0]['message']
+                    content = message.get('content', '') or ''
+                    reasoning_content = message.get('reasoning_content',
+                                                    '') or ''
+
+                    # Handle reasoning_content similar to OpenAISDK
+                    if reasoning_content:
+                        if self.verbose:
+                            self.logger.info(
+                                'Extracting reasoning content and tags.'
+                                'Reasoning Content: %s, \n'
+                                'Tags: %s, \n'
+                                'Content: %s', reasoning_content,
+                                self.think_tag, content)
+
+                        if content:
+                            return reasoning_content + self.think_tag + content
+                        else:
+                            return reasoning_content
+                    else:
+                        return content.strip()
             except KeyError:
                 if 'error' in response:
                     if response['error']['code'] == 'rate_limit_exceeded':
@@ -658,7 +683,8 @@ class OpenAISDK(OpenAI):
                     **query_data, timeout=timeout)  # timeout in seconds
                 if self.verbose:
                     self.logger.info(
-                        'Successfully get response from OpenAI API')
+                        'Successfully get response from OpenAI API '
+                        'with query: %s', query_data)
                     try:
                         self.logger.info(responses)
                     except Exception:
@@ -672,6 +698,13 @@ class OpenAISDK(OpenAI):
                         'reasoning_content',
                         '',
                     ))):  # noqa: E125
+                    # There is case that server does not return any content
+                    if responses.choices[0].finish_reason == 'stop':
+                        self.logger.info(
+                            'Server does not return any content '
+                            'and stop reason is <stop>, '
+                            'the input query is: %s', query_data)
+                        return ''
                     self.logger.error(
                         'Failed to extract content from the responses. '
                         'Please check the API response for detail information.'

From 9eaa1f6fec590e963f716c0c21a82e07f31449bc Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Tue, 13 May 2025 10:44:24 +0800
Subject: [PATCH 21/28] Update icl_judge_evaluator.py (#2095)

---
 opencompass/openicl/icl_evaluator/icl_judge_evaluator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
index e59cdc12..1de520cf 100644
--- a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
@@ -352,7 +352,7 @@ class Judgerbenchv2Evaluator(BaseEvaluator):
         total_normalized_diff = sum(normalized_diffs.values()) / len(
             normalized_diffs.values()) * 100
         acc = 100 * correct / count
-        final_score = acc - total_normalized_diff
+        final_score = (acc - total_normalized_diff + 100) / 2
         result = {
             'accuracy': acc,
             'rank_diff': total_rank_diff,

From d60f59dcab3fce015d7db9682397ed8d1f7dcc2b Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Tue, 13 May 2025 14:01:47 +0800
Subject: [PATCH 22/28] [CI] update baseline and fix lmdeploy version (#2098)

* update

* update

* update

* update

* update

* update
---
 .github/scripts/oc_score_baseline.yaml        |   6 +-
 .../scripts/oc_score_baseline_fullbench.yaml  | 156 +++++++++---------
 .../scripts/oc_score_baseline_testrange.yaml  | 130 +++++++--------
 .github/workflows/daily-run-test.yml          |  10 +-
 4 files changed, 151 insertions(+), 151 deletions(-)

diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index e4567553..2ea7a5e5 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -9,7 +9,7 @@ internlm2_5-7b_hf:
     race-high_accuracy: 90.02
 
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 87.50
+    demo_gsm8k_accuracy: 84.38
     race-middle_accuracy: 92.76
     race-high_accuracy: 90.54
 
@@ -24,7 +24,7 @@ internlm3-8b-instruct_hf-lmdeploy:
     race-high_accuracy: 90.34
 
 internlm3-8b-instruct_hf-vllm:
-    demo_gsm8k_accuracy: 81.25
+    demo_gsm8k_accuracy: 78.12
     race-middle_accuracy: 92.20
     race-high_accuracy: 89.88
 
@@ -34,6 +34,6 @@ internlm2_5-7b-chat_hf:
     race-high_accuracy: 90.48
 
 lmdeploy-api-test:
-    gsm8k_accuracy: 56.25
+    gsm8k_accuracy: 68.75
     race-middle_accuracy: 93.75
     race-high_accuracy: 93.75
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index fd355c0e..471c6602 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
         college_knowledge_naive_average: 87.5
     subjective:
         alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 0
-        arenahard_score: 50
+        alpaca_eval_total: 20.00
+        arenahard_score: 56.82
         Followbench_naive_average: 1
         CompassArena_naive_average: 43
-        mtbench101_avg: 7.8
-        wildbench_average: -15.56
-        simpleqa_accuracy_given_attempted: 0
-        chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.00
+        mtbench101_avg: 7.60
+        wildbench_average: -14.58
+        simpleqa_accuracy_given_attempted: 1.00
+        chinese_simpleqa_given_attempted_accuracy: 0.90
+        alignment_bench_v1_1_专业能力: 7.90
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -55,11 +55,11 @@ internlm2_5-7b-chat-hf_fullbench:
         alignment_bench_v1_1_文本写作: 0
         alignment_bench_v1_1_角色扮演: 0
         alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 0
+        alpaca_eval_helpful_base: 20.00
         compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 55
+        compassarena_knowledge_naive_average: 60.00
         compassarena_reason_v2_naive_average: 40
-        compassarena_math_v2_naive_average: 55
+        compassarena_math_v2_naive_average: 50.00
         compassarena_creationv2_zh_naive_average: 30
         followbench_llmeval_en_HSR_AVG: 1
         followbench_llmeval_en_SSR_AVG: 1
@@ -73,58 +73,58 @@ internlm2_5-7b-chat-hf_fullbench:
         followbench_llmeval_en_SSR_L3: 1
         followbench_llmeval_en_SSR_L4: 1
         followbench_llmeval_en_SSR_L5: 1
-        simpleqa_f1: 0
+        simpleqa_f1: 0.12
 
 internlm2_5-7b-chat-turbomind_fullbench:
     objective:
         race-high_accuracy:  93.75
-        ARC-c_accuracy: 87.50
-        BoolQ_accuracy: 68.75
+        ARC-c_accuracy: 93.75
+        BoolQ_accuracy: 75.00
         triviaqa_wiki_1shot_score: 50
         nq_open_1shot_score: 25
         IFEval_Prompt-level-strict-accuracy: 56.25
         drop_accuracy: 75
-        GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 87.5
+        GPQA_diamond_accuracy: 37.50
+        hellaswag_accuracy: 81.25
         TheoremQA_score: 12.5
         musr_average_naive_average: 39.58
         korbench_single_naive_average: 40
-        gsm8k_accuracy: 62.5
-        math_accuracy: 75
+        gsm8k_accuracy: 68.75
+        math_accuracy: 68.75
         cmo_fib_accuracy: 6.25
         aime2024_accuracy: 6.25
         wikibench-wiki-single_choice_cncircular_perf_4: 25
         sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 17.86
+        ds1000_naive_average: 15.18
         lcb_code_generation_pass@1: 12.5
         lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 18.75
-        bbh-logical_deduction_seven_objects_score: 56.25
-        bbh-multistep_arithmetic_two_score: 75
-        mmlu-other_accuracy: 72.6
-        cmmlu-china-specific_accuracy: 78.33
-        mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 12.5
+        lcb_test_output_pass@1: 0.00
+        bbh-logical_deduction_seven_objects_score: 62.50
+        bbh-multistep_arithmetic_two_score: 62.50
+        mmlu-other_accuracy: 73.08
+        cmmlu-china-specific_accuracy: 75.42
+        mmlu_pro_math_accuracy: 25.00
+        ds1000_Pandas_accuracy: 0.00
         ds1000_Numpy_accuracy: 0
         ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 25
+        ds1000_Scipy_accuracy: 18.75
         ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 6.25
-        ds1000_Matplotlib_accuracy: 50.00
+        ds1000_Pytorch_accuracy: 12.50
+        ds1000_Matplotlib_accuracy: 43.75
         openai_mmmlu_lite_AR-XY_accuracy: 37.5
         college_naive_average: 12.50
         college_knowledge_naive_average: 87.5
     subjective:
-        alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 0
-        arenahard_score: 50
+        alignment_bench_v1_1_总分: 0.72
+        alpaca_eval_total: 20.00
+        arenahard_score: 55.77
         Followbench_naive_average: 1
-        CompassArena_naive_average: 40
-        mtbench101_avg: 8
-        wildbench_average: -6.81
-        simpleqa_accuracy_given_attempted: 0
+        CompassArena_naive_average: 39.00
+        mtbench101_avg: 7.90
+        wildbench_average: 0.00
+        simpleqa_accuracy_given_attempted: 1.00
         chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.9
+        alignment_bench_v1_1_专业能力: 8.70
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -132,12 +132,12 @@ internlm2_5-7b-chat-turbomind_fullbench:
         alignment_bench_v1_1_文本写作: 0
         alignment_bench_v1_1_角色扮演: 0
         alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 0
-        compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 45
-        compassarena_reason_v2_naive_average: 25
-        compassarena_math_v2_naive_average: 60
-        compassarena_creationv2_zh_naive_average: 35
+        alpaca_eval_helpful_base: 20.00
+        compassarena_language_naive_average: 25.00
+        compassarena_knowledge_naive_average: 55.00
+        compassarena_reason_v2_naive_average: 35.00
+        compassarena_math_v2_naive_average: 55.00
+        compassarena_creationv2_zh_naive_average: 25.00
         followbench_llmeval_en_HSR_AVG: 1
         followbench_llmeval_en_SSR_AVG: 1
         followbench_llmeval_en_HSR_L1: 1
@@ -150,7 +150,7 @@ internlm2_5-7b-chat-turbomind_fullbench:
         followbench_llmeval_en_SSR_L3: 1
         followbench_llmeval_en_SSR_L4: 1
         followbench_llmeval_en_SSR_L5: 1
-        simpleqa_f1: 0
+        simpleqa_f1: 0.12
 
 internlm2_5-7b-hf_fullbench:
     objective:
@@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
         drop_accuracy: 62.5
         GPQA_diamond_accuracy: 62.5
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 12.50
+        TheoremQA_score: 18.75
         winogrande_accuracy: 75
         gsm8k_accuracy: 37.5
         GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@@ -188,23 +188,23 @@ internlm2_5-7b-turbomind_fullbench:
         triviaqa_wiki_1shot_score: 43.75
         nq_open_1shot_score: 43.75
         drop_accuracy: 62.5
-        GPQA_diamond_accuracy: 62.5
+        GPQA_diamond_accuracy: 68.75
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 12.50
+        TheoremQA_score: 18.75
         winogrande_accuracy: 87.5
-        gsm8k_accuracy: 56.25
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
+        gsm8k_accuracy: 62.50
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
         GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
-        math_accuracy: 18.75
-        wikibench-wiki-single_choice_cncircular_perf_4: 25
+        math_accuracy: 6.25
+        wikibench-wiki-single_choice_cncircular_perf_4: 0.00
         sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 50.00
-        dingo_zh_170_score: 93.75
-        mmlu-other_accuracy: 76.92
-        cmmlu-china-specific_accuracy: 84.17
+        dingo_en_192_score: 37.50
+        dingo_zh_170_score: 100.00
+        mmlu-other_accuracy: 78.37
+        cmmlu-china-specific_accuracy: 83.33
         mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 43.75
-        bbh-multistep_arithmetic_two_score: 56.25
+        bbh-logical_deduction_seven_objects_score: 62.50
+        bbh-multistep_arithmetic_two_score: 50.00
         college_naive_average: 12.5
         college_knowledge_naive_average: 87.5
 
@@ -230,7 +230,7 @@ internlm2_5-7b-turbomind:
         mmlu_naive_average: 71.44
         mmlu_pro_naive_average: 38.18
         openai_humaneval_humaneval_pass@1: 59.76
-        openai_humaneval_v2_humaneval_pass@1: 51.22
+        openai_humaneval_v2_humaneval_pass@1: 57.93
         sanitized_mbpp_score: 55.25
         dingo_en_192_score: 60.94
         dingo_zh_170_score: 67.65
@@ -257,17 +257,17 @@ internlm2_5-7b-turbomind:
         mmlu_pro_physics_accuracy: 26.02
         mmlu_pro_psychology_accuracy: 52.76
         mmlu_pro_other_accuracy: 42.21
-        college_naive_average: 10.67
+        college_naive_average: 7.00
         high_naive_average: 6.67
         middle_naive_average: 26.67
-        primary_naive_average: 60
+        primary_naive_average: 64.00
         arithmetic_naive_average: 55
         mathbench-a (average)_naive_average: 31.8
-        college_knowledge_naive_average: 62.34
-        high_knowledge_naive_average: 59.83
+        college_knowledge_naive_average: 58.23
+        high_knowledge_naive_average: 52.51
         middle_knowledge_naive_average: 71.15
-        primary_knowledge_naive_average: 66.55
-        mathbench-t (average)_naive_average: 64.97
+        primary_knowledge_naive_average: 60.48
+        mathbench-t (average)_naive_average: 60.19
     long_context:
         Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
         Single-Needle-Retrieval-EN-32000_naive_average: 100
@@ -309,7 +309,7 @@ internlm2_5-7b-chat-turbomind:
         GaokaoBench_weighted_average: 78.6
         math_accuracy: 61
         cmo_fib_accuracy: 11
-        aime2024_accuracy: 6.67
+        aime2024_accuracy: 3.33
         Mathbench_naive_average: 64.23
         wikibench-wiki-single_choice_cncircular_perf_4: 31.32
         cmmlu_naive_average: 74.3
@@ -322,7 +322,7 @@ internlm2_5-7b-chat-turbomind:
         lcb_code_generation_pass@1: 17.75
         lcb_code_execution_pass@1: 32.57
         lcb_test_output_pass@1: 26.13
-        bigcodebench_hard_instruct_pass@1: 8.45
+        bigcodebench_hard_instruct_pass@1: 3.38
         bigcodebench_hard_complete_pass@1: 5.06
         teval_naive_average: 80
         SciCode_sub_accuracy: 5.56
@@ -384,7 +384,7 @@ internlm2_5-7b-chat-turbomind:
         college_knowledge_naive_average: 67.1
         high_knowledge_naive_average: 70
         middle_knowledge_naive_average: 80
-        primary_knowledge_naive_average: 87
+        primary_knowledge_naive_average: 90.12
         mathbench-t (average)_naive_average: 76
     subjective:
         alignment_bench_v1_1_总分: 5.68
@@ -409,11 +409,11 @@ internlm2_5-7b-chat-turbomind:
         alpaca_eval_koala: 28.21
         alpaca_eval_oasst: 23.4
         alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 33.75
-        compassarena_language_naive_average: 58.50
+        alpaca_eval_vicuna: 25.00
+        compassarena_language_naive_average: 53.00
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 25.95
+        compassarena_math_v2_naive_average: 16.07
         compassarena_creationv2_zh_naive_average: 43.64
         fofo_test_prompts_overall: 0.35
         fofo_test_prompts_cn_overall: 0.41
@@ -524,7 +524,7 @@ qwen2.5-7b-instruct-turbomind:
         humanevalx-python_pass@1: 50
         humanevalx-cpp_pass@1: 42.07
         humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 74.39
+        humanevalx-java_pass@1: 53.05
         humanevalx-js_pass@1: 75
         ds1000_Pandas_accuracy: 14.09
         ds1000_Numpy_accuracy: 8.18
@@ -548,7 +548,7 @@ qwen2.5-7b-instruct-turbomind:
         openai_mmmlu_lite_SW-KE_accuracy: 36.42
         openai_mmmlu_lite_YO-NG_accuracy: 32.14
         openai_mmmlu_lite_ZH-CN_accuracy: 69.61
-        college_naive_average: 48
+        college_naive_average: 44.33
         high_naive_average: 59
         middle_naive_average: 78
         primary_naive_average: 85.67
@@ -658,7 +658,7 @@ internlm2_5-7b-chat-pytorch:
         college_naive_average: 21
         high_naive_average: 47
         middle_naive_average: 59.67
-        primary_naive_average: 76
+        primary_naive_average: 72.33
         arithmetic_naive_average: 62
         mathbench-a (average)_naive_average: 53.13
         college_knowledge_naive_average: 68.99
@@ -688,7 +688,7 @@ qwen2.5-7b-instruct-pytorch:
         gsm8k_accuracy: 91.66
         GaokaoBench_weighted_average: 80.02
         math_accuracy: 73.74
-        cmo_fib_accuracy: 26.44
+        cmo_fib_accuracy: 22.60
         aime2024_accuracy: 13.33
         Mathbench_naive_average: 77.08
         wikibench-wiki-single_choice_cncircular_perf_4: 34
@@ -793,8 +793,8 @@ internlm3-8b-instruct-turbomind:
         gsm8k_accuracy: 91.28
         GaokaoBench_weighted_average: 86.59
         math_accuracy: 76.96
-        cmo_fib_accuracy: 35.1
-        aime2024_accuracy: 16.67
+        cmo_fib_accuracy: 38.46
+        aime2024_accuracy: 13.33
         Mathbench_naive_average: 78.96
         wikibench-wiki-single_choice_cncircular_perf_4: 37.45
         cmmlu_naive_average: 83.33
@@ -841,7 +841,7 @@ internlm3-8b-instruct-turbomind:
         humanevalx-python_pass@1: 43.9
         humanevalx-cpp_pass@1: 20.12
         humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 74.39
+        humanevalx-java_pass@1: 40.85
         humanevalx-js_pass@1: 65.24
         ds1000_Pandas_accuracy: 16.49
         ds1000_Numpy_accuracy: 34.09
@@ -907,7 +907,7 @@ internlm3-8b-instruct-pytorch:
         mmlu_pro_naive_average: 58.16
         openai_humaneval_humaneval_pass@1: 82.32
         sanitized_mbpp_score: 70.04
-        humanevalx_naive_average: 39.76
+        humanevalx_naive_average: 25.49
         ds1000_naive_average: 27.84
         lcb_code_generation_pass@1: 34.5
         lcb_code_execution_pass@1: 48.02
@@ -946,7 +946,7 @@ internlm3-8b-instruct-pytorch:
         humanevalx-python_pass@1: 42.68
         humanevalx-cpp_pass@1: 19.51
         humanevalx-go_pass@1: 0
-        humanevalx-java_pass@1: 72.56
+        humanevalx-java_pass@1: 0.00
         humanevalx-js_pass@1: 64.02
         ds1000_Pandas_accuracy: 14.09
         ds1000_Numpy_accuracy: 35
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 94a28d36..64ceccd1 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -12,13 +12,13 @@ chat:
         gsm8k_accuracy: 46.88
         race-high_accuracy: 81.25
     deepseek-r1-distill-llama-8b-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 34.38
         race-high_accuracy: 81.25
     deepseek-r1-distill-qwen-1_5b-turbomind:
-        gsm8k_accuracy: 37.5
+        gsm8k_accuracy: 28.12
         race-high_accuracy: 53.12
     deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 78.12
     gemma2-2b-it-hf:
         gsm8k_accuracy: 50
@@ -33,13 +33,13 @@ chat:
         gsm8k_accuracy: 40.62
         race-high_accuracy: 68.75
     gemma-2-9b-it-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 84.38
     gemma-2-27b-it-turbomind:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     gemma-7b-it-vllm:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 28.12
         race-high_accuracy: 68.75
     internlm2_5-7b-chat-hf:
         gsm8k_accuracy: 84.38
@@ -48,25 +48,25 @@ chat:
         gsm8k_accuracy: 65.62
         race-high_accuracy: 87.5
     internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 84.38
+        gsm8k_accuracy: 81.25
         race-high_accuracy: 90.62
     internlm2-chat-1.8b-turbomind:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 25.00
         race-high_accuracy: 84.38
     internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 34.38
         race-high_accuracy: 84.38
     internlm2-chat-7b-lmdeploy:
         gsm8k_accuracy: 59.38
-        race-high_accuracy: 84.38
+        race-high_accuracy: 87.50
     internlm2-chat-7b-sft-turbomind:
         gsm8k_accuracy: 56.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
     internlm3-8b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
         race-high_accuracy: 87.5
     internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 53.12
         race-high_accuracy: 87.50
     llama-3_1-8b-instruct-hf:
         gsm8k_accuracy: 84.38
@@ -81,13 +81,13 @@ chat:
         gsm8k_accuracy: 18.75
         race-high_accuracy: 46.88
     llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
         race-high_accuracy: 81.25
     llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 65.62
         race-high_accuracy: 84.38
     mistral-7b-instruct-v0.2-hf:
         gsm8k_accuracy: 40.62
@@ -100,12 +100,12 @@ chat:
         race-high_accuracy: 81.25
     mistral-nemo-instruct-2407-turbomind:
         gsm8k_accuracy: 71.88
-        race-high_accuracy: 78.12
+        race-high_accuracy: 75
     mistral-7b-instruct-v0.1-vllm:
         gsm8k_accuracy: 34.38
         race-high_accuracy: 65.62
     mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 28.12
         race-high_accuracy: 78.12
     qwen2.5-0.5b-instruct-hf:
         gsm8k_accuracy: 34.38
@@ -114,7 +114,7 @@ chat:
         gsm8k_accuracy: 53.12
         race-high_accuracy: 90.62
     qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 31.25
+        gsm8k_accuracy: 28.12
         race-high_accuracy: 43.75
     qwen2.5-3b-instruct-turbomind:
         gsm8k_accuracy: 56.25
@@ -132,10 +132,10 @@ chat:
         gsm8k_accuracy: 56.25
         race-high_accuracy: 84.38
     qwen2-7b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 87.50
     qwen1.5-0.5b-chat-vllm:
-        gsm8k_accuracy: 3.12
+        gsm8k_accuracy: 6.25
         race-high_accuracy: 53.12
     yi-1.5-6b-chat-hf:
         gsm8k_accuracy: 65.62
@@ -144,13 +144,13 @@ chat:
         gsm8k_accuracy: 75
         race-high_accuracy: 93.75
     yi-1.5-6b-chat-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
         race-high_accuracy: 84.38
     yi-1.5-9b-chat-turbomind:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     deepseek-v2_lite-chat-turbomind:
-        gsm8k_accuracy: 37.5
+        gsm8k_accuracy: 43.75
         race-high_accuracy: 71.88
     gemma2-27b-it-hf:
         gsm8k_accuracy: 71.88
@@ -165,7 +165,7 @@ chat:
         gsm8k_accuracy: 81.25
         race-high_accuracy: 87.50
     mistral-small-instruct-2409-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 78.12
         race-high_accuracy: 87.50
     phi-4:
         gsm8k_accuracy: 81.25
@@ -174,16 +174,16 @@ chat:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 96.88
     qwen2.5-14b-instruct-turbomind:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 96.88
     yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 93.75
     deepseek-67b-chat-turbomind:
-        gsm8k_accuracy: 75.00
-        race-high_accuracy: 78.12
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 75.00
     deepseek-r1-distill-qwen-32b-turbomind:
-        gsm8k_accuracy: 25
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 90.62
     llama-3_3-70b-instruct-turbomind:
         gsm8k_accuracy: 93.75
@@ -192,19 +192,19 @@ chat:
         gsm8k_accuracy: 87.50
         race-high_accuracy: 93.75
     nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 93.75
-        race-high_accuracy: 50.00
+        gsm8k_accuracy: 90.62
+        race-high_accuracy: 53.12
     qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 81.25
+        gsm8k_accuracy: 78.12
         race-high_accuracy: 90.62
     deepseek-r1-distill-llama-70b-turbomind:
-        gsm8k_accuracy: 40.62
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 50.00
+        race-high_accuracy: 87.50
     deepseek-v2_5-1210-turbomind:
         gsm8k_accuracy: 90.62
         race-high_accuracy: 84.38
     mixtral-8x22b-instruct-v0.1-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 78.12
     mixtral-8x22b-instruct-v0.1-vllm:
         gsm8k_accuracy: 78.12
@@ -222,11 +222,11 @@ base:
         winogrande_accuracy: 71.88
     deepseek-7b-base-turbomind:
         gsm8k_accuracy: 18.75
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 43.75
+        GPQA_diamond_accuracy: 3.12
+        race-high_accuracy: 50.00
         winogrande_accuracy: 84.38
     deepseek-moe-16b-base-vllm:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 25.00
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 25
         winogrande_accuracy: 68.75
@@ -253,15 +253,15 @@ base:
     gemma-2-9b-turbomind:
         gsm8k_accuracy: 68.75
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 18.75
-        winogrande_accuracy: 46.88
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 81.25
     gemma-2b-vllm:
         gsm8k_accuracy: 15.62
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 28.12
         winogrande_accuracy: 68.75
     gemma-7b-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 6.25
         race-high_accuracy: 81.25
         winogrande_accuracy: 81.25
@@ -276,8 +276,8 @@ base:
         race-high_accuracy: 62.5
         winogrande_accuracy: 78.12
     internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 6.25
-        GPQA_diamond_accuracy: 12.5
+        gsm8k_accuracy: 12.50
+        GPQA_diamond_accuracy: 9.38
         race-high_accuracy: 71.88
         winogrande_accuracy: 75
     internlm2_5-7b-turbomind:
@@ -286,13 +286,13 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     internlm2-7b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 34.38
+        gsm8k_accuracy: 53.12
+        GPQA_diamond_accuracy: 25.00
         race-high_accuracy: 78.12
         winogrande_accuracy: 71.88
     internlm2-base-7b-turbomind:
-        gsm8k_accuracy: 28.12
-        GPQA_diamond_accuracy: 31.25
+        gsm8k_accuracy: 25.00
+        GPQA_diamond_accuracy: 34.38
         race-high_accuracy: 71.88
         winogrande_accuracy: 62.50
     llama-2-7b-hf:
@@ -311,8 +311,8 @@ base:
         race-high_accuracy: 65.62
         winogrande_accuracy: 65.62
     llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 15.62
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 9.38
         race-high_accuracy: 78.12
         winogrande_accuracy: 78.12
     llama-3-8b-turbomind:
@@ -332,12 +332,12 @@ base:
         winogrande_accuracy: 71.88
     qwen2.5-1.5b-turbomind:
         gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 18.75
-        race-high_accuracy: 75
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 78.12
         winogrande_accuracy: 71.88
     qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 18.75
+        gsm8k_accuracy: 78.12
+        GPQA_diamond_accuracy: 21.88
         race-high_accuracy: 87.5
         winogrande_accuracy: 75.00
     qwen1.5-moe-a2.7b-hf:
@@ -361,18 +361,18 @@ base:
         race-high_accuracy: 87.5
         winogrande_accuracy: 68.75
     qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 6.25
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 81.25
         winogrande_accuracy: 75
     qwen2-7b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 65.62
         GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 87.5
         winogrande_accuracy: 75
     qwen1.5-0.5b-vllm:
         gsm8k_accuracy: 9.38
-        GPQA_diamond_accuracy: 0
+        GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 56.25
         winogrande_accuracy: 59.38
     yi-1.5-6b-hf:
@@ -386,7 +386,7 @@ base:
         race-high_accuracy: 87.5
         winogrande_accuracy: 59.38
     yi-1.5-9b-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
         GPQA_diamond_accuracy: 40.62
         race-high_accuracy: 87.5
         winogrande_accuracy: 65.62
@@ -406,13 +406,13 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 78.12
     qwen2.5-32b-turbomind:
-        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 28.12
+        gsm8k_accuracy: 90.62
+        GPQA_diamond_accuracy: 31.25
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
     deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 34.38
+        gsm8k_accuracy: 62.50
+        GPQA_diamond_accuracy: 31.25
         race-high_accuracy: 78.12
         winogrande_accuracy: 81.25
     llama-3-70b-turbomind:
@@ -422,11 +422,11 @@ base:
         winogrande_accuracy: 84.38
     qwen2.5-72b-turbomind:
         gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 31.25
+        GPQA_diamond_accuracy: 40.62
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     deepseek-v2-turbomind:
         gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 9.38
+        GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index e6000c09..f755f56c 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -17,7 +17,7 @@ on:
         required: false
         description: 'whether to build lmdeploy'
         type:  boolean
-        default: true
+        default: false
       repo_org_lmdeploy:
         required: false
         description: 'Tested repository organization name. Default is internlm/lmdeploy'
@@ -146,7 +146,7 @@ jobs:
       - name: Prepare - create conda env and install torch - cu12
         uses: nick-fields/retry@v3
         with:
-          max_attempts: 1
+          max_attempts: 3
           timeout_minutes: 120
           command: |
             . ${{env.CONDA_PATH}}/bin/activate
@@ -182,7 +182,7 @@ jobs:
           pip list
 
   daily_run_test_volc:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
     needs: prepare_env
     strategy:
       fail-fast: false
@@ -222,7 +222,7 @@ jobs:
 
 
   daily_run_test_local:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
     needs: prepare_env
     strategy:
       fail-fast: false
@@ -303,7 +303,7 @@ jobs:
           python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
 
   fullbench_run_test:
-    if: ${{!cancelled()}}
+    if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
     needs: prepare_env
     strategy:
       fail-fast: false

From b84518c656034a46cbe65d89c1819b92ca205ae2 Mon Sep 17 00:00:00 2001
From: Wei Li <liwei2022@sjtu.edu.cn>
Date: Tue, 13 May 2025 17:10:50 +0800
Subject: [PATCH 23/28] [Dataset] Support MedMCQA and MedBullets benchmark
 (#2054)

* support medmcqa and medbullets benchmark

* Add Medbullets data folder for benchmark support

* revise gen name

* revise config file & remove csv file & add dataset info to dataset-index.yml

* remove csv file

* remove print in medbullets.py

* revise class name

* update_oss_info

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
---
 dataset-index.yml                             |  12 +
 .../datasets/Medbullets/medbullets_gen.py     |   4 +
 .../Medbullets/medbullets_gen_60c8f5.py       |  59 +++++
 .../Medbullets/medbullets_llmjudge_gen.py     |   4 +
 .../medbullets_llmjudge_gen_60c8f5.py         | 106 ++++++++
 .../configs/datasets/medmcqa/medmcqa_gen.py   |   4 +
 .../datasets/medmcqa/medmcqa_gen_60c8f5.py    |  58 ++++
 .../datasets/medmcqa/medmcqa_llmjudge_gen.py  |   4 +
 .../medmcqa/medmcqa_llmjudge_gen_60c8f5.py    | 105 ++++++++
 opencompass/datasets/Medbullets.py            | 243 +++++++++++++++++
 opencompass/datasets/__init__.py              |   2 +
 opencompass/datasets/medmcqa.py               | 247 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  10 +
 13 files changed, 858 insertions(+)
 create mode 100644 opencompass/configs/datasets/Medbullets/medbullets_gen.py
 create mode 100644 opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py
 create mode 100644 opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py
 create mode 100644 opencompass/configs/datasets/medmcqa/medmcqa_gen.py
 create mode 100644 opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py
 create mode 100644 opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py
 create mode 100644 opencompass/datasets/Medbullets.py
 create mode 100644 opencompass/datasets/medmcqa.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 5ebad535..984d34a6 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1095,3 +1095,15 @@
     paper: https://arxiv.org/pdf/2308.04709
     configpath: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_gen.py
     configpath_llmjudge: opencompass/configs/datasets/nejm_ai_benchmark/nejmaibench_llmjudge_gen.py
+- medbullets:
+    name: Medbullets
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2402.18060
+    configpath: opencompass/configs/datasets/Medbullets/medbullets_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
+- medmcqa:
+    name: medmcqa
+    category: Science /Medicine
+    paper: https://arxiv.org/pdf/2203.14371
+    configpath: opencompass/configs/datasets/medmcqa/medmcqa_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
diff --git a/opencompass/configs/datasets/Medbullets/medbullets_gen.py b/opencompass/configs/datasets/Medbullets/medbullets_gen.py
new file mode 100644
index 00000000..ef90893c
--- /dev/null
+++ b/opencompass/configs/datasets/Medbullets/medbullets_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medbullets_gen_60c8f5 import medbullets_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py b/opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py
new file mode 100644
index 00000000..c506934a
--- /dev/null
+++ b/opencompass/configs/datasets/Medbullets/medbullets_gen_60c8f5.py
@@ -0,0 +1,59 @@
+from opencompass.datasets import MedbulletsDataset, MedbulletsEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'question_type',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedbulletsEvaluator),
+    pred_role='BOT',
+)
+medbullets_dataset = dict(
+    type=MedbulletsDataset,
+    abbr='medbullets',
+    path='opencompass/medbullets',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medbullets_datasets = [medbullets_dataset]
diff --git a/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
new file mode 100644
index 00000000..d0f1cff9
--- /dev/null
+++ b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medbullets_llmjudge_gen_60c8f5 import medbullets_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py
new file mode 100644
index 00000000..3081ab74
--- /dev/null
+++ b/opencompass/configs/datasets/Medbullets/medbullets_llmjudge_gen_60c8f5.py
@@ -0,0 +1,106 @@
+from opencompass.datasets import MedbulletsDataset, medbullets_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+import os
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'question_type',
+        'prompt_mode',
+        
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedbulletsDataset,
+            path='opencompass/medbullets',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=medbullets_llmjudge_postprocess),
+    ),
+)
+
+
+medbullets_dataset = dict(
+    type=MedbulletsDataset,
+    abbr='medbullets',
+    path='opencompass/medbullets',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medbullets_datasets = [medbullets_dataset]
diff --git a/opencompass/configs/datasets/medmcqa/medmcqa_gen.py b/opencompass/configs/datasets/medmcqa/medmcqa_gen.py
new file mode 100644
index 00000000..68148ae7
--- /dev/null
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medmcqa_gen_60c8f5 import medmcqa_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py b/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py
new file mode 100644
index 00000000..a0d8bb43
--- /dev/null
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_gen_60c8f5.py
@@ -0,0 +1,58 @@
+from opencompass.datasets import MedmcqaDataset, MedmcqaEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'subject_name',
+        'choice_type',
+        'prompt_mode',
+        'topic_name',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedmcqaEvaluator),
+    pred_role='BOT',
+)
+medmcqa_dataset = dict(
+    type=MedmcqaDataset,
+    abbr='medmcqa',
+    path='openlifescienceai/medmcqa',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medmcqa_datasets = [medmcqa_dataset]
diff --git a/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
new file mode 100644
index 00000000..f9c1b806
--- /dev/null
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .medmcqa_llmjudge_gen_60c8f5 import medmcqa_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py
new file mode 100644
index 00000000..e96cfa28
--- /dev/null
+++ b/opencompass/configs/datasets/medmcqa/medmcqa_llmjudge_gen_60c8f5.py
@@ -0,0 +1,105 @@
+from opencompass.datasets import MedmcqaDataset, medmcqa_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\n Please select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nPlease select the correct answer from the options above and output only the corresponding letter (A, B, C, D, or E) without any explanation or additional text.\n\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'subject_name',
+        'choice_type',
+        'prompt_mode',
+        'topic_name',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedmcqaDataset,
+            path='openlifescienceai/medmcqa',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=medmcqa_llmjudge_postprocess),
+    ),
+)
+medmcqa_dataset = dict(
+    type=MedmcqaDataset,
+    abbr='medmcqa',
+    path='openlifescienceai/medmcqa',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+    
+)
+
+medmcqa_datasets = [medmcqa_dataset]
diff --git a/opencompass/datasets/Medbullets.py b/opencompass/datasets/Medbullets.py
new file mode 100644
index 00000000..1e7e9a63
--- /dev/null
+++ b/opencompass/datasets/Medbullets.py
@@ -0,0 +1,243 @@
+import re
+
+import pandas as pd
+from datasets import Dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.utils import get_data_path, get_logger
+
+from .base import BaseDataset
+
+
+def _parse(item: dict, prompt_mode: str) -> dict:
+    # 构建选项列表，忽略空字符串的 ope
+    options_keys = ['opa', 'opb', 'opc', 'opd']
+    if item.get('ope', '') != '':
+        options_keys.append('ope')
+    options_list = [item.get(k, '') for k in options_keys]
+    item['options'] = options_list
+
+    # 构建带标号的选项字符串
+    options_str = '\n'.join(
+        [f'{chr(65 + i)}. {opt}' for i, opt in enumerate(options_list)])
+
+    # 将选项附加到问题末尾
+    item['question'] = f"{item.get('question', '')}\n{options_str}"
+
+    # 标签及其他字段
+    item['label'] = item.get('answer_idx')
+    item['prompt_mode'] = prompt_mode
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(options_list) - 1)
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MedbulletsDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
+        # 读取 CSV 文件为 DataFrame，并将 NaN 转为空字符串
+        path = get_data_path(path)
+        df = pd.read_csv(path, encoding='utf-8')
+        df = df.fillna('')
+
+        # 转换为字典列表
+        data_list = df.to_dict(orient='records')
+
+        # 将数据列表包装为 Dataset
+        dataset = Dataset.from_list(data_list)
+
+        # 根据提示模式进行解析
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode))
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt handling
+        return dataset
+
+
+class MedbulletsEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['options'][idx],
+                                 test_set['label'][idx])
+            detail = {
+                'pred': i,
+                'answer': j,
+                'correct': False,
+                'question_type': test_set['question_type'][idx]
+            }
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+
+    # Clean up unwanted phrases in the prediction
+    for unwanted_phrase in [
+            'I understand',
+            'A through J',
+            'A through E',
+            'A through D',
+    ]:
+        prediction = prediction.replace(unwanted_phrase, '')
+
+    options_num = len(options)
+    options = [chr(65 + i) for i in range(options_num)]
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+        return prediction
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+        return prediction[0]
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def medbullets_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'question_type': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        question_type = sample.get('question_type', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('question_type', question_type),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('question_type', question_type),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('question_type', question_type),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'question_type': question_type,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'Medbullets-{key}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 92cda579..dfbc20ca 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -99,8 +99,10 @@ from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .mbpp_pro import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
+from .Medbullets import *  # noqa: F401, F403
 from .MedCalc_Bench import MedCalc_BenchDataset  # noqa: F401
 from .MedCalc_Bench import MedCalcOfficial_Evaluator  # noqa: F401
+from .medmcqa import *  # noqa: F401, F403
 from .MedQA import *  # noqa: F401, F403
 from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
diff --git a/opencompass/datasets/medmcqa.py b/opencompass/datasets/medmcqa.py
new file mode 100644
index 00000000..abb397a7
--- /dev/null
+++ b/opencompass/datasets/medmcqa.py
@@ -0,0 +1,247 @@
+import re
+
+from datasets import Dataset, load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.utils import get_logger
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode):
+    options_list = [item['opa'], item['opb'], item['opc'], item['opd']]
+    item['options'] = options_list
+
+    # 构建带标号的选项字符串
+    options_str = '\n'.join(
+        [f'{chr(65 + i)}. {opt}' for i, opt in enumerate(options_list)])
+
+    # 将选项附加到问题末尾
+    item['question'] = f"{item['question']}\n{options_str}"
+
+    item['label'] = chr(65 + item['cop'])
+    item['subject_name'] = item['subject_name'].replace('_', ' ')
+    item['prompt_mode'] = prompt_mode
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(options_list) - 1)  # 使用实际选项数量
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MedmcqaDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str = 'zero-shot', **kwargs):
+        dataset = load_dataset(path=path,
+                               split='validation',
+                               trust_remote_code=True)
+
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode))
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+
+        return dataset
+
+
+class MedmcqaEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['options'][idx],
+                                 test_set['label'][idx])
+            detail = {
+                'pred': i,
+                'answer': j,
+                'correct': False,
+                'subject_name': test_set['subject_name'][idx],
+                'topic_name': test_set['topic_name'][idx],
+                'choice_type': test_set['choice_type'][idx]
+            }
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+
+    # Clean up unwanted phrases in the prediction
+    for unwanted_phrase in [
+            'I understand',
+            'A through J',
+            'A through E',
+            'A through D',
+    ]:
+        prediction = prediction.replace(unwanted_phrase, '')
+
+    options_num = len(options)
+    options = [chr(65 + i) for i in range(options_num)]
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+        return prediction
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+        return prediction[0]
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def medmcqa_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    # Get the original dataset
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'subject': {}, 'topic': {}, 'question_type': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        subject = sample.get('subject_name', 'unknown')
+        question_type = sample.get('choice_type', 'unknown')
+        topic = sample.get('topic_name', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('subject', subject),
+            ('question_type', question_type),
+            ('topic', topic),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('subject', subject),
+                    ('question_type', question_type),
+                    ('topic', topic),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('subject', subject),
+                ('question_type', question_type),
+                ('topic', topic),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'question': sample['question'],
+                'options': sample['options'],
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'subject': subject,
+                'question_type': question_type,
+                'topic': topic,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'medmcqa-{key}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index ce12af64..af814eb8 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -461,6 +461,12 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/mbpp_pro/mbpp_pro.json",
     },
+    "opencompass/medbullets": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/medbullets/medbullets.csv",
+    },
+
 }
 
 DATASETS_URL = {
@@ -826,4 +832,8 @@ DATASETS_URL = {
         "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp_pro.zip",
         "md5": "eac330b8a0a8687f006265c9383503ce",
     },
+    "medbullets": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/medbullets.zip",
+        "md5": "b63130999c1f28d57acba1c7852639f8",
+    },
 }

From 3d1760aba23e81284210d085a7e75b8dfae4051d Mon Sep 17 00:00:00 2001
From: tcheng <tangcheng231@mails.ucas.ac.cn>
Date: Wed, 14 May 2025 10:25:03 +0800
Subject: [PATCH 24/28] [Dataset] Add Scieval (#2089)

* style: pass all formatting hooks (yapf & quote fixer)

* revise name:Add Lifescience Sub-set Support for MMLU & SciEval (datasets + configs + loader)

* revise name:Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* all categories of SciEval (datasets + configs + loader+dataset-index.yml)

* revise name:Add Lifescience SciEval (datasets + configs + loader+dataset-index.yml)

* revise :SciEval 5shot

---------

Co-authored-by: root <tangcheng231@mails.ucas.edu.cn>
---
 dataset-index.yml                             |   6 +
 .../SciEval/SciEval_5shot_gen_4043d4.py       |  65 +++++++++
 .../SciEval_5shot_llmjudge_gen_b7b684.py      | 130 ++++++++++++++++++
 .../SciEval/SciEval_lifescience_sets.py       |   6 +
 opencompass/datasets/SciEval.py               |  68 +++++++++
 opencompass/datasets/__init__.py              |   1 +
 6 files changed, 276 insertions(+)
 create mode 100644 opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py
 create mode 100644 opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py
 create mode 100644 opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py
 create mode 100644 opencompass/datasets/SciEval.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 984d34a6..05f7ed5e 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -719,6 +719,12 @@
     paper: https://arxiv.org/pdf/2009.03300
     configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
     configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+- SciEval:
+    name: SciEval
+    category: Understanding
+    paper: https://arxiv.org/pdf/2308.13149
+    configpath: opencompass/configs/datasets/SciEval/SciEval_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/SciEval/SciEval_llm_judge_gen.py
 - mmlu_cf:
     name: MMLU-CF
     category: Understanding
diff --git a/opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py b/opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py
new file mode 100644
index 00000000..645e744b
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval/SciEval_5shot_gen_4043d4.py
@@ -0,0 +1,65 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets import SciEvalDataset  
+
+# 只评测 biology + multiple-choice 的 test split
+_hint = ('Given a question and four options, please select the right answer. '
+         "Your answer should be 'A', 'B', 'C' or 'D'.")
+category = [
+    'biology',
+]
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+            ),
+            dict(role='BOT', prompt='{target}\n')
+        ]),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=GenInferencer),
+)
+
+scieval_eval_cfg = dict(
+    evaluator=dict(type=AccwithDetailsEvaluator),
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+
+scieval_datasets = [
+    dict(
+        abbr='scieval_biology',
+        type=SciEvalDataset,
+        path='OpenDFM/SciEval',
+        name='default',
+        category=category, 
+        reader_cfg=scieval_reader_cfg,
+        infer_cfg=scieval_infer_cfg,
+        eval_cfg=scieval_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py b/opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py
new file mode 100644
index 00000000..8899a031
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval/SciEval_5shot_llmjudge_gen_b7b684.py
@@ -0,0 +1,130 @@
+# SciEval_lifescience_llmjudge_gen.py
+
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import SciEvalDataset
+
+with read_base():
+    from .SciEval_lifescience_sets import SciEval_lifescience_subsets
+    
+category = [
+    'biology',
+]
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+GRADER_TEMPLATE = """
+Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+Here are some evaluation criteria:
+1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+A: CORRECT 
+B: INCORRECT
+Just return the letters "A" or "B", with no text around it.
+
+Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+<Original Question Begin>: {input}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+<Original Question End>
+
+<Gold Target Begin>:
+{target}
+<Gold Target End>
+
+<Predicted Answer Begin>:
+{prediction}
+<Predicted End>
+
+Judging the correctness of candidates' answers:
+""".strip()
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_datasets = []
+for name in SciEval_lifescience_subsets:
+    scieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ]
+            )
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    scieval_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=(
+                                'You are a helpful assistant who evaluates the correctness '
+                                "and quality of models' outputs."
+                            ),
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=SciEvalDataset,
+                path='OpenDFM/SciEval',
+                name='default',
+                reader_cfg=scieval_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    scieval_datasets.append(
+        dict(
+            abbr=f'scieval_lifescience_{name}_llmjudge',
+            type=SciEvalDataset,
+            path='OpenDFM/SciEval',
+            name='default',
+            category=category, 
+            reader_cfg=scieval_reader_cfg,
+            infer_cfg=scieval_infer_cfg,
+            eval_cfg=scieval_eval_cfg,
+            mode='singlescore',
+        )
+    )
diff --git a/opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py b/opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py
new file mode 100644
index 00000000..8cf9e540
--- /dev/null
+++ b/opencompass/configs/datasets/SciEval/SciEval_lifescience_sets.py
@@ -0,0 +1,6 @@
+SciEval_lifescience_subsets = [
+    'biology',        # 大学生物学
+    'physics',
+    'chemistry'
+
+]
diff --git a/opencompass/datasets/SciEval.py b/opencompass/datasets/SciEval.py
new file mode 100644
index 00000000..593e3183
--- /dev/null
+++ b/opencompass/datasets/SciEval.py
@@ -0,0 +1,68 @@
+import re
+from typing import List
+
+from datasets import Dataset, DatasetDict, load_dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.registry import LOAD_DATASET
+
+# 预编译的多选题正则，按 PEP-8 每行 < 79 字符
+_PATTERN_MC = (
+    r'^(?P<stem>.*?)'  # 题干
+    r'(?:A\.)\s*(?P<A>.*?)\s*'  # 选项 A
+    r'B\.\s*(?P<B>.*?)\s*'  # 选项 B
+    r'C\.\s*(?P<C>.*?)\s*'  # 选项 C
+    r'D\.\s*(?P<D>.*?)'  # 选项 D
+    r'Answer:'  # 答案分隔符
+)
+
+
+@LOAD_DATASET.register_module()
+class SciEvalDataset(BaseDataset):
+    """多选题子集，支持所有类别（可选指定 category 过滤）"""
+
+    @staticmethod
+    def load(path: str, name: str, **kwargs) -> DatasetDict:
+        # 如果传入 category，则仅保留该类别，否则包含所有类别
+        category = kwargs.get('category')
+        dataset: DatasetDict = DatasetDict()
+
+        for split in ('test', ):
+            raw_iter = load_dataset(
+                path,
+                name=name,
+                split=split,
+                streaming=True,
+            )
+            examples: List[dict] = []
+
+            for ex in raw_iter:
+                # 仅保留多选题
+                if ex.get('type') != 'multiple-choice':
+                    continue
+                # 如指定了 category，则进行过滤
+                if category is not None \
+                   and ex.get('category') != category:
+                    continue
+
+                ans_list = (ex.get('answer') or ex.get('answers') or [])
+                if not ans_list:
+                    continue
+                target = ans_list[0]
+
+                match = re.search(_PATTERN_MC, ex.get('question', ''), re.S)
+                if not match:
+                    continue
+
+                examples.append({
+                    'input': match.group('stem').strip(),
+                    'A': match.group('A').strip(),
+                    'B': match.group('B').strip(),
+                    'C': match.group('C').strip(),
+                    'D': match.group('D').strip(),
+                    'target': target,
+                })
+
+            dataset[split] = Dataset.from_list(examples)
+
+        return dataset
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index dfbc20ca..b1753221 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -138,6 +138,7 @@ from .ruler import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
 from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
+from .SciEval import SciEvalDataset  # noqa: F401
 from .SciKnowEval import *  # noqa: F401, F403
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403

From 6f3b6a5d12578226237d8e2cf3360e64bb2f71f9 Mon Sep 17 00:00:00 2001
From: kkscilife <126147887+kkscilife@users.noreply.github.com>
Date: Fri, 16 May 2025 14:34:57 +0800
Subject: [PATCH 25/28] [CI] Add gitleaks check (#2101)

---
 .pre-commit-config-zh-cn.yaml     | 8 +++++++-
 .pre-commit-config.yaml           | 8 +++++++-
 opencompass/models/glm.py         | 2 +-
 opencompass/models/interntrain.py | 2 +-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
index 20a7d30c..14ac1a71 100644
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@@ -115,9 +115,15 @@ repos:
         args:
           - --root_folder
           - opencompass/configs/datasets
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.23.1
+    hooks:
+    -   id: gitleaks
+        entry: "gitleaks dir"
+        args: ["--verbose", "--redact=50"]
   # - repo: https://github.com/open-mmlab/pre-commit-hooks
   #   rev: v0.2.0  # Use the ref you want to point at
   #   hooks:
   #     - id: check-algo-readme
       # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
\ No newline at end of file
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 55eb17ea..69941d1b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -116,9 +116,15 @@ repos:
         args:
           - --root_folder
           - opencompass/configs/datasets
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.23.1
+    hooks:
+    -   id: gitleaks
+        entry: "gitleaks dir"
+        args: ["--verbose", "--redact=50"]
   # - repo: https://github.com/open-mmlab/pre-commit-hooks
   #   rev: v0.2.0  # Use the ref you want to point at
   #   hooks:
   #     - id: check-algo-readme
       # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
\ No newline at end of file
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
diff --git a/opencompass/models/glm.py b/opencompass/models/glm.py
index c7882946..27c9d032 100644
--- a/opencompass/models/glm.py
+++ b/opencompass/models/glm.py
@@ -75,7 +75,7 @@ class GLM130B(BaseModel):
             ['--vocab-size', '150528'],
             ['--num-attention-heads', '96'],
             ['--max-sequence-length', '2048'],
-            ['--tokenizer-type', 'icetk-glm-130B'],
+            ['--tokenizer-type', 'icetk-glm-130B'], #gitleaks:allow
             ['--layernorm-order', 'post'],
             ['--load', self.ckpt_path],
             ['--skip-init'],
diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py
index 31cdba1d..1a9d0657 100644
--- a/opencompass/models/interntrain.py
+++ b/opencompass/models/interntrain.py
@@ -358,7 +358,7 @@ class InternTrain(BaseModel):
         output_text = self.batch_decode(
             outputs,
             eos_token_ids=self.generator.eos_token_id,
-            stopping_criteria=stopping_criteria)
+            stopping_criteria=stopping_criteria) #gitleaks:allow
 
         return output_text
 

From 8c0ccf9a6bc0ad7f55bc07acc58324110a496ad7 Mon Sep 17 00:00:00 2001
From: kkscilife <126147887+kkscilife@users.noreply.github.com>
Date: Fri, 16 May 2025 15:36:45 +0800
Subject: [PATCH 26/28] [CI] Fix Lint error (#2103)

---
 opencompass/models/glm.py         | 2 +-
 opencompass/models/interntrain.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencompass/models/glm.py b/opencompass/models/glm.py
index 27c9d032..3da487a5 100644
--- a/opencompass/models/glm.py
+++ b/opencompass/models/glm.py
@@ -75,7 +75,7 @@ class GLM130B(BaseModel):
             ['--vocab-size', '150528'],
             ['--num-attention-heads', '96'],
             ['--max-sequence-length', '2048'],
-            ['--tokenizer-type', 'icetk-glm-130B'], #gitleaks:allow
+            ['--tokenizer-type', 'icetk-glm-130B'],  # gitleaks:allow
             ['--layernorm-order', 'post'],
             ['--load', self.ckpt_path],
             ['--skip-init'],
diff --git a/opencompass/models/interntrain.py b/opencompass/models/interntrain.py
index 1a9d0657..bfe3c2e3 100644
--- a/opencompass/models/interntrain.py
+++ b/opencompass/models/interntrain.py
@@ -358,7 +358,7 @@ class InternTrain(BaseModel):
         output_text = self.batch_decode(
             outputs,
             eos_token_ids=self.generator.eos_token_id,
-            stopping_criteria=stopping_criteria) #gitleaks:allow
+            stopping_criteria=stopping_criteria)  # gitleaks:allow
 
         return output_text
 

From 7a7a4517abebde3ace85980350aea711bdd4ada7 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Mon, 19 May 2025 17:03:33 +0800
Subject: [PATCH 27/28] [Update] History code bench pass@k update (#2102)

* bigcodebench

* humaneval

* humanevalx

* humanevalx

* livecodebench

* mbpp

* humaneval_plus

* fix bug

* template

* max_out fix

* template update
---
 examples/eval_codebench_full.py               | 155 ++++++++++++++++
 ...debench_full_instruct_repeat_gen_c3d5ad.py |  44 +++++
 ...debench_hard_instruct_repeat_gen_c3d5ad.py |  48 +++++
 ...l_openai_sample_evals_repeat_gen_dcae0e.py |  37 ++++
 .../humaneval_plus_repeat_gen_41b01c.py       |  39 ++++
 .../humanevalx_repeat_gen_3d84a3.py           |  43 +++++
 ...bench_code_generation_repeat_gen_b5b6c5.py | 166 ++++++++++++++++++
 ...=> livecodebench_time_split_gen_a4f90b.py} |   0
 .../datasets/mbpp/mbpp_repeat_gen_18dd1b.py   |  44 +++++
 .../configs/summarizers/groups/multipl_e.py   |   6 +
 .../datasets/bigcodebench/bigcodebench.py     |   6 +-
 opencompass/datasets/humaneval.py             |   4 +-
 .../datasets/livecodebench/evaluator.py       |  35 +++-
 13 files changed, 617 insertions(+), 10 deletions(-)
 create mode 100644 examples/eval_codebench_full.py
 create mode 100644 opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py
 create mode 100644 opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
 create mode 100644 opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py
 create mode 100644 opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py
 create mode 100644 opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py
 create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
 rename opencompass/configs/datasets/livecodebench/{livecodebench_time_split_gen.py => livecodebench_time_split_gen_a4f90b.py} (100%)
 create mode 100644 opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py
 create mode 100644 opencompass/configs/summarizers/groups/multipl_e.py

diff --git a/examples/eval_codebench_full.py b/examples/eval_codebench_full.py
new file mode 100644
index 00000000..a4a4e78b
--- /dev/null
+++ b/examples/eval_codebench_full.py
@@ -0,0 +1,155 @@
+# This config is used to test all the code benchmarks
+from mmengine.config import read_base
+import os.path as osp
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    # Datasets Part
+    # bigcodebench
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
+        bigcodebench_full_instruct_datasets
+    )
+    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
+        bigcodebench_hard_instruct_datasets
+    )
+    # livecodebench code generation lite v5
+    from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
+        LCB_datasets
+    )
+    # huamneval series
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
+        humaneval_datasets
+    )
+    from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
+        humanevalpro_datasets
+    )
+    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
+        humanevalx_datasets
+    )
+    from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
+        humaneval_plus_datasets
+    )
+    # mbpp series
+    from opencompass.configs.datasets.mbpp.mbpp_gen import (
+        mbpp_datasets
+    )
+    from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
+        mbpppro_datasets
+    )
+    # multipl-e
+    from opencompass.configs.datasets.multipl_e.multiple_gen import (
+        multiple_datasets
+    )
+    # ds1000
+    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
+        ds1000_datasets
+    )
+
+    # Models Part
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.ds1000 import (
+        ds1000_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.multipl_e import (
+        multiple_summary_groups,
+    )
+    from opencompass.configs.summarizers.groups.humanevalx import (
+        humanevalx_summary_groups,
+    )
+
+# models config
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+for model in models:
+    model['max_seq_len'] = 16384
+    model['max_out_len'] = 8192
+
+# datasets config
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+for item in humanevalx_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/humanevalx'
+    item['eval_cfg']['evaluator']['port'] = ''
+for item in ds1000_datasets:
+    item['eval_cfg']['evaluator'][
+        'ip_address'
+    ] = 'codeeval.opencompass.org.cn/ds1000'
+    item['eval_cfg']['evaluator']['port'] = ''
+
+
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 8192
+
+
+# summary
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+summary_groups.append(
+    {'name': 'humanevalx', 
+    'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
+)
+summarizer = dict(
+    dataset_abbrs = [
+        ['bigcodebench_hard_instruct', 'pass@1'],
+        ['bigcodebench_full_instruct', 'pass@1'],
+        ['lcb_code_generation', 'pass@1'],
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['mbpp', 'score'],
+        ['humaneval_pro', 'pass@1'],
+        ['mbpp_pro', 'pass@1'],
+        ['humaneval_plus', 'humaneval_plus_pass@1'],
+        ['multiple', 'naive_average'],
+        ['humanevalx', 'naive_average'],
+        ['ds1000', 'naive_average'],
+        '',
+        'humanevalx-python',
+        'humanevalx-cpp',
+        'humanevalx-java',
+        'humanevalx-js',
+        '',
+        'ds1000_Pandas',
+        'ds1000_Numpy',
+        'ds1000_Tensorflow',
+        'ds1000_Scipy',
+        'ds1000_Sklearn',
+        'ds1000_Pytorch',
+        'ds1000_Matplotlib',
+        '',
+        'humaneval-multiple-cpp', 
+        'humaneval-multiple-cs', 
+        'humaneval-multiple-go', 
+        'humaneval-multiple-java', 
+        'humaneval-multiple-rb', 
+        'humaneval-multiple-js', 
+        'humaneval-multiple-php', 
+        'humaneval-multiple-r', 
+        'humaneval-multiple-rs', 
+        'humaneval-multiple-sh',
+        '',
+        'mbpp-multiple-cpp', 
+        'mbpp-multiple-cs', 
+        'mbpp-multiple-go', 
+        'mbpp-multiple-java', 
+        'mbpp-multiple-rb', 
+        'mbpp-multiple-js', 
+        'mbpp-multiple-php', 
+        'mbpp-multiple-r', 
+        'mbpp-multiple-rs', 
+        'mbpp-multiple-sh'
+    ],
+    summary_groups=summary_groups,
+)
+
+work_dir = 'outputs/code'
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py
new file mode 100644
index 00000000..13b8ad12
--- /dev/null
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_repeat_gen_c3d5ad.py
@@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_full_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer))
+
+bigcodebench_full_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='full',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_full_instruct_datasets = [
+    dict(abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2',
+         n=5,
+         k=3)
+]
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
new file mode 100644
index 00000000..5baa55c0
--- /dev/null
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_repeat_gen_c3d5ad.py
@@ -0,0 +1,48 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+       retriever=dict(type=ZeroRetriever),
+       inferencer=dict(type=GenInferencer)
+)
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+        n=5,
+        k=3
+    )
+]
diff --git a/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py
new file mode 100644
index 00000000..e3cf117a
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval/humaneval_openai_sample_evals_repeat_gen_dcae0e.py
@@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalEvaluator, humaneval_postprocess_v2
+
+humaneval_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Read the following function signature and docstring, and fully implement the function described. Your response should only contain the code for this function.\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_datasets = [
+    dict(
+        abbr='openai_humaneval',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_reader_cfg,
+        infer_cfg=humaneval_infer_cfg,
+        eval_cfg=humaneval_eval_cfg,
+        n=5,
+        k=3)
+]
diff --git a/opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py
new file mode 100644
index 00000000..3d20e6a0
--- /dev/null
+++ b/opencompass/configs/datasets/humaneval_plus/humaneval_plus_repeat_gen_41b01c.py
@@ -0,0 +1,39 @@
+# THIS SHALL ALSO BE DEPRECATED
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalDataset, HumanEvalPlusEvaluator, humaneval_postprocess_v2
+
+humaneval_plus_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+
+# TODO: allow empty output-column
+humaneval_plus_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt='Complete the following python code:\n{prompt}'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humaneval_plus_eval_cfg = dict(
+    evaluator=dict(type=HumanEvalPlusEvaluator),
+    pred_role='BOT',
+    k=[1, 10, 100],  # the parameter only for humaneval
+    pred_postprocessor=dict(type=humaneval_postprocess_v2),
+)
+
+humaneval_plus_datasets = [
+    dict(
+        abbr='humaneval_plus',
+        type=HumanevalDataset,
+        path='opencompass/humaneval',
+        reader_cfg=humaneval_plus_reader_cfg,
+        infer_cfg=humaneval_plus_infer_cfg,
+        eval_cfg=humaneval_plus_eval_cfg,
+        n=5,
+        k=3)
+]
diff --git a/opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py b/opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py
new file mode 100644
index 00000000..59602788
--- /dev/null
+++ b/opencompass/configs/datasets/humanevalx/humanevalx_repeat_gen_3d84a3.py
@@ -0,0 +1,43 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='declaration', train_split='test')
+
+humanevalx_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template='{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+humanevalx_eval_cfg_dict = {
+    lang : dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            'localhost',  # replace to your code_eval_server ip_address, port
+            port=5001),  # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
+        pred_role='BOT')
+    for lang in ['python', 'cpp', 'go', 'java', 'js']   # do not support rust now
+}
+
+# Please download the needed `xx.jsonl.gz` from
+# https://github.com/THUDM/CodeGeeX2/tree/main/benchmark/humanevalx
+# and move them into `data/humanevalx/` folder
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='./data/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg,
+        eval_cfg=humanevalx_eval_cfg_dict[lang],
+        n=5,
+        k=3)
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+]
diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py b/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
new file mode 100644
index 00000000..b50b6ecc
--- /dev/null
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_code_generation_repeat_gen_b5b6c5.py
@@ -0,0 +1,166 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    LCBCodeGenerationDataset,
+    LCBCodeExecutionDataset,
+    LCBTestOutputPredictionDataset,
+    LCBCodeGenerationEvaluator,
+    LCBCodeExecutionEvaluator,
+    LCBTestOutputEvaluator
+)
+from opencompass.datasets.livecodebench import TestOutputPromptConstants
+
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = f'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=prompt_template
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeGenerationEvaluator,
+        num_process_evaluate=4,
+        timeout=6,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    n=5,
+    k=3
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='You are an expert at Python programming, code execution, test case generation, and fuzzing.'
+                ),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBCodeExecutionEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}'
+                )
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(
+        type=LCBTestOutputEvaluator,
+    ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    # LCBCodeExecution_dataset,
+    # LCBTestOutput_dataset,
+]
diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py
similarity index 100%
rename from opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py
rename to opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen_a4f90b.py
diff --git a/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py b/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py
new file mode 100644
index 00000000..98eee528
--- /dev/null
+++ b/opencompass/configs/datasets/mbpp/mbpp_repeat_gen_18dd1b.py
@@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset, MBPPEvaluator
+
+mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
+
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n  res = tuple(set(test_tup1) & set(test_tup2))\r\n  return (res)' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n    result = False\r\n    for i in range(2,int(math.sqrt(n)) + 1):\r\n        if n % i == 0:\r\n            result = True\r\n    return result' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n'),
+                dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n  largest_nums = hq.nlargest(n, nums)\r\n  return largest_nums' \n[DONE] \n\n "),
+
+                dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list}  \n'),
+                dict(role='BOT', prompt='[BEGIN]\n'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
+
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp',
+        path='opencompass/mbpp',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg,
+        n=5,
+        k=3
+    )
+]
diff --git a/opencompass/configs/summarizers/groups/multipl_e.py b/opencompass/configs/summarizers/groups/multipl_e.py
new file mode 100644
index 00000000..1d50c7b6
--- /dev/null
+++ b/opencompass/configs/summarizers/groups/multipl_e.py
@@ -0,0 +1,6 @@
+multiple_summary_groups = []
+
+humaneval_multiple = ['humaneval-multiple-cpp', 'humaneval-multiple-cs', 'humaneval-multiple-go', 'humaneval-multiple-java', 'humaneval-multiple-rb', 'humaneval-multiple-js', 'humaneval-multiple-php', 'humaneval-multiple-r', 'humaneval-multiple-rs', 'humaneval-multiple-sh']
+mbpp_multiple = ['mbpp-multiple-cpp', 'mbpp-multiple-cs', 'mbpp-multiple-go', 'mbpp-multiple-java', 'mbpp-multiple-rb', 'mbpp-multiple-js', 'mbpp-multiple-php', 'mbpp-multiple-r', 'mbpp-multiple-rs', 'mbpp-multiple-sh']
+multiple_summary_groups.append({'name': 'multiple', 'subsets': humaneval_multiple})
+multiple_summary_groups.append({'name':'multiple','subsets': mbpp_multiple})
diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py
index 9ce3d196..59c030d4 100644
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@@ -188,7 +188,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
         while True:
             try:
                 eval_client = Client(self.remote_execute_api,
-                                     httpx_kwargs=dict(proxies=proxies))
+                                     httpx_kwargs=dict(
+                                         proxies=proxies,
+                                         timeout=httpx.Timeout(100.0)))
                 results, pass_at_k = eval_client.predict(
                     split=self.eval_type,
                     samples=handle_file(submitted_contents_path),
@@ -196,7 +198,7 @@ class BigCodeBenchEvaluator(BaseEvaluator):
                     **self.eval_kwargs)
                 break
             except (httpx.ReadTimeout, CancelledError):
-                logger.info('Read timeout error. Retrying in 4s...')
+                logger.info('Read timeout error. Retrying in 10s...')
                 time.sleep(10)
 
         if 'pass@1' in pass_at_k.keys():
diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py
index 9788b638..af001716 100644
--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -183,13 +183,13 @@ def humaneval_postprocess_v2(text: str) -> str:
     blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
     if len(blocks) >= 1:
         text = blocks[0]
-    return text
+    return text.lstrip()
 
 def humaneval_postprocess_v3(text: str) -> str:
     blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
     if len(blocks) >= 1:
         text = blocks[-1]
-    return text
+    return text.lstrip()
 
 def humaneval_internal_v2_postprocess(text: str):
     if text.startswith('   ') and not text.startswith('    '):
diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py
index 65867d47..e6afd838 100644
--- a/opencompass/datasets/livecodebench/evaluator.py
+++ b/opencompass/datasets/livecodebench/evaluator.py
@@ -248,6 +248,28 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
             end_date=end_date)['test']
         self.extractor_version = extractor_version
 
+    def _build_results(self, extracted_predictions, metrics, eval_results,
+                       final_metadata):
+        results = {}
+        results['pass@1'] = metrics.get('pass@1', 0.0)
+        details = []
+        # Safely get the details list from metrics
+        r = metrics.get('details', {}).get('pass@1', [])
+        for i, (ep, er, fm) in enumerate(
+                zip(extracted_predictions.values(), eval_results.values(),
+                    final_metadata)):
+            detail = {
+                'extracted_prediction':
+                ep[0] if isinstance(ep, list) and ep else ep,
+                'eval_result': er[0] if isinstance(er, list) and er else er,
+                'final_metadata': fm[0] if isinstance(fm, list) and fm else fm
+            }
+            # Use r[i] if available, otherwise fallback to False
+            detail['correct'] = bool(r[i] == 100.0) if i < len(r) else False
+            details.append(detail)
+        results['details'] = details
+        return results
+
     def score(self, predictions, references):
         if len(predictions) != len(references):
             return {
@@ -295,13 +317,14 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
             num_process_evaluate=self.num_process_evaluate,
             timeout=self.timeout,
         )
-        results = {
-            'extracted_predictions': extracted_predictions,
-            'eval_results': eval_results
-        }
-        results.update(metrics)
+        # results = {
+        #     'extracted_predictions': extracted_predictions,
+        #     'eval_results': eval_results
+        # }
+        # results.update(metrics)
 
-        return results
+        return self._build_results(extracted_predictions, metrics,
+                                   eval_results, final_metadata)
 
 
 def evaluate_score(args) -> list[bool]:

From aa2b89b6f8b7c5448e47ed1aa3f12b04da1ff123 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Tue, 20 May 2025 16:46:55 +0800
Subject: [PATCH 28/28] [Update] Add CascadeEvaluator with Data Replica (#2022)

* Update CascadeEvaluator

* Update CascadeEvaluator

* Update CascadeEvaluator

* Update Config

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update

* Update
---
 README.md                                     |   4 +-
 README_zh-CN.md                               |   4 +-
 dataset-index.yml                             |   2 +-
 docs/en/advanced_guides/llm_judge.md          |   2 +-
 docs/en/advanced_guides/math_verify.md        |  14 +-
 docs/zh_cn/advanced_guides/llm_judge.md       |   2 +-
 docs/zh_cn/advanced_guides/math_verify.md     |  14 +-
 examples/eval_cascade_evaluator.py            |   9 +-
 examples/eval_qwen3.py                        | 142 ++++++++++++++++++
 opencompass/cli/main.py                       |  23 ++-
 .../olymmath_cascade_eval_gen_97b203.py       | 109 ++++++++++++++
 ...piadBench_0shot_cascade_eval_gen_be8b13.py | 114 ++++++++++++++
 ...py => aime2024_cascade_eval_gen_5e9f4f.py} |  77 ++++++----
 ..._judge_gen.py => aime2024_llmjudge_gen.py} |   0
 .../aime2025_cascade_eval_gen_5e9f4f.py       | 115 ++++++++++++++
 .../gpqa/gpqa_cascade_eval_gen_772ea0.py      | 118 +++++++++++++++
 ...h_single_0shot_cascade_eval_gen_56cf43.py} |  88 ++++++-----
 ...nch_hard_custom_cascade_eval_gen_4bce59.py | 120 +++++++++++++++
 ...sonbench_llmverify_20250428_gen_0484cb.py} |  12 +-
 .../math/math_500_cascade_eval_gen_6ff468.py  | 117 +++++++++++++++
 .../configs/datasets/math/math_500_gen.py     |   4 +-
 .../configs/datasets/math/math_gen_a58d9d.py  |   4 +-
 .../math_prm800k_500_0shot_cot_gen_11c4b5.py  |   4 +-
 ...mmlu_stem_0shot_cascade_eval_gen_216503.py | 127 ++++++++++++++++
 .../omni_math_cascade_eval_gen_ccf9c0.py}     |  74 ++++++---
 opencompass/configs/summarizers/example.py    |  27 ++--
 opencompass/datasets/base.py                  |   3 +
 opencompass/datasets/korbench/korbench.py     |  90 +++++++----
 opencompass/datasets/math.py                  |   6 +-
 opencompass/datasets/musr/musr.py             |   6 +-
 .../teval/evaluators/review_evaluator.py      |   1 -
 opencompass/evaluator/__init__.py             |   1 +
 opencompass/evaluator/cascade_evaluator.py    |  48 +++++-
 .../evaluator/generic_llm_evaluator.py        |  25 +--
 .../math_evaluator.py                         |   4 +-
 opencompass/models/openai_api.py              |  71 +++++----
 opencompass/openicl/icl_evaluator/__init__.py |   1 -
 .../icl_evaluator/icl_base_evaluator.py       |  55 +++++--
 .../openicl/icl_evaluator/icl_hf_evaluator.py |  39 +++--
 opencompass/partitioners/num_worker.py        |  24 +--
 opencompass/tasks/openicl_eval.py             |  15 +-
 opencompass/utils/logging.py                  |  13 ++
 opencompass/utils/run.py                      |  12 +-
 43 files changed, 1471 insertions(+), 269 deletions(-)
 create mode 100644 examples/eval_qwen3.py
 create mode 100644 opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py
 create mode 100644 opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py
 rename opencompass/configs/datasets/aime2024/{aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py => aime2024_cascade_eval_gen_5e9f4f.py} (74%)
 rename opencompass/configs/datasets/aime2024/{aime2024_llm_judge_gen.py => aime2024_llmjudge_gen.py} (100%)
 create mode 100644 opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py
 create mode 100644 opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py
 rename opencompass/configs/datasets/korbench/{korbench_single_0shot_genericllmeval_xml_gen_17854d.py => korbench_single_0shot_cascade_eval_gen_56cf43.py} (69%)
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
 rename opencompass/configs/datasets/livereasonbench/{livereasonbench_genericllmeval_xml_gen_f990de.py => livereasonbench_llmverify_20250428_gen_0484cb.py} (96%)
 create mode 100644 opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py
 create mode 100644 opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py
 rename opencompass/configs/datasets/{aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py => omni_math/omni_math_cascade_eval_gen_ccf9c0.py} (71%)
 rename opencompass/{openicl/icl_evaluator => evaluator}/math_evaluator.py (98%)

diff --git a/README.md b/README.md
index 28073c8f..5ba2ea1e 100644
--- a/README.md
+++ b/README.md
@@ -60,7 +60,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 - **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
-- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
+- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
 - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
@@ -246,7 +246,7 @@ Currently, OpenCompass have provided standard recommended configurations for dat
 opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
 
 # Recommended Evaluation Config based on LLM Judge
-opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
 ```
 
 If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index f70eb41e..c77f5f68 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -60,7 +60,7 @@
 - **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
-- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
+- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
@@ -237,7 +237,7 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
 
   # 基于LLM Judge的推荐配置
-  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+  opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
   ```
 
   此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
diff --git a/dataset-index.yml b/dataset-index.yml
index 05f7ed5e..36e6847a 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -303,7 +303,7 @@
     category: Examination
     paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
     configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen.py
 - anli:
     name: Adversarial NLI
     category: Reasoning
diff --git a/docs/en/advanced_guides/llm_judge.md b/docs/en/advanced_guides/llm_judge.md
index f7e09d78..f27b34da 100644
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@@ -278,7 +278,7 @@ Here's an example of how to configure the CascadeEvaluator:
 
 ```python
 # Define a rule-based evaluator
-rule_evaluator = dict(type=MATHEvaluator)
+rule_evaluator = dict(type=MATHVerifyEvaluator)
 
 # Define an LLM judge evaluator
 llm_judge_evaluator = dict(
diff --git a/docs/en/advanced_guides/math_verify.md b/docs/en/advanced_guides/math_verify.md
index da9cfd2f..1ddaacd1 100644
--- a/docs/en/advanced_guides/math_verify.md
+++ b/docs/en/advanced_guides/math_verify.md
@@ -2,7 +2,7 @@
 
 ## Introduction
 
-Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
+Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHVerifyEvaluator components.
 
 ## Dataset Format
 
@@ -61,7 +61,7 @@ math_infer_cfg = dict(
 
 ```python
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 ```
 
@@ -86,11 +86,11 @@ math_datasets = [
 ]
 ```
 
-## MATHEvaluator
+## MATHVerifyEvaluator
 
-The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
+The MATHVerifyEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
 
-The MATHEvaluator implements:
+The MATHVerifyEvaluator implements:
 
 1. Extracts answers from both predictions and references using LaTeX extraction
 2. Handles various LaTeX formats and environments
@@ -133,7 +133,7 @@ Here's a complete example of how to set up math evaluation:
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHVerifyEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
@@ -160,7 +160,7 @@ math_infer_cfg = dict(
 
 # Evaluation configuration
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 
 # Dataset configuration
diff --git a/docs/zh_cn/advanced_guides/llm_judge.md b/docs/zh_cn/advanced_guides/llm_judge.md
index 3cf9619b..80e0a111 100644
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@@ -277,7 +277,7 @@ OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式
 
 ```python
 # 定义规则式评估器
-rule_evaluator = dict(type=MATHEvaluator)
+rule_evaluator = dict(type=MATHVerifyEvaluator)
 
 # 定义LLM评判器
 llm_judge_evaluator = dict(
diff --git a/docs/zh_cn/advanced_guides/math_verify.md b/docs/zh_cn/advanced_guides/math_verify.md
index 8e8d2fa6..f93faf9a 100644
--- a/docs/zh_cn/advanced_guides/math_verify.md
+++ b/docs/zh_cn/advanced_guides/math_verify.md
@@ -2,7 +2,7 @@
 
 ## 简介
 
-数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
+数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHVerifyEvaluator 组件提供了一种便捷的数学推理评测方式。
 
 ## 数据集格式
 
@@ -61,7 +61,7 @@ math_infer_cfg = dict(
 
 ```python
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 ```
 
@@ -86,11 +86,11 @@ math_datasets = [
 ]
 ```
 
-## MATHEvaluator
+## MATHVerifyEvaluator
 
-MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
+MATHVerifyEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
 
-MATHEvaluator 具有以下功能：
+MATHVerifyEvaluator 具有以下功能：
 
 1. 使用 LaTeX 提取器从预测和参考答案中提取答案
 2. 处理各种 LaTeX 格式和环境
@@ -133,7 +133,7 @@ MATHEvaluator 具有以下功能：
 from mmengine.config import read_base
 from opencompass.models import TurboMindModelwithChatTemplate
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
@@ -160,7 +160,7 @@ math_infer_cfg = dict(
 
 # 评测配置
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 
 # 数据集配置
diff --git a/examples/eval_cascade_evaluator.py b/examples/eval_cascade_evaluator.py
index 1c1b0980..ef11fbdd 100644
--- a/examples/eval_cascade_evaluator.py
+++ b/examples/eval_cascade_evaluator.py
@@ -7,9 +7,12 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.evaluator import (
+    GenericLLMEvaluator,
+    CascadeEvaluator,
+    MATHVerifyEvaluator,
+)
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
     MATHDataset,
     math_postprocess_v2,
@@ -94,7 +97,7 @@ llm_judge_evaluator =   dict(
         judge_cfg=dict(),
     )
 
-rule_evaluator =dict(type=MATHEvaluator)
+rule_evaluator =dict(type=MATHVerifyEvaluator)
 cascade_evaluator = dict(type=CascadeEvaluator,
                    llm_evaluator=llm_judge_evaluator,
                    rule_evaluator=rule_evaluator,
diff --git a/examples/eval_qwen3.py b/examples/eval_qwen3.py
new file mode 100644
index 00000000..eb600515
--- /dev/null
+++ b/examples/eval_qwen3.py
@@ -0,0 +1,142 @@
+
+import os.path as osp
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.runners import LocalRunner
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+with read_base():
+    from opencompass.configs.datasets.aime2024.aime2024_cascade_eval_gen_5e9f4f import aime2024_datasets
+    from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import aime2025_datasets
+    from opencompass.configs.datasets.math.math_500_cascade_eval_gen_6ff468 import math_datasets
+
+#######################################################################
+#                          PART 0  Meta Info                          #
+#######################################################################
+
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], 
+)
+
+
+judge_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://x.x.x.x:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=8,
+        batch_size=256,
+        temperature=0.001,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        # verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+        # max_seq_len=49152,
+        mode='mid',
+        retry=10
+)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+
+repeated_info = [
+    (math_datasets, 4),
+    (aime2024_datasets, 32),
+    (aime2025_datasets, 32),
+]
+
+for datasets_, num in repeated_info:
+    for dataset_ in datasets_:
+        dataset_['n'] = num
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    try:
+        if 'judge_cfg' in item['eval_cfg']['evaluator']:
+           item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+        elif'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
+            item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
+    except:
+        pass
+#######################################################################
+#                       PART 2  Dataset Summarizer                    #
+#######################################################################
+
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        ['math_prm800k_500', 'accuracy (4 runs average)'],
+        ['aime2024', 'accuracy (32 runs average)'],
+        ['aime2025', 'accuracy (32 runs average)'],
+        ['livemathbench_hard', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['olymmath', 'naive_average'],
+    ],
+    summary_groups = sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+    ),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+models += [
+
+    dict(
+        abbr='Qwen_Qwen3-235B-A22B',
+        type=OpenAISDK,
+        path='Qwen/Qwen3-235B-A22B',
+        key='sk-admin',
+        openai_api_base=[
+            'http://106.15.231.215:40007/v1/',
+        ],
+        meta_template=dict(
+            # begin=dict(role='SYSTEM', api_role='SYSTEM', prompt=''),
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                # XXX: all system roles are mapped to human in purpose
+                dict(role='BOT', api_role='BOT', generate=True),
+            ]
+        ),
+        query_per_second=16,
+        batch_size=128,
+        # batch_size=1,
+        temperature=0.6,
+        # max_completion_tokens=32768,
+        tokenizer_path='gpt-4',
+        # verbose=True,
+        max_out_len=32768,
+        max_seq_len=32768,
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask)),
+)
+
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)),
+)
+
+base_exp_dir = 'outputs/qwen3_reasoning'
+work_dir = osp.join(base_exp_dir, 'chat_objective')
\ No newline at end of file
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index d1f4b1dd..494c39fd 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -12,8 +12,8 @@ from mmengine.config import Config, DictAction
 from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
 from opencompass.runners import SlurmRunner
 from opencompass.summarizers import DefaultSummarizer
-from opencompass.utils import (LarkReporter, get_logger, read_from_station,
-                               save_to_station)
+from opencompass.utils import (LarkReporter, get_logger, pretty_print_config,
+                               read_from_station, save_to_station)
 from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
                                    get_config_from_arg)
 
@@ -94,6 +94,11 @@ def parse_args():
         help='Use the custom config directory instead of config/ to '
         'search the configs for datasets, models and summarizers',
         type=str)
+    parser.add_argument(
+        '--config-verbose',
+        default=False,
+        action='store_true',
+        help='Whether to print the config in verbose mode.')
     parser.add_argument('-l',
                         '--lark',
                         help='Report the running status to lark bot',
@@ -131,7 +136,7 @@ def parse_args():
         'correctness of each sample, bpb, etc.',
         action='store_true',
     )
-
+    # for the results persistence
     parser.add_argument('-sp',
         '--station-path',
         help='Path to your results station.',
@@ -150,7 +155,12 @@ def parse_args():
              'data station.',
         action='store_true',
     )
-
+    # for evaluation with multiple runs
+    parser.add_argument('--dataset-num-runs',
+        help='How many runs for one dataset',
+        type=int,
+        default=1,
+    )
 
     # set srun args
     slurm_parser = parser.add_argument_group('slurm_args')
@@ -299,6 +309,11 @@ def main():
         content = f'{getpass.getuser()}\'s task has been launched!'
         LarkReporter(cfg['lark_bot_url']).post(content)
 
+
+    # print config if specified --config-verbose
+    if args.config_verbose:
+        pretty_print_config(cfg)
+
     # infer
     if args.mode in ['all', 'infer']:
         # When user have specified --slurm or --dlc, or have not set
diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py b/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py
new file mode 100644
index 00000000..855c9b2a
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_cascade_eval_gen_97b203.py
@@ -0,0 +1,109 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import OlymMATHDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+
+olymmath_datasets = []
+
+for sub_set in sub_sets:
+    math_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=MATHVerifyEvaluator,
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=OlymMATHDataset,
+                    path='RUC-AIBOX/OlymMATH',
+                    reader_cfg=math_reader_cfg,
+                    subset=sub_set,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False,
+        ),
+    )
+    olymmath_datasets.append(
+        dict(
+            type=OlymMATHDataset,
+            abbr=f'olymmath_{sub_set}',
+            path='RUC-AIBOX/OlymMATH',
+            reader_cfg=math_reader_cfg,
+            infer_cfg=math_infer_cfg,
+            eval_cfg=math_eval_cfg,
+            subset=sub_set,
+            n=1
+        )
+    )
diff --git a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py
new file mode 100644
index 00000000..caedfbaa
--- /dev/null
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_cascade_eval_gen_be8b13.py
@@ -0,0 +1,114 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import (
+    GenericLLMEvaluator,
+    CascadeEvaluator,
+    MATHVerifyEvaluator
+)
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbench_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=MATHVerifyEvaluator,
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=OlympiadBenchDataset,
+                    path='opencompass/OlympiadBench',
+                    name=_name,
+                    reader_cfg=olympiadbench_reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False
+        )
+    )
+
+    olympiadbench_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+            n=1,
+        )
+    )
diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py b/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py
similarity index 74%
rename from opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py
rename to opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py
index 33d84265..64fb3565 100644
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_xml_gen_2b9dc2.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_cascade_eval_gen_5e9f4f.py
@@ -1,28 +1,44 @@
+"""
+Summary: A config for AIME-2024 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
-from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.utils import xml_tag_postprocessor
-
-aime2024_reader_cfg = dict(
-    input_columns=['question'], 
-    output_column='answer'
+from opencompass.datasets import Aime2024Dataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
 )
 
 
+aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
 aime2024_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
             ],
-        )
+        ),
     ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=2048)
+    inferencer=dict(type=GenInferencer),
 )
 
 
@@ -51,24 +67,27 @@ GRADER_TEMPLATE = """
     Judging the correctness of candidates' answers:
 """.strip()
 
-aime2024_eval_cfg = dict(
-    evaluator=dict(
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator= dict(
         type=GenericLLMEvaluator,
         prompt_template=dict(
             type=PromptTemplate,
             template=dict(
-            begin=[
-                dict(
-                    role='SYSTEM',
-                    fallback_role='HUMAN',
-                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
-            ],
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
                 round=[
-                dict(
-                    role='HUMAN',
-                    prompt = GRADER_TEMPLATE
-                ),
-            ]),
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
         ),
         dataset_cfg=dict(
             type=Aime2024Dataset,
@@ -77,9 +96,13 @@ aime2024_eval_cfg = dict(
         ),
         judge_cfg=dict(),
         dict_postprocessor=dict(type=generic_llmjudge_postprocess),
-        pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
     ),
-    pred_role='BOT',
+    parallel=False,
+)
+
+
+aime2024_eval_cfg = dict(
+    evaluator=cascade_evaluator,
 )
 
 aime2024_datasets = [
@@ -90,6 +113,6 @@ aime2024_datasets = [
         reader_cfg=aime2024_reader_cfg,
         infer_cfg=aime2024_infer_cfg,
         eval_cfg=aime2024_eval_cfg,
-        mode='singlescore',
+        n=1,# Evaluate the dataset with n times
     )
-]
\ No newline at end of file
+]
diff --git a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen.py
similarity index 100%
rename from opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
rename to opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen.py
diff --git a/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py b/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py
new file mode 100644
index 00000000..9fe5f557
--- /dev/null
+++ b/opencompass/configs/datasets/aime2025/aime2025_cascade_eval_gen_5e9f4f.py
@@ -0,0 +1,115 @@
+"""
+Summary: A config for AIME-2025 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2025_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/aime2025',
+            reader_cfg=aime2025_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    parallel=False,
+)
+aime2025_eval_cfg = dict(
+    evaluator=cascade_evaluator,
+)
+
+aime2025_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='aime2025',
+        path='opencompass/aime2025',
+        reader_cfg=aime2025_reader_cfg,
+        infer_cfg=aime2025_infer_cfg,
+        eval_cfg=aime2025_eval_cfg,
+        n=1,
+    )
+]
diff --git a/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py b/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py
new file mode 100644
index 00000000..1a5d5735
--- /dev/null
+++ b/opencompass/configs/datasets/gpqa/gpqa_cascade_eval_gen_772ea0.py
@@ -0,0 +1,118 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GPQADataset, GPQA_Simple_Eval_postprocess
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.utils.text_postprocessors import match_answer_pattern
+
+# openai_simple_eval prompt
+align_prompt = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+gpqa_reader_cfg = dict(
+    input_columns=['question', 'A', 'B', 'C', 'D'],
+    output_column='answer')
+
+gpqa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=align_prompt),
+            ], )),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+
+
+gpqa_datasets = []
+gpqa_subsets = {
+    # 'extended': 'gpqa_extended.csv',
+    # 'main': 'gpqa_main.csv',
+    'diamond': 'gpqa_diamond.csv'
+}
+
+for split in list(gpqa_subsets.keys()):
+    gpqa_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=AccEvaluator,
+                pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'),
+            ),
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    type=GPQADataset,
+                    path='./data/gpqa/',
+                    name=gpqa_subsets[split],
+                    reader_cfg=gpqa_reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+            parallel=False,
+        ),
+    )
+    gpqa_datasets.append(
+        dict(
+            abbr='GPQA_' + split,
+            type=GPQADataset,
+            path='./data/gpqa/',
+            name=gpqa_subsets[split],
+            reader_cfg=gpqa_reader_cfg,
+            infer_cfg=gpqa_infer_cfg,
+            eval_cfg=gpqa_eval_cfg,
+            mode='singlescore',
+        )
+    )
diff --git a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_xml_gen_17854d.py b/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py
similarity index 69%
rename from opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_xml_gen_17854d.py
rename to opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py
index 24156c11..50f4f15f 100644
--- a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_xml_gen_17854d.py
+++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_cascade_eval_gen_56cf43.py
@@ -1,17 +1,28 @@
+"""
+Summary: A config for KoR-Bench Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - korbenchEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from datasets import parallel
 from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
-from opencompass.utils import xml_tag_postprocessor
 
 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
 
-
 GRADER_TEMPLATE = """
     Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
-    
+
     Here are some evaluation criteria:
     1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
     2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
@@ -30,7 +41,7 @@ GRADER_TEMPLATE = """
     <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
     <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
     <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
-    
+
     Judging the correctness of candidates' answers:
 """.strip()
 
@@ -50,7 +61,7 @@ for category in categories:
             round=[
                 dict(
                     role='HUMAN',
-                    prompt='{prompt}' # f-string
+                    prompt='{prompt}'  # f-string
                 )
             ]
         )
@@ -66,41 +77,46 @@ for category in categories:
     infer_cfg = dict(
         prompt_template=prompt_template,
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=1024),
+        inferencer=dict(type=GenInferencer),
     )
 
     # Evaluation configuration
     eval_cfg = dict(
         evaluator=dict(
-            type=GenericLLMEvaluator,
-            prompt_template=dict(
-                type=PromptTemplate,
-                template=dict(
-                begin=[
-                    dict(
-                        role='SYSTEM',
-                        fallback_role='HUMAN',
-                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
-                ],
-                    round=[
-                    dict(
-                        role='HUMAN',
-                        prompt = GRADER_TEMPLATE
-                    ),
-                ]),
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=korbenchEvaluator,
             ),
-            dataset_cfg=dict(
-                type=korbenchDataset,
-                path='opencompass/korbench',
-                prompt_mode='0_shot',
-                category=category,
-                reader_cfg=reader_cfg,
+            llm_evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                        begin=[
+                            dict(
+                                role='SYSTEM',
+                                fallback_role='HUMAN',
+                                prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                        ],
+                        round=[
+                            dict(
+                                role='HUMAN',
+                                prompt=GRADER_TEMPLATE
+                            ),
+                        ]),
+                ),
+                dataset_cfg=dict(
+                    type=korbenchDataset,
+                    path='opencompass/korbench',
+                    prompt_mode='0_shot',
+                    category=category,
+                    reader_cfg=reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
             ),
-            judge_cfg=dict(),
-            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
-            pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
-        ),
-        pred_role='BOT',
+            parallel=False,
+        )
     )
 
     # Dataset
@@ -113,7 +129,7 @@ for category in categories:
         reader_cfg=reader_cfg,
         infer_cfg=infer_cfg,
         eval_cfg=eval_cfg,
-        mode='singlescore',
+        n=1,
     )
 
-    korbench_0shot_single_datasets.append(korbench_dataset)
+    korbench_0shot_single_datasets.append(korbench_dataset)
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
new file mode 100644
index 00000000..dd0c4211
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_cascade_eval_gen_4bce59.py
@@ -0,0 +1,120 @@
+"""
+Summary: A config for LiveMathBench-Hard-202412 Dataset Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 32
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator,
+)
+
+livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+
+splits = ['hard_cn', 'hard_en']
+# Dataset configuration
+livemathbench_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr=f'livemathbench_hard_custom_{split}',
+        path='data/LiveMathBench',
+        local_mode=True,
+        file_name=f'202412/{split}.jsonl',
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=dict(
+            # Evaluation configuration using LLM as judge
+            evaluator=dict(
+                type=CascadeEvaluator,
+                rule_evaluator=dict(
+                    type=MATHVerifyEvaluator,
+                ),
+                llm_evaluator=dict(
+                    type=GenericLLMEvaluator,
+                    prompt_template=dict(
+                        type=PromptTemplate,
+                        template=dict(
+                            begin=[
+                                dict(
+                                    role='SYSTEM',
+                                    fallback_role='HUMAN',
+                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                                )
+                            ],
+                            round=[
+                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                            ],
+                        ),
+                    ),
+                    dataset_cfg=dict(
+                        type=CustomDataset,
+                        path='data/LiveMathBench',
+                        local_mode=True,
+                        file_name=f'202412/{split}.jsonl',
+                        reader_cfg=livemathbench_reader_cfg,
+                    ),
+                    judge_cfg={},
+                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                ),
+                parallel=False
+            ),
+        ),
+        n=1, # repeat n times
+    ) for split in splits
+]
diff --git a/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_xml_gen_f990de.py b/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
similarity index 96%
rename from opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_xml_gen_f990de.py
rename to opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
index 6d3cc59d..4cf71096 100644
--- a/opencompass/configs/datasets/livereasonbench/livereasonbench_genericllmeval_xml_gen_f990de.py
+++ b/opencompass/configs/datasets/livereasonbench/livereasonbench_llmverify_20250428_gen_0484cb.py
@@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer
 
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import LiveReasonBenchDataset, livereasonbench_postprocess
-from opencompass.utils import xml_tag_postprocessor
 
 
 GRADER_TEMPLATE = """
@@ -97,7 +96,7 @@ livereasonbench_infer_cfg = dict(
             ],
         )),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=16384))
+    inferencer=dict(type=GenInferencer))
 
 livereasonbench_eval_cfg = dict(
     evaluator=dict(
@@ -122,23 +121,22 @@ livereasonbench_eval_cfg = dict(
             type=LiveReasonBenchDataset,
             path='opencompass/LiveReasonBench',
             reader_cfg=livereasonbench_reader_cfg,
+            version='livereasonbench-20250428',
         ),
         judge_cfg=dict(),
         dict_postprocessor=dict(type=livereasonbench_postprocess),
-        pred_postprocessor=dict(type=xml_tag_postprocessor, tag='<conclude>'),
     ),
-    pred_role='BOT',
 )
 
 livereasonbench_datasets = [
     dict(
-        abbr='LiveReasonBench-20241202',
+        abbr='LiveReasonBench-20250428',
         type=LiveReasonBenchDataset,
         path='opencompass/LiveReasonBench',
         reader_cfg=livereasonbench_reader_cfg,
         infer_cfg=livereasonbench_infer_cfg,
         eval_cfg=livereasonbench_eval_cfg,
-        version='livereasonbench-20241202',
-        mode='singlescore',
+        version='livereasonbench-20250428',
+        n=1
   )
 ]
diff --git a/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py b/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py
new file mode 100644
index 00000000..8c18b47b
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_500_cascade_eval_gen_6ff468.py
@@ -0,0 +1,117 @@
+"""
+Summary: A config for AIME-2024 Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+Avaliable Models:
+    - Instruct/Chat Models
+"""
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator
+)
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator= dict(
+        dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MATHDataset,
+                path='opencompass/math',
+                file_name = 'test_prm800k_500.json',
+                reader_cfg=math_reader_cfg,
+                n=4,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        )
+    ),
+    parallel=False,
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr=f'math_prm800k_500',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=dict(
+            evaluator=cascade_evaluator,
+        ),
+        n=1,
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_500_gen.py b/opencompass/configs/datasets/math/math_500_gen.py
index 79d2f3b0..232916fd 100644
--- a/opencompass/configs/datasets/math/math_500_gen.py
+++ b/opencompass/configs/datasets/math/math_500_gen.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import CustomDataset
-from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 
@@ -24,7 +24,7 @@ math_infer_cfg = dict(
 
 
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator),
+    evaluator=dict(type=MATHVerifyEvaluator),
 )
 
 math_datasets = [
diff --git a/opencompass/configs/datasets/math/math_gen_a58d9d.py b/opencompass/configs/datasets/math/math_gen_a58d9d.py
index bf01e9bc..648ff0a5 100644
--- a/opencompass/configs/datasets/math/math_gen_a58d9d.py
+++ b/opencompass/configs/datasets/math/math_gen_a58d9d.py
@@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import MATHDataset
-from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
 
@@ -24,7 +24,7 @@ math_infer_cfg = dict(
     inferencer=dict(type=GenInferencer))
 
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHVerifyEvaluator)
 )
 
 math_datasets = [
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
index d49a1ccc..0c2e516e 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.evaluator import MATHVerifyEvaluator
 from opencompass.datasets import (
     MATHDataset,
     math_postprocess_v2,
@@ -28,7 +28,7 @@ math_infer_cfg = dict(
 
 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHVerifyEvaluator)
 )
 
 math_datasets = [
diff --git a/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py b/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py
new file mode 100644
index 00000000..1f83098b
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu/mmlu_stem_0shot_cascade_eval_gen_216503.py
@@ -0,0 +1,127 @@
+"""
+Setting: 0-shot No-CoT
+Evaluator: GenericLLMEvaluator
+"""
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+)
+
+with read_base():
+    # from .....configs.datasets.mmlu.mmlu_all_sets import mmlu_all_sets
+    from .mmlu_stem_sets import mmlu_all_sets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    
+    mmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=CascadeEvaluator,
+            rule_evaluator=dict(
+                type=AccEvaluator,
+                pred_postprocessor=dict(type=match_answer_pattern, answer_pattern=r'(?i)ANSWER\s*:\s*([A-D])'),
+            ),
+            llm_evaluator = dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                        round=[
+                        dict(
+                            role='HUMAN',
+                            prompt = GRADER_TEMPLATE
+                        ),
+                    ]),
+                ),
+                dataset_cfg=dict(
+                    abbr=f'lukaemon_mmlu_{name}',
+                    type=MMLUDataset,
+                    path='opencompass/mmlu',
+                    name=name,
+                    reader_cfg=mmlu_reader_cfg,
+                ),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                judge_cfg=dict(),
+            ),
+            parallel=False
+        ),
+    )
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+            mode='singlescore',
+        ))
diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py b/opencompass/configs/datasets/omni_math/omni_math_cascade_eval_gen_ccf9c0.py
similarity index 71%
rename from opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
rename to opencompass/configs/datasets/omni_math/omni_math_cascade_eval_gen_ccf9c0.py
index 87c65f96..b823334b 100644
--- a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_llmjudge_gen_2b9dc2.py
+++ b/opencompass/configs/datasets/omni_math/omni_math_cascade_eval_gen_ccf9c0.py
@@ -1,30 +1,46 @@
+"""
+Summary: A config for OmniMath Dataset Evaluation.
+Setting:
+    Shot: 0-shot
+    Evaluator:
+        - CascadeEvaluator
+            - MATHVerifyEvaluator
+            - GenericLLMEvaluator
+    Repeat: 1
+Avaliable Models:
+    - Instruct/Chat Models
+"""
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
-from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.omni_math import OmniMathDataset
+from opencompass.evaluator import (
+    CascadeEvaluator,
+    GenericLLMEvaluator,
+    MATHVerifyEvaluator,
+)
 
-aime2024_reader_cfg = dict(
-    input_columns=['question'], 
+omnimath_reader_cfg = dict(
+    input_columns=['problem'], 
     output_column='answer'
 )
 
-
-aime2024_infer_cfg = dict(
+omnimath_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
         template=dict(
             round=[
-                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
-            ],
+                dict(role='HUMAN', prompt='please answer the following mathematical question, put your final answer in \\boxed{}.\n\n{problem}'),
+            ]
         )
     ),
     retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=2048)
+    inferencer=dict(type=GenInferencer)
 )
 
 
+
 GRADER_TEMPLATE = """
     Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
     
@@ -43,16 +59,20 @@ GRADER_TEMPLATE = """
     Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
 
 
-    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
     <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
     <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
     
     Judging the correctness of candidates' answers:
 """.strip()
 
-aime2024_eval_cfg = dict(
-    evaluator=dict(
-        type=LMEvaluator,
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    rule_evaluator=dict(
+        type=MATHVerifyEvaluator,
+    ),
+    llm_evaluator=dict(
+        type=GenericLLMEvaluator,
         prompt_template=dict(
             type=PromptTemplate,
             template=dict(
@@ -69,19 +89,27 @@ aime2024_eval_cfg = dict(
                 ),
             ]),
         ),
-        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        dataset_cfg=dict(
+            type=OmniMathDataset,
+            reader_cfg=omnimath_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),    
     ),
-    pred_role='BOT',
+    parallel=False,
 )
 
-aime2024_datasets = [
+omnimath_eval_cfg = dict(
+    evaluator=cascade_evaluator,
+)
+
+omnimath_datasets = [
     dict(
-        abbr='aime2024',
-        type=Aime2024Dataset,
-        path='opencompass/aime2024',
-        reader_cfg=aime2024_reader_cfg,
-        infer_cfg=aime2024_infer_cfg,
-        eval_cfg=aime2024_eval_cfg,
-        mode='singlescore',
+        type=OmniMathDataset,
+        abbr='OmniMath',
+        reader_cfg=omnimath_reader_cfg,
+        infer_cfg=omnimath_infer_cfg,
+        eval_cfg=omnimath_eval_cfg,
+        n=1,
     )
 ]
\ No newline at end of file
diff --git a/opencompass/configs/summarizers/example.py b/opencompass/configs/summarizers/example.py
index 937acfba..a059e4de 100644
--- a/opencompass/configs/summarizers/example.py
+++ b/opencompass/configs/summarizers/example.py
@@ -1,18 +1,19 @@
 from mmengine.config import read_base
 
-with read_base():
-    from .groups.agieval import agieval_summary_groups
-    from .groups.mmlu import mmlu_summary_groups
-    from .groups.cmmlu import cmmlu_summary_groups
-    from .groups.ceval import ceval_summary_groups
-    from .groups.bbh import bbh_summary_groups
-    from .groups.GaokaoBench import GaokaoBench_summary_groups
-    from .groups.flores import flores_summary_groups
-    from .groups.tydiqa import tydiqa_summary_groups
-    from .groups.xiezhi import xiezhi_summary_groups
-    from .groups.scibench import scibench_summary_groups
-    from .groups.mgsm import mgsm_summary_groups
-    from .groups.longbench import longbench_summary_groups
+# with read_base():
+    # pass
+    # from .groups.agieval import agieval_summary_groups
+    # from .groups.mmlu import mmlu_summary_groups
+    # from .groups.cmmlu import cmmlu_summary_groups
+    # from .groups.ceval import ceval_summary_groups
+    # from .groups.bbh import bbh_summary_groups
+    # from .groups.GaokaoBench import GaokaoBench_summary_groups
+    # from .groups.flores import flores_summary_groups
+    # from .groups.tydiqa import tydiqa_summary_groups
+    # from .groups.xiezhi import xiezhi_summary_groups
+    # from .groups.scibench import scibench_summary_groups
+    # from .groups.mgsm import mgsm_summary_groups
+    # from .groups.longbench import longbench_summary_groups
 
 summarizer = dict(
     summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py
index 1ccbe9fd..75ac3164 100644
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@@ -3,6 +3,9 @@ from typing import Dict, List, Optional, Union
 from datasets import Dataset, DatasetDict, concatenate_datasets
 
 from opencompass.openicl import DatasetReader
+from opencompass.utils import get_logger
+
+logger = get_logger()
 
 
 class BaseDataset:
diff --git a/opencompass/datasets/korbench/korbench.py b/opencompass/datasets/korbench/korbench.py
index 856c844e..3a8290cc 100644
--- a/opencompass/datasets/korbench/korbench.py
+++ b/opencompass/datasets/korbench/korbench.py
@@ -173,44 +173,76 @@ class korbenchEvaluator(BaseEvaluator):
     def __init__(self):
         super().__init__()
 
-    def score(self, predictions, references, test_set):
-        """Evaluate predictions for a single prompt_mode in KOR-Bench."""
-        if not test_set:
-            raise ValueError('Test set is empty.')
+    def sample_score(self, prediction, reference, test_item=None):
+        """Evaluate a single sample.
 
-        prompt_mode = test_set[0][
-            'prompt_mode']  # Determine the prompt_mode from the first entry
-        data = {}
+        Args:
+            prediction: The model's prediction
+            reference: The reference answer
+            test_item: Additional information about the test sample
 
-        # Organize data for the given prompt_mode
-        for i in range(len(predictions)):
-            entry = {
-                'prediction': predictions[i],
-                'gold': references[i],
-                'rule_id': test_set[i].get('rule_id', None),
-                'category': test_set[i].get('category', None),
-                'rule_list': test_set[i].get('rule_list', None),
-                'question_list': test_set[i].get('question_list', None),
-                'base_path': test_set[i].get('base_path', None),
-            }
-            data[i] = entry
+        Returns:
+            Dict: A dictionary containing evaluation results
+        """
+        if test_item is None:
+            raise ValueError('Test item is required.')
 
-        if not data:
-            raise ValueError(f"No data found for prompt_mode '{prompt_mode}'")
+        prompt_mode = test_item.get('prompt_mode')
 
-        # Evaluate based on the prompt_mode
+        # Build data for a single sample
+        entry = {
+            'prediction': prediction,
+            'gold': reference,
+            'rule_id': test_item.get('rule_id', None),
+            'category': test_item.get('category', None),
+            'rule_list': test_item.get('rule_list', None),
+            'question_list': test_item.get('question_list', None),
+            'base_path': test_item.get('base_path', None),
+        }
+
+        # Evaluate the single sample
+        data = {0: entry}
+
+        # Evaluate based on different prompt_mode
         if prompt_mode == '0_shot':
             evaluation_results = evaluate_responses(data, '0_shot')
         elif prompt_mode == '3_shot':
             evaluation_results = evaluate_responses(data, '3_shot')
         elif prompt_mode in ['Multi-Q', 'Multi-R', 'Multi-RQ', 'mixed']:
             evaluation_results = evaluate_responses(data, 'mixed',
-                                                    test_set[0]['base_path'])
+                                                    test_item.get('base_path'))
         else:
-            raise ValueError(f'Unsupported prompt_mode: {prompt_mode}')
-        # Calculate accuracy
-        correct_count = sum(res['is_correct'] for res in evaluation_results)
-        accuracy = (correct_count / len(evaluation_results)) * 100
+            return {
+                'is_correct': False,
+                'pred': prediction,
+                'answer': reference
+            }
 
-        # Return scores
-        return {'accuracy': accuracy}
+        # Return evaluation results
+        result = evaluation_results[0]
+        result['correct'] = result['is_correct']
+        result.update({'pred': prediction, 'answer': reference})
+        return result
+
+    def score(self, predictions, references, test_set):
+        """Evaluate each sample using sample_score."""
+        if not test_set:
+            raise ValueError('Test set is empty.')
+
+        details = []
+        correct_count = 0
+
+        # Call sample_score for each sample
+        for i in range(len(predictions)):
+            result = self.sample_score(predictions[i], references[i],
+                                       test_set[i])
+            details.append(result)
+            if result.get('is_correct', False):
+                correct_count += 1
+
+        # Calculate accuracy
+        accuracy = (correct_count /
+                    len(predictions)) * 100 if predictions else 0
+
+        # Return evaluation results
+        return {'accuracy': accuracy, 'details': details}
diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py
index f558379b..674f1b39 100644
--- a/opencompass/datasets/math.py
+++ b/opencompass/datasets/math.py
@@ -204,7 +204,11 @@ def math_postprocess_v2(text: str) -> str:
 @ICL_EVALUATORS.register_module()
 class MATHEvaluator(BaseEvaluator):
 
-    def __init__(self, version='v1'):
+    def __init__(self,
+                 version='v1',
+                 pred_postprocessor=None):  # 可能需要接收父类__init__的参数
+        super().__init__(
+            pred_postprocessor=pred_postprocessor)  # 调用父类的__init__
         assert version in ['v1', 'v2']
         self.version = version
 
diff --git a/opencompass/datasets/musr/musr.py b/opencompass/datasets/musr/musr.py
index d15e6831..96b84edf 100644
--- a/opencompass/datasets/musr/musr.py
+++ b/opencompass/datasets/musr/musr.py
@@ -280,7 +280,11 @@ class MusrDataset(BaseDataset):
 @ICL_EVALUATORS.register_module()
 class MusrEvaluator(BaseEvaluator):
 
-    def __init__(self, answer_index_modifier=1, self_consistency_n=1):
+    def __init__(self,
+                 answer_index_modifier=1,
+                 self_consistency_n=1,
+                 pred_postprocessor=None):
+        super().__init__(pred_postprocessor=pred_postprocessor)
         self.answer_index_modifier = answer_index_modifier
         self.self_consistency_n = self_consistency_n
 
diff --git a/opencompass/datasets/teval/evaluators/review_evaluator.py b/opencompass/datasets/teval/evaluators/review_evaluator.py
index 68a14668..b68a76dc 100644
--- a/opencompass/datasets/teval/evaluators/review_evaluator.py
+++ b/opencompass/datasets/teval/evaluators/review_evaluator.py
@@ -76,7 +76,6 @@ class ReviewEvaluator:
 
         pred_data = data_sample.pred
         if pred_data is not None:
-            # import pdb; pdb.set_trace()
             metrics_result['review_quality'] = 1.0 if pred_data == \
                 data_sample.gt else 0.0
             metrics_result['parse_rate'] = 1.0
diff --git a/opencompass/evaluator/__init__.py b/opencompass/evaluator/__init__.py
index 77b89f29..c24ae9a4 100644
--- a/opencompass/evaluator/__init__.py
+++ b/opencompass/evaluator/__init__.py
@@ -1,2 +1,3 @@
 from .cascade_evaluator import CascadeEvaluator  # noqa
 from .generic_llm_evaluator import GenericLLMEvaluator  # noqa
+from .math_evaluator import MATHVerifyEvaluator  # noqa
\ No newline at end of file
diff --git a/opencompass/evaluator/cascade_evaluator.py b/opencompass/evaluator/cascade_evaluator.py
index e26b3d86..6a898546 100644
--- a/opencompass/evaluator/cascade_evaluator.py
+++ b/opencompass/evaluator/cascade_evaluator.py
@@ -34,7 +34,8 @@ class CascadeEvaluator(BaseEvaluator):
         sample_score_fn: Optional[Callable] = None,
         parallel: bool = True,
     ) -> None:
-        self.logger = get_logger()
+        super().__init__()
+        self.logger = get_logger(__name__)
 
         # Initialize the LLM evaluator
         llm_evaluator_type = llm_evaluator.pop('type')
@@ -58,7 +59,10 @@ class CascadeEvaluator(BaseEvaluator):
             raise ValueError(
                 'Either rule_evaluator or sample_score_fn must be provided')
 
-    def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
+    def sample_score(self,
+                     prediction: str,
+                     reference: str,
+                     test_set=None) -> Dict[str, Any]:
         """Score a single sample using sample_score_fn or rule_evaluator.
 
         Args:
@@ -70,7 +74,7 @@ class CascadeEvaluator(BaseEvaluator):
         """
         if self.sample_score_fn:
             # Use user-provided function to evaluate a single sample
-            result = self.sample_score_fn(prediction, reference)
+            result = self.sample_score_fn(prediction, reference, test_set)
             if not isinstance(result, dict):
                 # Ensure result is a dictionary with at least 'correct' field
                 result = {
@@ -82,7 +86,8 @@ class CascadeEvaluator(BaseEvaluator):
         else:
             # Use rule_evaluator to evaluate a single sample by calling
             # the score method with single-element lists
-            result = self.rule_evaluator.score([prediction], [reference])
+            result = self.rule_evaluator.score([prediction], [reference],
+                                               [test_set])
             if 'details' in result and len(result['details']) > 0:
                 return result['details'][0]
             else:
@@ -137,7 +142,14 @@ class CascadeEvaluator(BaseEvaluator):
         failed_indices = []
 
         for i, (pred, ref) in enumerate(zip(predictions, references)):
-            result = self.sample_score(pred, ref)
+            if test_set is not None:
+                test_item = test_set[i]
+            else:
+                test_item = None
+            # Apply prediction postprocessing for each sample
+            [pred] = self.rule_evaluator.pred_postprocess([pred])
+
+            result = self.sample_score(pred, ref, test_item)
             result['evaluation_method'] = 'rule'
             details.append({'rule_evaluation': result})
 
@@ -181,8 +193,11 @@ class CascadeEvaluator(BaseEvaluator):
             original_out_dir = getattr(self.llm_evaluator, '_out_dir', None)
             self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
 
+            # Generate random hash suffix
+            llm_results_path = f'{self.llm_evaluator._out_dir}_replica{self.dataset_replica_idx}.json'  # noqa
+            self.logger.info(f'LLM evaluation results will be saved at '
+                             f'{llm_results_path}')
             # Check if results already exist to avoid re-evaluation
-            llm_results_path = f'{self.llm_evaluator._out_dir}.json'
             if os.path.exists(llm_results_path):
                 self.logger.info(
                     f'Loading existing LLM evaluation results from '
@@ -212,7 +227,15 @@ class CascadeEvaluator(BaseEvaluator):
                 # Use GenericLLMEvaluator to evaluate samples
                 # unset dataset_cfg for GenericLLMEvaluator to
                 # directly use test_set
+                # self.llm_evaluator.output_path = llm_results_path
+                self.llm_evaluator._dataset_replica_idx = \
+                    self._dataset_replica_idx
                 self.llm_evaluator.dataset_cfg = None
+
+                # Apply prediction postprocessing to for LLM evaluator
+                failed_predictions = self.llm_evaluator.pred_postprocess(
+                    failed_predictions)
+
                 llm_results = self.llm_evaluator.score(
                     predictions=failed_predictions,
                     references=failed_references,
@@ -235,6 +258,9 @@ class CascadeEvaluator(BaseEvaluator):
 
             # Update the details for samples that were evaluated by LLM
             for i, llm_detail in enumerate(llm_details.values()):
+                # Add dataset replica index to LLM evaluation result
+                llm_detail['dataset_replica_idx'] = self.dataset_replica_idx
+
                 original_index = failed_indices[i]
                 # Store original rule-based evaluation result
                 rule_result = details[original_index].copy()
@@ -283,6 +309,16 @@ class CascadeEvaluator(BaseEvaluator):
                     f'LLM evaluation: {llm_correct}/{llm_evaluated} '
                     f'correct ({llm_accuracy:.2f}%)')
 
+            # Append cascade correctness flag to each sample
+            for item in details:
+                _rule_correct = item['rule_evaluation'].get('correct', False)
+                if 'llm_evaluation' in item:
+                    _llm_correct = item['llm_evaluation'].get(
+                        'llm_correct', False)
+                else:
+                    _llm_correct = False
+                item['cascade_correct'] = _rule_correct or _llm_correct
+
             result = {
                 'accuracy': final_accuracy,
                 'cascade_stats': {
diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py
index c205ec4b..9482950f 100644
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@@ -1,5 +1,6 @@
 import os
 import os.path as osp
+from copy import deepcopy
 from typing import Dict, List, Optional
 
 import mmengine
@@ -14,6 +15,8 @@ from opencompass.registry import (DICT_POSTPROCESSORS, ICL_PROMPT_TEMPLATES,
 from opencompass.utils import build_dataset_from_cfg, build_model_from_cfg
 from opencompass.utils.logging import get_logger
 
+logger = get_logger(__name__)
+
 
 class GenericLLMEvaluator(BaseEvaluator):
     """Generic LLM evaluator.
@@ -23,6 +26,7 @@ class GenericLLMEvaluator(BaseEvaluator):
         judge_cfg (ConfigDict): The config for Judge LLM.
         dataset_cfg (ConfigDict): The config for dataset.
         pred_postprocessor (ConfigDict): The config for postprocessor.
+            used for the prediction results.
         dict_postprocessor (ConfigDict): The config for postprocessor,
             used for evaluation results dict.
     """
@@ -36,8 +40,7 @@ class GenericLLMEvaluator(BaseEvaluator):
         dict_postprocessor: Optional[ConfigDict] = None,
         keep_predictions: bool = False,
     ) -> None:
-
-        self.logger = get_logger()
+        super().__init__(pred_postprocessor=pred_postprocessor)
         # If judge_cfg is not provided, fall back to the default configuration
         if not judge_cfg:
             self.judge_cfg = self.default_judge_cfg
@@ -54,14 +57,14 @@ class GenericLLMEvaluator(BaseEvaluator):
         self.dict_postprocessor = dict_postprocessor
         self.pred_postprocessor = pred_postprocessor
 
-    def build_inferencer(self, ):
+    def build_inferencer(self):
         """Build LLM Inference."""
-        output_path = self._out_dir
-        self.output_path = f'{output_path}.json'
-        out_dir, out_name = osp.split(output_path)
-        out_name = f'{out_name}.json'
 
-        self.logger.info(
+        self.output_path = f'{self._out_dir}_replica{self.dataset_replica_idx}.json'  # noqa
+        logger.info(f'LLM judge details will be saved at:{self.output_path}')
+        out_dir, out_name = osp.split(self.output_path)
+
+        logger.info(
             f'Set self.output_path to {self.output_path} for current task')
         assert self.output_path is not None, 'output_path is None'
 
@@ -98,7 +101,6 @@ class GenericLLMEvaluator(BaseEvaluator):
 
         # -------------- Build Inferencer ----------------
         self.build_inferencer()
-
         # ---------------- Process Predictions ------------------
         predictions = self.pred_postprocess(predictions)
 
@@ -178,7 +180,7 @@ class GenericLLMEvaluator(BaseEvaluator):
         if self.dict_postprocessor is None:
             return output
         else:
-            kwargs = self.dict_postprocessor
+            kwargs = deepcopy(self.dict_postprocessor)
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
             sig = inspect.signature(proc)
             if 'dataset' in sig.parameters:
@@ -192,7 +194,8 @@ class GenericLLMEvaluator(BaseEvaluator):
     @property
     def default_judge_cfg(self):
         from opencompass.models import OpenAISDK
-
+        logger.info('Please set your judge model in `OC_JUDGE_MODEL`, \
+            `OC_JUDGE_API_KEY`, `OC_JUDGE_API_BASE` environment variables.')
         DEFAULT_JUDGE_CFG = dict(
             type=OpenAISDK,
             path=os.environ['OC_JUDGE_MODEL'],
diff --git a/opencompass/openicl/icl_evaluator/math_evaluator.py b/opencompass/evaluator/math_evaluator.py
similarity index 98%
rename from opencompass/openicl/icl_evaluator/math_evaluator.py
rename to opencompass/evaluator/math_evaluator.py
index 16db89f8..dfbc77dd 100644
--- a/opencompass/openicl/icl_evaluator/math_evaluator.py
+++ b/opencompass/evaluator/math_evaluator.py
@@ -3,9 +3,9 @@ from opencompass.registry import ICL_EVALUATORS
 
 
 @ICL_EVALUATORS.register_module()
-class MATHEvaluator(BaseEvaluator):
+class MATHVerifyEvaluator(BaseEvaluator):
 
-    def score(self, predictions, references):
+    def score(self, predictions, references, test_set=None):
         try:
             from latex2sympy2_extended import NormalizationConfig
             from math_verify import (ExprExtractionConfig,
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index f48869c5..e67e866e 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -556,28 +556,27 @@ class OpenAI(BaseAPIModel):
 
 class OpenAISDK(OpenAI):
 
-    def __init__(
-        self,
-        path: str = 'gpt-3.5-turbo',
-        max_seq_len: int = 16384,
-        query_per_second: int = 1,
-        rpm_verbose: bool = False,
-        retry: int = 2,
-        key: str | List[str] = 'ENV',
-        org: str | List[str] | None = None,
-        meta_template: Dict | None = None,
-        openai_api_base: str | List[str] = OPENAISDK_API_BASE,
-        openai_proxy_url: Optional[str] = None,
-        mode: str = 'none',
-        logprobs: bool | None = False,
-        top_logprobs: int | None = None,
-        temperature: float | None = None,
-        tokenizer_path: str | None = None,
-        extra_body: Dict | None = None,
-        verbose: bool = False,
-        status_code_mappings: dict = {},
-        think_tag: str = '</think>',
-    ):
+    def __init__(self,
+                 path: str = 'gpt-3.5-turbo',
+                 max_seq_len: int = 16384,
+                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
+                 retry: int = 2,
+                 key: str | List[str] = 'ENV',
+                 org: str | List[str] | None = None,
+                 meta_template: Dict | None = None,
+                 openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+                 openai_proxy_url: Optional[str] = None,
+                 mode: str = 'none',
+                 logprobs: bool | None = False,
+                 top_logprobs: int | None = None,
+                 temperature: float | None = None,
+                 tokenizer_path: str | None = None,
+                 extra_body: Dict | None = None,
+                 verbose: bool = False,
+                 http_client_cfg: dict = {},
+                 status_code_mappings: dict = {},
+                 think_tag: str = '</think>'):
         super().__init__(
             path,
             max_seq_len,
@@ -605,20 +604,20 @@ class OpenAISDK(OpenAI):
         else:
             self.openai_api_base = openai_api_base
 
-        if self.proxy_url is None:
-            self.openai_client = OpenAI(base_url=self.openai_api_base,
-                                        api_key=key)
-        else:
-            proxies = {
-                'http://': self.proxy_url,
-                'https://': self.proxy_url,
-            }
+        if self.proxy_url or http_client_cfg:
+            if self.proxy_url:
+                http_client_cfg['proxies'] = {
+                    'http://': self.proxy_url,
+                    'https://': self.proxy_url,
+                }
+
+        self.openai_client = OpenAI(
+            base_url=self.openai_api_base,
+            api_key=key,
+            http_client=httpx.Client(
+                **http_client_cfg) if http_client_cfg else None,
+        )
 
-            self.openai_client = OpenAI(
-                base_url=self.openai_api_base,
-                api_key=key,
-                http_client=httpx.Client(proxies=proxies),
-            )
         if self.verbose:
             self.logger.info(f'Used openai_client: {self.openai_client}')
         self.status_code_mappings = status_code_mappings
@@ -679,6 +678,7 @@ class OpenAISDK(OpenAI):
             try:
                 if self.verbose:
                     self.logger.info('Start calling OpenAI API')
+
                 responses = self.openai_client.chat.completions.create(
                     **query_data, timeout=timeout)  # timeout in seconds
                 if self.verbose:
@@ -689,7 +689,6 @@ class OpenAISDK(OpenAI):
                         self.logger.info(responses)
                     except Exception:
                         pass  # noqa F841
-
                 # Check if response is empty or content is empty
                 if (not responses.choices or not responses.choices[0].message
                         or
diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py
index 0fb77db3..f043982b 100644
--- a/opencompass/openicl/icl_evaluator/__init__.py
+++ b/opencompass/openicl/icl_evaluator/__init__.py
@@ -14,4 +14,3 @@ from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
 from .icl_plugin_evaluator import TEvalEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
-from .math_evaluator import MATHEvaluator  # noqa
diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index 10cc3fe4..40cdb6b3 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -8,6 +8,11 @@ import numpy as np
 from datasets import Dataset
 from scipy.stats import hypergeom
 
+from opencompass.registry import TEXT_POSTPROCESSORS
+from opencompass.utils.logging import get_logger
+
+logger = get_logger(__name__)
+
 
 def compute_pass_at_k(n, c, k):
     if n - c < k:
@@ -39,14 +44,19 @@ def compute_mg_pass_at_k(n, c, k):
 
 class BaseEvaluator:
 
-    def __init__(self) -> None:
-        pass
+    def __init__(self, pred_postprocessor=None) -> None:
+        self.pred_postprocessor = pred_postprocessor
+        self._dataset_replica_idx = 0  # Default value for dataset_replica_idx
 
     @property
     def output_dir(self):
         # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
         return self._out_dir
 
+    @property
+    def dataset_replica_idx(self):
+        return self._dataset_replica_idx
+
     def group(self, n: int, details: List[Dict[str, Any]],
               test_set: Dataset) -> Dict[str, Any]:
         example2replications = {}
@@ -82,6 +92,15 @@ class BaseEvaluator:
                 [detail[metric] for detail in details])
         return g_passk_details
 
+    def pred_postprocess(self, predictions: List) -> Dict:
+        if not hasattr(
+                self, 'pred_postprocessor') or self.pred_postprocessor is None:
+            return predictions
+        else:
+            kwargs = deepcopy(self.pred_postprocessor)
+            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
+            return [proc(pred, **kwargs) for pred in predictions]
+
     def evaluate(
         self,
         k: Union[int, List[int]],
@@ -98,10 +117,14 @@ class BaseEvaluator:
                 raise ValueError(
                     'Predictions and references must have the same length')
 
-        real_size = len(original_dataset) // n
+        real_size = len(original_dataset) // n  # dataset size of each replica
         all_details = []
         all_results = []
+
+        # Run evaluation for each replica
         for i in range(n):
+            self._dataset_replica_idx = i
+            logger.info(f'Running {i}-th replica of evaluation')
 
             def select_fn(i, real_size, x):
                 if isinstance(x, Dataset):
@@ -111,11 +134,14 @@ class BaseEvaluator:
                 else:
                     return x
 
-            results = self.score(
-                **{
-                    key: select_fn(i, real_size, value)
-                    for key, value in score_kwargs.items()
-                })
+            current_params = {
+                key: select_fn(i, real_size, value)
+                for key, value in score_kwargs.items()
+            }
+
+            current_params['predictions'] = self.pred_postprocess(
+                current_params['predictions'])
+            results = self.score(**current_params)
             details = results.pop('details', None)
             if details is not None:
                 if isinstance(details, Dict):
@@ -124,11 +150,11 @@ class BaseEvaluator:
             all_results.append(results)
 
         eval_results = {}
-        for single_results in all_results:
-            for key in single_results:
+        for single_replica_results in all_results:
+            for key in single_replica_results:
                 if key not in eval_results:
                     eval_results[key] = []
-                eval_results[key].append(single_results[key])
+                eval_results[key].append(single_replica_results[key])
         for key in deepcopy(eval_results):
             if isinstance(eval_results[key][0], float) or isinstance(
                     eval_results[key][0], int):
@@ -138,9 +164,8 @@ class BaseEvaluator:
                     eval_results.pop(key)
                 else:
                     eval_results[key] = np.mean(eval_results[key])
-            else:
-                eval_results[key] = eval_results[key][0]
 
+        # Calculate the additional metrics
         grouped_examples = self.group(n, all_details, original_dataset)
         can_calculate = False
         if len(all_details) != 0:
@@ -158,6 +183,10 @@ class BaseEvaluator:
                     elif example['detail'].get('is_correct', None) is not None:
                         can_calculate = True
                         c += int(example['detail']['is_correct'])
+                    elif example['detail'].get('cascade_correct',
+                                               None) is not None:
+                        can_calculate = True
+                        c += int(example['detail']['cascade_correct'])
 
                 k_list = [k] if isinstance(k, int) else k
                 if can_calculate and n > 1 and max(k_list) > 1:
diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
index 27270b95..8a6960df 100644
--- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py
@@ -1,10 +1,11 @@
 import os
 import random
-from typing import List
+from typing import List, Optional
 
 import evaluate
 import numpy as np
 from datasets import Dataset
+from mmengine.config import ConfigDict
 
 from opencompass.registry import ICL_EVALUATORS
 
@@ -19,12 +20,17 @@ class HuggingfaceEvaluator(BaseEvaluator):
         seed (int): There exists some randomness during the calculation of some
             metrics, thus we set a fixed random seed for reproducing. Defaults
             to 0.
+        pred_postprocessor (optional): Function or configuration for prediction
+            post-processing.
     """
 
-    def __init__(self, metric: str, seed: int = 0) -> None:
+    def __init__(self,
+                 metric: str,
+                 seed: int = 0,
+                 pred_postprocessor=None) -> None:
         self.metric = metric
         self.seed = seed
-        super().__init__()
+        super().__init__(pred_postprocessor=pred_postprocessor)
 
     def _preprocess(self, predictions: List, references: List) -> dict:
         """Preprocess the final predictions and references to needed format.
@@ -52,7 +58,10 @@ class HuggingfaceEvaluator(BaseEvaluator):
         """
         return scores
 
-    def score(self, predictions: List, references: List) -> dict:
+    def score(self,
+              predictions: List,
+              references: List,
+              test_set=None) -> dict:
         """Calculate scores.
 
         Args:
@@ -92,10 +101,15 @@ class HuggingfaceEvaluator(BaseEvaluator):
 class AccEvaluator(HuggingfaceEvaluator):
     """Accuracy evaluator."""
 
-    def __init__(self) -> None:
-        super().__init__(metric='accuracy')
+    def __init__(self,
+                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
+        super().__init__(metric='accuracy',
+                         pred_postprocessor=pred_postprocessor)
 
-    def _preprocess(self, predictions: List, references: List) -> dict:
+    def _preprocess(self,
+                    predictions: List,
+                    references: List,
+                    test_set=None) -> dict:
         """Preprocess the final predictions and references to needed format.
 
         Args:
@@ -187,8 +201,9 @@ class RougeEvaluator(HuggingfaceEvaluator):
     Note: this evaluator is not suitable for chinese datasets.
     """
 
-    def __init__(self) -> None:
-        super().__init__(metric='rouge')
+    def __init__(self,
+                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
+        super().__init__(metric='rouge', pred_postprocessor=pred_postprocessor)
 
     def _postprocess(self, scores: dict) -> dict:
         """Postprocess for final scores.
@@ -206,8 +221,10 @@ class RougeEvaluator(HuggingfaceEvaluator):
 class BleuEvaluator(HuggingfaceEvaluator):
     """Bleu evaluator."""
 
-    def __init__(self) -> None:
-        super().__init__(metric='sacrebleu')
+    def __init__(self,
+                 pred_postprocessor: Optional[ConfigDict] = None) -> None:
+        super().__init__(metric='sacrebleu',
+                         pred_postprocessor=pred_postprocessor)
 
 
 class BleuFloresEvaluator(HuggingfaceEvaluator):
diff --git a/opencompass/partitioners/num_worker.py b/opencompass/partitioners/num_worker.py
index e916a17d..f9ab4a89 100644
--- a/opencompass/partitioners/num_worker.py
+++ b/opencompass/partitioners/num_worker.py
@@ -26,6 +26,7 @@ class NumWorkerPartitioner(BasePartitioner):
         dataset_size_path (str): The path to the dataset size cache file.
         keep_keys (list[str]): The keys to be kept from the experiment config
             to the task config.
+        force_rebuild (bool): Whether to force rebuild dataset to get size.
     """
 
     def __init__(self,
@@ -35,7 +36,8 @@ class NumWorkerPartitioner(BasePartitioner):
                  min_task_size: int = 16,
                  strategy: str = 'heuristic',
                  dataset_size_path: str = '.cache/dataset_size.json',
-                 keep_keys: Optional[List[str]] = None):
+                 keep_keys: Optional[List[str]] = None,
+                 force_rebuild: bool = False):
         super().__init__(out_dir=out_dir, keep_keys=keep_keys)
         if strategy == 'split' and num_worker is not None:
             self.logger.warning('num_worker is ignored with split.')
@@ -44,6 +46,7 @@ class NumWorkerPartitioner(BasePartitioner):
         self.num_split = num_split or num_worker
         self.min_task_size = min_task_size
         self.dataset_size_path = dataset_size_path
+        self.force_rebuild = force_rebuild
         assert strategy in ('heuristic', 'split'), \
             f'Unsupported partition strategy: {strategy}. '\
             'Supported strategies are: `heuristic`, `split` .'
@@ -106,7 +109,7 @@ class NumWorkerPartitioner(BasePartitioner):
     @property
     def dataset_size(self):
         if not hasattr(self, '_dataset_size'):
-            if osp.exists(self.dataset_size_path):
+            if not self.force_rebuild and osp.exists(self.dataset_size_path):
                 self._dataset_size = mmengine.load(self.dataset_size_path)
             else:
                 self._dataset_size = {}
@@ -130,22 +133,25 @@ class NumWorkerPartitioner(BasePartitioner):
 
     def get_size(self, dataset: ConfigDict) -> int:
         dataset_abbr = dataset_abbr_from_cfg(dataset)
-
         test_range = dataset.reader_cfg.get('test_range', '')
 
-        if dataset_abbr in self.dataset_size:
+        # If not forcing rebuild and data exists in cache, use the cache
+        if not self.force_rebuild and dataset_abbr in self.dataset_size:
             actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
                                f'{test_range})')
             return actual_size
 
+        # Otherwise, rebuild the dataset to get its size
         dataset = build_dataset_from_cfg(dataset)
         self.dataset_size[dataset_abbr] = len(dataset.test)
 
-        mmengine.mkdir_or_exist('.cache/')
-        mmengine.dump(self.dataset_size,
-                      self.dataset_size_path,
-                      indent=4,
-                      ensure_ascii=False)
+        # Save to cache file
+        if self.dataset_size_path:
+            mmengine.mkdir_or_exist('.cache/')
+            mmengine.dump(self.dataset_size,
+                          self.dataset_size_path,
+                          indent=4,
+                          ensure_ascii=False)
 
         actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
                            f'{test_range})')
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index fd0a773c..ef3ca363 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -146,11 +146,16 @@ class OpenICLEvalTask(BaseTask):
             preds = []
             i = 1
             while osp.exists(osp.realpath(filename)):
-                sub_preds = mmengine.load(filename)
-                preds.extend(
-                    [sub_preds[str(i)] for i in range(len(sub_preds))])
-                filename = root + f'_{i}' + ext
-                i += 1
+                try:
+                    sub_preds = mmengine.load(filename)
+                    preds.extend(
+                        [sub_preds[str(i)] for i in range(len(sub_preds))])
+                    filename = root + f'_{i}' + ext
+                    i += 1
+                except Exception as e:
+                    self.logger.error(
+                        f'Error loading prediction file {filename}: {e}')
+                    break
 
         pred_dicts = copy.deepcopy(preds)
         preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
diff --git a/opencompass/utils/logging.py b/opencompass/utils/logging.py
index 5250c918..fb3db986 100644
--- a/opencompass/utils/logging.py
+++ b/opencompass/utils/logging.py
@@ -2,6 +2,8 @@ import logging
 import os
 
 from mmengine.logging import MMLogger
+from rich.console import Console
+from rich.syntax import Syntax
 
 _nameToLevel = {
     'CRITICAL': logging.CRITICAL,
@@ -79,3 +81,14 @@ class FilterDuplicateMessage(logging.Filter):
             self.seen.add(record.msg)
             return True
         return False
+
+
+def pretty_print_config(cfg):
+    """Pretty print config using the rich library."""
+    console = Console()
+    config_str = cfg.pretty_text
+    syntax = Syntax(config_str,
+                    'python',
+                    theme='solarized-dark',
+                    line_numbers=True)
+    console.print(syntax)
diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index 772c0a8a..281ecdee 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -150,6 +150,13 @@ def get_config_from_arg(args) -> Config:
             dataset['meta_path'] = args.custom_dataset_meta_path
         dataset = make_custom_dataset_config(dataset)
         datasets.append(dataset)
+    ## apply the dataset repeat runs
+    if len(datasets) > 0 and args.dataset_num_runs > 1:
+        logger.warning(f'User has set the --dataset-num-runs, the datasets will be evaluated with {args.dataset_num_runs} runs.')
+        for _dataset in datasets:
+            logger.warning(f"The default num runs of {_dataset['abbr']} is: {_dataset['n']}, changed into: {args.dataset_num_runs}")
+            _dataset['n'] = args.dataset_num_runs
+            _dataset['k'] = args.dataset_num_runs
 
     # parse model args
     if not args.models and not args.hf_path:
@@ -204,7 +211,6 @@ def get_config_from_arg(args) -> Config:
     summarizers_dir = [
         os.path.join(args.config_dir, 'summarizers'),
         os.path.join(default_configs_dir, './summarizers'),
-
     ]
 
     # Check if summarizer_arg contains '/'
@@ -308,7 +314,7 @@ def change_accelerator(models, accelerator):
                     model_kwargs=model_kwargs,
                     max_seq_len=model.get('max_seq_len', None),
                     max_out_len=model['max_out_len'],
-                    batch_size=16,
+                    batch_size=model.get('batch_size', 16),
                     run_cfg=model['run_cfg'],
                     stop_words=model.get('stop_words', []),
                 )
@@ -335,7 +341,7 @@ def change_accelerator(models, accelerator):
                     gen_config=gen_config,
                     max_seq_len=model.get('max_seq_len', None),
                     max_out_len=model['max_out_len'],
-                    batch_size=16,
+                    batch_size=model.get('batch_size', 16),
                     run_cfg=model['run_cfg'],
                     stop_words=model.get('stop_words', []),
                 )