From 75e7834b59c422d821f1299559c530a083e4abd8 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Mon, 14 Apr 2025 20:18:47 +0800
Subject: [PATCH 01/10] [Feature] Add Datasets: ClimateQA,Physics (#2017)

* feat ClimateQA

* feat PHYSICS

* fix

* fix

* fix

* fix
---
 dataset-index.yml                             |  14 ++
 .../ClimaQA/ClimaQA_Gold_llm_judge_gen.py     |   4 +
 .../ClimaQA_Gold_llm_judge_gen_f15343.py      | 164 ++++++++++++++++++
 .../ClimaQA/ClimaQA_Silver_llm_judge_gen.py   |   4 +
 .../ClimaQA_Silver_llm_judge_gen_f15343.py    | 160 +++++++++++++++++
 .../datasets/PHYSICS/PHYSICS_llm_judge_gen.py |   4 +
 .../PHYSICS/PHYSICS_llm_judge_gen_a133a2.py   | 131 ++++++++++++++
 .../configs/summarizers/groups/PHYSICS.py     |  14 ++
 opencompass/datasets/__init__.py              |   2 +
 opencompass/datasets/climaqa.py               |  30 ++++
 opencompass/datasets/physics.py               |  30 ++++
 opencompass/utils/datasets_info.py            |  31 ++++
 12 files changed, 588 insertions(+)
 create mode 100644 opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen_f15343.py
 create mode 100644 opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen_f15343.py
 create mode 100644 opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
 create mode 100644 opencompass/configs/summarizers/groups/PHYSICS.py
 create mode 100644 opencompass/datasets/climaqa.py
 create mode 100644 opencompass/datasets/physics.py

diff --git a/dataset-index.yml b/dataset-index.yml
index f1581c21..89fde388 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -997,3 +997,17 @@
     paper: https://arxiv.org/pdf/2502.14739
     configpath: opencompass/configs/datasets/supergpqa
     configpath_llmjudge: ''
+- climaqa:
+    name: ClimaQA
+    category: Science
+    paper: https://arxiv.org/pdf/2410.16701
+    configpath: ''
+    configpath_llmjudge:
+        - opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge.py
+        - opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge.py
+- physics:
+    name: PHYSICS
+    category: Science
+    paper: https://arxiv.org/pdf/2503.21821
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen.py b/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen.py
new file mode 100644
index 00000000..599b6b82
--- /dev/null
+++ b/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ClimaQA_Gold_llm_judge_gen_f15343 import climaqa_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen_f15343.py b/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen_f15343.py
new file mode 100644
index 00000000..1ec9283d
--- /dev/null
+++ b/opencompass/configs/datasets/ClimaQA/ClimaQA_Gold_llm_judge_gen_f15343.py
@@ -0,0 +1,164 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import ClimaQADataset, generic_llmjudge_postprocess
+
+from opencompass.evaluator import GenericLLMEvaluator
+
+climaqa_gold_sets = [
+    'mcq',
+    'cloze',
+    'ffq'
+]
+
+GRADER_TEMPLATE_mcq = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. The answer may be one of the four options: a, b, c, or d. Only when the options given by prediction are strictly consistent with the answer, the prediction can be considered correct.
+    3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:', and only judge whether the candidate's answer is consistent with the standard answer.
+    
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+GRADER_TEMPLATE_cloze = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. The form of the answer is a word or a phrase. Please strictly compare the prediction and the answer. Only when the prediction and the answer are exactly the same, will the prediction be considered correct; otherwise, it will be considered incorrect.
+    3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+GRADER_TEMPLATE_ffq = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. The type of question is open-ended Q&A. Please compare whether the prediction is close enough to the meaning of the answer and whether the prediction covers each key point in the answer. If the prediction meets the above requirements, it can be considered very close to the answer.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is very close to the standard answer.
+
+    Please judge whether the following answers are close to the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: very close to the answer
+    B: not very close to the answer
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either A or B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+climaqa_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+climaqa_datasets = []
+
+for _task in climaqa_gold_sets:
+
+    if _task == 'mcq':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_mcq
+        infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification. The question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: a\"\n\nQ: {{input}}\nA: "
+    if _task == 'ffq':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_ffq
+        infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\".\n\nQ: {{input}}\nA: "
+    if _task == 'cloze':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_cloze
+        infer_prompt = f"Fill the <Mask> in the sentence. Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\".\n\nQ: {{input}}\nA: "
+
+    climaqa_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=infer_prompt,
+                    )
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    climaqa_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=ClimaQADataset,
+                path='opencompass/ClimaQA-Gold',
+                task=_task,
+                abbr='ClimaQA_Gold_' + _task,
+                reader_cfg=climaqa_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    climaqa_datasets.append(
+        dict(
+            abbr='ClimaQA_Gold_' + _task,
+            type=ClimaQADataset,
+            path='opencompass/ClimaQA-Gold',
+            task=_task,
+            reader_cfg=climaqa_reader_cfg,
+            infer_cfg=climaqa_infer_cfg,
+            eval_cfg=climaqa_eval_cfg,
+        )
+    )
+
+
diff --git a/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen.py b/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen.py
new file mode 100644
index 00000000..958ca7d3
--- /dev/null
+++ b/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ClimaQA_Silver_llm_judge_gen_f15343 import climaqa_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen_f15343.py b/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen_f15343.py
new file mode 100644
index 00000000..afe64d9d
--- /dev/null
+++ b/opencompass/configs/datasets/ClimaQA/ClimaQA_Silver_llm_judge_gen_f15343.py
@@ -0,0 +1,160 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import ClimaQADataset, generic_llmjudge_postprocess
+
+from opencompass.evaluator import GenericLLMEvaluator
+
+climaqa_silver_sets = [
+    'mcq',
+    'cloze',
+    'ffq'
+]
+
+GRADER_TEMPLATE_mcq = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. The answer may be one of the four options: a, b, c, or d. Only when the options given by prediction are strictly consistent with the answer, the prediction can be considered correct.
+    3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:', and only judge whether the candidate's answer is consistent with the standard answer.
+
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+GRADER_TEMPLATE_cloze = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. The form of the answer is a word or a phrase. Please strictly compare the prediction and the answer. Only when the prediction and the answer are exactly the same, will the prediction be considered correct; otherwise, it will be considered incorrect.
+    3. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+GRADER_TEMPLATE_ffq = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. The type of question is open-ended Q&A. Please compare whether the prediction is close enough to the meaning of the answer and whether the prediction covers each key point in the answer. If the prediction meets the above requirements, it can be considered very close to the answer.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with 'The answer is:', please ignore the 'The answer is:' and only judge whether the candidate's answer is very close to the standard answer.
+
+    Please judge whether the following answers are close to the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: very close to the answer
+    B: not very close to the answer
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either A or B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+climaqa_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+climaqa_datasets = []
+
+for _task in climaqa_silver_sets:
+
+    if _task == 'mcq':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_mcq
+        infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification. The question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: a\"\n\nQ: {{input}}\nA: "
+    if _task == 'ffq':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_ffq
+        infer_prompt = f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\".\n\nQ: {{input}}\nA: "
+    if _task == 'cloze':
+        GRADER_TEMPLATE = GRADER_TEMPLATE_cloze
+        infer_prompt = f"Fill the <Mask> in the sentence. Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\".\n\nQ: {{input}}\nA: "
+
+    climaqa_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=infer_prompt,
+                    )
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    climaqa_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=ClimaQADataset,
+                path='opencompass/ClimaQA-Silver',
+                task=_task,
+                abbr='ClimaQA_Silver_' + _task,
+                reader_cfg=climaqa_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    climaqa_datasets.append(
+        dict(
+            abbr='ClimaQA_Silver_' + _task,
+            type=ClimaQADataset,
+            path='opencompass/ClimaQA-Silver',
+            task=_task,
+            reader_cfg=climaqa_reader_cfg,
+            infer_cfg=climaqa_infer_cfg,
+            eval_cfg=climaqa_eval_cfg,
+        )
+    )
+
diff --git a/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen.py b/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen.py
new file mode 100644
index 00000000..3859ddde
--- /dev/null
+++ b/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .PHYSICS_llm_judge_gen_a133a2 import physics_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py b/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
new file mode 100644
index 00000000..79b8d023
--- /dev/null
+++ b/opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
@@ -0,0 +1,131 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    PHYSICSDataset,
+    generic_llmjudge_postprocess,
+)
+from opencompass.evaluator import GenericLLMEvaluator
+
+physics_sets = [
+    'atomic_dataset_textonly',
+    'electro_dataset_textonly',
+    'mechanics_dataset_textonly',
+    'optics_dataset_textonly',
+    'quantum_dataset_textonly',
+    'statistics_dataset_textonly',
+]
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some questions may include multiple sub questions and sub answers. Each sub answer is given after a guide character in the form of <Answer 1:> or <Answer 2:>, etc. Please note that only when all sub predictions given in prediction correspond one-to-one with the answer and are all correct, will the prediction be considered correct; otherwise, it will be considered incorrect.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. The final answers in the prediction are generally given with \\boxed{}. If you cannot find sufficient \\boxed{} in the prediction, please try to find matching answers from other places within the prediction as much as possible.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: All Sub Predictions Are Correct
+    B: Not Every Sub Predictions is Correct
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either A, B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# GRADER_TEMPLATE = """
+#     Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+#
+#     Here are some evaluation criteria:
+#     1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+#     2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+#     3. Some questions may include multiple sub questions and sub answers. Each sub answer is given after a guide character in the form of <Answer 1:> or <Answer 2:>, etc. Please note that as long as at least one correct answer appears in the prediction, the prediction is considered correct.
+#     4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+#     5. The final answers in the prediction are generally given with \\boxed{}. If you cannot find sufficient \\boxed{} in the prediction, please try to find matching answers from other places within the prediction as much as possible.
+#
+#     Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+#     A: At Least One Sub Prediction is Correct
+#     B: All Sub Predictions are Incorrect
+#     Just return the letters "A" or "B", with no text around it.
+#
+#     Here is your task. Simply reply with either A, B. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+#
+#     <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+#     <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+#     <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+#
+#     Judging the correctness of candidates' answers:
+# """.strip()
+
+physics_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+physics_datasets = []
+
+for _name in physics_sets:
+
+    physics_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'Answer the given question step by step. Begin by explaining your reasoning process clearly. Conclude by providing the final answers at the end in LaTeX boxed format. Think step by step before answering. It should be noted that the question may include multiple sub questions, please ensure that each question is answered in order.\n\nQ: {{input}}\nA: ',
+                    )
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    physics_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=PHYSICSDataset,
+                path='opencompass/PHYSICS-textonly',
+                abbr='PHYSICS_' + _name,
+                name=_name,
+                reader_cfg=physics_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    physics_datasets.append(
+        dict(
+            abbr='PHYSICS_' + _name,
+            type=PHYSICSDataset,
+            path='opencompass/PHYSICS-textonly',
+            name=_name,
+            reader_cfg=physics_reader_cfg,
+            infer_cfg=physics_infer_cfg,
+            eval_cfg=physics_eval_cfg,
+        )
+    )
+
diff --git a/opencompass/configs/summarizers/groups/PHYSICS.py b/opencompass/configs/summarizers/groups/PHYSICS.py
new file mode 100644
index 00000000..eff80721
--- /dev/null
+++ b/opencompass/configs/summarizers/groups/PHYSICS.py
@@ -0,0 +1,14 @@
+physics_summary_groups = []
+
+# bbh
+_physcis = [
+    'atomic_dataset_textonly',
+    'electro_dataset_textonly',
+    'mechanics_dataset_textonly',
+    'optics_dataset_textonly',
+    'quantum_dataset_textonly',
+    'statistics_dataset_textonly',
+]
+
+_physcis = ['PHYSICS_' + s for s in _physcis]
+physics_summary_groups.append({'name': 'PHYSICS', 'subsets': _physcis})
\ No newline at end of file
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 3e2d0eef..82314e9e 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -25,6 +25,7 @@ from .chinese_simpleqa import *  # noqa: F401, F403
 from .cibench import *  # noqa: F401, F403
 from .circular import *  # noqa: F401, F403
 from .civilcomments import *  # noqa: F401, F403
+from .climaqa import *  # noqa: F401, F403
 from .clozeTest_maxmin import *  # noqa: F401, F403
 from .cluewsc import *  # noqa: F401, F403
 from .cmb import *  # noqa: F401, F403
@@ -110,6 +111,7 @@ from .obqa import *  # noqa: F401, F403
 from .olymmath import *  # noqa: F401, F403
 from .OlympiadBench import *  # noqa: F401, F403
 from .OpenFinData import *  # noqa: F401, F403
+from .physics import *  # noqa: F401, F403
 from .piqa import *  # noqa: F401, F403
 from .py150 import *  # noqa: F401, F403
 from .qasper import *  # noqa: F401, F403
diff --git a/opencompass/datasets/climaqa.py b/opencompass/datasets/climaqa.py
new file mode 100644
index 00000000..b11988b7
--- /dev/null
+++ b/opencompass/datasets/climaqa.py
@@ -0,0 +1,30 @@
+import os
+
+from datasets import load_dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+
+@LOAD_DATASET.register_module()
+class ClimaQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, task: str, **kwargs):
+
+        path = get_data_path(path)
+        path = os.path.join(path, task)
+        climateqa = load_dataset(path)['train']
+
+        input_column = []
+        for i in range(len(climateqa)):
+            if 'Options' in climateqa[i].keys(
+            ) and climateqa[i]['Options'] is not None:
+                input_column.append(climateqa[i]['Question'] + '\n' +
+                                    climateqa[i]['Options'])
+            else:
+                input_column.append(climateqa[i]['Question'])
+        climateqa = climateqa.add_column(name='input', column=input_column)
+        climateqa = climateqa.rename_column('Answer', 'target')
+        return climateqa
diff --git a/opencompass/datasets/physics.py b/opencompass/datasets/physics.py
new file mode 100644
index 00000000..2e5f878f
--- /dev/null
+++ b/opencompass/datasets/physics.py
@@ -0,0 +1,30 @@
+import os
+
+from datasets import load_dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+
+@LOAD_DATASET.register_module()
+class PHYSICSDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str, **kwargs):
+        path = get_data_path(path)
+        path = os.path.join(path, name)
+        physics = load_dataset(path)['train']
+        physics = physics.rename_column('questions', 'input')
+
+        target = []
+        for i in physics:
+            this_final_answer = ''
+            for j in range(len(i['final_answers'])):
+                this_final_answer += 'Answer ' + str(j + 1) + ': '
+                this_final_answer += i['final_answers'][j]
+                this_final_answer += '\n'
+            target.append(this_final_answer)
+        physics = physics.add_column(name='target', column=target)
+
+        return physics
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 00db25e8..e3690162 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -420,9 +420,40 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/OlympiadBench",
     },
+    "opencompass/ClimaQA-Gold": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/climaqa_gold",
+    },
+    "opencompass/ClimaQA-Silver": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/climaqa_silver",
+    },
+    "opencompass/PHYSICS-textonly": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/PHYSICS-textonly",
+    },
+
 }
 
 DATASETS_URL = {
+    "/climaqa_gold": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/climaqa_gold.zip",
+        "md5": "310cd0dc96db2bbbce798c40e2163ac2",
+    },
+    "/climaqa_silver": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/climaqa_silver.zip",
+        "md5": "acdd955f1c170539c5233c12f7227c58",
+    },
+    "/PHYSICS-textonly": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/PHYSICS-textonly.zip",
+        "md5": "92be6846a22dd4da942ca43f0638c709",
+    },
     "/OlympiadBench": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip",

From 65ff602cf556f59a279cf7a81ded95a9b37322d4 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 15 Apr 2025 11:33:16 +0800
Subject: [PATCH 02/10] [Update] Fix LLM Judge metrics cacluation & Add
 reasoning content concat to OpenAI SDK

---
 opencompass/datasets/generic.py  |  3 +-
 opencompass/models/openai_api.py | 48 ++++++++++++++++++--------------
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py
index 07b6a0bb..deca2486 100644
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@@ -37,7 +37,6 @@ def get_final_results(judged_answers,
     is_correct = is_correct_count / count
     is_incorrect = is_incorrect_count / count
     is_given_attempted = is_correct + is_incorrect
-    loose_accuracy = is_correct / count
     accuracy_given_attempted = (is_correct / is_given_attempted
                                 if is_given_attempted > 0 else 0)
     attempted_judge_ratio = attempted_judge_count / count
@@ -46,7 +45,7 @@ def get_final_results(judged_answers,
           (accuracy_given_attempted + is_correct) if
           (accuracy_given_attempted + is_correct) > 0 else 0)
     result = {
-        metric_name: loose_accuracy * 100,
+        metric_name: is_correct * 100,
         f'{metric_name}_given_attempted': accuracy_given_attempted * 100,
         'f1': f1,
         'attempted_ratio': attempted_judge_ratio * 100,
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 7b2c2c53..6ef11b8f 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -531,27 +531,26 @@ class OpenAI(BaseAPIModel):
 
 class OpenAISDK(OpenAI):
 
-    def __init__(
-        self,
-        path: str = 'gpt-3.5-turbo',
-        max_seq_len: int = 16384,
-        query_per_second: int = 1,
-        rpm_verbose: bool = False,
-        retry: int = 2,
-        key: str | List[str] = 'ENV',
-        org: str | List[str] | None = None,
-        meta_template: Dict | None = None,
-        openai_api_base: str | List[str] = OPENAISDK_API_BASE,
-        openai_proxy_url: Optional[str] = None,
-        mode: str = 'none',
-        logprobs: bool | None = False,
-        top_logprobs: int | None = None,
-        temperature: float | None = None,
-        tokenizer_path: str | None = None,
-        extra_body: Dict | None = None,
-        verbose: bool = False,
-        status_code_mappings: dict = {},
-    ):
+    def __init__(self,
+                 path: str = 'gpt-3.5-turbo',
+                 max_seq_len: int = 16384,
+                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
+                 retry: int = 2,
+                 key: str | List[str] = 'ENV',
+                 org: str | List[str] | None = None,
+                 meta_template: Dict | None = None,
+                 openai_api_base: str | List[str] = OPENAISDK_API_BASE,
+                 openai_proxy_url: Optional[str] = None,
+                 mode: str = 'none',
+                 logprobs: bool | None = False,
+                 top_logprobs: int | None = None,
+                 temperature: float | None = None,
+                 tokenizer_path: str | None = None,
+                 extra_body: Dict | None = None,
+                 verbose: bool = False,
+                 status_code_mappings: dict = {},
+                 think_tag: str = '</think>'):
         super().__init__(
             path,
             max_seq_len,
@@ -596,6 +595,7 @@ class OpenAISDK(OpenAI):
         if self.verbose:
             self.logger.info(f'Used openai_client: {self.openai_client}')
         self.status_code_mappings = status_code_mappings
+        self.think_tag = think_tag
 
     def _generate(self,
                   input: PromptList | str,
@@ -670,6 +670,12 @@ class OpenAISDK(OpenAI):
                     num_retries += 1
                     # Continue to retry instead of returning empty response
                     continue
+                # If the model has reasoning_content, concat it
+                # with the content
+                if hasattr(responses.choices[0].message, 'reasoning_content'):
+                    return (responses.choices[0].message.reasoning_content +
+                            self.think_tag +
+                            responses.choices[0].message.content)
 
                 return responses.choices[0].message.content
 

From b2da1c08a803982ab828558f8c55d05d8cd35a6d Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 18 Apr 2025 17:21:29 +0800
Subject: [PATCH 03/10] [Dataset] Add SmolInstruct, Update Chembench (#2025)

* [Dataset] Add SmolInstruct, Update Chembench

* Add dataset metadata

* update

* update

* update
---
 dataset-index.yml                             |   8 +-
 .../datasets/ChemBench/ChemBench_gen.py       |  79 +---
 .../ChemBench/ChemBench_gen_a9f753.py         |  77 ++++
 .../ChemBench/ChemBench_llmjudge_gen.py       |   4 +
 .../ChemBench_llmjudge_gen_c584cf.py          | 108 +++++
 .../smolinstruct_fts_gen_5774b5.py            |  73 +++
 .../datasets/SmolInstruct/smolinstruct_gen.py |  10 +
 .../smolinstruct_meteor_gen_065150.py         |  67 +++
 .../smolinstruct_nc_gen_c84c18.py             |  93 ++++
 .../smolinstruct_pp_acc_gen_8607a3.py         |  79 ++++
 .../smolinstruct_rmse_gen_0fcc6b.py           |  70 +++
 .../livemathbench_hard_llmjudge_gen_71eaf5.py |  97 ++++
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/chembench.py             |   2 +
 opencompass/datasets/smolinstruct.py          | 426 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  11 +
 requirements/extra.txt                        |   2 +
 17 files changed, 1130 insertions(+), 77 deletions(-)
 create mode 100644 opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py
 create mode 100644 opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py
 create mode 100644 opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py
 create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
 create mode 100644 opencompass/datasets/smolinstruct.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 89fde388..6e2c3fc9 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1010,4 +1010,10 @@
     category: Science
     paper: https://arxiv.org/pdf/2503.21821
     configpath: ''
-    configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
\ No newline at end of file
+    configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
+- smolinstruct:
+    name: SmolInstruct
+    category: Science /Chemistry
+    paper: https://arxiv.org/pdf/2402.09391
+    configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
+    configpath_llmjudge: ''
diff --git a/opencompass/configs/datasets/ChemBench/ChemBench_gen.py b/opencompass/configs/datasets/ChemBench/ChemBench_gen.py
index 67fff5c9..66923794 100644
--- a/opencompass/configs/datasets/ChemBench/ChemBench_gen.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_gen.py
@@ -1,77 +1,4 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import FixKRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import AccEvaluator
-from opencompass.datasets import ChemBenchDataset
-from opencompass.utils.text_postprocessors import first_capital_postprocess
+from mmengine.config import read_base
 
-
-chembench_reader_cfg = dict(
-    input_columns=['input', 'A', 'B', 'C', 'D'],
-    output_column='target',
-    train_split='dev')
-
-chembench_all_sets = [
-    'Name_Conversion',
-    'Property_Prediction',
-    'Mol2caption',
-    'Caption2mol',
-    'Product_Prediction',
-    'Retrosynthesis',
-    'Yield_Prediction',
-    'Temperature_Prediction',
-    'Solvent_Prediction'
-]
-
-
-chembench_datasets = []
-for _name in chembench_all_sets:
-    # _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
-    _hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
-
-    chembench_infer_cfg = dict(
-        ice_template=dict(
-            type=PromptTemplate,
-            template=dict(round=[
-                dict(
-                    role='HUMAN',
-                    prompt=
-                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
-                ),
-                dict(role='BOT', prompt='{target}\n')
-            ]),
-        ),
-        prompt_template=dict(
-            type=PromptTemplate,
-            template=dict(
-                begin='</E>',
-                round=[
-                    dict(
-                        role='HUMAN',
-                        prompt=
-                        f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
-                    ),
-                ],
-            ),
-            ice_token='</E>',
-        ),
-        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
-        inferencer=dict(type=GenInferencer),
-    )
-
-    chembench_eval_cfg = dict(
-        evaluator=dict(type=AccEvaluator),
-        pred_postprocessor=dict(type=first_capital_postprocess))
-
-    chembench_datasets.append(
-        dict(
-            abbr=f'ChemBench_{_name}',
-            type=ChemBenchDataset,
-            path='opencompass/ChemBench',
-            name=_name,
-            reader_cfg=chembench_reader_cfg,
-            infer_cfg=chembench_infer_cfg,
-            eval_cfg=chembench_eval_cfg,
-        ))
-
-del _name, _hint
+with read_base():
+    from .ChemBench_gen_a9f753 import chembench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py b/opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py
new file mode 100644
index 00000000..efc9df48
--- /dev/null
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py
@@ -0,0 +1,77 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import ChemBenchDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+
+chembench_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+chembench_all_sets = [
+    'Name_Conversion',
+    'Property_Prediction',
+    'Mol2caption',
+    'Caption2mol',
+    'Product_Prediction',
+    'Retrosynthesis',
+    'Yield_Prediction',
+    'Temperature_Prediction',
+    'Solvent_Prediction'
+]
+
+
+chembench_datasets = []
+for _name in chembench_all_sets:
+    # _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    _hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
+
+    chembench_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=
+                        f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    chembench_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=first_capital_postprocess))
+
+    chembench_datasets.append(
+        dict(
+            abbr=f'ChemBench_{_name}',
+            type=ChemBenchDataset,
+            path='opencompass/ChemBench4K',
+            name=_name,
+            reader_cfg=chembench_reader_cfg,
+            infer_cfg=chembench_infer_cfg,
+            eval_cfg=chembench_eval_cfg,
+        ))
+
+del _name, _hint
diff --git a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py
new file mode 100644
index 00000000..969cbf04
--- /dev/null
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .ChemBench_llmjudge_gen_c584cf import chembench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
new file mode 100644
index 00000000..d6fc7e46
--- /dev/null
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
@@ -0,0 +1,108 @@
+from opencompass.datasets.math import MATHDataset
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import ChemBenchDataset
+
+
+chembench_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+chembench_all_sets = [
+    'Name_Conversion',
+    'Property_Prediction',
+    'Mol2caption',
+    'Caption2mol',
+    'Product_Prediction',
+    'Retrosynthesis',
+    'Yield_Prediction',
+    'Temperature_Prediction',
+    'Solvent_Prediction'
+]
+_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
+
+chembench_datasets = []
+for _name in chembench_all_sets:
+    chembench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ')
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer)
+    )
+
+    # Evaluation configuration
+    chembench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=ChemBenchDataset,
+                path='/fs-computility/llm/xiaolinchen/opencompass_fork/data/ChemBench4K',
+                name=_name,
+                reader_cfg=chembench_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+        )
+
+    chembench_datasets.append(
+    dict(
+        abbr=f'ChemBench_{_name}',
+        type=ChemBenchDataset,
+        path='opencompass/ChemBench4K',
+        name=_name,
+        reader_cfg=chembench_reader_cfg,
+        infer_cfg=chembench_infer_cfg,
+        eval_cfg=chembench_eval_cfg,
+    ))
\ No newline at end of file
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py
new file mode 100644
index 00000000..60c65f93
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py
@@ -0,0 +1,73 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import FTSEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+fts_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+fts_hint_dict = {
+    'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
+    The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
+    'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
+    The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain only the SMILES representation of the predicted product and no other text. Your reply must be valid and chemically reasonable.""",
+    'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
+    The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and different reactants and reagents should be separated by ".". Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'MG': 'molecule_generation',
+    'FS': 'forward_synthesis',
+    'RS': 'retrosynthesis'
+}
+
+fts_datasets = []
+for _name in fts_hint_dict:
+    _hint = fts_hint_dict[_name]
+    fts_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{output}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    fts_eval_cfg = dict(
+        evaluator=dict(type=FTSEvaluator),
+    )
+
+    fts_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=fts_reader_cfg,
+            infer_cfg=fts_infer_cfg,
+            eval_cfg=fts_eval_cfg,
+        ))
+
+del _name, _hint
\ No newline at end of file
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
new file mode 100644
index 00000000..8b02c1e6
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
@@ -0,0 +1,10 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_gen_c84c18 import nc_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_gen_8607a3 import pp_acc_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_gen_0fcc6b import pp_rmse_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_gen_5774b5 import fts_datasets
+    from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_gen_065150 import meteor_datasets
+
+smolinstruct_datasets = nc_datasets + pp_rmse_datasets + pp_acc_datasets + meteor_datasets + fts_datasets
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py
new file mode 100644
index 00000000..7e08ced1
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py
@@ -0,0 +1,67 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import MeteorEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+meteor_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+meteor_hint_dict = {
+    'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
+    The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'MC': 'molecule_captioning',
+}
+
+meteor_datasets = []
+for _name in meteor_hint_dict:
+    _hint = meteor_hint_dict[_name]
+    meteor_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{output}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    meteor_eval_cfg = dict(
+        evaluator=dict(type=MeteorEvaluator),
+    )
+
+    meteor_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=meteor_reader_cfg,
+            infer_cfg=meteor_infer_cfg,
+            eval_cfg=meteor_eval_cfg,
+        ))
+
+del _name, _hint
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py
new file mode 100644
index 00000000..01dbe555
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py
@@ -0,0 +1,93 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
+from opencompass.datasets import SmolInstructDataset
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+nc_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+nc_hint_dict = {
+    'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
+    The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
+    The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
+    'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'I2F': 'name_conversion-i2f',
+    'I2S': 'name_conversion-i2s',
+    'S2F': 'name_conversion-s2f',
+    'S2I': 'name_conversion-s2i',
+}
+
+nc_datasets = []
+for _name in nc_hint_dict:
+    _hint = nc_hint_dict[_name]
+    nc_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{output}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
+        inferencer=dict(type=GenInferencer),
+    )
+    # nc_infer_cfg = dict(
+    #     prompt_template=dict(
+    #         type=PromptTemplate,
+    #         template=dict(
+    #             round=[
+    #                 dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
+    #             ],
+    #         ),
+    #     ),
+    #     retriever=dict(type=ZeroRetriever),
+    #     inferencer=dict(type=GenInferencer),
+    # )
+    if _name in ['I2F', 'S2F']:
+        nc_eval_cfg = dict(
+            evaluator=dict(type=NCElementMatchEvaluator),
+        )
+    else:
+        nc_eval_cfg = dict(
+            evaluator=dict(type=NCExactMatchEvaluator),
+        )
+
+    nc_datasets.append(
+        dict(
+            abbr=f'NC-{_name}',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=nc_reader_cfg,
+            infer_cfg=nc_infer_cfg,
+            eval_cfg=nc_eval_cfg,
+        ))
+
+del _name, _hint
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py
new file mode 100644
index 00000000..f32594f2
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py
@@ -0,0 +1,79 @@
+from opencompass.openicl import AccEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import SmolInstructDataset
+from opencompass.datasets.smolinstruct import smolinstruct_acc_postprocess
+
+pp_acc_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+pp_acc_hint_dict = {
+    'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+    'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+    'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+    'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
+    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
+}
+
+name_dict = {
+    'BBBP': 'property_prediction-bbbp',
+    'ClinTox': 'property_prediction-clintox',
+    'HIV': 'property_prediction-hiv',
+    'SIDER': 'property_prediction-sider',
+}
+
+pp_acc_datasets = []
+for _name in pp_acc_hint_dict:
+    _hint = pp_acc_hint_dict[_name]
+
+    pp_acc_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{output}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    pp_acc_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_postprocessor=dict(type=smolinstruct_acc_postprocess)
+    )
+
+    pp_acc_datasets.append(
+        dict(
+            abbr=f'PP-{_name}',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=pp_acc_reader_cfg,
+            infer_cfg=pp_acc_infer_cfg,
+            eval_cfg=pp_acc_eval_cfg,
+        ))
+
+del _name, _hint
diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py
new file mode 100644
index 00000000..ceeccdc7
--- /dev/null
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py
@@ -0,0 +1,70 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.smolinstruct import RMSEEvaluator
+from opencompass.datasets import SmolInstructDataset
+
+pp_rmse_reader_cfg = dict(
+    input_columns=['input'],
+    output_column='output',
+    train_split='validation')
+
+pp_rmse_hint_dict = {
+    'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable.""",
+    'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
+    The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable."""
+}
+
+name_dict = {
+    'ESOL': 'property_prediction-esol',
+    'Lipo': 'property_prediction-lipo'
+}
+
+pp_rmse_datasets = []
+for _name in pp_rmse_hint_dict:
+    _hint = pp_rmse_hint_dict[_name]
+    pp_rmse_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                ),
+                dict(role='BOT', prompt='{output}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    pp_rmse_eval_cfg = dict(
+        evaluator=dict(type=RMSEEvaluator),
+    )
+
+    pp_rmse_datasets.append(
+        dict(
+            abbr=f'PP-{_name}',
+            type=SmolInstructDataset,
+            path='osunlp/SMolInstruct',
+            name=name_dict[_name],
+            reader_cfg=pp_rmse_reader_cfg,
+            infer_cfg=pp_rmse_infer_cfg,
+            eval_cfg=pp_rmse_eval_cfg,
+        ))
+
+del _name, _hint
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
new file mode 100644
index 00000000..6a513847
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
@@ -0,0 +1,97 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets.livemathbench import LiveMathBenchDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+
+livemathbench_reader_cfg = dict(
+    input_columns=['question'], output_column='answer'
+)
+
+
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\n',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+splits = ['hard']
+livemathbench_datasets = []
+for split in splits:
+    # Dataset configuration
+    livemathbench_datasets.append(
+        dict(
+            type=LiveMathBenchDataset,
+            abbr=f'livemathbench_{split}',
+            path='opencompass/LiveMathBench',
+            dataset_splits = [split],
+            dataset_languages= ['cn', 'en'],
+            reader_cfg=livemathbench_reader_cfg,
+            infer_cfg=livemathbench_infer_cfg,
+            eval_cfg=dict(
+                # # Evaluation configuration using LLM as judge
+                evaluator=dict(
+                    type=GenericLLMEvaluator,
+                    prompt_template=dict(
+                        type=PromptTemplate,
+                        template=dict(
+                            begin=[
+                                dict(
+                                    role='SYSTEM',
+                                    fallback_role='HUMAN',
+                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                                )
+                            ],
+                            round=[
+                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                            ],
+                        ),
+                    ),
+                    dataset_cfg=dict(
+                        type=LiveMathBenchDataset,
+                        path='opencompass/LiveMathBench202412',
+                        dataset_splits = [split],
+                        reader_cfg=livemathbench_reader_cfg,
+                    ),
+                    judge_cfg={},
+                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+                ),
+            ),
+        )
+    )
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 82314e9e..6f7be89a 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -127,6 +127,7 @@ from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
+from .smolinstruct import *  # noqa: F401, F403
 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator  # noqa: F401, F403
 from .storycloze import *  # noqa: F401, F403
 from .strategyqa import *  # noqa: F401, F403
diff --git a/opencompass/datasets/chembench.py b/opencompass/datasets/chembench.py
index 887c11c9..b6173823 100644
--- a/opencompass/datasets/chembench.py
+++ b/opencompass/datasets/chembench.py
@@ -4,6 +4,7 @@ import os.path as osp
 from datasets import Dataset, DatasetDict
 
 from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
 
 from .base import BaseDataset
 
@@ -14,6 +15,7 @@ class ChemBenchDataset(BaseDataset):
     @staticmethod
     def load(path: str, name: str):
         dataset = DatasetDict()
+        path = get_data_path(path)
         for split in ['dev', 'test']:
             raw_data = []
             filename = osp.join(path, split, f'{name}_benchmark.json')
diff --git a/opencompass/datasets/smolinstruct.py b/opencompass/datasets/smolinstruct.py
new file mode 100644
index 00000000..4589d606
--- /dev/null
+++ b/opencompass/datasets/smolinstruct.py
@@ -0,0 +1,426 @@
+# flake8: noqa: W605
+import re
+from collections import defaultdict
+
+import numpy as np
+from datasets import Dataset, DatasetDict, load_dataset
+
+from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class SmolInstructDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        dataset = DatasetDict()
+        raw_dataset = load_dataset(path)
+        for split in ['validation', 'test']:
+            raw_data = []
+            for data in raw_dataset[split]:
+                if data['task'] == name:
+                    raw_data.append(data)
+            dataset[split] = Dataset.from_list(raw_data)
+        return dataset
+
+
+def extract_chemical_data(text):
+    pattern = re.compile(r'<(MOLFORMULA|SMILES|IUPAC)>(.*?)</\1>', re.DOTALL)
+    matches = pattern.findall(text)
+    if not matches:
+        return []
+    return [match[1].strip() for match in matches]
+
+
+def parse_molecule(molecular_formula):
+    valid = re.match('([A-Za-z]\d*)+([\+\-]\d*)*$', molecular_formula)
+    if valid is None:
+        raise ValueError("Molecular formula \"%s\" is not valid." %
+                         molecular_formula)
+
+    stack = [defaultdict(int)]
+
+    def _parse_formula(formula, _stack):
+
+        # Set remainder equal to 'None'
+        r = None
+
+        # Regular expression matching for each of the three cases:
+        atom = re.match(r'([A-Z][a-z]?)(\d+)?', formula)
+        opening = re.match(r'[\(\[\{]', formula)
+        closing = re.match(r'[\)\]\}](\d+)?', formula)
+
+        # If atom is identified:
+        if atom:
+            r = formula[len(atom.group()):]
+            _stack[-1][atom.group(1)] += int(atom.group(2) or 1)
+
+        # If opening brackets encountered:
+        elif opening:
+            r = formula[len(
+                opening.group()
+            ):]  # this sets the remainder equal to everything after the opening brackets
+            _stack.append(defaultdict(int))
+
+            # If closing brackets encountered:
+        elif closing:
+            r = formula[len(
+                closing.group()
+            ):]  # this sets the remainder equal to everything after the closing brackets
+            for k, v in _stack.pop().items():
+                _stack[-1][k] += v * int(
+                    closing.group(1)
+                    or 1)  # v times amount of molecule k, depending on nesting
+
+        # If anything remains, process remainders recursively as nested formulas:
+        if r:
+            _parse_formula(r, _stack)
+
+        return dict(_stack[0])
+
+    result = _parse_formula(molecular_formula, stack)
+
+    charge = re.search('[\+\-]\d*', molecular_formula)
+    if charge is not None:
+        charge_str = charge.group()
+        charge_type = charge_str[0]
+        if len(charge_str) == 1:
+            charge_num = 1
+        else:
+            charge_num = int(charge_str[1:])
+        result[charge_type] = charge_num
+
+    return result
+
+
+def calculate_single_element_match_for_list(predictions, references):
+    # 抽取SMILES里的化学式
+    predictions = [
+        extract_chemical_data(prediction) for prediction in predictions
+    ]
+    references = [extract_chemical_data(reference) for reference in references]
+
+    ele_match_labels = []
+    ele_invalid_labels = []
+    details = []
+    for pred_formula, gold_formula in zip(predictions, references):
+        gold_formula = gold_formula[0]
+        if pred_formula:
+            pred_formula = pred_formula[0]
+        detail = {'pred': [pred_formula], 'answer': gold_formula}
+        if not pred_formula or not pred_formula:
+            ele_invalid_labels.append(False)
+            ele_match_labels.append(False)
+            detail['score'] = [False]
+            details.append(detail)
+            continue
+        try:
+            pred_ele = parse_molecule(pred_formula)
+        except KeyboardInterrupt:
+            raise
+        except:
+            # print(pred_formula)
+            # print('=====')
+            ele_invalid_labels.append(True)
+            ele_match_labels.append(False)
+            detail['score'] = [False]
+            details.append(detail)
+            continue
+        ele_invalid_labels.append(False)
+        ele_match = False
+        gold_ele = parse_molecule(gold_formula)
+        if pred_ele == gold_ele:
+            ele_match = True
+        ele_match_labels.append(ele_match)
+        detail['score'] = [ele_match]
+        details.append(detail)
+
+    score = sum(ele_match_labels) / len(predictions) * 100
+    valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
+
+    return {'score': score, 'valid_score': valid_score, 'details': details}
+
+
+def calculate_single_element_match(predictions, references):
+    # 抽取SMILES里的化学式
+    predictions = [
+        extract_chemical_data(prediction) for prediction in predictions
+    ]
+    references = [extract_chemical_data(reference) for reference in references]
+
+    ele_match_labels = []
+    ele_invalid_labels = []
+    details = []
+    for pred_formula, gold_formula in zip(predictions, references):
+        gold_formula = gold_formula[0]
+        if pred_formula:
+            pred_formula = pred_formula[0]
+        detail = {'pred': pred_formula, 'answer': gold_formula}
+        if not pred_formula or not pred_formula:
+            ele_invalid_labels.append(False)
+            ele_match_labels.append(False)
+            detail['score'] = False
+            details.append(detail)
+            continue
+        try:
+            pred_ele = parse_molecule(pred_formula)
+        except KeyboardInterrupt:
+            raise
+        except:
+            # print(pred_formula)
+            # print('=====')
+            ele_invalid_labels.append(True)
+            ele_match_labels.append(False)
+            detail['score'] = False
+            details.append(detail)
+            continue
+        ele_invalid_labels.append(False)
+        ele_match = False
+        gold_ele = parse_molecule(gold_formula)
+        if pred_ele == gold_ele:
+            ele_match = True
+        ele_match_labels.append(ele_match)
+        detail['score'] = ele_match
+        details.append(detail)
+
+    score = sum(ele_match_labels) / len(predictions) * 100
+    valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
+
+    return {'score': score, 'valid_score': valid_score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class NCElementMatchEvaluator(BaseEvaluator):
+    """Element match evaluator for name conversion."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        print('len(predictions):', len(predictions))
+        print('len(references):', len(references))
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+
+        # topk的prediction，要拆开
+        if isinstance(predictions[0], str):
+            return calculate_single_element_match(predictions, references)
+        else:
+            num_k = len(predictions[0])
+            scores = []
+            for i in range(num_k):
+                pred = [prediction[i] for prediction in predictions]
+                ref = references
+                score = calculate_single_element_match_for_list(pred, ref)
+                scores.append(score)
+            # 按照instance合并成一个完整的dict
+            final_details = scores[0]['details']
+            final_scores = [scores[0]['score']]
+            final_valid_scores = [scores[0]['valid_score']]
+            for _k in scores[1:]:
+                for i, _d in enumerate(_k['details']):
+                    # print(_d)
+                    final_details[i]['pred'].extend(_d['pred'])
+                    final_details[i]['score'].extend(_d['score'])
+                final_scores.append(_k['score'])
+                final_valid_scores.append(_k['valid_score'])
+            avg_score = []
+            for _d in final_details:
+                if True in _d['score']:
+                    avg_score.append(1)
+                else:
+                    avg_score.append(0)
+            max_score = sum(avg_score) / len(avg_score) * 100
+            return {
+                'score': max_score,
+                'all_score': final_scores,
+                'valid_score': final_valid_scores,
+                'details': final_details,
+            }
+
+
+@ICL_EVALUATORS.register_module()
+class NCExactMatchEvaluator(BaseEvaluator):
+    """Exact match evaluator for name conversion."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        predictions = [
+            extract_chemical_data(prediction) for prediction in predictions
+        ]
+        references = [
+            extract_chemical_data(reference) for reference in references
+        ]
+
+        cnt = 0
+        valid_cnt = 0
+        details = []
+        for pred, ans in zip(predictions, references):
+            ans = ans[0]
+            if pred:
+                pred = pred[0]
+                valid_cnt += 1
+            detail = {'pred': pred, 'answer': ans}
+            if pred and pred.strip() == ans.strip():
+                cnt += 1
+                detail['correct'] = True
+            else:
+                detail['correct'] = False
+            details.append(detail)
+
+        score = cnt / len(predictions) * 100
+        valid_score = valid_cnt / len(predictions) * 100
+
+        return {'score': score, 'valid_score': valid_score, 'details': details}
+
+
+def extract_number(text):
+    pattern = re.compile(r'<NUMBER>\s*(-?\d*\.?\d+)\s*</NUMBER>')
+    matches = pattern.findall(text)
+    return [float(match) for match in matches]
+
+
+@ICL_EVALUATORS.register_module()
+class RMSEEvaluator(BaseEvaluator):
+    """Exact match evaluator for name conversion."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+
+        avg_score = 0
+        details = []
+        for prediction, reference in zip(predictions, references):
+            pred = extract_number(prediction)
+            ans = extract_number(reference)
+            if not pred:
+                pred = 0
+            else:
+                pred = pred[0]
+            try:
+                ans = ans[0]
+            except:
+                raise ValueError(f'ans: {reference}')
+            detail = {'pred': pred, 'answer': ans}
+            rmse_score = np.sqrt(np.mean((np.array(pred) - np.array(ans))**2))
+            detail['score'] = rmse_score
+            avg_score += rmse_score
+            details.append(detail)
+
+        score = avg_score / len(predictions)
+
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class FTSEvaluator(BaseEvaluator):
+    """Exact match evaluator for name conversion."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+
+        predictions = [
+            extract_chemical_data(prediction) for prediction in predictions
+        ]
+        references = [
+            extract_chemical_data(reference) for reference in references
+        ]
+
+        avg_score = 0
+        valid_cnt = 0
+        details = []
+        for pred, ans in zip(predictions, references):
+            ans = ans[0]
+            if not pred:
+                detail = {'pred': '', 'answer': ans, 'score': 0}
+                details.append(detail)
+                continue
+            pred = pred[0]
+            detail = {'pred': pred, 'answer': ans}
+            # 将 SMILES 转换为 RDKit 分子对象
+            from rdkit import Chem
+            mol1 = Chem.MolFromSmiles(pred)
+            mol2 = Chem.MolFromSmiles(ans)
+            if mol1 is None or mol2 is None:
+                detail['score'] = 0
+                details.append(detail)
+                continue
+            valid_cnt += 1
+            # 生成 Morgan 指纹（等同于 ECFP4）
+            # fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048)
+            # fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048)
+            from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
+            generator = GetMorganGenerator(radius=2, fpSize=2048)
+            fp1 = generator.GetFingerprint(mol1)
+            fp2 = generator.GetFingerprint(mol2)
+            from rdkit.Chem import DataStructs
+            similarity = DataStructs.TanimotoSimilarity(fp1, fp2) * 100
+            detail['score'] = similarity
+            avg_score += similarity
+            details.append(detail)
+
+        score = avg_score / len(predictions)
+        valid_score = valid_cnt / len(predictions) * 100
+
+        return {'score': score, 'valid_score': valid_score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class MeteorEvaluator(BaseEvaluator):
+    """Exact match evaluator for name conversion."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different '
+                'length'
+            }
+        avg_score = 0
+        details = []
+        for pred, ans in zip(predictions, references):
+            score = meteor_score([ans.split()], pred.split())
+            avg_score += score
+            detail = {'pred': pred, 'answer': ans, 'score': score}
+            details.append(detail)
+
+        score = avg_score / len(predictions)
+
+        return {'score': score, 'details': details}
+
+
+@TEXT_POSTPROCESSORS.register_module('smolinstruct-acc')
+def smolinstruct_acc_postprocess(text: str) -> str:
+    if 'yes' in text.lower():
+        return '<BOOLEAN> Yes </BOOLEAN>'
+    elif 'no' in text.lower():
+        return '<BOOLEAN> No </BOOLEAN>'
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index e3690162..8492b0df 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -435,6 +435,11 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/PHYSICS-textonly",
     },
+     "opencompass/ChemBench4K": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/ChemBench4K",
+    },
 
 }
 
@@ -777,5 +782,11 @@ DATASETS_URL = {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
         "md5": "270f399f4142b74f47ecff116cc3b21d"
+    },
+    "ChemBench4K": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ChemBench4K.zip",
+        "md5": "fc23fd21b2566a5dbbebfa4601d7779c"
     }
+    
 }
diff --git a/requirements/extra.txt b/requirements/extra.txt
index f81d410f..57ca9c88 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -19,5 +19,7 @@ math-verify[antlr4_11_0]
 pyext
 # Law Bench
 pypinyin
+# Smolinstruct
+rdkit
 # RULER
 wonderwords

From a2093a81eff7869b417474bc86a745e4fbae9854 Mon Sep 17 00:00:00 2001
From: JuchengHu <36852779+smgjch@users.noreply.github.com>
Date: Mon, 21 Apr 2025 15:50:47 +0800
Subject: [PATCH 04/10] [Dataset] Matbench (#2021)

* add support for matbench

* fix dataset path

* fix data load

* fix

* fix lint

---------

Co-authored-by: Jucheng Hu <jucheng.hu.20@ucl.ac.uk>
Co-authored-by: Myhs-phz <demarcia2014@126.com>
---
 .pre-commit-config.yaml                       |  1 +
 dataset-index.yml                             |  6 ++
 .../configs/datasets/matbench/matbench_gen.py |  4 +
 .../datasets/matbench/matbench_gen_f71840.py  | 55 ++++++++++++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/matbench/__init__.py     |  3 +
 opencompass/datasets/matbench/matbench.py     | 87 +++++++++++++++++++
 opencompass/datasets/matbench/post_process.py | 25 ++++++
 opencompass/utils/datasets_info.py            | 11 +++
 9 files changed, 193 insertions(+)
 create mode 100644 opencompass/configs/datasets/matbench/matbench_gen.py
 create mode 100644 opencompass/configs/datasets/matbench/matbench_gen_f71840.py
 create mode 100644 opencompass/datasets/matbench/__init__.py
 create mode 100644 opencompass/datasets/matbench/matbench.py
 create mode 100644 opencompass/datasets/matbench/post_process.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b464115e..55eb17ea 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,6 +8,7 @@ exclude: |
       opencompass/datasets/lawbench/utils|
       opencompass/datasets/lawbench/evaluation_functions/|
       opencompass/datasets/medbench/|
+      opencompass/datasets/matbench/|
       opencompass/datasets/teval/|
       opencompass/datasets/NPHardEval/|
       opencompass/datasets/TheoremQA|
diff --git a/dataset-index.yml b/dataset-index.yml
index 6e2c3fc9..9585f97c 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -110,6 +110,12 @@
     paper: ''
     configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
     configpath_llmjudge: ''
+- matbench:
+    name: matbench
+    category: Science / Material
+    paper: 'https://www.nature.com/articles/s41524-020-00406-3'
+    configpath: opencompass/configs/datasets/matbench/matbench_gen_f71840.py
+    configpath_llmjudge: ''
 - medbench:
     name: MedBench
     category: Knowledge / Medicine
diff --git a/opencompass/configs/datasets/matbench/matbench_gen.py b/opencompass/configs/datasets/matbench/matbench_gen.py
new file mode 100644
index 00000000..d3212435
--- /dev/null
+++ b/opencompass/configs/datasets/matbench/matbench_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .matbench_gen_f71840 import matbench_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/matbench/matbench_gen_f71840.py b/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
new file mode 100644
index 00000000..8b8a676b
--- /dev/null
+++ b/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
@@ -0,0 +1,55 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
+
+
+
+matbench_reader_cfg = dict(
+    input_columns=['problem'], output_column='answer')
+
+
+matbench_tasks =  ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
+
+matbench_datasets = []
+
+for task in matbench_tasks:
+    if task in ['matbench_expt_is_metal','matbench_glass']:
+        matbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+        matbench_eval_cfg = dict(
+            evaluator=dict(type=MatbenchEvaluator_classification),
+            pred_role='BOT')
+
+    elif task in ['matbench_steels','matbench_expt_gap']:
+        matbench_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+
+        matbench_eval_cfg = dict(
+            evaluator=dict(type=MatbenchEvaluator_regression),
+            pred_role='BOT')
+
+
+    matbench_datasets.append(
+        dict(
+            type=MatbenchDataset,
+            path=f'opencompass/Matbench',
+            task=task,
+            abbr=task,
+            reader_cfg=matbench_reader_cfg,
+            infer_cfg=matbench_infer_cfg,
+            eval_cfg=matbench_eval_cfg))
+
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 6f7be89a..3e4cc6fc 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -87,6 +87,7 @@ from .longbench import *  # noqa: F401, F403
 from .longbenchv2 import *  # noqa: F401, F403
 from .lveval import *  # noqa: F401, F403
 from .mastermath2024v1 import *  # noqa: F401, F403
+from .matbench import *  # noqa: F401, F403
 from .math import *  # noqa: F401, F403
 from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
diff --git a/opencompass/datasets/matbench/__init__.py b/opencompass/datasets/matbench/__init__.py
new file mode 100644
index 00000000..7fb9a7f7
--- /dev/null
+++ b/opencompass/datasets/matbench/__init__.py
@@ -0,0 +1,3 @@
+# flake8: noqa
+
+from .matbench import *  # noqa: F401, F403
diff --git a/opencompass/datasets/matbench/matbench.py b/opencompass/datasets/matbench/matbench.py
new file mode 100644
index 00000000..bd456998
--- /dev/null
+++ b/opencompass/datasets/matbench/matbench.py
@@ -0,0 +1,87 @@
+import json
+import os
+
+from datasets import Dataset
+from sklearn.metrics import (accuracy_score, f1_score, precision_score,
+                             recall_score)
+
+from opencompass.datasets.matbench.post_process import (parse_float_answer,
+                                                        parse_true_false_answer
+                                                        )
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MatbenchDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, task):
+        path = get_data_path(path)
+        path = os.path.join(path,
+                            'matbench_base_fold_0_' + task + '_test.json')
+        dataset = []
+        with open(path, 'r', encoding='utf-8') as file:
+            data = json.load(file)
+            for item in data:
+                dataset.append({
+                    'problem': item['problem'],
+                    'answer': item['answer'],
+                })
+        dataset = Dataset.from_list(dataset)
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class MatbenchEvaluator_regression(BaseEvaluator):
+
+    def score(self, predictions, references):
+        mae_sum = 0
+        count = 0
+        details = []
+        for pred, ref in zip(predictions, references):
+            pred = parse_float_answer(pred)
+            detail = {'pred': pred, 'answer': ref, 'error': None}
+            count += 1
+            try:
+                error = abs(float(pred) - float(ref))
+                mae_sum += error
+                detail['error'] = error
+            except Exception as e:
+                detail['error'] = str(e)
+            details.append(detail)
+        mae = mae_sum / count if count > 0 else 0
+        result = {'mae': mae, 'details': details}
+        return result
+
+
+@ICL_EVALUATORS.register_module()
+class MatbenchEvaluator_classification(BaseEvaluator):
+
+    def score(self, predictions, references):
+        details = []
+        predictions_parsed = []
+        for pred, ref in zip(predictions, references):
+            pred = parse_true_false_answer(pred)
+            detail = {'pred': pred, 'answer': ref, 'correct': False}
+            if pred == ref:
+                detail['correct'] = True
+            details.append(detail)
+            predictions_parsed.append(pred)
+        accuracy = accuracy_score(references, predictions_parsed)
+        precision = precision_score(references,
+                                    predictions_parsed,
+                                    average='binary')
+        recall = recall_score(references, predictions_parsed, average='binary')
+        f1 = f1_score(references, predictions_parsed, average='binary')
+
+        return {
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1_score': f1,
+            'details': details
+        }
diff --git a/opencompass/datasets/matbench/post_process.py b/opencompass/datasets/matbench/post_process.py
new file mode 100644
index 00000000..f252a35f
--- /dev/null
+++ b/opencompass/datasets/matbench/post_process.py
@@ -0,0 +1,25 @@
+# flake8: noqa
+
+import re
+
+
+def parse_float_answer(raw_string, option=''):
+    number_pattern = re.compile(r'[-+]?\d+(\.\d+)?([eE][-+]?\d+)?')
+
+    # Search for the first match
+    match = number_pattern.search(raw_string)
+    if match:
+        # Extract the matched number and convert it to float
+        return float(match.group())
+    else:
+        # Return None if no number is found
+        return 0
+
+
+def parse_true_false_answer(raw_string, option=''):
+    if 'yes' in raw_string.lower():
+        return True
+    elif 'no' in raw_string.lower():
+        return False
+    else:
+        return True
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 8492b0df..5048a496 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -27,6 +27,12 @@ DATASETS_MAPPING = {
         "hf_id": "opencompass/ai2_arc",
         "local": "./data/ARC/ARC-e/ARC-Easy-Dev.jsonl",
     },
+    # Matbench
+    "opencompass/Matbench": {
+    # "ms_id": "opencompass/Matbench",
+    "hf_id": "opencompass/Matbench",
+    "local": "./data/Matbench",
+    },
     # BBH
     "opencompass/bbh": {
         "ms_id": "opencompass/bbh",
@@ -664,6 +670,11 @@ DATASETS_URL = {
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip",
         "md5": "1321cbf9349e1102a57d31d1b2bfdd7e",
     },
+    "/Matbench":{
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Matbench.zip",
+        "md5": "99f9457f54f4f419da9556af56ac4c24",
+    },
     "mmlu_pro": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",

From c69110361b24f04973eb107bb86d5c01f4bf9e53 Mon Sep 17 00:00:00 2001
From: Taolin Zhang <55646471+taolinzhang@users.noreply.github.com>
Date: Mon, 21 Apr 2025 17:18:51 +0800
Subject: [PATCH 05/10] [Add] add rewardbench (#2029)

* add rewardbench

* add rewardbench
---
 examples/eval_rewardbench.py                  | 53 ++++++++++++++
 .../configs/datasets/judge/rewardbench.py     | 71 +++++++++++++++++++
 .../configs/summarizers/rewardbench.py        | 11 +++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/judge/__init__.py        |  1 +
 opencompass/datasets/judge/rewardbench.py     | 56 +++++++++++++++
 opencompass/openicl/icl_evaluator/__init__.py |  1 +
 .../icl_evaluator/icl_judge_evaluator.py      | 33 +++++++++
 8 files changed, 227 insertions(+)
 create mode 100644 examples/eval_rewardbench.py
 create mode 100644 opencompass/configs/datasets/judge/rewardbench.py
 create mode 100644 opencompass/configs/summarizers/rewardbench.py
 create mode 100644 opencompass/datasets/judge/__init__.py
 create mode 100644 opencompass/datasets/judge/rewardbench.py
 create mode 100644 opencompass/openicl/icl_evaluator/icl_judge_evaluator.py

diff --git a/examples/eval_rewardbench.py b/examples/eval_rewardbench.py
new file mode 100644
index 00000000..9a3a6efc
--- /dev/null
+++ b/examples/eval_rewardbench.py
@@ -0,0 +1,53 @@
+from mmengine.config import read_base
+with read_base():
+    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
+    from opencompass.configs.summarizers.rewardbench import summarizer
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
+from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
+datasets = [*get_rewardbench_datasets]
+
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='qwen-7b-hf',
+        path='Qwen/Qwen-7B',
+        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
+        max_seq_len=16384,
+        max_out_len=2048,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    ),
+]
+
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=72,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+
+
+work_dir = './outputs/rewardbench/'
diff --git a/opencompass/configs/datasets/judge/rewardbench.py b/opencompass/configs/datasets/judge/rewardbench.py
new file mode 100644
index 00000000..a77e4e2d
--- /dev/null
+++ b/opencompass/configs/datasets/judge/rewardbench.py
@@ -0,0 +1,71 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import JudgeEvaluator
+from opencompass.datasets import RewardBenchDataset
+
+
+subjective_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='judge',
+    )
+
+data_path = './data/judgeeval/rewardbench'
+subjective_all_sets = ['llmbar-natural.json', 'llmbar-adver-GPTInst.json', 'hep-go.json', 'refusals-dangerous.json', 'hep-cpp.json', 'mt-bench-easy.json', 'alpacaeval-length.json', 'llmbar-adver-neighbor.json', 'alpacaeval-easy.json', 'hep-java.json', 'llmbar-adver-GPTOut.json', 'mt-bench-hard.json', 'xstest-should-respond.json', 'xstest-should-refuse.json', 'hep-python.json', 'refusals-offensive.json', 'alpacaeval-hard.json', 'llmbar-adver-manual.json', 'hep-js.json', 'math-prm.json', 'hep-rust.json', 'mt-bench-med.json', 'donotanswer.json']
+get_rewardbench_datasets = []
+
+
+
+prompt_choice_prefix = """
+Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
+
+- Do not let the order of presentation, response length, or assistant names influence your judgment.
+- Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
+
+Your final reply must be structured in the following format:
+{
+  "Choice": "[Model A or Model B]"
+}
+"""
+
+prompt_choice_en = """User Question: {question}
+
+Model A's Response: {answerA}
+
+Model B's Response: {answerB}
+
+Now it's your turn. Please provide selection result as required:
+"""
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=prompt_choice_prefix + prompt_choice_en
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    rewardbench_eval_cfg = dict(
+        evaluator=dict(
+            type=JudgeEvaluator,
+        ),
+    )
+
+    get_rewardbench_datasets.append(
+        dict(
+            abbr=f'{_name.split(".")[0]}',
+            type=RewardBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=rewardbench_eval_cfg,
+            mode='singlescore',
+        ))
diff --git a/opencompass/configs/summarizers/rewardbench.py b/opencompass/configs/summarizers/rewardbench.py
new file mode 100644
index 00000000..477f1a56
--- /dev/null
+++ b/opencompass/configs/summarizers/rewardbench.py
@@ -0,0 +1,11 @@
+RewardBench_summary_groups = []
+
+_RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
+RewardBench_summary_groups.append({'name': 'RewardBench', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
+
+summarizer = dict(
+    dataset_abbrs=[
+        'RewardBench'
+    ],
+    summary_groups=RewardBench_summary_groups,
+)
\ No newline at end of file
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 3e4cc6fc..b00162d1 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -71,6 +71,7 @@ from .infinitebench import *  # noqa: F401, F403
 from .iwslt2017 import *  # noqa: F401, F403
 from .jigsawmultilingual import *  # noqa: F401, F403
 from .jsonl import JsonlDataset  # noqa: F401, F403
+from .judge import *  # noqa: F401, F403
 from .kaoshi import KaoshiDataset, KaoshiEvaluator  # noqa: F401, F403
 from .korbench import *  # noqa: F401, F403
 from .lambada import *  # noqa: F401, F403
diff --git a/opencompass/datasets/judge/__init__.py b/opencompass/datasets/judge/__init__.py
new file mode 100644
index 00000000..be6a7ee9
--- /dev/null
+++ b/opencompass/datasets/judge/__init__.py
@@ -0,0 +1 @@
+from .rewardbench import RewardBenchDataset  # noqa: F401, F403
diff --git a/opencompass/datasets/judge/rewardbench.py b/opencompass/datasets/judge/rewardbench.py
new file mode 100644
index 00000000..9533ae17
--- /dev/null
+++ b/opencompass/datasets/judge/rewardbench.py
@@ -0,0 +1,56 @@
+# flake8: noqa
+import json
+import os.path as osp
+import re
+
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
+                                  LOAD_DATASET)
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+@LOAD_DATASET.register_module()
+class RewardBenchDataset(BaseDataset):
+
+    def load(self, path: str, name: str, *args, **kwargs):
+
+        path = get_data_path(path, local_mode=True)
+        filename = osp.join(path, f'{name}')
+        raw_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+            for item in data:
+                conversation_a = item['chosen']
+                conversation_b = item['rejected']
+                model_a = item['chosen_model']
+                model_b = item['rejected_model']
+                question = item['prompt']
+                winner = item['winner']
+                if winner == 'B':
+                    conversation_a, conversation_b = conversation_b, conversation_a
+                    model_a, model_b = model_b, model_a
+                subset = item['subset']
+                lan = 'en'
+                raw_data.append({
+                    'question': question,
+                    'answerA': conversation_a,
+                    'answerB': conversation_b,
+                    'judge': {
+                        'prompt': item['prompt'],
+                        'Answer_A': conversation_a,
+                        'Answer_B': conversation_b,
+                        'subset': subset,
+                        'winner': winner,
+                        'model_a': model_a,
+                        'model_b': model_b,
+                        'dataset_name': 'rewardbench',
+                        'lan': lan
+                    }
+                })
+        dataset = Dataset.from_list(raw_data)
+        return dataset
diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py
index fa8f25ab..47e2ae27 100644
--- a/opencompass/openicl/icl_evaluator/__init__.py
+++ b/opencompass/openicl/icl_evaluator/__init__.py
@@ -6,6 +6,7 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
+from .icl_judge_evaluator import JudgeEvaluator  # noqa
 from .icl_misc_evaluator import AverageInferencePPLEvaluator  # noqa
 from .icl_misc_evaluator import AverageMinKEvaluator  # noqa
 from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
diff --git a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
new file mode 100644
index 00000000..e50afae8
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
@@ -0,0 +1,33 @@
+# flake8: noqa
+"""KOR-Bench Evaluator."""
+
+import json
+import os
+import re
+
+from .icl_base_evaluator import BaseEvaluator
+
+
+class JudgeEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for prediction, reference in zip(predictions, references):
+            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
+            gold_winner = reference.get('winner', '')
+            detail = {
+                'pred': prediction,
+                'answer': gold_winner,
+                'correct': False
+            }
+            count += 1
+            if choice == gold_winner:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result

From 455bb05d1b959bcdf240ddc656798545aef1136c Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 21 Apr 2025 18:55:06 +0800
Subject: [PATCH 06/10] [Update] Update dataset configs (#2030)

* [Update] Update dataset configs

* Fix lint
---
 .../datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py    | 2 +-
 opencompass/datasets/TheoremQA/utils.py                    | 7 ++++++-
 opencompass/datasets/judge/rewardbench.py                  | 1 +
 opencompass/datasets/smolinstruct.py                       | 1 +
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
index d6fc7e46..86e9739c 100644
--- a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
@@ -86,7 +86,7 @@ for _name in chembench_all_sets:
             ),
             dataset_cfg=dict(
                 type=ChemBenchDataset,
-                path='/fs-computility/llm/xiaolinchen/opencompass_fork/data/ChemBench4K',
+                path='opencompass/ChemBench4K',
                 name=_name,
                 reader_cfg=chembench_reader_cfg,
             ),
diff --git a/opencompass/datasets/TheoremQA/utils.py b/opencompass/datasets/TheoremQA/utils.py
index ca9c2661..e6a35e4f 100644
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@@ -33,7 +33,12 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
             try:
                 with time_limit(1):
                     tmp = str(latex2sympy(pred))
-                    pred = str(eval(tmp))
+                    pred = eval(tmp)
+                    if isinstance(pred, tuple):
+                        pred = str(list(pred))
+                    else:
+                        pred = str(pred)
+
             except Exception:
                 if re.match(r'-?[\d\.]+\s\D+$', pred):
                     pred = pred.split(' ')[0]
diff --git a/opencompass/datasets/judge/rewardbench.py b/opencompass/datasets/judge/rewardbench.py
index 9533ae17..e951dc22 100644
--- a/opencompass/datasets/judge/rewardbench.py
+++ b/opencompass/datasets/judge/rewardbench.py
@@ -14,6 +14,7 @@ from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
+
 @LOAD_DATASET.register_module()
 class RewardBenchDataset(BaseDataset):
 
diff --git a/opencompass/datasets/smolinstruct.py b/opencompass/datasets/smolinstruct.py
index 4589d606..54c58a6b 100644
--- a/opencompass/datasets/smolinstruct.py
+++ b/opencompass/datasets/smolinstruct.py
@@ -4,6 +4,7 @@ from collections import defaultdict
 
 import numpy as np
 from datasets import Dataset, DatasetDict, load_dataset
+from nltk.translate.meteor_score import meteor_score
 
 from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
 from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,

From bf74f26603ed7b27ded518930ff882f23c421361 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 22 Apr 2025 18:27:48 +0800
Subject: [PATCH 07/10] [Update] Safe SmolInstruct meteor calculation (#2033)

---
 opencompass/datasets/smolinstruct.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/opencompass/datasets/smolinstruct.py b/opencompass/datasets/smolinstruct.py
index 54c58a6b..cd3b21ed 100644
--- a/opencompass/datasets/smolinstruct.py
+++ b/opencompass/datasets/smolinstruct.py
@@ -409,7 +409,13 @@ class MeteorEvaluator(BaseEvaluator):
         avg_score = 0
         details = []
         for pred, ans in zip(predictions, references):
-            score = meteor_score([ans.split()], pred.split())
+            try:
+                score = (meteor_score([ans.split()], pred.split())
+                         if ans and pred else 0.0)
+            except AttributeError:
+                logging.error(f'Failed to compute METEOR'
+                              f"score:\npred='{pred}'\nans='{ans}'")
+                score = 0.0
             avg_score += score
             detail = {'pred': pred, 'answer': ans, 'score': score}
             details.append(detail)

From dcbf899369dae31af8f1ecb9a59eb2c50cf2aad4 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 23 Apr 2025 11:10:30 +0800
Subject: [PATCH 08/10] [Bug] Fix SmolInsturct logger import (#2036)

---
 opencompass/datasets/smolinstruct.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/opencompass/datasets/smolinstruct.py b/opencompass/datasets/smolinstruct.py
index cd3b21ed..e9577335 100644
--- a/opencompass/datasets/smolinstruct.py
+++ b/opencompass/datasets/smolinstruct.py
@@ -9,6 +9,7 @@ from nltk.translate.meteor_score import meteor_score
 from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
 from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
                                   TEXT_POSTPROCESSORS)
+from opencompass.utils import get_logger
 
 from .base import BaseDataset
 
@@ -413,8 +414,9 @@ class MeteorEvaluator(BaseEvaluator):
                 score = (meteor_score([ans.split()], pred.split())
                          if ans and pred else 0.0)
             except AttributeError:
-                logging.error(f'Failed to compute METEOR'
-                              f"score:\npred='{pred}'\nans='{ans}'")
+                self.logger = get_logger()
+                self.logger.warning(f'Failed to compute METEOR'
+                                    f"score:\npred='{pred}'\nans='{ans}'")
                 score = 0.0
             avg_score += score
             detail = {'pred': pred, 'answer': ans, 'score': score}

From 97010dc4ce9ee0e0684431125d84e3e132312f8e Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Wed, 23 Apr 2025 16:16:28 +0800
Subject: [PATCH 09/10] [Update] Update dataset repeat concatenation (#2039)

---
 opencompass/datasets/base.py | 38 ++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py
index 7099b0c6..ac6c4570 100644
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@@ -1,7 +1,6 @@
-from copy import deepcopy
 from typing import Dict, List, Optional, Union
 
-from datasets import Dataset, DatasetDict
+from datasets import Dataset, DatasetDict, concatenate_datasets
 
 from opencompass.openicl import DatasetReader
 
@@ -19,28 +18,25 @@ class BaseDataset:
         assert (max(k) if isinstance(k, List) else
                 k) <= n, 'Maximum value of `k` must less than or equal to `n`'
         if isinstance(dataset, Dataset):
-            examples = []
-            for idx, example in enumerate(dataset):
-                if 'subdivision' not in example:
-                    example['subdivision'] = abbr
-                if 'idx' not in example:
-                    example['idx'] = idx
-                examples.append(example)
-            examples = sum([deepcopy(examples) for _ in range(n)], [])
-            self.dataset = Dataset.from_list(examples)
+            dataset = dataset.map(lambda x, idx: {
+                'subdivision': abbr,
+                'idx': idx
+            },
+                                  with_indices=True,
+                                  writer_batch_size=16)
+            dataset = concatenate_datasets([dataset] * n)
+            self.dataset = dataset
         else:
             self.dataset = DatasetDict()
             for key in dataset:
-                examples = []
-                for idx, example in enumerate(dataset[key]):
-                    if 'subdivision' not in example:
-                        example['subdivision'] = f'{abbr}_{key}'
-                    if 'idx' not in example:
-                        example['idx'] = idx
-                    examples.append(example)
-                print(abbr, key, len(examples))
-                examples = sum([deepcopy(examples) for _ in range(n)], [])
-                self.dataset[key] = Dataset.from_list(examples)
+                dataset[key] = dataset[key].map(lambda x, idx: {
+                    'subdivision': f'{abbr}_{key}',
+                    'idx': idx
+                },
+                                                with_indices=True,
+                                                writer_batch_size=16)
+                dataset[key] = concatenate_datasets([dataset[key]] * n)
+                self.dataset[key] = dataset[key]
         self._init_reader(**reader_cfg)
 
     def _init_reader(self, **kwargs):

From e8bc8c1e8c29aff1c1ab8a2a647da0864affe8ef Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 25 Apr 2025 14:10:33 +0800
Subject: [PATCH 10/10] [Bug] Concat OpenaiSDK reasoning content (#2041)

* [Bug] Concat OpenaiSDK reasoning content

* [Bug] Concat OpenaiSDK reasoning content

* update

* update
---
 opencompass/models/openai_api.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 6ef11b8f..9c2baed1 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -661,18 +661,32 @@ class OpenAISDK(OpenAI):
                         pass  # noqa F841
 
                 # Check if response is empty or content is empty
-                if not responses.choices or not responses.choices[
-                        0].message.content:
+                if (not responses.choices or not responses.choices[0].message
+                        or not responses.choices[0].message.content):
                     self.logger.error(
-                        'API response is empty, it might be due to excessive '
-                        'input length or an internal server error '
-                        'from your API provider.')
+                        'Failed to extract content from the responses. '
+                        'Please check the API response for detail information.'
+                        'API responses: %s',
+                        responses,
+                    )
                     num_retries += 1
                     # Continue to retry instead of returning empty response
                     continue
-                # If the model has reasoning_content, concat it
-                # with the content
-                if hasattr(responses.choices[0].message, 'reasoning_content'):
+
+                # Concat Reasoning Content and tags to content
+                if (hasattr(responses.choices[0].message, 'reasoning_content')
+                        and responses.choices[0].message.reasoning_content):
+                    if self.verbose:
+                        self.logger.info(
+                            'Follow'
+                            'vllm/reasoning/deepseek_r1_reasoning_parser'
+                            'to parse the reasoning content and tags'
+                            'Reasoning Content: %s, \n'
+                            'Tags: %s, \n'
+                            'Content: %s',
+                            responses.choices[0].message.reasoning_content,
+                            self.think_tag,
+                            responses.choices[0].message.content)
                     return (responses.choices[0].message.reasoning_content +
                             self.think_tag +
                             responses.choices[0].message.content)