Merge branch 'open-compass:main' into main

2025-05-30 16:03:24 +08:00 · 2025-04-29 16:38:35 +08:00 · 2025-04-29 16:38:35 +08:00 · b0b209e443
commit b0b209e443
parent 5fee3b237a 527a80947b
40 changed files with 2153 additions and 109 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -8,6 +8,7 @@ exclude: |
      opencompass/datasets/lawbench/utils|
      opencompass/datasets/lawbench/evaluation_functions/|
      opencompass/datasets/medbench/|
      opencompass/datasets/matbench/|
      opencompass/datasets/teval/|
      opencompass/datasets/NPHardEval/|
      opencompass/datasets/TheoremQA|
--- a/dataset-index.yml
+++ b/dataset-index.yml
@ -110,6 +110,12 @@
    paper: ''
    configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
    configpath_llmjudge: ''
 - matbench:
    name: matbench
    category: Science / Material
    paper: 'https://www.nature.com/articles/s41524-020-00406-3'
    configpath: opencompass/configs/datasets/matbench/matbench_gen_f71840.py
    configpath_llmjudge: ''
 - medbench:
    name: MedBench
    category: Knowledge / Medicine
@ -1010,4 +1016,10 @@
    category: Science
    paper: https://arxiv.org/pdf/2503.21821
    configpath: ''
-    configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
+    configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
 - smolinstruct:
    name: SmolInstruct
    category: Science /Chemistry
    paper: https://arxiv.org/pdf/2402.09391
    configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
    configpath_llmjudge: ''
--- a/examples/eval_rewardbench.py
+++ b/examples/eval_rewardbench.py
@ -0,0 +1,53 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.judge.rewardbench import get_rewardbench_datasets
    from opencompass.configs.summarizers.rewardbench import summarizer
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
 from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
 from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ]
 )
 datasets = [*get_rewardbench_datasets]
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='qwen-7b-hf',
        path='Qwen/Qwen-7B',
        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
        max_seq_len=16384,
        max_out_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    ),
 ]
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
    runner=dict(
        type=LocalRunner,
        max_num_workers=72,
        task=dict(type=OpenICLInferTask),
    ),
 )
 work_dir = './outputs/rewardbench/'
--- a/examples/eval_rmb.py
+++ b/examples/eval_rmb.py
@ -0,0 +1,53 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.judge.rmb import get_rmb_dataset
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner, NumWorkerPartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
 from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
 from opencompass.runners import LocalRunner, DLCRunner, VOLCRunner
 from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
 api_meta_template = dict(
    round=[
        dict(role='HUMAN', api_role='HUMAN'),
        dict(role='BOT', api_role='BOT', generate=True),
    ]
 )
 datasets = [*get_rmb_dataset]
 from opencompass.models import TurboMindModelwithChatTemplate
 models = [
    dict(
        type=TurboMindModelwithChatTemplate,
        abbr='qwen-7b-hf',
        path='Qwen/Qwen-7B',
        engine_config=dict(session_len=16384, max_batch_size=16, tp=1),
        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048),
        max_seq_len=16384,
        max_out_len=2048,
        batch_size=16,
        run_cfg=dict(num_gpus=1),
    ),
 ]
 infer = dict(
    # partitioner=dict(type=NaivePartitioner),
    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
    runner=dict(
        type=LocalRunner,
        max_num_workers=72,
        task=dict(type=OpenICLInferTask),
    ),
 )
 work_dir = './outputs/rmb/'
--- a/opencompass/configs/datasets/ChemBench/ChemBench_gen.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_gen.py
@ -1,77 +1,4 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
+from mmengine.config import read_base
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ChemBenchDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
-
+with read_base():
-chembench_reader_cfg = dict(
+    from .ChemBench_gen_a9f753 import chembench_datasets  # noqa: F401, F403
    input_columns=['input', 'A', 'B', 'C', 'D'],
    output_column='target',
    train_split='dev')
 chembench_all_sets = [
    'Name_Conversion',
    'Property_Prediction',
    'Mol2caption',
    'Caption2mol',
    'Product_Prediction',
    'Retrosynthesis',
    'Yield_Prediction',
    'Temperature_Prediction',
    'Solvent_Prediction'
 ]
 chembench_datasets = []
 for _name in chembench_all_sets:
    # _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
    _hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
    chembench_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{target}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=
                        f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
        inferencer=dict(type=GenInferencer),
    )
    chembench_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_capital_postprocess))
    chembench_datasets.append(
        dict(
            abbr=f'ChemBench_{_name}',
            type=ChemBenchDataset,
            path='opencompass/ChemBench',
            name=_name,
            reader_cfg=chembench_reader_cfg,
            infer_cfg=chembench_infer_cfg,
            eval_cfg=chembench_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_gen_a9f753.py
@ -0,0 +1,77 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import ChemBenchDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 chembench_reader_cfg = dict(
    input_columns=['input', 'A', 'B', 'C', 'D'],
    output_column='target',
    train_split='dev')
 chembench_all_sets = [
    'Name_Conversion',
    'Property_Prediction',
    'Mol2caption',
    'Caption2mol',
    'Product_Prediction',
    'Retrosynthesis',
    'Yield_Prediction',
    'Temperature_Prediction',
    'Solvent_Prediction'
 ]
 chembench_datasets = []
 for _name in chembench_all_sets:
    # _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
    _hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
    chembench_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=
                    f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{target}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=
                        f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
        inferencer=dict(type=GenInferencer),
    )
    chembench_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=first_capital_postprocess))
    chembench_datasets.append(
        dict(
            abbr=f'ChemBench_{_name}',
            type=ChemBenchDataset,
            path='opencompass/ChemBench4K',
            name=_name,
            reader_cfg=chembench_reader_cfg,
            infer_cfg=chembench_infer_cfg,
            eval_cfg=chembench_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .ChemBench_llmjudge_gen_c584cf import chembench_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
+++ b/opencompass/configs/datasets/ChemBench/ChemBench_llmjudge_gen_c584cf.py
@ -0,0 +1,108 @@
 from opencompass.datasets.math import MATHDataset
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 from opencompass.datasets import ChemBenchDataset
 chembench_reader_cfg = dict(
    input_columns=['input', 'A', 'B', 'C', 'D'],
    output_column='target',
    train_split='dev')
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 chembench_all_sets = [
    'Name_Conversion',
    'Property_Prediction',
    'Mol2caption',
    'Caption2mol',
    'Product_Prediction',
    'Retrosynthesis',
    'Yield_Prediction',
    'Temperature_Prediction',
    'Solvent_Prediction'
 ]
 _hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
 chembench_datasets = []
 for _name in chembench_all_sets:
    chembench_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ')
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer)
    )
    # Evaluation configuration
    chembench_eval_cfg = dict(
        evaluator=dict(
            type=GenericLLMEvaluator,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = GRADER_TEMPLATE
                    ),
                ]),
            ),
            dataset_cfg=dict(
                type=ChemBenchDataset,
                path='opencompass/ChemBench4K',
                name=_name,
                reader_cfg=chembench_reader_cfg,
            ),
            judge_cfg=dict(),
            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
        ),
        pred_role='BOT',
        )
    chembench_datasets.append(
    dict(
        abbr=f'ChemBench_{_name}',
        type=ChemBenchDataset,
        path='opencompass/ChemBench4K',
        name=_name,
        reader_cfg=chembench_reader_cfg,
        infer_cfg=chembench_infer_cfg,
        eval_cfg=chembench_eval_cfg,
    ))
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_gen_5774b5.py
@ -0,0 +1,73 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.smolinstruct import FTSEvaluator
 from opencompass.datasets import SmolInstructDataset
 fts_reader_cfg = dict(
    input_columns=['input'],
    output_column='output',
    train_split='validation')
 fts_hint_dict = {
    'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
    The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
    'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
    The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain only the SMILES representation of the predicted product and no other text. Your reply must be valid and chemically reasonable.""",
    'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
    The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and different reactants and reagents should be separated by ".". Your reply must be valid and chemically reasonable.""",
 }
 name_dict = {
    'MG': 'molecule_generation',
    'FS': 'forward_synthesis',
    'RS': 'retrosynthesis'
 }
 fts_datasets = []
 for _name in fts_hint_dict:
    _hint = fts_hint_dict[_name]
    fts_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{output}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
        inferencer=dict(type=GenInferencer),
    )
    fts_eval_cfg = dict(
        evaluator=dict(type=FTSEvaluator),
    )
    fts_datasets.append(
        dict(
            abbr=f'{_name}',
            type=SmolInstructDataset,
            path='osunlp/SMolInstruct',
            name=name_dict[_name],
            reader_cfg=fts_reader_cfg,
            infer_cfg=fts_infer_cfg,
            eval_cfg=fts_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
@ -0,0 +1,10 @@
 from mmengine.config import read_base
 with read_base():
    from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_gen_c84c18 import nc_datasets
    from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_gen_8607a3 import pp_acc_datasets
    from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_gen_0fcc6b import pp_rmse_datasets
    from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_gen_5774b5 import fts_datasets
    from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_gen_065150 import meteor_datasets
 smolinstruct_datasets = nc_datasets + pp_rmse_datasets + pp_acc_datasets + meteor_datasets + fts_datasets
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_gen_065150.py
@ -0,0 +1,67 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.smolinstruct import MeteorEvaluator
 from opencompass.datasets import SmolInstructDataset
 meteor_reader_cfg = dict(
    input_columns=['input'],
    output_column='output',
    train_split='validation')
 meteor_hint_dict = {
    'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
    The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
 }
 name_dict = {
    'MC': 'molecule_captioning',
 }
 meteor_datasets = []
 for _name in meteor_hint_dict:
    _hint = meteor_hint_dict[_name]
    meteor_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{output}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
        inferencer=dict(type=GenInferencer),
    )
    meteor_eval_cfg = dict(
        evaluator=dict(type=MeteorEvaluator),
    )
    meteor_datasets.append(
        dict(
            abbr=f'{_name}',
            type=SmolInstructDataset,
            path='osunlp/SMolInstruct',
            name=name_dict[_name],
            reader_cfg=meteor_reader_cfg,
            infer_cfg=meteor_infer_cfg,
            eval_cfg=meteor_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_gen_c84c18.py
@ -0,0 +1,93 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
 from opencompass.datasets import SmolInstructDataset
 from opencompass.utils.text_postprocessors import first_capital_postprocess
 nc_reader_cfg = dict(
    input_columns=['input'],
    output_column='output',
    train_split='validation')
 nc_hint_dict = {
    'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
    The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
    'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
    The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
    'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
    The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
    'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
    The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
 }
 name_dict = {
    'I2F': 'name_conversion-i2f',
    'I2S': 'name_conversion-i2s',
    'S2F': 'name_conversion-s2f',
    'S2I': 'name_conversion-s2i',
 }
 nc_datasets = []
 for _name in nc_hint_dict:
    _hint = nc_hint_dict[_name]
    nc_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{output}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
        inferencer=dict(type=GenInferencer),
    )
    # nc_infer_cfg = dict(
    #     prompt_template=dict(
    #         type=PromptTemplate,
    #         template=dict(
    #             round=[
    #                 dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
    #             ],
    #         ),
    #     ),
    #     retriever=dict(type=ZeroRetriever),
    #     inferencer=dict(type=GenInferencer),
    # )
    if _name in ['I2F', 'S2F']:
        nc_eval_cfg = dict(
            evaluator=dict(type=NCElementMatchEvaluator),
        )
    else:
        nc_eval_cfg = dict(
            evaluator=dict(type=NCExactMatchEvaluator),
        )
    nc_datasets.append(
        dict(
            abbr=f'NC-{_name}',
            type=SmolInstructDataset,
            path='osunlp/SMolInstruct',
            name=name_dict[_name],
            reader_cfg=nc_reader_cfg,
            infer_cfg=nc_infer_cfg,
            eval_cfg=nc_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_gen_8607a3.py
@ -0,0 +1,79 @@
 from opencompass.openicl import AccEvaluator
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import SmolInstructDataset
 from opencompass.datasets.smolinstruct import smolinstruct_acc_postprocess
 pp_acc_reader_cfg = dict(
    input_columns=['input'],
    output_column='output',
    train_split='validation')
 pp_acc_hint_dict = {
    'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
    'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
    'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
    'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
    The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
 }
 name_dict = {
    'BBBP': 'property_prediction-bbbp',
    'ClinTox': 'property_prediction-clintox',
    'HIV': 'property_prediction-hiv',
    'SIDER': 'property_prediction-sider',
 }
 pp_acc_datasets = []
 for _name in pp_acc_hint_dict:
    _hint = pp_acc_hint_dict[_name]
    pp_acc_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{output}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
        inferencer=dict(type=GenInferencer),
    )
    pp_acc_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_postprocessor=dict(type=smolinstruct_acc_postprocess)
    )
    pp_acc_datasets.append(
        dict(
            abbr=f'PP-{_name}',
            type=SmolInstructDataset,
            path='osunlp/SMolInstruct',
            name=name_dict[_name],
            reader_cfg=pp_acc_reader_cfg,
            infer_cfg=pp_acc_infer_cfg,
            eval_cfg=pp_acc_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py
+++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_gen_0fcc6b.py
@ -0,0 +1,70 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets.smolinstruct import RMSEEvaluator
 from opencompass.datasets import SmolInstructDataset
 pp_rmse_reader_cfg = dict(
    input_columns=['input'],
    output_column='output',
    train_split='validation')
 pp_rmse_hint_dict = {
    'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
    The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable.""",
    'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
    The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable."""
 }
 name_dict = {
    'ESOL': 'property_prediction-esol',
    'Lipo': 'property_prediction-lipo'
 }
 pp_rmse_datasets = []
 for _name in pp_rmse_hint_dict:
    _hint = pp_rmse_hint_dict[_name]
    pp_rmse_infer_cfg = dict(
        ice_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role='HUMAN',
                    prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                ),
                dict(role='BOT', prompt='{output}\n')
            ]),
        ),
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                begin='</E>',
                round=[
                    dict(
                        role='HUMAN',
                        prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
                    ),
                ],
            ),
            ice_token='</E>',
        ),
        retriever=dict(type=FixKRetriever, fix_id_list=[0]),
        inferencer=dict(type=GenInferencer),
    )
    pp_rmse_eval_cfg = dict(
        evaluator=dict(type=RMSEEvaluator),
    )
    pp_rmse_datasets.append(
        dict(
            abbr=f'PP-{_name}',
            type=SmolInstructDataset,
            path='osunlp/SMolInstruct',
            name=name_dict[_name],
            reader_cfg=pp_rmse_reader_cfg,
            infer_cfg=pp_rmse_infer_cfg,
            eval_cfg=pp_rmse_eval_cfg,
        ))
 del _name, _hint
--- a/opencompass/configs/datasets/judge/rewardbench.py
+++ b/opencompass/configs/datasets/judge/rewardbench.py
@ -0,0 +1,71 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import JudgeEvaluator
 from opencompass.datasets import RewardBenchDataset
 subjective_reader_cfg = dict(
    input_columns=['prompt'],
    output_column='judge',
    )
 data_path = './data/judgeeval/rewardbench'
 subjective_all_sets = ['llmbar-natural.json', 'llmbar-adver-GPTInst.json', 'hep-go.json', 'refusals-dangerous.json', 'hep-cpp.json', 'mt-bench-easy.json', 'alpacaeval-length.json', 'llmbar-adver-neighbor.json', 'alpacaeval-easy.json', 'hep-java.json', 'llmbar-adver-GPTOut.json', 'mt-bench-hard.json', 'xstest-should-respond.json', 'xstest-should-refuse.json', 'hep-python.json', 'refusals-offensive.json', 'alpacaeval-hard.json', 'llmbar-adver-manual.json', 'hep-js.json', 'math-prm.json', 'hep-rust.json', 'mt-bench-med.json', 'donotanswer.json']
 get_rewardbench_datasets = []
 prompt_choice_prefix = """
 Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
 - Do not let the order of presentation, response length, or assistant names influence your judgment.
 - Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
 Your final reply must be structured in the following format:
 {
  "Choice": "[Model A or Model B]"
 }
 """
 prompt_choice_en = """User Question: {question}
 Model A's Response: {answerA}
 Model B's Response: {answerB}
 Now it's your turn. Please provide selection result as required:
 """
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt=prompt_choice_prefix + prompt_choice_en
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_out_len=4096),
        )
    rewardbench_eval_cfg = dict(
        evaluator=dict(
            type=JudgeEvaluator,
        ),
    )
    get_rewardbench_datasets.append(
        dict(
            abbr=f'{_name.split(".")[0]}',
            type=RewardBenchDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=rewardbench_eval_cfg,
            mode='singlescore',
        ))
--- a/opencompass/configs/datasets/judge/rmb.py
+++ b/opencompass/configs/datasets/judge/rmb.py
@ -0,0 +1,70 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import RMBEvaluator
 from opencompass.datasets import RMBDataset
 subjective_reader_cfg = dict(
    input_columns=['prompt'],
    output_column='judge',
    )
 data_path = './data/judgeeval/rmb_dataset'
 subjective_all_sets = ['rmb_dataset.json']
 get_rmb_dataset = []
 prompt_choice_prefix = """
 Please act as an impartial judge to evaluate the responses provided by two AI assistants to the user question below. Your evaluation should focus on the following criteria: helpfulness, relevance, accuracy, depth, creativity, and level of detail.
 - Do not let the order of presentation, response length, or assistant names influence your judgment.
 - Base your decision solely on how well each response addresses the user’s question and adheres to the instructions.
 Your final reply must be structured in the following format:
 {
  "Choice": "[Model A or Model B]"
 }
 """
 prompt_choice_en = """User Question: {question}
 Model A's Response: {answerA}
 Model B's Response: {answerB}
 Now it's your turn. Please provide selection result as required:
 """
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt=prompt_choice_prefix + prompt_choice_en
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer, max_out_len=4096),
        )
    rmb_eval_cfg = dict(
        evaluator=dict(
            type=RMBEvaluator,
        ),
    )
    get_rmb_dataset.append(
        dict(
            abbr=f'{_name.split(".")[0]}',
            type=RMBDataset,
            path=data_path,
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=rmb_eval_cfg,
            mode='singlescore',
        ))
--- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_llmjudge_gen_71eaf5.py
@ -0,0 +1,97 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets.livemathbench import LiveMathBenchDataset
 from opencompass.datasets import generic_llmjudge_postprocess
 livemathbench_reader_cfg = dict(
    input_columns=['question'], output_column='answer'
 )
 # Inference configuration
 livemathbench_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(
            round=[
                dict(
                    role='HUMAN',
                    prompt='{question}\n',
                ),
            ]
        ),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer),
 )
 # Template for the LLM judge
 GRADER_TEMPLATE = """
    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
    Here are some evaluation criteria:
    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
    A: CORRECT 
    B: INCORRECT
    Just return the letters "A" or "B", with no text around it.
    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
    Judging the correctness of candidates' answers:
 """.strip()
 splits = ['hard']
 livemathbench_datasets = []
 for split in splits:
    # Dataset configuration
    livemathbench_datasets.append(
        dict(
            type=LiveMathBenchDataset,
            abbr=f'livemathbench_{split}',
            path='opencompass/LiveMathBench',
            dataset_splits = [split],
            dataset_languages= ['cn', 'en'],
            reader_cfg=livemathbench_reader_cfg,
            infer_cfg=livemathbench_infer_cfg,
            eval_cfg=dict(
                # # Evaluation configuration using LLM as judge
                evaluator=dict(
                    type=GenericLLMEvaluator,
                    prompt_template=dict(
                        type=PromptTemplate,
                        template=dict(
                            begin=[
                                dict(
                                    role='SYSTEM',
                                    fallback_role='HUMAN',
                                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
                                )
                            ],
                            round=[
                                dict(role='HUMAN', prompt=GRADER_TEMPLATE),
                            ],
                        ),
                    ),
                    dataset_cfg=dict(
                        type=LiveMathBenchDataset,
                        path='opencompass/LiveMathBench202412',
                        dataset_splits = [split],
                        reader_cfg=livemathbench_reader_cfg,
                    ),
                    judge_cfg={},
                    dict_postprocessor=dict(type=generic_llmjudge_postprocess),
                ),
            ),
        )
    )
--- a/opencompass/configs/datasets/matbench/matbench_gen.py
+++ b/opencompass/configs/datasets/matbench/matbench_gen.py
@ -0,0 +1,4 @@
 from mmengine.config import read_base
 with read_base():
    from .matbench_gen_f71840 import matbench_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
+++ b/opencompass/configs/datasets/matbench/matbench_gen_f71840.py
@ -0,0 +1,55 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets.matbench.matbench import MatbenchDataset, MatbenchEvaluator_regression, MatbenchEvaluator_classification
 matbench_reader_cfg = dict(
    input_columns=['problem'], output_column='answer')
 matbench_tasks =  ['matbench_steels','matbench_expt_gap', 'matbench_expt_is_metal','matbench_glass']
 matbench_datasets = []
 for task in matbench_tasks:
    if task in ['matbench_expt_is_metal','matbench_glass']:
        matbench_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by yes or no, do not output anything else.')])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
        matbench_eval_cfg = dict(
            evaluator=dict(type=MatbenchEvaluator_classification),
            pred_role='BOT')
    elif task in ['matbench_steels','matbench_expt_gap']:
        matbench_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(
                round=[dict(role='HUMAN', prompt=f'{{problem}} Please present your answer by one float number, do not output anything else.')])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer))
        matbench_eval_cfg = dict(
            evaluator=dict(type=MatbenchEvaluator_regression),
            pred_role='BOT')
    matbench_datasets.append(
        dict(
            type=MatbenchDataset,
            path=f'opencompass/Matbench',
            task=task,
            abbr=task,
            reader_cfg=matbench_reader_cfg,
            infer_cfg=matbench_infer_cfg,
            eval_cfg=matbench_eval_cfg))
--- a/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py
+++ b/opencompass/configs/datasets/subjective/writingbench/writingbench_judge.py
@ -0,0 +1,69 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import WritingBenchDataset, writingbench_postprocess
 from mmengine.config import read_base
 subjective_reader_cfg = dict(
    input_columns=['question'],
    output_column='judge',
    )
 subjective_all_sets = [
    'writingbench'
 ]
 writingbench_datasets = []
 for _name in subjective_all_sets:
    subjective_infer_cfg = dict(
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(round=[
                    dict(
                        role='HUMAN',
                        prompt='{question}'
                    ),
                ]),
            ),
            retriever=dict(type=ZeroRetriever),
            inferencer=dict(type=GenInferencer,),
        )
    subjective_eval_cfg = dict(
        evaluator=dict(
            type=LMEvaluator,
            multi_eval=True,
            prompt_template=dict(
                type=PromptTemplate,
                template=dict(
                begin=[
                    dict(
                        role='SYSTEM',
                        fallback_role='HUMAN',
                        prompt='You are an expert evaluator with extensive experience in evaluating response of given query.')
                ],
                    round=[
                    dict(
                        role='HUMAN',
                        prompt = '{prediction}'
                    ),
                ]),
            ),
            dict_postprocessor=dict(type=writingbench_postprocess),
        ),
        pred_role='BOT',
    )
    writingbench_datasets.append(
        dict(
            abbr=f'{_name}',
            type=WritingBenchDataset,
            path='./data/subjective/writingbench',
            name=_name,
            reader_cfg=subjective_reader_cfg,
            infer_cfg=subjective_infer_cfg,
            eval_cfg=subjective_eval_cfg,
            mode='singlescore',
        ))
--- a/opencompass/configs/summarizers/rewardbench.py
+++ b/opencompass/configs/summarizers/rewardbench.py
@ -0,0 +1,11 @@
 RewardBench_summary_groups = []
 _RewardBench_weights = {'alpacaeval-easy': 0.08088826366559486,'alpacaeval-length': 0.08088826366559486,'alpacaeval-hard': 0.08088826366559486,'mt-bench-easy': 0.0028135048231511255,'mt-bench-med': 0.004521704180064309,'mt-bench-hard': 0.024245689655172414,'llmbar-natural': 0.05387931034482758,'llmbar-adver-neighbor': 0.07219827586206896,'llmbar-adver-GPTInst': 0.04956896551724138,'llmbar-adver-GPTOut': 0.025323275862068964,'llmbar-adver-manual': 0.02478448275862069,'refusals-dangerous': 0.033783783783783786,'refusals-offensive': 0.033783783783783786,'xstest-should-refuse': 0.05202702702702703,'xstest-should-respond': 0.08445945945945946,'donotanswer': 0.04594594594594595,'math-prm': 0.07809224318658281,'hep-cpp': 0.0286512928022362,'hep-go': 0.0286512928022362,'hep-java': 0.0286512928022362,'hep-js': 0.0286512928022362,'hep-python': 0.0286512928022362,'hep-rust': 0.0286512928022362,}
 RewardBench_summary_groups.append({'name': 'RewardBench', 'subsets': list(_RewardBench_weights.keys()), 'weights': _RewardBench_weights})
 summarizer = dict(
    dataset_abbrs=[
        'RewardBench'
    ],
    summary_groups=RewardBench_summary_groups,
 )
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@ -33,7 +33,12 @@ def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
            try:
                with time_limit(1):
                    tmp = str(latex2sympy(pred))
-                    pred = str(eval(tmp))
+                    pred = eval(tmp)
                    if isinstance(pred, tuple):
                        pred = str(list(pred))
                    else:
                        pred = str(pred)
            except Exception:
                if re.match(r'-?[\d\.]+\s\D+$', pred):
                    pred = pred.split(' ')[0]
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -71,6 +71,7 @@ from .infinitebench import *  # noqa: F401, F403
 from .iwslt2017 import *  # noqa: F401, F403
 from .jigsawmultilingual import *  # noqa: F401, F403
 from .jsonl import JsonlDataset  # noqa: F401, F403
 from .judge import *  # noqa: F401, F403
 from .kaoshi import KaoshiDataset, KaoshiEvaluator  # noqa: F401, F403
 from .korbench import *  # noqa: F401, F403
 from .lambada import *  # noqa: F401, F403
@ -87,6 +88,7 @@ from .longbench import *  # noqa: F401, F403
 from .longbenchv2 import *  # noqa: F401, F403
 from .lveval import *  # noqa: F401, F403
 from .mastermath2024v1 import *  # noqa: F401, F403
 from .matbench import *  # noqa: F401, F403
 from .math import *  # noqa: F401, F403
 from .math401 import *  # noqa: F401, F403
 from .math_intern import *  # noqa: F401, F403
@ -127,6 +129,7 @@ from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .smolinstruct import *  # noqa: F401, F403
 from .squad20 import SQuAD20Dataset, SQuAD20Evaluator  # noqa: F401, F403
 from .storycloze import *  # noqa: F401, F403
 from .strategyqa import *  # noqa: F401, F403
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@ -1,7 +1,6 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Union
-from datasets import Dataset, DatasetDict
+from datasets import Dataset, DatasetDict, concatenate_datasets
 from opencompass.openicl import DatasetReader
@ -19,28 +18,25 @@ class BaseDataset:
        assert (max(k) if isinstance(k, List) else
                k) <= n, 'Maximum value of `k` must less than or equal to `n`'
        if isinstance(dataset, Dataset):
-            examples = []
+            dataset = dataset.map(lambda x, idx: {
-            for idx, example in enumerate(dataset):
+                'subdivision': abbr,
-                if 'subdivision' not in example:
+                'idx': idx
-                    example['subdivision'] = abbr
+            },
-                if 'idx' not in example:
+                                  with_indices=True,
-                    example['idx'] = idx
+                                  writer_batch_size=16)
-                examples.append(example)
+            dataset = concatenate_datasets([dataset] * n)
-            examples = sum([deepcopy(examples) for _ in range(n)], [])
+            self.dataset = dataset
            self.dataset = Dataset.from_list(examples)
        else:
            self.dataset = DatasetDict()
            for key in dataset:
-                examples = []
+                dataset[key] = dataset[key].map(lambda x, idx: {
-                for idx, example in enumerate(dataset[key]):
+                    'subdivision': f'{abbr}_{key}',
-                    if 'subdivision' not in example:
+                    'idx': idx
-                        example['subdivision'] = f'{abbr}_{key}'
+                },
-                    if 'idx' not in example:
+                                                with_indices=True,
-                        example['idx'] = idx
+                                                writer_batch_size=16)
-                    examples.append(example)
+                dataset[key] = concatenate_datasets([dataset[key]] * n)
-                print(abbr, key, len(examples))
+                self.dataset[key] = dataset[key]
                examples = sum([deepcopy(examples) for _ in range(n)], [])
                self.dataset[key] = Dataset.from_list(examples)
        self._init_reader(**reader_cfg)
    def _init_reader(self, **kwargs):
--- a/opencompass/datasets/chembench.py
+++ b/opencompass/datasets/chembench.py
@ -4,6 +4,7 @@ import os.path as osp
 from datasets import Dataset, DatasetDict
 from opencompass.registry import LOAD_DATASET
 from opencompass.utils import get_data_path
 from .base import BaseDataset
@ -14,6 +15,7 @@ class ChemBenchDataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
        dataset = DatasetDict()
        path = get_data_path(path)
        for split in ['dev', 'test']:
            raw_data = []
            filename = osp.join(path, split, f'{name}_benchmark.json')
--- a/opencompass/datasets/judge/init.py
+++ b/opencompass/datasets/judge/init.py
@ -0,0 +1,2 @@
 from .rewardbench import RewardBenchDataset  # noqa: F401, F403
 from .rmb import RMBDataset  # noqa: F401, F403
--- a/opencompass/datasets/judge/rewardbench.py
+++ b/opencompass/datasets/judge/rewardbench.py
@ -0,0 +1,57 @@
 # flake8: noqa
 import json
 import os.path as osp
 import re
 import numpy as np
 import pandas as pd
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import (DICT_POSTPROCESSORS, ICL_EVALUATORS,
                                  LOAD_DATASET)
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class RewardBenchDataset(BaseDataset):
    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for item in data:
                conversation_a = item['chosen']
                conversation_b = item['rejected']
                model_a = item['chosen_model']
                model_b = item['rejected_model']
                question = item['prompt']
                winner = item['winner']
                if winner == 'B':
                    conversation_a, conversation_b = conversation_b, conversation_a
                    model_a, model_b = model_b, model_a
                subset = item['subset']
                lan = 'en'
                raw_data.append({
                    'question': question,
                    'answerA': conversation_a,
                    'answerB': conversation_b,
                    'judge': {
                        'prompt': item['prompt'],
                        'Answer_A': conversation_a,
                        'Answer_B': conversation_b,
                        'subset': subset,
                        'winner': winner,
                        'model_a': model_a,
                        'model_b': model_b,
                        'dataset_name': 'rewardbench',
                        'lan': lan
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
--- a/opencompass/datasets/judge/rmb.py
+++ b/opencompass/datasets/judge/rmb.py
@ -0,0 +1,99 @@
 # flake8: noqa
 import json
 import os.path as osp
 import re
 import numpy as np
 import pandas as pd
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class RMBDataset(BaseDataset):
    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
            for item in data:
                if item['subset'] == 'pair':
                    raw_data.extend(self.load_pair(item))
                elif item['subset'] == 'bon':
                    raw_data.extend(self.loadbon(item))
                else:
                    raise NotImplementedError
        dataset = Dataset.from_list(raw_data)
        return dataset
    def load_pair(self, item):
        raw_item_list = []
        conversation_a = item['chosen']['answer']
        conversation_b = item['reject']['answer']
        question = ''
        for line in item['conversation_input']:
            if line['role'] == 'user':
                question += '\n\n ### User:' + line['content']
            else:
                question += '\n\n ### Assistant:' + line['content']
        question += '\n\n ### Assistant:'
        winner = 'A'
        pair_uid = item['pair_uid']
        subset = item['subset']
        goal = item['goal']
        raw_item = {
            'question': question,
            'answerA': conversation_a,
            'answerB': conversation_b,
            'judge': {
                'question': question,
                'Answer_A': conversation_a,
                'Answer_B': conversation_b,
                'winner': winner,
                'pair_uid': pair_uid,
                'subset': subset,
                'goal': goal,
            }
        }
        raw_item_list.append(raw_item)
        return raw_item_list
    def loadbon(self, item):
        raw_item_list = []
        conversation_a = item['bon_best']['answer']
        question = ''
        for line in item['conversation_input']:
            if line['role'] == 'user':
                question += '\n\n ### User:' + line['content']
            else:
                question += '\n\n ### Assistant:' + line['content']
        question += '\n\n ### Assistant:'
        bon_uid = item['bon_uid']
        subset = item['subset']
        goal = item['goal']
        for loser in item['loser_list']:
            conversation_b = loser['answer']
            winner = 'A'
            raw_item = {
                'question': question,
                'answerA': conversation_a,
                'answerB': conversation_b,
                'judge': {
                    'question': question,
                    'Answer_A': conversation_a,
                    'Answer_B': conversation_b,
                    'winner': winner,
                    'bon_uid': bon_uid,
                    'subset': subset,
                    'goal': goal,
                }
            }
            raw_item_list.append(raw_item)
        return raw_item_list
--- a/opencompass/datasets/matbench/init.py
+++ b/opencompass/datasets/matbench/init.py
@ -0,0 +1,3 @@
 # flake8: noqa
 from .matbench import *  # noqa: F401, F403
--- a/opencompass/datasets/matbench/matbench.py
+++ b/opencompass/datasets/matbench/matbench.py
@ -0,0 +1,87 @@
 import json
 import os
 from datasets import Dataset
 from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
 from opencompass.datasets.matbench.post_process import (parse_float_answer,
                                                        parse_true_false_answer
                                                        )
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
@LOAD_DATASET.register_module()
 class MatbenchDataset(BaseDataset):
    @staticmethod
    def load(path, task):
        path = get_data_path(path)
        path = os.path.join(path,
                            'matbench_base_fold_0_' + task + '_test.json')
        dataset = []
        with open(path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            for item in data:
                dataset.append({
                    'problem': item['problem'],
                    'answer': item['answer'],
                })
        dataset = Dataset.from_list(dataset)
        return dataset
@ICL_EVALUATORS.register_module()
 class MatbenchEvaluator_regression(BaseEvaluator):
    def score(self, predictions, references):
        mae_sum = 0
        count = 0
        details = []
        for pred, ref in zip(predictions, references):
            pred = parse_float_answer(pred)
            detail = {'pred': pred, 'answer': ref, 'error': None}
            count += 1
            try:
                error = abs(float(pred) - float(ref))
                mae_sum += error
                detail['error'] = error
            except Exception as e:
                detail['error'] = str(e)
            details.append(detail)
        mae = mae_sum / count if count > 0 else 0
        result = {'mae': mae, 'details': details}
        return result
@ICL_EVALUATORS.register_module()
 class MatbenchEvaluator_classification(BaseEvaluator):
    def score(self, predictions, references):
        details = []
        predictions_parsed = []
        for pred, ref in zip(predictions, references):
            pred = parse_true_false_answer(pred)
            detail = {'pred': pred, 'answer': ref, 'correct': False}
            if pred == ref:
                detail['correct'] = True
            details.append(detail)
            predictions_parsed.append(pred)
        accuracy = accuracy_score(references, predictions_parsed)
        precision = precision_score(references,
                                    predictions_parsed,
                                    average='binary')
        recall = recall_score(references, predictions_parsed, average='binary')
        f1 = f1_score(references, predictions_parsed, average='binary')
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'details': details
        }
--- a/opencompass/datasets/matbench/post_process.py
+++ b/opencompass/datasets/matbench/post_process.py
@ -0,0 +1,25 @@
 # flake8: noqa
 import re
 def parse_float_answer(raw_string, option=''):
    number_pattern = re.compile(r'[-+]?\d+(\.\d+)?([eE][-+]?\d+)?')
    # Search for the first match
    match = number_pattern.search(raw_string)
    if match:
        # Extract the matched number and convert it to float
        return float(match.group())
    else:
        # Return None if no number is found
        return 0
 def parse_true_false_answer(raw_string, option=''):
    if 'yes' in raw_string.lower():
        return True
    elif 'no' in raw_string.lower():
        return False
    else:
        return True
--- a/opencompass/datasets/smolinstruct.py
+++ b/opencompass/datasets/smolinstruct.py
@ -0,0 +1,435 @@
 # flake8: noqa: W605
 import re
 from collections import defaultdict
 import numpy as np
 from datasets import Dataset, DatasetDict, load_dataset
 from nltk.translate.meteor_score import meteor_score
 from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
 from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
                                  TEXT_POSTPROCESSORS)
 from opencompass.utils import get_logger
 from .base import BaseDataset
@LOAD_DATASET.register_module()
 class SmolInstructDataset(BaseDataset):
    @staticmethod
    def load(path: str, name: str):
        dataset = DatasetDict()
        raw_dataset = load_dataset(path)
        for split in ['validation', 'test']:
            raw_data = []
            for data in raw_dataset[split]:
                if data['task'] == name:
                    raw_data.append(data)
            dataset[split] = Dataset.from_list(raw_data)
        return dataset
 def extract_chemical_data(text):
    pattern = re.compile(r'<(MOLFORMULA|SMILES|IUPAC)>(.*?)</\1>', re.DOTALL)
    matches = pattern.findall(text)
    if not matches:
        return []
    return [match[1].strip() for match in matches]
 def parse_molecule(molecular_formula):
    valid = re.match('([A-Za-z]\d*)+([\+\-]\d*)*$', molecular_formula)
    if valid is None:
        raise ValueError("Molecular formula \"%s\" is not valid." %
                         molecular_formula)
    stack = [defaultdict(int)]
    def _parse_formula(formula, _stack):
        # Set remainder equal to 'None'
        r = None
        # Regular expression matching for each of the three cases:
        atom = re.match(r'([A-Z][a-z]?)(\d+)?', formula)
        opening = re.match(r'[\(\[\{]', formula)
        closing = re.match(r'[\)\]\}](\d+)?', formula)
        # If atom is identified:
        if atom:
            r = formula[len(atom.group()):]
            _stack[-1][atom.group(1)] += int(atom.group(2) or 1)
        # If opening brackets encountered:
        elif opening:
            r = formula[len(
                opening.group()
            ):]  # this sets the remainder equal to everything after the opening brackets
            _stack.append(defaultdict(int))
            # If closing brackets encountered:
        elif closing:
            r = formula[len(
                closing.group()
            ):]  # this sets the remainder equal to everything after the closing brackets
            for k, v in _stack.pop().items():
                _stack[-1][k] += v * int(
                    closing.group(1)
                    or 1)  # v times amount of molecule k, depending on nesting
        # If anything remains, process remainders recursively as nested formulas:
        if r:
            _parse_formula(r, _stack)
        return dict(_stack[0])
    result = _parse_formula(molecular_formula, stack)
    charge = re.search('[\+\-]\d*', molecular_formula)
    if charge is not None:
        charge_str = charge.group()
        charge_type = charge_str[0]
        if len(charge_str) == 1:
            charge_num = 1
        else:
            charge_num = int(charge_str[1:])
        result[charge_type] = charge_num
    return result
 def calculate_single_element_match_for_list(predictions, references):
    # 抽取SMILES里的化学式
    predictions = [
        extract_chemical_data(prediction) for prediction in predictions
    ]
    references = [extract_chemical_data(reference) for reference in references]
    ele_match_labels = []
    ele_invalid_labels = []
    details = []
    for pred_formula, gold_formula in zip(predictions, references):
        gold_formula = gold_formula[0]
        if pred_formula:
            pred_formula = pred_formula[0]
        detail = {'pred': [pred_formula], 'answer': gold_formula}
        if not pred_formula or not pred_formula:
            ele_invalid_labels.append(False)
            ele_match_labels.append(False)
            detail['score'] = [False]
            details.append(detail)
            continue
        try:
            pred_ele = parse_molecule(pred_formula)
        except KeyboardInterrupt:
            raise
        except:
            # print(pred_formula)
            # print('=====')
            ele_invalid_labels.append(True)
            ele_match_labels.append(False)
            detail['score'] = [False]
            details.append(detail)
            continue
        ele_invalid_labels.append(False)
        ele_match = False
        gold_ele = parse_molecule(gold_formula)
        if pred_ele == gold_ele:
            ele_match = True
        ele_match_labels.append(ele_match)
        detail['score'] = [ele_match]
        details.append(detail)
    score = sum(ele_match_labels) / len(predictions) * 100
    valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
    return {'score': score, 'valid_score': valid_score, 'details': details}
 def calculate_single_element_match(predictions, references):
    # 抽取SMILES里的化学式
    predictions = [
        extract_chemical_data(prediction) for prediction in predictions
    ]
    references = [extract_chemical_data(reference) for reference in references]
    ele_match_labels = []
    ele_invalid_labels = []
    details = []
    for pred_formula, gold_formula in zip(predictions, references):
        gold_formula = gold_formula[0]
        if pred_formula:
            pred_formula = pred_formula[0]
        detail = {'pred': pred_formula, 'answer': gold_formula}
        if not pred_formula or not pred_formula:
            ele_invalid_labels.append(False)
            ele_match_labels.append(False)
            detail['score'] = False
            details.append(detail)
            continue
        try:
            pred_ele = parse_molecule(pred_formula)
        except KeyboardInterrupt:
            raise
        except:
            # print(pred_formula)
            # print('=====')
            ele_invalid_labels.append(True)
            ele_match_labels.append(False)
            detail['score'] = False
            details.append(detail)
            continue
        ele_invalid_labels.append(False)
        ele_match = False
        gold_ele = parse_molecule(gold_formula)
        if pred_ele == gold_ele:
            ele_match = True
        ele_match_labels.append(ele_match)
        detail['score'] = ele_match
        details.append(detail)
    score = sum(ele_match_labels) / len(predictions) * 100
    valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
    return {'score': score, 'valid_score': valid_score, 'details': details}
@ICL_EVALUATORS.register_module()
 class NCElementMatchEvaluator(BaseEvaluator):
    """Element match evaluator for name conversion."""
    def __init__(self) -> None:
        super().__init__()
    def score(self, predictions, references):
        print('len(predictions):', len(predictions))
        print('len(references):', len(references))
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        # topk的prediction，要拆开
        if isinstance(predictions[0], str):
            return calculate_single_element_match(predictions, references)
        else:
            num_k = len(predictions[0])
            scores = []
            for i in range(num_k):
                pred = [prediction[i] for prediction in predictions]
                ref = references
                score = calculate_single_element_match_for_list(pred, ref)
                scores.append(score)
            # 按照instance合并成一个完整的dict
            final_details = scores[0]['details']
            final_scores = [scores[0]['score']]
            final_valid_scores = [scores[0]['valid_score']]
            for _k in scores[1:]:
                for i, _d in enumerate(_k['details']):
                    # print(_d)
                    final_details[i]['pred'].extend(_d['pred'])
                    final_details[i]['score'].extend(_d['score'])
                final_scores.append(_k['score'])
                final_valid_scores.append(_k['valid_score'])
            avg_score = []
            for _d in final_details:
                if True in _d['score']:
                    avg_score.append(1)
                else:
                    avg_score.append(0)
            max_score = sum(avg_score) / len(avg_score) * 100
            return {
                'score': max_score,
                'all_score': final_scores,
                'valid_score': final_valid_scores,
                'details': final_details,
            }
@ICL_EVALUATORS.register_module()
 class NCExactMatchEvaluator(BaseEvaluator):
    """Exact match evaluator for name conversion."""
    def __init__(self) -> None:
        super().__init__()
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        predictions = [
            extract_chemical_data(prediction) for prediction in predictions
        ]
        references = [
            extract_chemical_data(reference) for reference in references
        ]
        cnt = 0
        valid_cnt = 0
        details = []
        for pred, ans in zip(predictions, references):
            ans = ans[0]
            if pred:
                pred = pred[0]
                valid_cnt += 1
            detail = {'pred': pred, 'answer': ans}
            if pred and pred.strip() == ans.strip():
                cnt += 1
                detail['correct'] = True
            else:
                detail['correct'] = False
            details.append(detail)
        score = cnt / len(predictions) * 100
        valid_score = valid_cnt / len(predictions) * 100
        return {'score': score, 'valid_score': valid_score, 'details': details}
 def extract_number(text):
    pattern = re.compile(r'<NUMBER>\s*(-?\d*\.?\d+)\s*</NUMBER>')
    matches = pattern.findall(text)
    return [float(match) for match in matches]
@ICL_EVALUATORS.register_module()
 class RMSEEvaluator(BaseEvaluator):
    """Exact match evaluator for name conversion."""
    def __init__(self) -> None:
        super().__init__()
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        avg_score = 0
        details = []
        for prediction, reference in zip(predictions, references):
            pred = extract_number(prediction)
            ans = extract_number(reference)
            if not pred:
                pred = 0
            else:
                pred = pred[0]
            try:
                ans = ans[0]
            except:
                raise ValueError(f'ans: {reference}')
            detail = {'pred': pred, 'answer': ans}
            rmse_score = np.sqrt(np.mean((np.array(pred) - np.array(ans))**2))
            detail['score'] = rmse_score
            avg_score += rmse_score
            details.append(detail)
        score = avg_score / len(predictions)
        return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
 class FTSEvaluator(BaseEvaluator):
    """Exact match evaluator for name conversion."""
    def __init__(self) -> None:
        super().__init__()
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        predictions = [
            extract_chemical_data(prediction) for prediction in predictions
        ]
        references = [
            extract_chemical_data(reference) for reference in references
        ]
        avg_score = 0
        valid_cnt = 0
        details = []
        for pred, ans in zip(predictions, references):
            ans = ans[0]
            if not pred:
                detail = {'pred': '', 'answer': ans, 'score': 0}
                details.append(detail)
                continue
            pred = pred[0]
            detail = {'pred': pred, 'answer': ans}
            # 将 SMILES 转换为 RDKit 分子对象
            from rdkit import Chem
            mol1 = Chem.MolFromSmiles(pred)
            mol2 = Chem.MolFromSmiles(ans)
            if mol1 is None or mol2 is None:
                detail['score'] = 0
                details.append(detail)
                continue
            valid_cnt += 1
            # 生成 Morgan 指纹（等同于 ECFP4）
            # fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048)
            # fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048)
            from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
            generator = GetMorganGenerator(radius=2, fpSize=2048)
            fp1 = generator.GetFingerprint(mol1)
            fp2 = generator.GetFingerprint(mol2)
            from rdkit.Chem import DataStructs
            similarity = DataStructs.TanimotoSimilarity(fp1, fp2) * 100
            detail['score'] = similarity
            avg_score += similarity
            details.append(detail)
        score = avg_score / len(predictions)
        valid_score = valid_cnt / len(predictions) * 100
        return {'score': score, 'valid_score': valid_score, 'details': details}
@ICL_EVALUATORS.register_module()
 class MeteorEvaluator(BaseEvaluator):
    """Exact match evaluator for name conversion."""
    def __init__(self) -> None:
        super().__init__()
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {
                'error': 'predictions and references have different '
                'length'
            }
        avg_score = 0
        details = []
        for pred, ans in zip(predictions, references):
            try:
                score = (meteor_score([ans.split()], pred.split())
                         if ans and pred else 0.0)
            except AttributeError:
                self.logger = get_logger()
                self.logger.warning(f'Failed to compute METEOR'
                                    f"score:\npred='{pred}'\nans='{ans}'")
                score = 0.0
            avg_score += score
            detail = {'pred': pred, 'answer': ans, 'score': score}
            details.append(detail)
        score = avg_score / len(predictions)
        return {'score': score, 'details': details}
@TEXT_POSTPROCESSORS.register_module('smolinstruct-acc')
 def smolinstruct_acc_postprocess(text: str) -> str:
    if 'yes' in text.lower():
        return '<BOOLEAN> Yes </BOOLEAN>'
    elif 'no' in text.lower():
        return '<BOOLEAN> No </BOOLEAN>'
--- a/opencompass/datasets/subjective/init.py
+++ b/opencompass/datasets/subjective/init.py
@ -35,3 +35,4 @@ from .subjective_cmp import SubjectiveCmpDataset  # noqa: F401, F403
 from .wildbench import WildBenchDataset  # noqa: F401, F403
 from .wildbench import wildbench_bradleyterry_postprocess  # noqa: F401, F403
 from .wildbench import wildbench_postprocess  # noqa: F401, F403
 from .writingbench import *
--- a/opencompass/datasets/subjective/writingbench.py
+++ b/opencompass/datasets/subjective/writingbench.py
@ -0,0 +1,116 @@
 # flake8: noqa
 import json
 import os.path as osp
 import re
 from collections import defaultdict
 from datasets import Dataset
 from opencompass.registry import DICT_POSTPROCESSORS, LOAD_DATASET
 from opencompass.utils import get_data_path
 from ..base import BaseDataset
 from .utils import get_judgeanswer_and_reference
 base_prompt = """Evaluate the Response based on the Query and criteria provided.
 ** Criteria **
 ```{criteria}```
 ** Query **
 ```{question}```
 ** Response **
 ```{prediction}```
 Provide your evaluation based on the criteria:
 ```{criteria}```
 Provide reasons for each score, indicating where and why any strengths or deficiencies occur within the Response. Reference specific passages or elements from the text to support your justification.
 Ensure that each reason is concrete, with explicit references to the text that aligns with the criteria requirements.
 Scoring Range: Assign an integer score between 1 to 10
 ** Output format **
 Return the results in the following JSON format, Only output this JSON format and nothing else:
 ```json
 {{
    "score": an integer score between 1 to 10,
    "reason": "Specific and detailed justification for the score using text elements."
 }}
 ```
 """
@LOAD_DATASET.register_module()
 class WritingBenchDataset(BaseDataset):
    def load(self, path: str, name: str, *args, **kwargs):
        path = get_data_path(path, local_mode=True)
        filename = osp.join(path, f'{name}.jsonl')
        raw_data = []
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                data = json.loads(line)
                domain1 = data['domain1']
                domain2 = data['domain2']
                query = data['query']
                criteria = data['criteria']
                judge_prompt_list = []
                for criteria_item in criteria:
                    temp_prompt = base_prompt.format(question=query,
                                                     criteria=criteria_item,
                                                     prediction='{prediction}')
                    judge_prompt_list.append(temp_prompt)
                idx = data['index']
                raw_data.append({
                    'question': query,
                    'judge': {
                        'index': idx,
                        'domain1': domain1,
                        'domain2': domain2,
                        'query': query,
                        'judge_prompt_list': judge_prompt_list
                    }
                })
        dataset = Dataset.from_list(raw_data)
        return dataset
 def post_process_writingbench(judgement: dict):
    """Input a string like below:
    {"score": 9, "reason": "The response provides..."}, and extract the score
    """
    match = re.search(r"[\"']score[\"']:\s*([0-9]+)", judgement['prediction'])
    if match:
        score = int(match.group(1))
    else:
        return None
    return {'score': score}
@DICT_POSTPROCESSORS.register_module('writingbench')
 def writingbench_postprocess(output: dict, output_path: str) -> dict:
    judged_answers, references = get_judgeanswer_and_reference(
        output, output_path, post_process_writingbench)
    if len(judged_answers) == 0:
        scores = None
    scores = defaultdict(list)
    for ans, ref in zip(judged_answers, references):
        domain = ref['domain1']
        score = ans['score']
        if score is not None:
            scores['overall'].append(score)
            scores[domain].append(score)
    single_model_scores = {
        task: sum(score) / len(score)
        for task, score in scores.items()
    }
    results = single_model_scores
    results['details'] = output
    return results
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -661,18 +661,32 @@ class OpenAISDK(OpenAI):
                        pass  # noqa F841
                # Check if response is empty or content is empty
-                if not responses.choices or not responses.choices[
+                if (not responses.choices or not responses.choices[0].message
-                        0].message.content:
+                        or not responses.choices[0].message.content):
                    self.logger.error(
-                        'API response is empty, it might be due to excessive '
+                        'Failed to extract content from the responses. '
-                        'input length or an internal server error '
+                        'Please check the API response for detail information.'
-                        'from your API provider.')
+                        'API responses: %s',
                        responses,
                    )
                    num_retries += 1
                    # Continue to retry instead of returning empty response
                    continue
-                # If the model has reasoning_content, concat it
+
-                # with the content
+                # Concat Reasoning Content and tags to content
-                if hasattr(responses.choices[0].message, 'reasoning_content'):
+                if (hasattr(responses.choices[0].message, 'reasoning_content')
                        and responses.choices[0].message.reasoning_content):
                    if self.verbose:
                        self.logger.info(
                            'Follow'
                            'vllm/reasoning/deepseek_r1_reasoning_parser'
                            'to parse the reasoning content and tags'
                            'Reasoning Content: %s, \n'
                            'Tags: %s, \n'
                            'Content: %s',
                            responses.choices[0].message.reasoning_content,
                            self.think_tag,
                            responses.choices[0].message.content)
                    return (responses.choices[0].message.reasoning_content +
                            self.think_tag +
                            responses.choices[0].message.content)
--- a/opencompass/openicl/icl_evaluator/init.py
+++ b/opencompass/openicl/icl_evaluator/init.py
@ -6,6 +6,7 @@ from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from .icl_judge_evaluator import JudgeEvaluator, RMBEvaluator  # noqa
 from .icl_misc_evaluator import AverageInferencePPLEvaluator  # noqa
 from .icl_misc_evaluator import AverageMinKEvaluator  # noqa
 from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
--- a/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_judge_evaluator.py
@ -0,0 +1,135 @@
 # flake8: noqa
 """KOR-Bench Evaluator."""
 import json
 import os
 import re
 from collections import defaultdict
 from .icl_base_evaluator import BaseEvaluator
 class JudgeEvaluator(BaseEvaluator):
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}
        correct = 0
        count = 0
        details = []
        for prediction, reference in zip(predictions, references):
            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
            gold_winner = reference.get('winner', '')
            detail = {
                'pred': prediction,
                'answer': gold_winner,
                'correct': False
            }
            count += 1
            if choice == gold_winner:
                correct += 1
                detail['correct'] = True
            details.append(detail)
        result = {'accuracy': 100 * correct / count, 'details': details}
        return result
 class RMBEvaluator(BaseEvaluator):
    def calculate_pair_accuracy(self, data):
        correct = 0
        total = 0
        for item in data:
            choice = item['choice']
            gold_winner = item['gold_winner']
            if choice and gold_winner:
                total += 1
                if gold_winner == choice:
                    correct += 1
        return correct / total if total > 0 else 0
    def calculate_bon_accuracy(self, data):
        bon_groups = defaultdict(list)
        """计算bon指标的准确率"""
        for item in data:
            bon_uid = item['bon_uid']
            if bon_uid:
                choice = item['choice']
                gold_winner = item['gold_winner']
                if choice and gold_winner:
                    bon_groups[bon_uid].append(gold_winner == choice)
        # 计算每个bon_uid是否全部正确
        correct_bons = 0
        for bon_uid, matches in bon_groups.items():
            if all(matches):
                correct_bons += 1
        return correct_bons / len(bon_groups) if bon_groups else 0
    def score(self, predictions, references):
        if len(predictions) != len(references):
            return {'error': 'preds and refrs have different length'}
        # 创建四个数据列表，分别对应不同的subset和goal组合
        bon_help_list = []
        bon_harm_list = []
        pair_help_list = []
        pair_harm_list = []
        # 根据subset和goal分类数据
        for prediction, reference in zip(predictions, references):
            choice = prediction.split("\"Choice\": \"Model ")[-1][0]
            gold_winner = reference.get('winner', '')
            subset = reference.get('subset', '')
            goal = reference.get('goal', '')
            data_item = {
                'choice': choice,
                'gold_winner': gold_winner,
                'bon_uid': reference.get('bon_uid', ''),
                'pair_uid': reference.get('pair_uid', ''),
            }
            # 根据subset和goal将数据分配到对应的列表中
            if subset == 'bon':
                if goal == 'Helpfulness':
                    bon_help_list.append(data_item)
                elif goal == 'Harmlessness':
                    bon_harm_list.append(data_item)
            elif subset == 'pair':
                if goal == 'Helpfulness':
                    pair_help_list.append(data_item)
                elif goal == 'Harmlessness':
                    pair_harm_list.append(data_item)
        # 计算四种组合的准确率
        bon_help_acc = self.calculate_bon_accuracy(
            bon_help_list) if bon_help_list else 0
        bon_harm_acc = self.calculate_bon_accuracy(
            bon_harm_list) if bon_harm_list else 0
        pair_help_acc = self.calculate_pair_accuracy(
            pair_help_list) if pair_help_list else 0
        pair_harm_acc = self.calculate_pair_accuracy(
            pair_harm_list) if pair_harm_list else 0
        # 返回所有结果
        result = {
            'bon_helpfulness_accuracy':
            bon_help_acc * 100,
            'bon_harmlessness_accuracy':
            bon_harm_acc * 100,
            'pair_helpfulness_accuracy':
            pair_help_acc * 100,
            'pair_harmlessness_accuracy':
            pair_harm_acc * 100,
            'bon_average': ((bon_help_acc + bon_harm_acc) / 2) * 100,
            'pair_average': ((pair_help_acc + pair_harm_acc) / 2) * 100,
            'total_accuracy':
            ((bon_help_acc + bon_harm_acc + pair_help_acc + pair_harm_acc) / 4)
            * 100
        }
        return result
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@ -116,6 +116,7 @@ class LMEvaluator:
        pred_postprocessor (ConfigDict): The model prediction's postprocessor
            config.
        keep_predictions (bool): Whether to save model predictions in references. Useful when postprocessor requires model predictions as input to calculate additional features (e.g. response length, markdown list counts, ...). Defaults to False.
        multi_eval (bool): Whether to do multiple evaluation with different prompt settings.
    """
    def __init__(
@ -129,7 +130,9 @@ class LMEvaluator:
        pred_postprocessor: Optional[ConfigDict] = None,
        dict_postprocessor: Optional[ConfigDict] = None,
        keep_predictions: bool = False,
        multi_eval: bool = False,
    ) -> None:
        self.multi_eval = multi_eval
        self.output_path = output_path
        out_dir, out_name = osp.split(output_path)
        if not out_dir:
@ -209,6 +212,33 @@ class LMEvaluator:
                references = [
                    {} for _ in range(len(predictions[0]['model_preds']))
                ]
            if self.multi_eval:
                assert references is not None
                assert 'judge_prompt_list' in references[0]
                self.multi_eval_times = len(references[0]['judge_prompt_list'])
                temp_predictions_save_list = []
                for idx, pred in enumerate(predictions['model_preds']):
                    for judge_prompt in references[idx]['judge_prompt_list']:
                        temp_prediction = judge_prompt.replace(
                            '{prediction}', pred)
                        temp_predictions_save_list.append(temp_prediction)
                predictions['model_preds'] = temp_predictions_save_list
                temp_references_save_list = []
                for item in references:
                    new_item = {
                        key: value
                        for key, value in item.items()
                        if key != 'judge_prompt_list'
                    }
                    if 'judge_prompt_list' in item:
                        for prompt in item['judge_prompt_list']:
                            temp_item = new_item.copy()
                            temp_item['judge_prompt'] = prompt
                            temp_references_save_list.append(temp_item)
                    else:
                        temp_references_save_list.append(item)
                references = temp_references_save_list
            predictions = [predictions['model_preds']]
        # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature.
@ -268,7 +298,12 @@ class LMEvaluator:
        if self.dataset_cfg:
            dataset = build_dataset_from_cfg(self.dataset_cfg)
-
+            if self.multi_eval:
                new_ds = {
                    k: dataset.test[k] * self.multi_eval_times
                    for k in dataset.test.column_names
                }
                dataset.reader.dataset['test'] = Dataset.from_dict(new_ds)
            if infer_order == 'double':
                new_ds = {
                    k: dataset.test[k] * 2
@ -329,4 +364,4 @@ class LMEvaluator:
        else:
            kwargs = self.dict_postprocessor
            proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            return proc(output, self.output_path, **kwargs)
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@ -27,6 +27,12 @@ DATASETS_MAPPING = {
        "hf_id": "opencompass/ai2_arc",
        "local": "./data/ARC/ARC-e/ARC-Easy-Dev.jsonl",
    },
    # Matbench
    "opencompass/Matbench": {
    # "ms_id": "opencompass/Matbench",
    "hf_id": "opencompass/Matbench",
    "local": "./data/Matbench",
    },
    # BBH
    "opencompass/bbh": {
        "ms_id": "opencompass/bbh",
@ -435,6 +441,11 @@ DATASETS_MAPPING = {
        "hf_id": "",
        "local": "./data/PHYSICS-textonly",
    },
     "opencompass/ChemBench4K": {
        "ms_id": "",
        "hf_id": "",
        "local": "./data/ChemBench4K",
    },
 }
@ -659,6 +670,11 @@ DATASETS_URL = {
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip",
        "md5": "1321cbf9349e1102a57d31d1b2bfdd7e",
    },
    "/Matbench":{
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Matbench.zip",
        "md5": "99f9457f54f4f419da9556af56ac4c24",
    },
    "mmlu_pro": {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
@ -777,5 +793,11 @@ DATASETS_URL = {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
        "md5": "270f399f4142b74f47ecff116cc3b21d"
    },
    "ChemBench4K": {
        "url":
        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ChemBench4K.zip",
        "md5": "fc23fd21b2566a5dbbebfa4601d7779c"
    }
 }
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@ -19,5 +19,7 @@ math-verify[antlr4_11_0]
 pyext
 # Law Bench
 pypinyin
 # Smolinstruct
 rdkit
 # RULER
 wonderwords
		`@ -0,0 +1,2 @@`
							`from .rewardbench import RewardBenchDataset # noqa: F401, F403`
							`from .rmb import RMBDataset # noqa: F401, F403`
		`@ -0,0 +1,3 @@`
							`# flake8: noqa`

							`from .matbench import * # noqa: F401, F403`