[Dataset] Add SmolInstruct, Update Chembench (#2025)

* [Dataset] Add SmolInstruct, Update Chembench

* Add dataset metadata

* update

* update

* update
This commit is contained in:
Linchen Xiao 2025-04-18 17:21:29 +08:00 committed by GitHub
parent 65ff602cf5
commit b2da1c08a8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 1130 additions and 77 deletions

View File

@ -1010,4 +1010,10 @@
category: Science
paper: https://arxiv.org/pdf/2503.21821
configpath: ''
configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
configpath_llmjudge: opencompass/configs/datasets/PHYSICS/PHYSICS_llm_judge_gen_a133a2.py
- smolinstruct:
name: SmolInstruct
category: Science /Chemistry
paper: https://arxiv.org/pdf/2402.09391
configpath: opencompass/configs/datasets/SmolInstruct/smolinstruct_gen.py
configpath_llmjudge: ''

View File

@ -1,77 +1,4 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import ChemBenchDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
from mmengine.config import read_base
chembench_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
chembench_all_sets = [
'Name_Conversion',
'Property_Prediction',
'Mol2caption',
'Caption2mol',
'Product_Prediction',
'Retrosynthesis',
'Yield_Prediction',
'Temperature_Prediction',
'Solvent_Prediction'
]
chembench_datasets = []
for _name in chembench_all_sets:
# _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
chembench_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
chembench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess))
chembench_datasets.append(
dict(
abbr=f'ChemBench_{_name}',
type=ChemBenchDataset,
path='opencompass/ChemBench',
name=_name,
reader_cfg=chembench_reader_cfg,
infer_cfg=chembench_infer_cfg,
eval_cfg=chembench_eval_cfg,
))
del _name, _hint
with read_base():
from .ChemBench_gen_a9f753 import chembench_datasets # noqa: F401, F403

View File

@ -0,0 +1,77 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import ChemBenchDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
chembench_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
chembench_all_sets = [
'Name_Conversion',
'Property_Prediction',
'Mol2caption',
'Caption2mol',
'Product_Prediction',
'Retrosynthesis',
'Yield_Prediction',
'Temperature_Prediction',
'Solvent_Prediction'
]
chembench_datasets = []
for _name in chembench_all_sets:
# _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
chembench_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
dict(role='BOT', prompt='{target}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=
f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer),
)
chembench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess))
chembench_datasets.append(
dict(
abbr=f'ChemBench_{_name}',
type=ChemBenchDataset,
path='opencompass/ChemBench4K',
name=_name,
reader_cfg=chembench_reader_cfg,
infer_cfg=chembench_infer_cfg,
eval_cfg=chembench_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .ChemBench_llmjudge_gen_c584cf import chembench_datasets # noqa: F401, F403

View File

@ -0,0 +1,108 @@
from opencompass.datasets.math import MATHDataset
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
from opencompass.datasets import ChemBenchDataset
chembench_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D'],
output_column='target',
train_split='dev')
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n<Original Question End>\n\n
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
chembench_all_sets = [
'Name_Conversion',
'Property_Prediction',
'Mol2caption',
'Caption2mol',
'Product_Prediction',
'Retrosynthesis',
'Yield_Prediction',
'Temperature_Prediction',
'Solvent_Prediction'
]
_hint = f'There is a single choice question about chemistry. Answer the question by replying A, B, C or D.'
chembench_datasets = []
for _name in chembench_all_sets:
chembench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: ')
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer)
)
# Evaluation configuration
chembench_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=ChemBenchDataset,
path='/fs-computility/llm/xiaolinchen/opencompass_fork/data/ChemBench4K',
name=_name,
reader_cfg=chembench_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
chembench_datasets.append(
dict(
abbr=f'ChemBench_{_name}',
type=ChemBenchDataset,
path='opencompass/ChemBench4K',
name=_name,
reader_cfg=chembench_reader_cfg,
infer_cfg=chembench_infer_cfg,
eval_cfg=chembench_eval_cfg,
))

View File

@ -0,0 +1,73 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import FTSEvaluator
from opencompass.datasets import SmolInstructDataset
fts_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
fts_hint_dict = {
'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule.
The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in <SMILES> and </SMILES> tags. Your reply must be valid and chemically reasonable.""",
'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge.
The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain only the SMILES representation of the predicted product and no other text. Your reply must be valid and chemically reasonable.""",
'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge.
The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and different reactants and reagents should be separated by ".". Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'MG': 'molecule_generation',
'FS': 'forward_synthesis',
'RS': 'retrosynthesis'
}
fts_datasets = []
for _name in fts_hint_dict:
_hint = fts_hint_dict[_name]
fts_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
fts_eval_cfg = dict(
evaluator=dict(type=FTSEvaluator),
)
fts_datasets.append(
dict(
abbr=f'{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=fts_reader_cfg,
infer_cfg=fts_infer_cfg,
eval_cfg=fts_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,10 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_gen_c84c18 import nc_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_gen_8607a3 import pp_acc_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_gen_0fcc6b import pp_rmse_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_gen_5774b5 import fts_datasets
from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_gen_065150 import meteor_datasets
smolinstruct_datasets = nc_datasets + pp_rmse_datasets + pp_acc_datasets + meteor_datasets + fts_datasets

View File

@ -0,0 +1,67 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import MeteorEvaluator
from opencompass.datasets import SmolInstructDataset
meteor_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
meteor_hint_dict = {
'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language.
The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'MC': 'molecule_captioning',
}
meteor_datasets = []
for _name in meteor_hint_dict:
_hint = meteor_hint_dict[_name]
meteor_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
meteor_eval_cfg = dict(
evaluator=dict(type=MeteorEvaluator),
)
meteor_datasets.append(
dict(
abbr=f'{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=meteor_reader_cfg,
infer_cfg=meteor_infer_cfg,
eval_cfg=meteor_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,93 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator
from opencompass.datasets import SmolInstructDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess
nc_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
nc_hint_dict = {
'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound.
The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound.
The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in <SMILES> and </SMILES> tags and no other text. Your reply must be valid and chemically reasonable.""",
'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound.
The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in <MOLFORMULA> and </MOLFORMULA> tags and no other text. Your reply must be valid and chemically reasonable.""",
'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound.
The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in <IUPAC> and </IUPAC> tags and no other text. Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'I2F': 'name_conversion-i2f',
'I2S': 'name_conversion-i2s',
'S2F': 'name_conversion-s2f',
'S2I': 'name_conversion-s2i',
}
nc_datasets = []
for _name in nc_hint_dict:
_hint = nc_hint_dict[_name]
nc_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
# nc_infer_cfg = dict(
# prompt_template=dict(
# type=PromptTemplate,
# template=dict(
# round=[
# dict(role='HUMAN', prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '),
# ],
# ),
# ),
# retriever=dict(type=ZeroRetriever),
# inferencer=dict(type=GenInferencer),
# )
if _name in ['I2F', 'S2F']:
nc_eval_cfg = dict(
evaluator=dict(type=NCElementMatchEvaluator),
)
else:
nc_eval_cfg = dict(
evaluator=dict(type=NCExactMatchEvaluator),
)
nc_datasets.append(
dict(
abbr=f'NC-{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=nc_reader_cfg,
infer_cfg=nc_infer_cfg,
eval_cfg=nc_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,79 @@
from opencompass.openicl import AccEvaluator
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SmolInstructDataset
from opencompass.datasets.smolinstruct import smolinstruct_acc_postprocess
pp_acc_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
pp_acc_hint_dict = {
'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects.
The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""",
}
name_dict = {
'BBBP': 'property_prediction-bbbp',
'ClinTox': 'property_prediction-clintox',
'HIV': 'property_prediction-hiv',
'SIDER': 'property_prediction-sider',
}
pp_acc_datasets = []
for _name in pp_acc_hint_dict:
_hint = pp_acc_hint_dict[_name]
pp_acc_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
pp_acc_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=smolinstruct_acc_postprocess)
)
pp_acc_datasets.append(
dict(
abbr=f'PP-{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=pp_acc_reader_cfg,
infer_cfg=pp_acc_infer_cfg,
eval_cfg=pp_acc_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,70 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.smolinstruct import RMSEEvaluator
from opencompass.datasets import SmolInstructDataset
pp_rmse_reader_cfg = dict(
input_columns=['input'],
output_column='output',
train_split='validation')
pp_rmse_hint_dict = {
'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound.
The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable.""",
'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound.
The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in <NUMBER> and </NUMBER> tags. Your reply must be valid and chemically reasonable."""
}
name_dict = {
'ESOL': 'property_prediction-esol',
'Lipo': 'property_prediction-lipo'
}
pp_rmse_datasets = []
for _name in pp_rmse_hint_dict:
_hint = pp_rmse_hint_dict[_name]
pp_rmse_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
dict(role='BOT', prompt='{output}\n')
]),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\nQuestion: {{input}}\nAnswer: '
),
],
),
ice_token='</E>',
),
retriever=dict(type=FixKRetriever, fix_id_list=[0]),
inferencer=dict(type=GenInferencer),
)
pp_rmse_eval_cfg = dict(
evaluator=dict(type=RMSEEvaluator),
)
pp_rmse_datasets.append(
dict(
abbr=f'PP-{_name}',
type=SmolInstructDataset,
path='osunlp/SMolInstruct',
name=name_dict[_name],
reader_cfg=pp_rmse_reader_cfg,
infer_cfg=pp_rmse_infer_cfg,
eval_cfg=pp_rmse_eval_cfg,
))
del _name, _hint

View File

@ -0,0 +1,97 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets.livemathbench import LiveMathBenchDataset
from opencompass.datasets import generic_llmjudge_postprocess
livemathbench_reader_cfg = dict(
input_columns=['question'], output_column='answer'
)
# Inference configuration
livemathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{question}\n',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
# Template for the LLM judge
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
splits = ['hard']
livemathbench_datasets = []
for split in splits:
# Dataset configuration
livemathbench_datasets.append(
dict(
type=LiveMathBenchDataset,
abbr=f'livemathbench_{split}',
path='opencompass/LiveMathBench',
dataset_splits = [split],
dataset_languages= ['cn', 'en'],
reader_cfg=livemathbench_reader_cfg,
infer_cfg=livemathbench_infer_cfg,
eval_cfg=dict(
# # Evaluation configuration using LLM as judge
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
)
],
round=[
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
],
),
),
dataset_cfg=dict(
type=LiveMathBenchDataset,
path='opencompass/LiveMathBench202412',
dataset_splits = [split],
reader_cfg=livemathbench_reader_cfg,
),
judge_cfg={},
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
),
)
)

View File

@ -127,6 +127,7 @@ from .scibench import ScibenchDataset, scibench_postprocess # noqa: F401, F403
from .scicode import * # noqa: F401, F403
from .simpleqa import * # noqa: F401, F403
from .siqa import * # noqa: F401, F403
from .smolinstruct import * # noqa: F401, F403
from .squad20 import SQuAD20Dataset, SQuAD20Evaluator # noqa: F401, F403
from .storycloze import * # noqa: F401, F403
from .strategyqa import * # noqa: F401, F403

View File

@ -4,6 +4,7 @@ import os.path as osp
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@ -14,6 +15,7 @@ class ChemBenchDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
path = get_data_path(path)
for split in ['dev', 'test']:
raw_data = []
filename = osp.join(path, split, f'{name}_benchmark.json')

View File

@ -0,0 +1,426 @@
# flake8: noqa: W605
import re
from collections import defaultdict
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from opencompass.openicl.icl_evaluator.icl_base_evaluator import BaseEvaluator
from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
TEXT_POSTPROCESSORS)
from .base import BaseDataset
@LOAD_DATASET.register_module()
class SmolInstructDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
raw_dataset = load_dataset(path)
for split in ['validation', 'test']:
raw_data = []
for data in raw_dataset[split]:
if data['task'] == name:
raw_data.append(data)
dataset[split] = Dataset.from_list(raw_data)
return dataset
def extract_chemical_data(text):
pattern = re.compile(r'<(MOLFORMULA|SMILES|IUPAC)>(.*?)</\1>', re.DOTALL)
matches = pattern.findall(text)
if not matches:
return []
return [match[1].strip() for match in matches]
def parse_molecule(molecular_formula):
valid = re.match('([A-Za-z]\d*)+([\+\-]\d*)*$', molecular_formula)
if valid is None:
raise ValueError("Molecular formula \"%s\" is not valid." %
molecular_formula)
stack = [defaultdict(int)]
def _parse_formula(formula, _stack):
# Set remainder equal to 'None'
r = None
# Regular expression matching for each of the three cases:
atom = re.match(r'([A-Z][a-z]?)(\d+)?', formula)
opening = re.match(r'[\(\[\{]', formula)
closing = re.match(r'[\)\]\}](\d+)?', formula)
# If atom is identified:
if atom:
r = formula[len(atom.group()):]
_stack[-1][atom.group(1)] += int(atom.group(2) or 1)
# If opening brackets encountered:
elif opening:
r = formula[len(
opening.group()
):] # this sets the remainder equal to everything after the opening brackets
_stack.append(defaultdict(int))
# If closing brackets encountered:
elif closing:
r = formula[len(
closing.group()
):] # this sets the remainder equal to everything after the closing brackets
for k, v in _stack.pop().items():
_stack[-1][k] += v * int(
closing.group(1)
or 1) # v times amount of molecule k, depending on nesting
# If anything remains, process remainders recursively as nested formulas:
if r:
_parse_formula(r, _stack)
return dict(_stack[0])
result = _parse_formula(molecular_formula, stack)
charge = re.search('[\+\-]\d*', molecular_formula)
if charge is not None:
charge_str = charge.group()
charge_type = charge_str[0]
if len(charge_str) == 1:
charge_num = 1
else:
charge_num = int(charge_str[1:])
result[charge_type] = charge_num
return result
def calculate_single_element_match_for_list(predictions, references):
# 抽取SMILES里的化学式
predictions = [
extract_chemical_data(prediction) for prediction in predictions
]
references = [extract_chemical_data(reference) for reference in references]
ele_match_labels = []
ele_invalid_labels = []
details = []
for pred_formula, gold_formula in zip(predictions, references):
gold_formula = gold_formula[0]
if pred_formula:
pred_formula = pred_formula[0]
detail = {'pred': [pred_formula], 'answer': gold_formula}
if not pred_formula or not pred_formula:
ele_invalid_labels.append(False)
ele_match_labels.append(False)
detail['score'] = [False]
details.append(detail)
continue
try:
pred_ele = parse_molecule(pred_formula)
except KeyboardInterrupt:
raise
except:
# print(pred_formula)
# print('=====')
ele_invalid_labels.append(True)
ele_match_labels.append(False)
detail['score'] = [False]
details.append(detail)
continue
ele_invalid_labels.append(False)
ele_match = False
gold_ele = parse_molecule(gold_formula)
if pred_ele == gold_ele:
ele_match = True
ele_match_labels.append(ele_match)
detail['score'] = [ele_match]
details.append(detail)
score = sum(ele_match_labels) / len(predictions) * 100
valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
return {'score': score, 'valid_score': valid_score, 'details': details}
def calculate_single_element_match(predictions, references):
# 抽取SMILES里的化学式
predictions = [
extract_chemical_data(prediction) for prediction in predictions
]
references = [extract_chemical_data(reference) for reference in references]
ele_match_labels = []
ele_invalid_labels = []
details = []
for pred_formula, gold_formula in zip(predictions, references):
gold_formula = gold_formula[0]
if pred_formula:
pred_formula = pred_formula[0]
detail = {'pred': pred_formula, 'answer': gold_formula}
if not pred_formula or not pred_formula:
ele_invalid_labels.append(False)
ele_match_labels.append(False)
detail['score'] = False
details.append(detail)
continue
try:
pred_ele = parse_molecule(pred_formula)
except KeyboardInterrupt:
raise
except:
# print(pred_formula)
# print('=====')
ele_invalid_labels.append(True)
ele_match_labels.append(False)
detail['score'] = False
details.append(detail)
continue
ele_invalid_labels.append(False)
ele_match = False
gold_ele = parse_molecule(gold_formula)
if pred_ele == gold_ele:
ele_match = True
ele_match_labels.append(ele_match)
detail['score'] = ele_match
details.append(detail)
score = sum(ele_match_labels) / len(predictions) * 100
valid_score = 100 - sum(ele_invalid_labels) / len(predictions) * 100
return {'score': score, 'valid_score': valid_score, 'details': details}
@ICL_EVALUATORS.register_module()
class NCElementMatchEvaluator(BaseEvaluator):
"""Element match evaluator for name conversion."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
print('len(predictions):', len(predictions))
print('len(references):', len(references))
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
# topk的prediction要拆开
if isinstance(predictions[0], str):
return calculate_single_element_match(predictions, references)
else:
num_k = len(predictions[0])
scores = []
for i in range(num_k):
pred = [prediction[i] for prediction in predictions]
ref = references
score = calculate_single_element_match_for_list(pred, ref)
scores.append(score)
# 按照instance合并成一个完整的dict
final_details = scores[0]['details']
final_scores = [scores[0]['score']]
final_valid_scores = [scores[0]['valid_score']]
for _k in scores[1:]:
for i, _d in enumerate(_k['details']):
# print(_d)
final_details[i]['pred'].extend(_d['pred'])
final_details[i]['score'].extend(_d['score'])
final_scores.append(_k['score'])
final_valid_scores.append(_k['valid_score'])
avg_score = []
for _d in final_details:
if True in _d['score']:
avg_score.append(1)
else:
avg_score.append(0)
max_score = sum(avg_score) / len(avg_score) * 100
return {
'score': max_score,
'all_score': final_scores,
'valid_score': final_valid_scores,
'details': final_details,
}
@ICL_EVALUATORS.register_module()
class NCExactMatchEvaluator(BaseEvaluator):
"""Exact match evaluator for name conversion."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
extract_chemical_data(prediction) for prediction in predictions
]
references = [
extract_chemical_data(reference) for reference in references
]
cnt = 0
valid_cnt = 0
details = []
for pred, ans in zip(predictions, references):
ans = ans[0]
if pred:
pred = pred[0]
valid_cnt += 1
detail = {'pred': pred, 'answer': ans}
if pred and pred.strip() == ans.strip():
cnt += 1
detail['correct'] = True
else:
detail['correct'] = False
details.append(detail)
score = cnt / len(predictions) * 100
valid_score = valid_cnt / len(predictions) * 100
return {'score': score, 'valid_score': valid_score, 'details': details}
def extract_number(text):
pattern = re.compile(r'<NUMBER>\s*(-?\d*\.?\d+)\s*</NUMBER>')
matches = pattern.findall(text)
return [float(match) for match in matches]
@ICL_EVALUATORS.register_module()
class RMSEEvaluator(BaseEvaluator):
"""Exact match evaluator for name conversion."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
avg_score = 0
details = []
for prediction, reference in zip(predictions, references):
pred = extract_number(prediction)
ans = extract_number(reference)
if not pred:
pred = 0
else:
pred = pred[0]
try:
ans = ans[0]
except:
raise ValueError(f'ans: {reference}')
detail = {'pred': pred, 'answer': ans}
rmse_score = np.sqrt(np.mean((np.array(pred) - np.array(ans))**2))
detail['score'] = rmse_score
avg_score += rmse_score
details.append(detail)
score = avg_score / len(predictions)
return {'score': score, 'details': details}
@ICL_EVALUATORS.register_module()
class FTSEvaluator(BaseEvaluator):
"""Exact match evaluator for name conversion."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
predictions = [
extract_chemical_data(prediction) for prediction in predictions
]
references = [
extract_chemical_data(reference) for reference in references
]
avg_score = 0
valid_cnt = 0
details = []
for pred, ans in zip(predictions, references):
ans = ans[0]
if not pred:
detail = {'pred': '', 'answer': ans, 'score': 0}
details.append(detail)
continue
pred = pred[0]
detail = {'pred': pred, 'answer': ans}
# 将 SMILES 转换为 RDKit 分子对象
from rdkit import Chem
mol1 = Chem.MolFromSmiles(pred)
mol2 = Chem.MolFromSmiles(ans)
if mol1 is None or mol2 is None:
detail['score'] = 0
details.append(detail)
continue
valid_cnt += 1
# 生成 Morgan 指纹(等同于 ECFP4
# fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius=2, nBits=2048)
# fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius=2, nBits=2048)
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
generator = GetMorganGenerator(radius=2, fpSize=2048)
fp1 = generator.GetFingerprint(mol1)
fp2 = generator.GetFingerprint(mol2)
from rdkit.Chem import DataStructs
similarity = DataStructs.TanimotoSimilarity(fp1, fp2) * 100
detail['score'] = similarity
avg_score += similarity
details.append(detail)
score = avg_score / len(predictions)
valid_score = valid_cnt / len(predictions) * 100
return {'score': score, 'valid_score': valid_score, 'details': details}
@ICL_EVALUATORS.register_module()
class MeteorEvaluator(BaseEvaluator):
"""Exact match evaluator for name conversion."""
def __init__(self) -> None:
super().__init__()
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
avg_score = 0
details = []
for pred, ans in zip(predictions, references):
score = meteor_score([ans.split()], pred.split())
avg_score += score
detail = {'pred': pred, 'answer': ans, 'score': score}
details.append(detail)
score = avg_score / len(predictions)
return {'score': score, 'details': details}
@TEXT_POSTPROCESSORS.register_module('smolinstruct-acc')
def smolinstruct_acc_postprocess(text: str) -> str:
if 'yes' in text.lower():
return '<BOOLEAN> Yes </BOOLEAN>'
elif 'no' in text.lower():
return '<BOOLEAN> No </BOOLEAN>'

View File

@ -435,6 +435,11 @@ DATASETS_MAPPING = {
"hf_id": "",
"local": "./data/PHYSICS-textonly",
},
"opencompass/ChemBench4K": {
"ms_id": "",
"hf_id": "",
"local": "./data/ChemBench4K",
},
}
@ -777,5 +782,11 @@ DATASETS_URL = {
"url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
"md5": "270f399f4142b74f47ecff116cc3b21d"
},
"ChemBench4K": {
"url":
"http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ChemBench4K.zip",
"md5": "fc23fd21b2566a5dbbebfa4601d7779c"
}
}

View File

@ -19,5 +19,7 @@ math-verify[antlr4_11_0]
pyext
# Law Bench
pypinyin
# Smolinstruct
rdkit
# RULER
wonderwords