From d572761cef1e376e443365fd1562353a04d6b5a0 Mon Sep 17 00:00:00 2001 From: Yu Sun Date: Thu, 29 May 2025 14:09:08 +0800 Subject: [PATCH] [Dataset] Add Smolinstruct configs (#2127) * 0-shot Smolinstruct Add 0-shot evaluation and postprocess functions for Smolinstruct * fix acc postprocessor * update 0-shot acc postprocessor * rename 0-shot --- .../smolinstruct_0shot_instruct_gen.py | 10 +++ .../smolinstruct_fts_0shot_instruct.py | 55 ++++++++++++++ .../smolinstruct_meteor_0shot_instruct.py | 49 ++++++++++++ .../smolinstruct_nc_0shot_instruct.py | 63 ++++++++++++++++ .../smolinstruct_pp_acc_0_shot_instruct.py | 61 +++++++++++++++ .../smolinstruct_rmse_0shot_instruct.py | 52 +++++++++++++ opencompass/datasets/smolinstruct.py | 75 ++++++++++++++++--- 7 files changed, 354 insertions(+), 11 deletions(-) create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py create mode 100644 opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py new file mode 100644 index 00000000..f0ac4780 --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_0shot_instruct_gen.py @@ -0,0 +1,10 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.SmolInstruct.smolinstruct_nc_0shot_instruct import nc_0shot_instruct_datasets + from opencompass.configs.datasets.SmolInstruct.smolinstruct_pp_acc_0_shot_instruct import pp_acc_datasets_0shot_instruct + from opencompass.configs.datasets.SmolInstruct.smolinstruct_rmse_0shot_instruct import pp_rmse_0shot_instruct_datasets + from opencompass.configs.datasets.SmolInstruct.smolinstruct_fts_0shot_instruct import fts_0shot_instruct_datasets + from opencompass.configs.datasets.SmolInstruct.smolinstruct_meteor_0shot_instruct import meteor_0shot_instruct_datasets + +smolinstruct_datasets_0shot_instruct = nc_0shot_instruct_datasets + pp_rmse_0shot_instruct_datasets + pp_acc_datasets_0shot_instruct + meteor_0shot_instruct_datasets + fts_0shot_instruct_datasets diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py new file mode 100644 index 00000000..c07c4f67 --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_fts_0shot_instruct.py @@ -0,0 +1,55 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.smolinstruct import FTSEvaluator +from opencompass.datasets import SmolInstructDataset + +fts_0shot_reader_cfg = dict( + input_columns=['input'], + output_column='output', + train_split='validation') + +fts_hint_dict = { + 'MG': """You are an expert chemist. Given the description of a molecule, your task is to generate the potential SMILES representation of the molecule. + The input contains the description of the molecule. Your reply should contain the potential SMILES representation of the molecule wrapped in and tags. Your reply must be valid and chemically reasonable.""", + 'FS': """You are an expert chemist. Given the SMILES representation of reactants and reagents, your task is to predict the potential product using your chemical reaction knowledge. + The input contains both reactants and reagents, and different reactants and reagents are separated by ".". Your reply should contain the SMILES representation of the predicted product wrapped in and tags. Your reply must be valid and chemically reasonable.""", + 'RS': """You are an expert chemist. Given the SMILES representation of the product, your task is to predict the potential reactants and reagents using your chemical reaction knowledge. + The input contains the SMILES representation of the product. Your reply should contain the SMILES representation of both reactants and reagents, and all reactants and reagents should be enclosed **together** within a single pair of and tags, separated by ".". Your reply must be valid and chemically reasonable.""", +} + +name_dict = { + 'MG': 'molecule_generation', + 'FS': 'forward_synthesis', + 'RS': 'retrosynthesis' +} + +fts_0shot_instruct_datasets = [] +for _name in name_dict: + _hint = fts_hint_dict[_name] + fts_0shot_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint}\nQuestion: {{input}}\nAnswer: ', + # template=f'[INST] {{input}} [/INST]', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + fts_0shot_eval_cfg = dict( + evaluator=dict(type=FTSEvaluator), + ) + + fts_0shot_instruct_datasets.append( + dict( + abbr=f'{_name}-0shot-instruct', + type=SmolInstructDataset, + path='osunlp/SMolInstruct', + name=name_dict[_name], + reader_cfg=fts_0shot_reader_cfg, + infer_cfg=fts_0shot_infer_cfg, + eval_cfg=fts_0shot_eval_cfg, + )) + +del _name diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py new file mode 100644 index 00000000..c07d98cb --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_meteor_0shot_instruct.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.smolinstruct import MeteorEvaluator +from opencompass.datasets import SmolInstructDataset + +meteor_0shot_reader_cfg = dict( + input_columns=['input'], + output_column='output', + train_split='validation') + +meteor_hint_dict = { + 'MC': """You are an expert chemist. Given the SMILES representation of a molecule, your task is to describe the molecule in natural language. + The input contains the SMILES representation of the molecule. Your reply should contain a natural language description of the molecule. Your reply must be valid and chemically reasonable.""", +} + +name_dict = { + 'MC': 'molecule_captioning', +} + +meteor_0shot_instruct_datasets = [] +for _name in name_dict: + _hint = meteor_hint_dict[_name] + meteor_0shot_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint}\nQuestion: {{input}}\nAnswer: ', + # template=f'[INST] {{input}} [/INST]', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + meteor_0shot_eval_cfg = dict( + evaluator=dict(type=MeteorEvaluator), + ) + + meteor_0shot_instruct_datasets.append( + dict( + abbr=f'{_name}-0shot-instruct', + type=SmolInstructDataset, + path='osunlp/SMolInstruct', + name=name_dict[_name], + reader_cfg=meteor_0shot_reader_cfg, + infer_cfg=meteor_0shot_infer_cfg, + eval_cfg=meteor_0shot_eval_cfg, + )) + +del _name diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py new file mode 100644 index 00000000..ad80efc9 --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_nc_0shot_instruct.py @@ -0,0 +1,63 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.smolinstruct import NCExactMatchEvaluator, NCElementMatchEvaluator +from opencompass.datasets import SmolInstructDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess + +nc_0shot_reader_cfg = dict( + input_columns=['input'], + output_column='output', + train_split='validation') + +nc_hint_dict = { + 'I2F': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the molecular formula of the compound. + The input contains the IUPAC representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in and tags and no other text. Your reply must be valid and chemically reasonable.""", + 'I2S': """You are an expert chemist. Given the IUPAC representation of compounds, your task is to predict the SMILES representation of the compound. + The input contains the IUPAC representation of the compound. Your reply should contain only the SMILES representation of the compound wrapped in and tags and no other text. Your reply must be valid and chemically reasonable.""", + 'S2F': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the molecular formula of the compound. + The input contains the SMILES representation of the compound. Your reply should contain only the molecular formula of the compound wrapped in and tags and no other text. Your reply must be valid and chemically reasonable.""", + 'S2I': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the IUPAC representation of the compound. + The input contains the SMILES representation of the compound. Your reply should contain only the IUPAC representation of the compound wrapped in and tags and no other text. Your reply must be valid and chemically reasonable.""", +} + +name_dict = { + 'I2F': 'name_conversion-i2f', + 'I2S': 'name_conversion-i2s', + 'S2F': 'name_conversion-s2f', + 'S2I': 'name_conversion-s2i', +} + +nc_0shot_instruct_datasets = [] +for _name in name_dict: + _hint = nc_hint_dict[_name] + nc_0shot_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint}\nQuestion: {{input}}\nAnswer: ', + # template=f'[INST] {{input}} [/INST]', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + if _name in ['I2F', 'S2F']: + nc_0shot_eval_cfg = dict( + evaluator=dict(type=NCElementMatchEvaluator), + ) + else: + nc_0shot_eval_cfg = dict( + evaluator=dict(type=NCExactMatchEvaluator), + ) + + nc_0shot_instruct_datasets.append( + dict( + abbr=f'NC-{_name}-0shot-instruct', + type=SmolInstructDataset, + path='osunlp/SMolInstruct', + name=name_dict[_name], + reader_cfg=nc_0shot_reader_cfg, + infer_cfg=nc_0shot_infer_cfg, + eval_cfg=nc_0shot_eval_cfg, + )) + +del _name diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py new file mode 100644 index 00000000..4747117a --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_pp_acc_0_shot_instruct.py @@ -0,0 +1,61 @@ +from opencompass.openicl import AccEvaluator +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SmolInstructDataset +from opencompass.datasets.smolinstruct import smolinstruct_acc_0shot_postprocess + +pp_acc_reader_cfg = dict( + input_columns=['input'], + output_column='output', + train_split='validation') + +pp_acc_hint_dict = { + 'BBBP': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether blood-brain barrier permeability (BBBP) is a property of the compound. + The input contains the compound. Your reply should only contain Yes or No. Your reply must be valid and chemically reasonable.""", + 'ClinTox': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound is toxic. + The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""", + 'HIV': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound serve as an inhibitor of HIV replication. + The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""", + 'SIDER': """You are an expert chemist. Given the smiles representation of the compound, your task is to predict whether the compound has any side effects. + The input contains the compound. Your reply should contain only Yes or No. Your reply must be valid and chemically reasonable.""", +} + +name_dict = { + 'BBBP': 'property_prediction-bbbp', + 'ClinTox': 'property_prediction-clintox', + 'HIV': 'property_prediction-hiv', + 'SIDER': 'property_prediction-sider', +} + +pp_acc_datasets_0shot_instruct = [] +for _name in pp_acc_hint_dict: + _hint = pp_acc_hint_dict[_name] + + pp_acc_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint}\nQuestion: {{input}}\nAnswer: ', + # template=f'[INST] {{input}} [/INST]', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + pp_acc_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=smolinstruct_acc_0shot_postprocess) + ) + + pp_acc_datasets_0shot_instruct.append( + dict( + abbr=f'PP-{_name}-0shot-instruct', + type=SmolInstructDataset, + path='osunlp/SMolInstruct', + name=name_dict[_name], + reader_cfg=pp_acc_reader_cfg, + infer_cfg=pp_acc_infer_cfg, + eval_cfg=pp_acc_eval_cfg, + )) + +del _name, _hint diff --git a/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py new file mode 100644 index 00000000..b0734ca1 --- /dev/null +++ b/opencompass/configs/datasets/SmolInstruct/smolinstruct_rmse_0shot_instruct.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.smolinstruct import RMSEEvaluator +from opencompass.datasets import SmolInstructDataset + +pp_rmse_0shot_reader_cfg = dict( + input_columns=['input'], + output_column='output', + train_split='validation') + +pp_rmse_hint_dict = { + 'ESOL': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the log solubility of the compound. + The input contains the SMILES representation of the compound. Your reply should contain the log solubility of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable.""", + 'Lipo': """You are an expert chemist. Given the SMILES representation of compounds, your task is to predict the octanol/water partition coefficient of the compound. + The input contains the SMILES representation of the compound. Your reply should contain the octanol/water partition coefficient of the compound wrapped in \\boxed{}. Your reply must be valid and chemically reasonable.""" +} + +name_dict = { + 'ESOL': 'property_prediction-esol', + 'Lipo': 'property_prediction-lipo' +} + +pp_rmse_0shot_instruct_datasets = [] +for _name in name_dict: + _hint = pp_rmse_hint_dict[_name] + pp_rmse_0shot_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=f'{_hint}\nQuestion: {{input}}\nAnswer: ', + # template=f'[INST] {{input}} [/INST]', + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ) + + pp_rmse_0shot_eval_cfg = dict( + evaluator=dict(type=RMSEEvaluator), + ) + + pp_rmse_0shot_instruct_datasets.append( + dict( + abbr=f'PP-{_name}-0shot-instruct', + type=SmolInstructDataset, + path='osunlp/SMolInstruct', + name=name_dict[_name], + reader_cfg=pp_rmse_0shot_reader_cfg, + infer_cfg=pp_rmse_0shot_infer_cfg, + eval_cfg=pp_rmse_0shot_eval_cfg, + )) + +del _name \ No newline at end of file diff --git a/opencompass/datasets/smolinstruct.py b/opencompass/datasets/smolinstruct.py index e9577335..6cd850cb 100644 --- a/opencompass/datasets/smolinstruct.py +++ b/opencompass/datasets/smolinstruct.py @@ -20,7 +20,7 @@ class SmolInstructDataset(BaseDataset): @staticmethod def load(path: str, name: str): dataset = DatasetDict() - raw_dataset = load_dataset(path) + raw_dataset = load_dataset(path, trust_remote_code=True) for split in ['validation', 'test']: raw_data = [] for data in raw_dataset[split]: @@ -31,10 +31,21 @@ class SmolInstructDataset(BaseDataset): def extract_chemical_data(text): + other_patterns = [ + 'reactants and reagents are:\n```\n', 'reactants and reagents:\n```\n', + 'Reactants and Reagents:**\n```\n', + 'Reactants and Reagents SMILES:**\n```\n', + 'Reactants and Reagents:** \n`' + ] + pattern = re.compile(r'<(MOLFORMULA|SMILES|IUPAC)>(.*?)', re.DOTALL) matches = pattern.findall(text) if not matches: - return [] + for other_pattern in other_patterns: + if other_pattern in text: + text = text.split(other_pattern)[-1].split('\n')[0] + break + return [text] return [match[1].strip() for match in matches] @@ -110,9 +121,9 @@ def calculate_single_element_match_for_list(predictions, references): ele_invalid_labels = [] details = [] for pred_formula, gold_formula in zip(predictions, references): - gold_formula = gold_formula[0] + gold_formula = gold_formula[-1] if pred_formula: - pred_formula = pred_formula[0] + pred_formula = pred_formula[-1] detail = {'pred': [pred_formula], 'answer': gold_formula} if not pred_formula or not pred_formula: ele_invalid_labels.append(False) @@ -158,9 +169,9 @@ def calculate_single_element_match(predictions, references): ele_invalid_labels = [] details = [] for pred_formula, gold_formula in zip(predictions, references): - gold_formula = gold_formula[0] + gold_formula = gold_formula[-1] if pred_formula: - pred_formula = pred_formula[0] + pred_formula = pred_formula[-1] detail = {'pred': pred_formula, 'answer': gold_formula} if not pred_formula or not pred_formula: ele_invalid_labels.append(False) @@ -272,9 +283,9 @@ class NCExactMatchEvaluator(BaseEvaluator): valid_cnt = 0 details = [] for pred, ans in zip(predictions, references): - ans = ans[0] + ans = ans[-1] if pred: - pred = pred[0] + pred = pred[-1] valid_cnt += 1 detail = {'pred': pred, 'answer': ans} if pred and pred.strip() == ans.strip(): @@ -291,7 +302,8 @@ class NCExactMatchEvaluator(BaseEvaluator): def extract_number(text): - pattern = re.compile(r'\s*(-?\d*\.?\d+)\s*') + pattern = re.compile( + r'(?:\s*|\\boxed\{)\s*(-?\d*\.?\d+)\s*(?:|\})') matches = pattern.findall(text) return [float(match) for match in matches] @@ -359,12 +371,12 @@ class FTSEvaluator(BaseEvaluator): valid_cnt = 0 details = [] for pred, ans in zip(predictions, references): - ans = ans[0] + ans = ans[-1] if not pred: detail = {'pred': '', 'answer': ans, 'score': 0} details.append(detail) continue - pred = pred[0] + pred = pred[-1] detail = {'pred': pred, 'answer': ans} # 将 SMILES 转换为 RDKit 分子对象 from rdkit import Chem @@ -433,3 +445,44 @@ def smolinstruct_acc_postprocess(text: str) -> str: return ' Yes ' elif 'no' in text.lower(): return ' No ' + + +@TEXT_POSTPROCESSORS.register_module('smolinstruct-acc-0shot') +def smolinstruct_acc_0shot_postprocess(text: str) -> str: + # Remove tags if they exist + if '' in text: + text = text.split('')[-1].strip() + + # Check for exact "yes" or "no" responses + if text.strip().lower() == 'yes': + return ' Yes ' + elif text.strip().lower() == 'no': + return ' No ' + + # Define regex patterns to match various formats of "yes" or "no" + patterns = [ + r'\\boxed\{\s*(yes|no)\s*\}', + r'[Th]he\s+answer\s+is\s*[\.:\'"“‘’\-]*\s*(yes|no)[\s\.,!?:;\'"”’\-]*', + r'[Aa]nswer:\s*(yes|no)\b', r'\*\*[Aa]nswer:\*\*\s*(yes|no)\b', + r'\*\*[Aa]nswer\*\*:\s*(yes|no)\b', + r'\s*(yes|no)\s*', r'^\s*(yes|no)[\.\?!]?\s*$' + ] + for pattern in patterns: + text = text.strip() + match = re.search(pattern, text, flags=re.IGNORECASE) + if match: + answer = match.group(1) # modified + if answer.lower() == 'yes': + return ' Yes ' + elif answer.lower() == 'no': + return ' No ' + + # If no patterns matched, check for simple "yes" or "no" + text = text.strip().lower() + if text.startswith('yes') or text.endswith('yes'): + return ' Yes ' + elif text.startswith('no') or text.endswith('no'): + return ' No ' + + # If no patterns matched, return an empty string + return ''