From c0acd06b052404b3737350ef3a4524c45701634d Mon Sep 17 00:00:00 2001 From: Raymond Zhang Date: Thu, 16 Nov 2023 17:47:57 +0800 Subject: [PATCH] [Feature] Add FinanceIQ dataset (#596) --- configs/datasets/FinanceIQ/FinanceIQ_gen.py | 4 + .../FinanceIQ/FinanceIQ_gen_e0e6b5.py | 77 +++++++++++++++++++ configs/datasets/FinanceIQ/FinanceIQ_ppl.py | 4 + .../FinanceIQ/FinanceIQ_ppl_42b9bd.py | 76 ++++++++++++++++++ opencompass/datasets/FinanceIQ.py | 39 ++++++++++ opencompass/datasets/__init__.py | 1 + 6 files changed, 201 insertions(+) create mode 100644 configs/datasets/FinanceIQ/FinanceIQ_gen.py create mode 100644 configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py create mode 100644 configs/datasets/FinanceIQ/FinanceIQ_ppl.py create mode 100644 configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py create mode 100644 opencompass/datasets/FinanceIQ.py diff --git a/configs/datasets/FinanceIQ/FinanceIQ_gen.py b/configs/datasets/FinanceIQ/FinanceIQ_gen.py new file mode 100644 index 00000000..10195a04 --- /dev/null +++ b/configs/datasets/FinanceIQ/FinanceIQ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .FinanceIQ_gen_e0e6b5 import FinanceIQ_datasets # noqa: F401, F403 diff --git a/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py b/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py new file mode 100644 index 00000000..29329e7d --- /dev/null +++ b/configs/datasets/FinanceIQ/FinanceIQ_gen_e0e6b5.py @@ -0,0 +1,77 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import FinanceIQDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess + +financeIQ_subject_mapping_en = { + 'certified_public_accountant': '注册会计师(CPA)', + 'banking_qualification': '银行从业资格', + 'securities_qualification': '证券从业资格', + 'fund_qualification': '基金从业资格', + 'insurance_qualification': '保险从业资格CICE', + 'economic_analyst': '经济师', + 'taxation_practitioner': '税务师', + 'futures_qualification': '期货从业资格', + 'certified_fin_planner': '理财规划师', + 'actuary_fin_math': '精算师-金融数学', +} + +financeIQ_subject_mapping = { + '注册会计师(CPA)': '注册会计师(CPA)', + '银行从业资格': '银行从业资格', + '证券从业资格': '证券从业资格', + '基金从业资格': '基金从业资格', + '保险从业资格CICE': '保险从业资格CICE', + '经济师': '经济师', + '税务师': '税务师', + '期货从业资格': '期货从业资格', + '理财规划师': '理财规划师', + '精算师-金融数学': '精算师-金融数学', +} + +financeIQ_all_sets = list(financeIQ_subject_mapping.keys()) + +financeIQ_datasets = [] +for _name in financeIQ_all_sets: + _ch_name = financeIQ_subject_mapping[_name] + financeIQ_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin="", + round=[ + dict( + role="HUMAN", + prompt= + f"以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}" + ), + dict(role="BOT", prompt='答案是: {answer}'), + ]), + ice_token="", + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=GenInferencer), + ) + + financeIQ_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess)) + + financeIQ_datasets.append( + dict( + type=FinanceIQDataset, + path="./data/FinanceIQ/", + name=_name, + abbr=f"FinanceIQ-{_name}", + reader_cfg=dict( + input_columns=["question", "A", "B", "C", "D"], + output_column="answer", + train_split="dev", + test_split='test'), + infer_cfg=financeIQ_infer_cfg, + eval_cfg=financeIQ_eval_cfg, + )) + +del _name, _ch_name diff --git a/configs/datasets/FinanceIQ/FinanceIQ_ppl.py b/configs/datasets/FinanceIQ/FinanceIQ_ppl.py new file mode 100644 index 00000000..c6fac62c --- /dev/null +++ b/configs/datasets/FinanceIQ/FinanceIQ_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .FinanceIQ_ppl_42b9bd import FinanceIQ_datasets # noqa: F401, F403 diff --git a/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py b/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py new file mode 100644 index 00000000..b9e63d30 --- /dev/null +++ b/configs/datasets/FinanceIQ/FinanceIQ_ppl_42b9bd.py @@ -0,0 +1,76 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import FixKRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import FinanceIQDataset + +financeIQ_subject_mapping_en = { + 'certified_public_accountant': '注册会计师(CPA)', + 'banking_qualification': '银行从业资格', + 'securities_qualification': '证券从业资格', + 'fund_qualification': '基金从业资格', + 'insurance_qualification': '保险从业资格CICE', + 'economic_analyst': '经济师', + 'taxation_practitioner': '税务师', + 'futures_qualification': '期货从业资格', + 'certified_fin_planner': '理财规划师', + 'actuary_fin_math': '精算师-金融数学', +} + +financeIQ_subject_mapping = { + '注册会计师(CPA)': '注册会计师(CPA)', + '银行从业资格': '银行从业资格', + '证券从业资格': '证券从业资格', + '基金从业资格': '基金从业资格', + '保险从业资格CICE': '保险从业资格CICE', + '经济师': '经济师', + '税务师': '税务师', + '期货从业资格': '期货从业资格', + '理财规划师': '理财规划师', + '精算师-金融数学': '精算师-金融数学', +} + +financeIQ_all_sets = list(financeIQ_subject_mapping.keys()) + +financeIQ_datasets = [] +for _name in financeIQ_all_sets: + _ch_name = financeIQ_subject_mapping[_name] + financeIQ_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={ + answer: dict( + begin="", + round=[ + dict( + role="HUMAN", + prompt=f"以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}" + ), + dict(role="BOT", prompt=f'答案是: {answer}'), + ]) + for answer in ["A", "B", "C", "D"] + }, + ice_token="", + ), + retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]), + inferencer=dict(type=PPLInferencer), + ) + + financeIQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + + financeIQ_datasets.append( + dict( + type=FinanceIQDataset, + path="./data/FinanceIQ/", + name=_name, + abbr=f"FinanceIQ-{_name}", + reader_cfg=dict( + input_columns=["question", "A", "B", "C", "D"], + output_column="answer", + train_split="dev", + test_split='test'), + infer_cfg=financeIQ_infer_cfg, + eval_cfg=financeIQ_eval_cfg, + )) + +del _name, _ch_name diff --git a/opencompass/datasets/FinanceIQ.py b/opencompass/datasets/FinanceIQ.py new file mode 100644 index 00000000..0816e9f6 --- /dev/null +++ b/opencompass/datasets/FinanceIQ.py @@ -0,0 +1,39 @@ +import csv +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class FinanceIQDataset(BaseDataset): + + # @staticmethod + # def load(path: str): + # from datasets import load_dataset + # return load_dataset('csv', data_files={'test': path}) + + @staticmethod + def load(path: str, name: str): + dataset = DatasetDict() + for split in ['dev', 'test']: + raw_data = [] + filename = osp.join(path, split, f'{name}.csv') + with open(filename, encoding='utf-8') as f: + reader = csv.reader(f) + _ = next(reader) # skip the header + for row in reader: + assert len(row) == 7 + raw_data.append({ + 'question': row[1], + 'A': row[2], + 'B': row[3], + 'C': row[4], + 'D': row[5], + 'answer': row[6], + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 729a8192..8b113c2f 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -30,6 +30,7 @@ from .drop import * # noqa: F401, F403 from .ds1000 import * # noqa: F401, F403 from .ds1000_interpreter import * # noqa: F401, F403 from .eprstmt import * # noqa: F401, F403 +from .FinanceIQ import * # noqa: F401, F403 from .flores import * # noqa: F401, F403 from .game24 import * # noqa: F401, F403 from .GaokaoBench import * # noqa: F401, F403