diff --git a/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py b/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py index 5736fa79..b09470fb 100644 --- a/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py +++ b/configs/datasets/ARC_c/ARC_c_gen_1e0de5.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import ARCDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess ARC_c_reader_cfg = dict( input_columns=["question", "textA", "textB", "textC", "textD"], @@ -27,7 +28,7 @@ ARC_c_infer_cfg = dict( ARC_c_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) ARC_c_datasets = [ diff --git a/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py b/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py index 4c82b122..5af17e4d 100644 --- a/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py +++ b/configs/datasets/ARC_e/ARC_e_gen_1e0de5.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import ARCDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess ARC_e_reader_cfg = dict( input_columns=["question", "textA", "textB", "textC", "textD"], @@ -27,7 +28,7 @@ ARC_e_infer_cfg = dict( ARC_e_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) ARC_e_datasets = [ diff --git a/configs/datasets/CLUE_C3/CLUE_C3_gen_8c358f.py b/configs/datasets/CLUE_C3/CLUE_C3_gen_8c358f.py index af5cb16f..df67194e 100644 --- a/configs/datasets/CLUE_C3/CLUE_C3_gen_8c358f.py +++ b/configs/datasets/CLUE_C3/CLUE_C3_gen_8c358f.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import C3Dataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess C3_reader_cfg = dict( input_columns=[ @@ -35,7 +36,7 @@ C3_infer_cfg = dict( C3_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) C3_datasets = [ diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py index 4ec72897..fbbc4780 100644 --- a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .CLUE_CMRC_gen_941108 import CMRC_datasets # noqa: F401, F403 + from .CLUE_CMRC_gen_1bd3c8 import CMRC_datasets # noqa: F401, F403 diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py new file mode 100644 index 00000000..6de67636 --- /dev/null +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_1bd3c8.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import CMRCDataset, cmrc_postprocess + +CMRC_reader_cfg = dict( + input_columns=['question', 'context'], output_column='answers') + +CMRC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt="根据文章回答问题。你的答案应该尽可能简练,请以 ‘答案是’ 开头的句式作答。\n文章:{context}\n问:{question}\n答:"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +CMRC_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=cmrc_postprocess), +) + +CMRC_datasets = [ + dict( + type=CMRCDataset, + abbr='CMRC_dev', + path='./data/CLUE/CMRC/dev.json', + reader_cfg=CMRC_reader_cfg, + infer_cfg=CMRC_infer_cfg, + eval_cfg=CMRC_eval_cfg), +] diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py index 91d27630..0cb4debe 100644 --- a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .CLUE_DRCD_gen_941108 import DRCD_datasets # noqa: F401, F403 + from .CLUE_DRCD_gen_1bd3c8 import DRCD_datasets # noqa: F401, F403 diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py new file mode 100644 index 00000000..9d3880fa --- /dev/null +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_1bd3c8.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import DRCDDataset, drcd_postprocess + +DRCD_reader_cfg = dict( + input_columns=['question', 'context'], output_column='answers') + +DRCD_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt="根据文章回答问题。你的答案应该尽可能简练,请以 ‘答案是’ 开头的句式作答。\n文章:{context}\n问:{question}\n答:"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +DRCD_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=drcd_postprocess), + +) + +DRCD_datasets = [ + dict( + type=DRCDDataset, + abbr='DRCD_dev', + path='./data/CLUE/DRCD/dev.json', + reader_cfg=DRCD_reader_cfg, + infer_cfg=DRCD_infer_cfg, + eval_cfg=DRCD_eval_cfg), +] diff --git a/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py b/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py index 7591d29c..8b60d14f 100644 --- a/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py +++ b/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen_901306.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AFQMCDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess afqmc_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -27,7 +28,7 @@ afqmc_infer_cfg = dict( afqmc_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) afqmc_datasets = [ diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py index c86e2c29..11e4c8cf 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_1abf97.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import cmnliDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess cmnli_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -27,7 +28,7 @@ cmnli_infer_cfg = dict( cmnli_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) cmnli_datasets = [ diff --git a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py index 1d56a12b..00ff3b2b 100644 --- a/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py +++ b/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen_51e956.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import cmnliDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess cmnli_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -27,7 +28,7 @@ cmnli_infer_cfg = dict( cmnli_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) cmnli_datasets = [ diff --git a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py index b4582492..79ddea13 100644 --- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py +++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_51e956.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import cmnliDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -28,7 +29,7 @@ ocnli_infer_cfg = dict( ocnli_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) ocnli_datasets = [ diff --git a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py index 71518795..d856efdd 100644 --- a/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py +++ b/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen_c4cb6c.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import cmnliDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -28,7 +29,7 @@ ocnli_infer_cfg = dict( ocnli_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) ocnli_datasets = [ diff --git a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py index b9698c7a..c8b91b2d 100644 --- a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py +++ b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_634f41.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AFQMCDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess bustm_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -27,7 +28,7 @@ bustm_infer_cfg = dict( bustm_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) bustm_datasets = [ diff --git a/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py b/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py index 6c638558..d6c28de7 100644 --- a/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py +++ b/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen_0a29a2.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CHIDDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess chid_reader_cfg = dict( input_columns=["content","A","B","C","D","E","F","G"], @@ -27,7 +28,7 @@ chid_infer_cfg = dict( chid_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) chid_datasets = [ diff --git a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py index fd9fbc00..ac159e36 100644 --- a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py +++ b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen_c68933.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CluewscDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess cluewsc_reader_cfg = dict( input_columns=["span1", "span2", "text", "new_text"], @@ -27,7 +28,7 @@ cluewsc_infer_cfg = dict( cluewsc_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) cluewsc_datasets = [ diff --git a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py index 1146964c..bb62a008 100644 --- a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py +++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .FewCLUE_csl_gen_87f4a8 import csl_datasets # noqa: F401, F403 + from .FewCLUE_csl_gen_28b223 import csl_datasets # noqa: F401, F403 diff --git a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py new file mode 100644 index 00000000..857d796c --- /dev/null +++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_28b223.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import CslDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess + +csl_reader_cfg = dict( + input_columns=["abst", "keywords"], + output_column="label", +) + +csl_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "摘要是对论文内容不加注释和评论的简短陈述,要求扼要地说明研究工作的目的、研究方法和最终结论等。\n关键词是一篇学术论文的核心词汇,一般由一系列名词组成。关键词在全文中应有较高出现频率,且能起到帮助文献检索的作用。\n摘要:{abst}\n关键词:{keywords}\n请问上述关键词是否匹配摘要且符合要求?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:" + ) + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +csl_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_capital_postprocess), +) + +csl_datasets = [ + dict( + abbr="csl_dev", + type=CslDataset_V2, + path="./data/FewCLUE/csl/dev_few_all.json", + reader_cfg=csl_reader_cfg, + infer_cfg=csl_infer_cfg, + eval_cfg=csl_eval_cfg, + ), + dict( + abbr="csl_test", + type=CslDataset_V2, + path="./data/FewCLUE/csl/test_public.json", + reader_cfg=csl_reader_cfg, + infer_cfg=csl_infer_cfg, + eval_cfg=csl_eval_cfg, + ), +] diff --git a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py index c83db29e..a5b8134c 100644 --- a/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py +++ b/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen_87f4a8.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CslDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess csl_reader_cfg = dict( input_columns=["abst", "keywords"], @@ -27,7 +28,7 @@ csl_infer_cfg = dict( csl_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) csl_datasets = [ diff --git a/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py b/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py index 34d3cc3b..28c0de06 100644 --- a/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py +++ b/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen_740ea0.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import eprstmtDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess eprstmt_reader_cfg = dict( input_columns=["sentence"], output_column="label", test_split="train") @@ -25,7 +26,7 @@ eprstmt_infer_cfg = dict( eprstmt_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) eprstmt_datasets = [ diff --git a/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py b/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py index 2caa8888..2609eb0a 100644 --- a/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py +++ b/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen_f97a97.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import cmnliDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess ocnli_fc_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -26,7 +27,7 @@ ocnli_fc_infer_cfg = dict( ocnli_fc_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) ocnli_fc_datasets = [ diff --git a/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py index 9257b72e..cb999a5a 100644 --- a/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py +++ b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen_b90e4a.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import TNewsDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess tnews_reader_cfg = dict( input_columns="sentence", @@ -49,7 +50,7 @@ tnews_infer_cfg = dict( tnews_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) tnews_datasets = [ diff --git a/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py b/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py index 6304dc4c..ab687c35 100644 --- a/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py +++ b/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen_4dfefa.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AXDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess AX_b_reader_cfg = dict( input_columns=["sentence1", "sentence2"], @@ -27,7 +28,7 @@ AX_b_infer_cfg = dict( AX_b_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) AX_b_datasets = [ diff --git a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py index b879cedd..e057f277 100644 --- a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py +++ b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_68aac7.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AXDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess AX_g_reader_cfg = dict( input_columns=["hypothesis", "premise"], @@ -27,7 +28,7 @@ AX_g_infer_cfg = dict( AX_g_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) AX_g_datasets = [ diff --git a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py index 43f65067..7bbcf106 100644 --- a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py +++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen_883d50.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import BoolQDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess BoolQ_reader_cfg = dict( input_columns=["question", "passage"], @@ -25,7 +26,7 @@ BoolQ_infer_cfg = dict( BoolQ_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) BoolQ_datasets = [ diff --git a/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py index 89762511..4501ecc1 100644 --- a/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py +++ b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen_854c6c.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CBDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess CB_reader_cfg = dict( input_columns=["premise", "hypothesis"], @@ -28,7 +29,7 @@ CB_infer_cfg = dict( CB_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) CB_datasets = [ diff --git a/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py index 879390b9..7beb22da 100644 --- a/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py +++ b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_91ca53.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import COPADataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess COPA_reader_cfg = dict( input_columns=["question", "premise", "choice1", "choice2"], @@ -28,7 +29,7 @@ COPA_infer_cfg = dict( COPA_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) COPA_datasets = [ diff --git a/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py index a3cce8e5..f69ad70b 100644 --- a/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py +++ b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_27071f.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import MultiRCDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess MultiRC_reader_cfg = dict( input_columns=["question", "text", "answer"], @@ -27,7 +28,7 @@ MultiRC_infer_cfg = dict( MultiRC_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) MultiRC_datasets = [ diff --git a/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py b/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py index f1696437..7fdc39f3 100644 --- a/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py +++ b/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen_68aac7.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AXDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess RTE_reader_cfg = dict( input_columns=["hypothesis", "premise"], @@ -27,7 +28,7 @@ RTE_infer_cfg = dict( RTE_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) RTE_datasets = [ diff --git a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py index 06dbc502..002e5b0f 100644 --- a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py +++ b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_0f7784.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import EMEvaluator -from opencompass.datasets import ReCoRDDataset +from opencompass.datasets import ReCoRDDataset, ReCoRD_postprocess ReCoRD_reader_cfg = dict( input_columns=['question', 'text'], output_column='answers') @@ -16,7 +16,7 @@ ReCoRD_infer_cfg = dict( inferencer=dict(type=GenInferencer)) ReCoRD_eval_cfg = dict( - evaluator=dict(type=EMEvaluator), pred_postprocessor=dict(type='ReCoRD')) + evaluator=dict(type=EMEvaluator), pred_postprocessor=dict(type=ReCoRD_postprocess)) ReCoRD_datasets = [ dict( diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py index 087f66c2..fba65d9d 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .SuperGLUE_WSC_gen_6dc406 import WSC_datasets # noqa: F401, F403 + from .SuperGLUE_WSC_gen_8a881c import WSC_datasets # noqa: F401, F403 diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_6dc406.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_6dc406.py index 30887362..abdfef1c 100644 --- a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_6dc406.py +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_6dc406.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import WSCDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess WSC_reader_cfg = dict( input_columns=["span1", "span2", "text"], @@ -27,7 +28,7 @@ WSC_infer_cfg = dict( WSC_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) WSC_datasets = [ diff --git a/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_8a881c.py b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_8a881c.py new file mode 100644 index 00000000..b1e4d714 --- /dev/null +++ b/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen_8a881c.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WSCDataset_V3 +from opencompass.utils.text_postprocessors import first_capital_postprocess + +WSC_reader_cfg = dict( + input_columns=["span1", "span2", "text"], + output_column="label", +) + +WSC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "Passage: {text}\nDoes the pronoun # {span2} # refer to * {span1} *?\nA. Yes\nB. No\nAnseer:" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +WSC_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_capital_postprocess), +) + +WSC_datasets = [ + dict( + abbr="WSC", + type=WSCDataset_V3, + path="./data/SuperGLUE/WSC/val.jsonl", + reader_cfg=WSC_reader_cfg, + infer_cfg=WSC_infer_cfg, + eval_cfg=WSC_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py index cfa15e4a..05a2a1ab 100644 --- a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py +++ b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import WiCDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess WiC_reader_cfg = dict( input_columns=[ @@ -31,7 +32,7 @@ WiC_infer_cfg = dict( WiC_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) WiC_datasets = [ diff --git a/configs/datasets/TheoremQA/TheoremQA_gen.py b/configs/datasets/TheoremQA/TheoremQA_gen.py index 648ccb7b..0824a9c2 100644 --- a/configs/datasets/TheoremQA/TheoremQA_gen.py +++ b/configs/datasets/TheoremQA/TheoremQA_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .TheoremQA_gen_a27a10 import TheoremQA_datasets # noqa: F401, F403 + from .TheoremQA_gen_7009de import TheoremQA_datasets # noqa: F401, F403 diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_6365d5.py b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py similarity index 90% rename from configs/datasets/TheoremQA/TheoremQA_gen_6365d5.py rename to configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py index 8046ce40..651b8212 100644 --- a/configs/datasets/TheoremQA/TheoremQA_gen_6365d5.py +++ b/configs/datasets/TheoremQA/TheoremQA_gen_424e0a.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import TheoremQADataset +from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess TheoremQA_reader_cfg = dict( input_columns=['Question', 'Answer_type'], @@ -23,11 +23,11 @@ TheoremQA_infer_cfg = dict( type=PromptTemplate, template=TheoremQA_prompt2), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer, max_out_len=512)) TheoremQA_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='TheoremQA')) + pred_postprocessor=dict(type=TheoremQA_postprocess)) TheoremQA_datasets = [ dict( diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_a27a10.py b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py similarity index 89% rename from configs/datasets/TheoremQA/TheoremQA_gen_a27a10.py rename to configs/datasets/TheoremQA/TheoremQA_gen_7009de.py index bbbd6b24..e5dac6f0 100644 --- a/configs/datasets/TheoremQA/TheoremQA_gen_a27a10.py +++ b/configs/datasets/TheoremQA/TheoremQA_gen_7009de.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import TheoremQADataset +from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess TheoremQA_reader_cfg = dict( input_columns=['Question', 'Answer_type'], @@ -31,11 +31,11 @@ TheoremQA_infer_cfg = dict( dict(role='HUMAN', prompt=TheoremQA_prompt2), ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer, max_out_len=512)) TheoremQA_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='TheoremQA')) + pred_postprocessor=dict(type=TheoremQA_postprocess)) TheoremQA_datasets = [ dict( diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_8acdf7.py b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py similarity index 88% rename from configs/datasets/TheoremQA/TheoremQA_gen_8acdf7.py rename to configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py index 9651c019..ef037ee2 100644 --- a/configs/datasets/TheoremQA/TheoremQA_gen_8acdf7.py +++ b/configs/datasets/TheoremQA/TheoremQA_gen_ef26ca.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import TheoremQADataset +from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess TheoremQA_reader_cfg = dict( input_columns=['Question', 'Answer_type'], @@ -20,11 +20,11 @@ TheoremQA_infer_cfg = dict( ), ])), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer)) + inferencer=dict(type=GenInferencer, max_out_len=512)) TheoremQA_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='TheoremQA')) + pred_postprocessor=dict(type=TheoremQA_postprocess)) TheoremQA_datasets = [ dict( diff --git a/configs/datasets/XLSum/XLSum_gen_2bb71c.py b/configs/datasets/XLSum/XLSum_gen_2bb71c.py index ac0a5f3a..8596df0f 100644 --- a/configs/datasets/XLSum/XLSum_gen_2bb71c.py +++ b/configs/datasets/XLSum/XLSum_gen_2bb71c.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import RougeEvaluator -from opencompass.datasets import XLSUMDataset +from opencompass.datasets import XLSUMDataset, Xsum_postprocess XLSum_reader_cfg = dict(input_columns=['text'], output_column='summary') @@ -16,7 +16,7 @@ XLSum_infer_cfg = dict( XLSum_eval_cfg = dict( evaluator=dict(type=RougeEvaluator), - pred_postprocessor=dict(type='Xsum'), + pred_postprocessor=dict(type=Xsum_postprocess), ) XLSum_datasets = [ diff --git a/configs/datasets/Xsum/Xsum_gen_8ea5f8.py b/configs/datasets/Xsum/Xsum_gen_8ea5f8.py index 8ef0ef34..364e7004 100644 --- a/configs/datasets/Xsum/Xsum_gen_8ea5f8.py +++ b/configs/datasets/Xsum/Xsum_gen_8ea5f8.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import RougeEvaluator -from opencompass.datasets import XsumDataset +from opencompass.datasets import XsumDataset, Xsum_postprocess Xsum_reader_cfg = dict(input_columns=['dialogue'], output_column='summary') @@ -16,7 +16,7 @@ Xsum_infer_cfg = dict( Xsum_eval_cfg = dict( evaluator=dict(type=RougeEvaluator), - pred_postprocessor=dict(type='Xsum'), + pred_postprocessor=dict(type=Xsum_postprocess), ) Xsum_datasets = [ diff --git a/configs/datasets/agieval/agieval_gen_0a9ace.py b/configs/datasets/agieval/agieval_gen_0a9ace.py index 42019307..113b0e2d 100644 --- a/configs/datasets/agieval/agieval_gen_0a9ace.py +++ b/configs/datasets/agieval/agieval_gen_0a9ace.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AGIEvalDataset, AGIEvalEvaluator +from opencompass.utils.text_postprocessors import first_capital_postprocess agieval_reader_cfg = dict( input_columns=['problem_input'], output_column='label') @@ -44,7 +45,7 @@ for name in agieval_single_choice_sets: agieval_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital')) + pred_postprocessor=dict(type=first_capital_postprocess)) agieval_datasets.append( dict( diff --git a/configs/datasets/agieval/agieval_gen_397d81.py b/configs/datasets/agieval/agieval_gen_397d81.py index 0df8f90a..523cb074 100644 --- a/configs/datasets/agieval/agieval_gen_397d81.py +++ b/configs/datasets/agieval/agieval_gen_397d81.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi agieval_reader_cfg = dict( input_columns=['question', 'options'], output_column='label') @@ -82,7 +83,7 @@ for _name in agieval_single_choice_sets: agieval_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital')) + pred_postprocessor=dict(type=first_capital_postprocess)) agieval_datasets.append( dict( @@ -111,7 +112,7 @@ for _name in agieval_multiple_choices_sets: agieval_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital-multi')) + pred_postprocessor=dict(type=first_capital_postprocess_multi)) agieval_datasets.append( dict( diff --git a/configs/datasets/agieval/agieval_mixed_2f14ad.py b/configs/datasets/agieval/agieval_mixed_2f14ad.py index 555ca0da..bd75df3f 100644 --- a/configs/datasets/agieval/agieval_mixed_2f14ad.py +++ b/configs/datasets/agieval/agieval_mixed_2f14ad.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer, GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import AGIEvalDataset_v2, AGIEvalEvaluator +from opencompass.utils.text_postprocessors import first_capital_postprocess_multi agieval_single_choice_sets = [ 'gaokao-chinese', @@ -116,7 +117,7 @@ for _name in agieval_multiple_choices_sets: agieval_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital-multi')) + pred_postprocessor=dict(type=first_capital_postprocess_multi)) agieval_datasets.append( dict( diff --git a/configs/datasets/apps/apps_gen_5b4254.py b/configs/datasets/apps/apps_gen_5b4254.py index 2f510e11..636d2a4b 100644 --- a/configs/datasets/apps/apps_gen_5b4254.py +++ b/configs/datasets/apps/apps_gen_5b4254.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess apps_reader_cfg = dict( input_columns=['question'], output_column='problem_id', train_split='test') @@ -20,7 +20,7 @@ apps_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) apps_datasets = [ diff --git a/configs/datasets/apps/apps_gen_7fbb95.py b/configs/datasets/apps/apps_gen_7fbb95.py index 481a4e05..33244e0d 100644 --- a/configs/datasets/apps/apps_gen_7fbb95.py +++ b/configs/datasets/apps/apps_gen_7fbb95.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess apps_reader_cfg = dict( input_columns=['question'], output_column='problem_id', train_split='test') @@ -27,7 +27,7 @@ apps_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) apps_datasets = [ diff --git a/configs/datasets/apps/apps_gen_b4dee3.py b/configs/datasets/apps/apps_gen_b4dee3.py index 4eec64fa..0d018504 100644 --- a/configs/datasets/apps/apps_gen_b4dee3.py +++ b/configs/datasets/apps/apps_gen_b4dee3.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess apps_reader_cfg = dict( input_columns=['question'], output_column='problem_id', train_split='test') @@ -17,7 +17,7 @@ apps_infer_cfg = dict( apps_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), k=[1, 10, 100], - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) apps_datasets = [ diff --git a/configs/datasets/bbh/bbh_gen.py b/configs/datasets/bbh/bbh_gen.py index 03768981..cb9dff44 100644 --- a/configs/datasets/bbh/bbh_gen.py +++ b/configs/datasets/bbh/bbh_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .bbh_gen_6bd693 import bbh_datasets # noqa: F401, F403 + from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403 diff --git a/configs/datasets/bbh/bbh_gen_6bd693.py b/configs/datasets/bbh/bbh_gen_5b92b0.py similarity index 94% rename from configs/datasets/bbh/bbh_gen_6bd693.py rename to configs/datasets/bbh/bbh_gen_5b92b0.py index 6489e5c4..11f1b3bb 100644 --- a/configs/datasets/bbh/bbh_gen_6bd693.py +++ b/configs/datasets/bbh/bbh_gen_5b92b0.py @@ -3,7 +3,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import BBHDataset, BBHEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess bbh_reader_cfg = dict(input_columns=["input"], output_column="target") @@ -61,8 +61,8 @@ for _name in bbh_multiple_choice_sets: bbh_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type='bbh-mcq'), - dataset_postprocessor=dict(type='bbh-mcq')) + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) bbh_datasets.append( dict( diff --git a/configs/datasets/ceval/ceval_gen_2daf24.py b/configs/datasets/ceval/ceval_gen_2daf24.py index c87cd3bd..c203b51c 100644 --- a/configs/datasets/ceval/ceval_gen_2daf24.py +++ b/configs/datasets/ceval/ceval_gen_2daf24.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess ceval_subject_mapping = { "computer_network": @@ -166,7 +167,7 @@ for _split in ["val", "test"]: ceval_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital')) + pred_postprocessor=dict(type=first_capital_postprocess)) ceval_datasets.append( dict( diff --git a/configs/datasets/ceval/ceval_gen_5f30c7.py b/configs/datasets/ceval/ceval_gen_5f30c7.py index 323cbafa..1ccbe4de 100644 --- a/configs/datasets/ceval/ceval_gen_5f30c7.py +++ b/configs/datasets/ceval/ceval_gen_5f30c7.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import CEvalDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess ceval_subject_mapping = { "computer_network": @@ -164,7 +165,9 @@ for _split in ["val"]: inferencer=dict(type=GenInferencer, fix_id_list=[0, 1, 2, 3, 4]), ) - ceval_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + ceval_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess)) ceval_datasets.append( dict( diff --git a/configs/datasets/collections/base_medium.py b/configs/datasets/collections/base_medium.py index 4684c1e6..d3caf379 100644 --- a/configs/datasets/collections/base_medium.py +++ b/configs/datasets/collections/base_medium.py @@ -5,18 +5,18 @@ with read_base(): from ..ceval.ceval_ppl_578f8d import ceval_datasets from ..agieval.agieval_mixed_2f14ad import agieval_datasets from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets - from ..bbh.bbh_gen_6bd693 import bbh_datasets + from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets - from ..CLUE_CMRC.CLUE_CMRC_gen_941108 import CMRC_datasets - from ..CLUE_DRCD.CLUE_DRCD_gen_941108 import DRCD_datasets + from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets + from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets from ..CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets from ..CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets - from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets + from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_4284a0 import cluewsc_datasets from ..FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_ppl_c08300 import ocnli_fc_datasets @@ -33,24 +33,24 @@ with read_base(): from ..SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets - from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_d0f531 import WSC_datasets - from ..race.race_ppl_ab8734 import race_datasets + from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets + from ..race.race_ppl_a138cd import race_datasets from ..Xsum.Xsum_gen_31397e import Xsum_datasets from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets - from ..math.math_gen_3e92f6 import math_datasets - from ..TheoremQA.TheoremQA_gen_8acdf7 import TheoremQA_datasets + from ..math.math_gen_265cce import math_datasets + from ..TheoremQA.TheoremQA_gen_ef26ca import TheoremQA_datasets from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets from ..ARC_e.ARC_e_ppl_a450bd import ARC_e_datasets from ..ARC_c.ARC_c_ppl_a450bd import ARC_c_datasets from ..commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets from ..siqa.siqa_ppl_ced5f6 import siqa_datasets - from ..strategyqa.strategyqa_gen_b3ff20 import strategyqa_datasets - from ..winogrande.winogrande_ppl_18e5de import winogrande_datasets + from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets + from ..winogrande.winogrande_ppl_55a66e import winogrande_datasets from ..obqa.obqa_ppl_c7c154 import obqa_datasets - from ..nq.nq_gen_3dcea1 import nq_datasets - from ..triviaqa.triviaqa_gen_3e39a5 import triviaqa_datasets + from ..nq.nq_gen_c788f6 import nq_datasets + from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..flores.flores_gen_806ede import flores_datasets from ..crowspairs.crowspairs_ppl_e811e1 import crowspairs_datasets diff --git a/configs/datasets/collections/base_small.py b/configs/datasets/collections/base_small.py index 835ae742..a038ad39 100644 --- a/configs/datasets/collections/base_small.py +++ b/configs/datasets/collections/base_small.py @@ -2,9 +2,9 @@ from mmengine.config import read_base with read_base(): from ..ceval.ceval_ppl_578f8d import ceval_datasets - from ..bbh.bbh_gen_6bd693 import bbh_datasets - from ..CLUE_CMRC.CLUE_CMRC_gen_941108 import CMRC_datasets - from ..CLUE_DRCD.CLUE_DRCD_gen_941108 import DRCD_datasets + from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets + from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets from ..FewCLUE_bustm.FewCLUE_bustm_ppl_e53034 import bustm_datasets from ..FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets @@ -24,16 +24,16 @@ with read_base(): from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets from ..SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets from ..SuperGLUE_WSC.SuperGLUE_WSC_ppl_d0f531 import WSC_datasets - from ..race.race_ppl_ab8734 import race_datasets - from ..math.math_gen_3e92f6 import math_datasets + from ..race.race_ppl_a138cd import race_datasets + from ..math.math_gen_265cce import math_datasets from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets - from ..winogrande.winogrande_ppl_18e5de import winogrande_datasets + from ..winogrande.winogrande_ppl_55a66e import winogrande_datasets from ..obqa.obqa_ppl_c7c154 import obqa_datasets - from ..nq.nq_gen_3dcea1 import nq_datasets - from ..triviaqa.triviaqa_gen_3e39a5 import triviaqa_datasets + from ..nq.nq_gen_c788f6 import nq_datasets + from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..crowspairs.crowspairs_ppl_e811e1 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py index ebd17e5d..ddbb2bb3 100644 --- a/configs/datasets/collections/chat_medium.py +++ b/configs/datasets/collections/chat_medium.py @@ -5,19 +5,19 @@ with read_base(): from ..ceval.ceval_gen_5f30c7 import ceval_datasets from ..agieval.agieval_gen_397d81 import agieval_datasets from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets - from ..bbh.bbh_gen_6bd693 import bbh_datasets + from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets - from ..CLUE_CMRC.CLUE_CMRC_gen_941108 import CMRC_datasets - from ..CLUE_DRCD.CLUE_DRCD_gen_941108 import DRCD_datasets + from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets + from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets from ..CLUE_cmnli.CLUE_cmnli_gen_1abf97 import cmnli_datasets from ..CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets - from ..FewCLUE_csl.FewCLUE_csl_gen_87f4a8 import csl_datasets + from ..FewCLUE_csl.FewCLUE_csl_gen_28b223 import csl_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_f97a97 import ocnli_fc_datasets from ..FewCLUE_tnews.FewCLUE_tnews_gen_b90e4a import tnews_datasets @@ -37,20 +37,20 @@ with read_base(): from ..race.race_gen_69ee4f import race_datasets from ..Xsum.Xsum_gen_31397e import Xsum_datasets from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from ..summedits.summedits_gen_4fb38b import summedits_datasets - from ..math.math_gen_3e92f6 import math_datasets - from ..TheoremQA.TheoremQA_gen_a27a10 import TheoremQA_datasets + from ..summedits.summedits_gen_315438 import summedits_datasets + from ..math.math_gen_265cce import math_datasets + from ..TheoremQA.TheoremQA_gen_7009de import TheoremQA_datasets from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets from ..ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets from ..ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets from ..commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets from ..piqa.piqa_gen_1194eb import piqa_datasets from ..siqa.siqa_gen_e78df3 import siqa_datasets - from ..strategyqa.strategyqa_gen_b3ff20 import strategyqa_datasets + from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets from ..winogrande.winogrande_gen_a9ede5 import winogrande_datasets from ..obqa.obqa_gen_9069e4 import obqa_datasets - from ..nq.nq_gen_68c1c6 import nq_datasets - from ..triviaqa.triviaqa_gen_3e39a5 import triviaqa_datasets + from ..nq.nq_gen_c788f6 import nq_datasets + from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..flores.flores_gen_806ede import flores_datasets from ..crowspairs.crowspairs_gen_02b6c1 import crowspairs_datasets diff --git a/configs/datasets/collections/chat_small.py b/configs/datasets/collections/chat_small.py index 0f85a806..2004077f 100644 --- a/configs/datasets/collections/chat_small.py +++ b/configs/datasets/collections/chat_small.py @@ -3,9 +3,9 @@ from mmengine.config import read_base with read_base(): from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets from ..ceval.ceval_gen_5f30c7 import ceval_datasets - from ..bbh.bbh_gen_6bd693 import bbh_datasets - from ..CLUE_CMRC.CLUE_CMRC_gen_941108 import CMRC_datasets - from ..CLUE_DRCD.CLUE_DRCD_gen_941108 import DRCD_datasets + from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets + from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets from ..FewCLUE_bustm.FewCLUE_bustm_gen_634f41 import bustm_datasets from ..FewCLUE_chid.FewCLUE_chid_gen_0a29a2 import chid_datasets @@ -24,17 +24,17 @@ with read_base(): from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets - from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_6dc406 import WSC_datasets + from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_8a881c import WSC_datasets from ..race.race_gen_69ee4f import race_datasets - from ..math.math_gen_3e92f6 import math_datasets + from ..math.math_gen_265cce import math_datasets from ..gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets - from ..summedits.summedits_gen_4fb38b import summedits_datasets + from ..summedits.summedits_gen_315438 import summedits_datasets from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets from ..piqa.piqa_gen_1194eb import piqa_datasets from ..winogrande.winogrande_gen_a9ede5 import winogrande_datasets from ..obqa.obqa_gen_9069e4 import obqa_datasets - from ..nq.nq_gen_68c1c6 import nq_datasets - from ..triviaqa.triviaqa_gen_3e39a5 import triviaqa_datasets + from ..nq.nq_gen_c788f6 import nq_datasets + from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..crowspairs.crowspairs_gen_02b6c1 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/example.py b/configs/datasets/collections/example.py index 4ea7bc9a..baa4325f 100644 --- a/configs/datasets/collections/example.py +++ b/configs/datasets/collections/example.py @@ -2,6 +2,6 @@ from mmengine.config import read_base with read_base(): from ..piqa.piqa_gen_1194eb import piqa_datasets - from ..nq.nq_gen_68c1c6 import nq_datasets + from ..nq.nq_gen_c788f6 import nq_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py index 9a3d008c..088e2906 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import MDLRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import commonsenseqaDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess commonsenseqa_reader_cfg = dict( input_columns=["question", "A", "B", "C", "D", "E"], @@ -44,7 +45,7 @@ commonsenseqa_infer_cfg = dict( commonsenseqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) commonsenseqa_datasets = [ diff --git a/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py b/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py index ebb7bfc6..3981ff87 100644 --- a/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py +++ b/configs/datasets/crowspairs/crowspairs_gen_02b6c1.py @@ -3,10 +3,11 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import crowspairsDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess crowspairs_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], - output_column='id', + output_column='label', train_split='test', test_split='test') @@ -26,7 +27,7 @@ crowspairs_infer_cfg = dict( crowspairs_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) crowspairs_datasets = [ diff --git a/configs/datasets/crowspairs/crowspairs_ppl_47f211.py b/configs/datasets/crowspairs/crowspairs_ppl_47f211.py index 2f686ba4..822276a9 100644 --- a/configs/datasets/crowspairs/crowspairs_ppl_47f211.py +++ b/configs/datasets/crowspairs/crowspairs_ppl_47f211.py @@ -6,7 +6,7 @@ from opencompass.datasets import crowspairsDataset crowspairs_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], - output_column='id', + output_column='label', train_split='test', test_split='test') diff --git a/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py b/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py index 748ea374..76a34908 100644 --- a/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py +++ b/configs/datasets/crowspairs/crowspairs_ppl_e811e1.py @@ -6,7 +6,7 @@ from opencompass.datasets import crowspairsDataset crowspairs_reader_cfg = dict( input_columns=['sent_more', 'sent_less'], - output_column='id', + output_column='label', train_split='test', test_split='test') diff --git a/configs/datasets/govrepcrs/govrepcrs_gen_aa5eb3.py b/configs/datasets/govrepcrs/govrepcrs_gen_aa5eb3.py index 9768c6e0..73a97776 100644 --- a/configs/datasets/govrepcrs/govrepcrs_gen_aa5eb3.py +++ b/configs/datasets/govrepcrs/govrepcrs_gen_aa5eb3.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import GovRepcrsDataset +from opencompass.utils.text_postprocessors import general_cn_postprocess govrepcrs_reader_cfg = dict( input_columns='content', @@ -21,8 +22,8 @@ govrepcrs_infer_cfg = dict( govrepcrs_eval_cfg = dict( evaluator=dict(type=BleuEvaluator), - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) govrepcrs_datasets = [ dict( diff --git a/configs/datasets/govrepcrs/govrepcrs_gen_db7930.py b/configs/datasets/govrepcrs/govrepcrs_gen_db7930.py index 9af1402a..d59d023f 100644 --- a/configs/datasets/govrepcrs/govrepcrs_gen_db7930.py +++ b/configs/datasets/govrepcrs/govrepcrs_gen_db7930.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import GovRepcrsDataset +from opencompass.utils.text_postprocessors import general_cn_postprocess govrepcrs_reader_cfg = dict( input_columns='content', @@ -33,8 +34,8 @@ govrepcrs_infer_cfg = dict( govrepcrs_eval_cfg = dict( evaluator=dict(type=BleuEvaluator), pred_role='BOT', - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) govrepcrs_datasets = [ dict( diff --git a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py index f8e9fd42..2bcb9c6f 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -26,8 +26,8 @@ gsm8k_infer_cfg = dict( inferencer=dict(type=GenInferencer, max_out_len=512)) gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='gsm8k'), - dataset_postprocessor=dict(type='gsm8k_dataset')) + pred_postprocessor=dict(type=gsm8k_postprocess), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) gsm8k_datasets = [ dict( diff --git a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py index 36052772..0e146a48 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -73,8 +73,8 @@ Question: {question}{answer} gsm8k_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='gsm8k'), - dataset_postprocessor=dict(type='gsm8k_dataset')) + pred_postprocessor=dict(type=gsm8k_postprocess), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) gsm8k_datasets = [ dict( diff --git a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py index 858ee28b..f351c901 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py +++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -36,8 +36,8 @@ gsm8k_infer_cfg = dict( gsm8k_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type='gsm8k'), - dataset_postprocessor=dict(type='gsm8k_dataset')) + pred_postprocessor=dict(type=gsm8k_postprocess), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) gsm8k_datasets = [ dict( diff --git a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py index 5fc8d799..18f76f04 100644 --- a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py +++ b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import hellaswagDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess hellaswag_reader_cfg = dict( input_columns=["ctx", "A", "B", "C", "D"], @@ -30,7 +31,7 @@ hellaswag_infer_cfg = dict( hellaswag_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) hellaswag_datasets = [ diff --git a/configs/datasets/humaneval/humaneval_gen_6f294d.py b/configs/datasets/humaneval/humaneval_gen_6f294d.py index 68dc6d36..a0a991a3 100644 --- a/configs/datasets/humaneval/humaneval_gen_6f294d.py +++ b/configs/datasets/humaneval/humaneval_gen_6f294d.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -27,7 +27,7 @@ humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) humaneval_datasets = [ diff --git a/configs/datasets/humaneval/humaneval_gen_8e312c.py b/configs/datasets/humaneval/humaneval_gen_8e312c.py index 0b71e8e8..cd8421b6 100644 --- a/configs/datasets/humaneval/humaneval_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_gen_8e312c.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -22,7 +22,7 @@ humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) humaneval_datasets = [ diff --git a/configs/datasets/humaneval/humaneval_gen_fd5822.py b/configs/datasets/humaneval/humaneval_gen_fd5822.py index 3bb178c5..9b28d30f 100644 --- a/configs/datasets/humaneval/humaneval_gen_fd5822.py +++ b/configs/datasets/humaneval/humaneval_gen_fd5822.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -17,7 +17,7 @@ humaneval_infer_cfg = dict( humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) humaneval_datasets = [ diff --git a/configs/datasets/humaneval/humaneval_gen_ff7054.py b/configs/datasets/humaneval/humaneval_gen_ff7054.py index 4e3e6d78..845a5eda 100644 --- a/configs/datasets/humaneval/humaneval_gen_ff7054.py +++ b/configs/datasets/humaneval/humaneval_gen_ff7054.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator +from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -27,7 +27,7 @@ humaneval_eval_cfg = dict( evaluator=dict(type=HumanEvaluator), pred_role='BOT', k=[1, 10, 100], # the parameter only for humaneval - pred_postprocessor=dict(type='humaneval'), + pred_postprocessor=dict(type=humaneval_postprocess), ) humaneval_datasets = [ diff --git a/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py b/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py index 3b51fba9..1ecb30a6 100644 --- a/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py +++ b/configs/datasets/iwslt2017/iwslt2017_gen_69ce16.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import BM25Retriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import IWSLT2017Dataset +from opencompass.utils.text_postprocessors import general_cn_postprocess iwslt2017_reader_cfg = dict( input_columns='en', output_column='de', train_split='validation') @@ -15,10 +16,10 @@ iwslt2017_infer_cfg = dict( inferencer=dict(type=GenInferencer)) iwslt2017_eval_cfg = dict( - evaluator=dict(type=BleuEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + evaluator=dict(type=BleuEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) iwslt2017_datasets = [ dict( @@ -28,4 +29,4 @@ iwslt2017_datasets = [ reader_cfg=iwslt2017_reader_cfg, infer_cfg=iwslt2017_infer_cfg, eval_cfg=iwslt2017_eval_cfg) -] \ No newline at end of file +] diff --git a/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py b/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py index 1d50d397..c27a7434 100644 --- a/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py +++ b/configs/datasets/iwslt2017/iwslt2017_gen_b4a814.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import BM25Retriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import IWSLT2017Dataset +from opencompass.utils.text_postprocessors import general_cn_postprocess iwslt2017_reader_cfg = dict( input_columns='en', output_column='de', train_split='validation') @@ -24,10 +25,10 @@ iwslt2017_infer_cfg = dict( inferencer=dict(type=GenInferencer)) iwslt2017_eval_cfg = dict( - evaluator=dict(type=BleuEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + evaluator=dict(type=BleuEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) iwslt2017_datasets = [ dict( @@ -37,4 +38,4 @@ iwslt2017_datasets = [ reader_cfg=iwslt2017_reader_cfg, infer_cfg=iwslt2017_infer_cfg, eval_cfg=iwslt2017_eval_cfg) -] \ No newline at end of file +] diff --git a/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py b/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py index a696c372..c590c889 100644 --- a/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py +++ b/configs/datasets/iwslt2017/iwslt2017_gen_d0ebd1.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import BM25Retriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import IWSLT2017Dataset +from opencompass.utils.text_postprocessors import general_cn_postprocess iwslt2017_reader_cfg = dict( input_columns='en', output_column='de', train_split='validation') @@ -22,10 +23,10 @@ iwslt2017_infer_cfg = dict( inferencer=dict(type=GenInferencer)) iwslt2017_eval_cfg = dict( - evaluator=dict(type=BleuEvaluator), - pred_role='BOT', - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + evaluator=dict(type=BleuEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) iwslt2017_datasets = [ dict( @@ -35,4 +36,4 @@ iwslt2017_datasets = [ reader_cfg=iwslt2017_reader_cfg, infer_cfg=iwslt2017_infer_cfg, eval_cfg=iwslt2017_eval_cfg) -] \ No newline at end of file +] diff --git a/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_1af0ae.py b/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_1af0ae.py index 62a2d727..9b2efbab 100644 --- a/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_1af0ae.py +++ b/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_1af0ae.py @@ -33,8 +33,8 @@ for _l in lang: dict( abbr=f'jigsaw_multilingual_{_l}', type=JigsawMultilingualDataset, - path='data/test.csv', - label='data/test_labels.csv', + path='data/jigsawmultilingual/test.csv', + label='data/jigsawmultilingual/test_labels.csv', lang=_l, reader_cfg=jigsawmultilingual_reader_cfg, infer_cfg=jigsawmultilingual_infer_cfg, diff --git a/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_fe50d8.py b/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_fe50d8.py index d441c7dd..f366a101 100644 --- a/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_fe50d8.py +++ b/configs/datasets/jigsawmultilingual/jigsawmultilingual_ppl_fe50d8.py @@ -37,8 +37,8 @@ for _l in lang: dict( abbr=f'jigsaw_multilingual_{_l}', type=JigsawMultilingualDataset, - path='data/test.csv', - label='data/test_labels.csv', + path='data/jigsawmultilingual/test.csv', + label='data/jigsawmultilingual/test_labels.csv', lang=_l, reader_cfg=jigsawmultilingual_reader_cfg, infer_cfg=jigsawmultilingual_infer_cfg, diff --git a/configs/datasets/lcsts/lcsts_gen_8ee1fe.py b/configs/datasets/lcsts/lcsts_gen_8ee1fe.py index ec7a1c7b..923243b2 100644 --- a/configs/datasets/lcsts/lcsts_gen_8ee1fe.py +++ b/configs/datasets/lcsts/lcsts_gen_8ee1fe.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import RougeEvaluator -from opencompass.datasets import LCSTSDataset +from opencompass.datasets import LCSTSDataset, lcsts_postprocess lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') @@ -18,7 +18,7 @@ lcsts_infer_cfg = dict( lcsts_eval_cfg = dict( evaluator=dict(type=RougeEvaluator), pred_role='BOT', - pred_postprocessor=dict(type='lcsts'), + pred_postprocessor=dict(type=lcsts_postprocess), ) lcsts_datasets = [ diff --git a/configs/datasets/lcsts/lcsts_gen_9b0b89.py b/configs/datasets/lcsts/lcsts_gen_9b0b89.py index 428ef1b6..49627f3f 100644 --- a/configs/datasets/lcsts/lcsts_gen_9b0b89.py +++ b/configs/datasets/lcsts/lcsts_gen_9b0b89.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import RougeEvaluator -from opencompass.datasets import LCSTSDataset +from opencompass.datasets import LCSTSDataset, lcsts_postprocess lcsts_reader_cfg = dict(input_columns=['content'], output_column='abst') @@ -14,7 +14,7 @@ lcsts_infer_cfg = dict( lcsts_eval_cfg = dict( evaluator=dict(type=RougeEvaluator), - pred_postprocessor=dict(type='lcsts'), + pred_postprocessor=dict(type=lcsts_postprocess), ) lcsts_datasets = [ diff --git a/configs/datasets/math/math_gen.py b/configs/datasets/math/math_gen.py index cc3aca76..f3e23068 100644 --- a/configs/datasets/math/math_gen.py +++ b/configs/datasets/math/math_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .math_gen_3e92f6 import math_datasets # noqa: F401, F403 + from .math_gen_265cce import math_datasets # noqa: F401, F403 diff --git a/configs/datasets/math/math_gen_3e92f6.py b/configs/datasets/math/math_gen_265cce.py similarity index 67% rename from configs/datasets/math/math_gen_3e92f6.py rename to configs/datasets/math/math_gen_265cce.py index 9fb9a15c..2312fc61 100644 --- a/configs/datasets/math/math_gen_3e92f6.py +++ b/configs/datasets/math/math_gen_265cce.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MATHDataset, MATHEvaluator +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -12,12 +12,12 @@ math_infer_cfg = dict( dict( role="HUMAN", prompt= - "Problem:\nFind the domain of the expression $\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:" + "Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:" ), dict( role="BOT", prompt= - "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct." + "The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n" ), dict( role="HUMAN", @@ -27,7 +27,7 @@ math_infer_cfg = dict( dict( role="BOT", prompt= - "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct." + "We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n" ), dict( role="HUMAN", @@ -37,17 +37,17 @@ math_infer_cfg = dict( dict( role="BOT", prompt= - "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \begin{{align*}} 30n&=480\\ \Rightarrow\qquad n&=480/30=\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct." + "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n" ), dict( role="HUMAN", prompt= - "Problem:\nIf the system of equations: \begin{{align*}} 6x-4y&=a,\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:" + "Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:" ), dict( role="BOT", prompt= - "If we multiply the first equation by $-\frac{{3}}{{2}}$, we obtain $$6y-9x=-\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\frac{{3}}{{2}}a=b\Rightarrow\frac{{a}}{{b}}=\boxed{{-\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\frac{{2}}{{3}}$. I hope it is correct." + "If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n" ), dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"), ])), @@ -55,7 +55,7 @@ math_infer_cfg = dict( inferencer=dict(type=GenInferencer, max_out_len=512)) math_eval_cfg = dict( - evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type='math')) + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) math_datasets = [ dict( diff --git a/configs/datasets/math/math_gen_01261e.py b/configs/datasets/math/math_gen_559593.py similarity index 67% rename from configs/datasets/math/math_gen_01261e.py rename to configs/datasets/math/math_gen_559593.py index ddd8bae6..3fbdadbd 100644 --- a/configs/datasets/math/math_gen_01261e.py +++ b/configs/datasets/math/math_gen_559593.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MATHDataset, MATHEvaluator +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess math_reader_cfg = dict(input_columns=['problem'], output_column='solution') @@ -9,28 +9,28 @@ math_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, template='''Problem: -Find the domain of the expression $\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} Solution: -The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{{[2,5)}}$. +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. Final Answer: The final answer is $[2,5)$. I hope it is correct. Problem: If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ Solution: -We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \boxed{{24}}.$ +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ Final Answer: The final answer is $24$. I hope it is correct. Problem: Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? Solution: -If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \begin{{align*}} 30n&=480\\ \Rightarrow\qquad n&=480/30=\boxed{{16}} \end{{align*}} +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} Final Answer: The final answer is $16$. I hope it is correct. Problem: -If the system of equations: \begin{{align*}} 6x-4y&=a,\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{{a}}{{b}},$ assuming $b$ is nonzero. +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. Solution: -If we multiply the first equation by $-\frac{{3}}{{2}}$, we obtain $$6y-9x=-\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\frac{{3}}{{2}}a=b\Rightarrow\frac{{a}}{{b}}=\boxed{{-\frac{{2}}{{3}}}}.$$ -Final Answer: The final answer is $-\frac{{2}}{{3}}$. I hope it is correct. +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. Problem: {problem} @@ -40,7 +40,7 @@ Solution: inferencer=dict(type=GenInferencer, max_out_len=512)) math_eval_cfg = dict( - evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type='math')) + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) math_datasets = [ dict( diff --git a/configs/datasets/math/math_gen_664168.py b/configs/datasets/math/math_gen_5e8458.py similarity index 67% rename from configs/datasets/math/math_gen_664168.py rename to configs/datasets/math/math_gen_5e8458.py index 9e3a6616..9a6bf866 100644 --- a/configs/datasets/math/math_gen_664168.py +++ b/configs/datasets/math/math_gen_5e8458.py @@ -1,34 +1,34 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MATHDataset, MATHEvaluator +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess math_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, template='''Problem: -Find the domain of the expression $\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} +Find the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}} Solution: -The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\boxed{{[2,5)}}$. +The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$. Final Answer: The final answer is $[2,5)$. I hope it is correct. Problem: If $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$ Solution: -We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \boxed{{24}}.$ +We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$ Final Answer: The final answer is $24$. I hope it is correct. Problem: Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? Solution: -If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \begin{{align*}} 30n&=480\\ \Rightarrow\qquad n&=480/30=\boxed{{16}} \end{{align*}} +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}} Final Answer: The final answer is $16$. I hope it is correct. Problem: -If the system of equations: \begin{{align*}} 6x-4y&=a,\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{{a}}{{b}},$ assuming $b$ is nonzero. +If the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero. Solution: -If we multiply the first equation by $-\frac{{3}}{{2}}$, we obtain $$6y-9x=-\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\frac{{3}}{{2}}a=b\Rightarrow\frac{{a}}{{b}}=\boxed{{-\frac{{2}}{{3}}}}.$$ -Final Answer: The final answer is $-\frac{{2}}{{3}}$. I hope it is correct. +If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$ +Final Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct. Problem: {problem}Solution: @@ -37,7 +37,7 @@ Problem: inferencer=dict(type=GenInferencer, max_out_len=512)) math_eval_cfg = dict( - evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type='math')) + evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess)) math_datasets = [ dict( diff --git a/configs/datasets/mbpp/mbpp_gen_1e1056.py b/configs/datasets/mbpp/mbpp_gen_1e1056.py index 234ffd96..2add7a61 100644 --- a/configs/datasets/mbpp/mbpp_gen_1e1056.py +++ b/configs/datasets/mbpp/mbpp_gen_1e1056.py @@ -4,7 +4,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import MBPPDataset, MBPPEvaluator mbpp_reader_cfg = dict( - input_columns=['text', 'test_list'], output_column='code') + input_columns=['text', 'test_list'], output_column='test_list_2') mbpp_infer_cfg = dict( prompt_template=dict( diff --git a/configs/datasets/mbpp/mbpp_gen_6590b0.py b/configs/datasets/mbpp/mbpp_gen_6590b0.py index c4948f34..ba9574db 100644 --- a/configs/datasets/mbpp/mbpp_gen_6590b0.py +++ b/configs/datasets/mbpp/mbpp_gen_6590b0.py @@ -4,7 +4,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import MBPPDataset, MBPPEvaluator mbpp_reader_cfg = dict( - input_columns=['text', 'test_list'], output_column='code') + input_columns=['text', 'test_list'], output_column='test_list_2') mbpp_infer_cfg = dict( prompt_template=dict( diff --git a/configs/datasets/mbpp/mbpp_gen_78c1bc.py b/configs/datasets/mbpp/mbpp_gen_78c1bc.py index 18facf02..f69ba735 100644 --- a/configs/datasets/mbpp/mbpp_gen_78c1bc.py +++ b/configs/datasets/mbpp/mbpp_gen_78c1bc.py @@ -4,7 +4,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.datasets import MBPPDataset, MBPPEvaluator mbpp_reader_cfg = dict( - input_columns=['text', 'test_list'], output_column='code') + input_columns=['text', 'test_list'], output_column='test_list_2') mbpp_infer_cfg = dict( prompt_template=dict( diff --git a/configs/datasets/mmlu/mmlu_gen_23a9a9.py b/configs/datasets/mmlu/mmlu_gen_23a9a9.py index 4a079d34..53595b3b 100644 --- a/configs/datasets/mmlu/mmlu_gen_23a9a9.py +++ b/configs/datasets/mmlu/mmlu_gen_23a9a9.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -33,7 +34,7 @@ mmlu_infer_cfg = dict( mmlu_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital')) + pred_postprocessor=dict(type=first_capital_postprocess)) mmlu_all_sets = [ "college_biology", diff --git a/configs/datasets/mmlu/mmlu_gen_5d1409.py b/configs/datasets/mmlu/mmlu_gen_5d1409.py index cf2c95a7..8a925d42 100644 --- a/configs/datasets/mmlu/mmlu_gen_5d1409.py +++ b/configs/datasets/mmlu/mmlu_gen_5d1409.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -107,7 +108,7 @@ for _name in mmlu_all_sets: mmlu_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type="first-capital")) + pred_postprocessor=dict(type=first_capital_postprocess)) mmlu_datasets.append( dict( diff --git a/configs/datasets/mmlu/mmlu_gen_79e572.py b/configs/datasets/mmlu/mmlu_gen_79e572.py index 32730a0e..eabab8e7 100644 --- a/configs/datasets/mmlu/mmlu_gen_79e572.py +++ b/configs/datasets/mmlu/mmlu_gen_79e572.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -92,7 +93,7 @@ for _name in mmlu_all_sets: mmlu_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) mmlu_datasets.append( diff --git a/configs/datasets/mmlu/mmlu_gen_a484b3.py b/configs/datasets/mmlu/mmlu_gen_a484b3.py index d998ad95..93406ea6 100644 --- a/configs/datasets/mmlu/mmlu_gen_a484b3.py +++ b/configs/datasets/mmlu/mmlu_gen_a484b3.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import FixKRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import MMLUDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess # None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar @@ -107,7 +108,7 @@ for _name in mmlu_all_sets: mmlu_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type="first-capital")) + pred_postprocessor=dict(type=first_capital_postprocess)) mmlu_datasets.append( dict( diff --git a/configs/datasets/nq/nq_gen_c788f6.py b/configs/datasets/nq/nq_gen_c788f6.py new file mode 100644 index 00000000..ce31b02e --- /dev/null +++ b/configs/datasets/nq/nq_gen_c788f6.py @@ -0,0 +1,30 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import NaturalQuestionDataset, NQEvaluator + +nq_reader_cfg = dict( + input_columns=['question'], output_column='answer', train_split='test') + +nq_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Answer these questions, your answer should be as simple as possible, start your answer with the prompt \'The answer is \'.\nQ: {question}?'), + dict(role='BOT', prompt='A:'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT") + +nq_datasets = [ + dict( + type=NaturalQuestionDataset, + abbr='nq', + path='./data/nq/', + reader_cfg=nq_reader_cfg, + infer_cfg=nq_infer_cfg, + eval_cfg=nq_eval_cfg) +] diff --git a/configs/datasets/obqa/obqa_gen_9069e4.py b/configs/datasets/obqa/obqa_gen_9069e4.py index a9ef4f41..5183c3d7 100644 --- a/configs/datasets/obqa/obqa_gen_9069e4.py +++ b/configs/datasets/obqa/obqa_gen_9069e4.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import OBQADataset +from opencompass.utils.text_postprocessors import first_capital_postprocess _input_columns = [ ["question_stem", "A", "B", "C", "D"], @@ -54,7 +55,7 @@ for _i in range(2): obqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) obqa_datasets[_i]["reader_cfg"] = obqa_reader_cfg diff --git a/configs/datasets/piqa/piqa_gen_1194eb.py b/configs/datasets/piqa/piqa_gen_1194eb.py index 09ce5289..e4ba2257 100644 --- a/configs/datasets/piqa/piqa_gen_1194eb.py +++ b/configs/datasets/piqa/piqa_gen_1194eb.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import piqaDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess piqa_reader_cfg = dict( input_columns=["goal", "sol1", "sol2"], @@ -24,7 +25,7 @@ piqa_infer_cfg = dict( piqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) piqa_datasets = [ diff --git a/configs/datasets/race/race_gen_69ee4f.py b/configs/datasets/race/race_gen_69ee4f.py index f2b218db..607672d7 100644 --- a/configs/datasets/race/race_gen_69ee4f.py +++ b/configs/datasets/race/race_gen_69ee4f.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import RaceDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], @@ -23,7 +24,7 @@ race_infer_cfg = dict( race_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital'), + pred_postprocessor=dict(type=first_capital_postprocess), pred_role='BOT') race_datasets = [ diff --git a/configs/datasets/race/race_gen_9302a5.py b/configs/datasets/race/race_gen_9302a5.py index f0f764ea..b6e61391 100644 --- a/configs/datasets/race/race_gen_9302a5.py +++ b/configs/datasets/race/race_gen_9302a5.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import RaceDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], @@ -18,7 +19,7 @@ race_infer_cfg = dict( race_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='first-capital')) + pred_postprocessor=dict(type=first_capital_postprocess)) race_datasets = [ dict( diff --git a/configs/datasets/race/race_ppl.py b/configs/datasets/race/race_ppl.py index 5cb15c60..071c04d5 100644 --- a/configs/datasets/race/race_ppl.py +++ b/configs/datasets/race/race_ppl.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .race_ppl_ab8734 import race_datasets # noqa: F401, F403 + from .race_ppl_a138cd import race_datasets # noqa: F401, F403 diff --git a/configs/datasets/race/race_ppl_ab8734.py b/configs/datasets/race/race_ppl_a138cd.py similarity index 88% rename from configs/datasets/race/race_ppl_ab8734.py rename to configs/datasets/race/race_ppl_a138cd.py index 20f4d32b..8b0f36d4 100644 --- a/configs/datasets/race/race_ppl_ab8734.py +++ b/configs/datasets/race/race_ppl_a138cd.py @@ -18,10 +18,9 @@ race_infer_cfg = dict( prompt= "Read the article, and answer the question by replying A, B, C or D.\n\nArticle:\n{article}\n\nQ: {question}\n\nA. {A}\nB. {B}\nC. {C}\nD. {D}" ), - dict(role="BOT", prompt=ans_token), + dict(role="BOT", prompt=f'A: {ans}'), ]) - for ans, ans_token in [["A", "{A}"], ["B", "{B}"], ["C", "{C}"], - ["D", "{D}"]] + for ans in ['A', 'B', 'C', 'D'] }), retriever=dict(type=ZeroRetriever), inferencer=dict(type=PPLInferencer)) diff --git a/configs/datasets/realtoxicprompts/realtoxicprompts_gen_7605e4.py b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_7605e4.py index 91a5e0e3..b0efb30a 100644 --- a/configs/datasets/realtoxicprompts/realtoxicprompts_gen_7605e4.py +++ b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_7605e4.py @@ -27,8 +27,9 @@ realtoxicprompts_eval_cfg = dict( realtoxicprompts_datasets = [ dict( + abbr='real-toxicity-prompts', type=RealToxicPromptsDataset, - path='allenai/real-toxicity-prompts', + path='data/realtoxicprompts/realtoxicprompts_train.arrow', challenging_subset=True, reader_cfg=realtoxicprompts_reader_cfg, infer_cfg=realtoxicprompts_infer_cfg, diff --git a/configs/datasets/realtoxicprompts/realtoxicprompts_gen_ac723c.py b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_ac723c.py index 07246639..12fda746 100644 --- a/configs/datasets/realtoxicprompts/realtoxicprompts_gen_ac723c.py +++ b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_ac723c.py @@ -25,8 +25,9 @@ realtoxicprompts_eval_cfg = dict( realtoxicprompts_datasets = [ dict( + abbr='real-toxicity-prompts', type=RealToxicPromptsDataset, - path='allenai/real-toxicity-prompts', + path='data/realtoxicprompts/realtoxicprompts_train.arrow', challenging_subset=True, reader_cfg=realtoxicprompts_reader_cfg, infer_cfg=realtoxicprompts_infer_cfg, diff --git a/configs/datasets/siqa/siqa_gen_e78df3.py b/configs/datasets/siqa/siqa_gen_e78df3.py index 9da5de36..98537f2f 100644 --- a/configs/datasets/siqa/siqa_gen_e78df3.py +++ b/configs/datasets/siqa/siqa_gen_e78df3.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import siqaDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess siqa_reader_cfg = dict( input_columns=["context", "question", "answerA", "answerB", "answerC"], @@ -28,7 +29,7 @@ siqa_infer_cfg = dict( siqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) siqa_datasets = [ diff --git a/configs/datasets/storycloze/storycloze_gen_7f656a.py b/configs/datasets/storycloze/storycloze_gen_7f656a.py index 54b9563d..77e03d7f 100644 --- a/configs/datasets/storycloze/storycloze_gen_7f656a.py +++ b/configs/datasets/storycloze/storycloze_gen_7f656a.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import storyclozeDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess storycloze_reader_cfg = dict( input_columns=["context", "sentence_quiz1", "sentence_quiz2"], @@ -27,7 +28,7 @@ storycloze_infer_cfg = dict( storycloze_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) # The original story cloze dataset and repo are not long maintaining. diff --git a/configs/datasets/strategyqa/strategyqa_gen.py b/configs/datasets/strategyqa/strategyqa_gen.py index 043ee177..d34b2795 100644 --- a/configs/datasets/strategyqa/strategyqa_gen.py +++ b/configs/datasets/strategyqa/strategyqa_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .strategyqa_gen_b3ff20 import strategyqa_datasets # noqa: F401, F403 + from .strategyqa_gen_1180a7 import strategyqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/strategyqa/strategyqa_gen_b3ff20.py b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py similarity index 88% rename from configs/datasets/strategyqa/strategyqa_gen_b3ff20.py rename to configs/datasets/strategyqa/strategyqa_gen_1180a7.py index c5df261f..faecd76f 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_b3ff20.py +++ b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import HFDataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess strategyqa_reader_cfg = dict( input_columns=['question'], @@ -23,7 +23,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals.\nSo the answer is yes' + 'Hamsters are prey animals. Prey are food for predators. Thus, hamsters provide food for some animals.\nSo the answer is yes\n' ), dict( role='HUMAN', @@ -33,7 +33,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania.\nSo the answer is yes' + 'Brooke Shields went to Princeton University. Princeton University is about as academically rigorous as the University of Pennsylvania. Thus, Brooke Shields could also succeed at the University of Pennsylvania.\nSo the answer is yes\n' ), dict( role='HUMAN', @@ -43,7 +43,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen\'s atomic number squared is less than 5.\nSo the answer is no' + 'Hydrogen has an atomic number of 1. 1 squared is 1. There are 5 Spice Girls. Thus, Hydrogen\'s atomic number squared is less than 5.\nSo the answer is no\n' ), dict( role='HUMAN', @@ -53,7 +53,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'College commencement ceremonies can happen in December, May, and June. December is in the winter, so there can be frost. Thus, there could be frost at some commencements.\nSo the answer is yes' + 'College commencement ceremonies can happen in December, May, and June. December is in the winter, so there can be frost. Thus, there could be frost at some commencements.\nSo the answer is yes\n' ), dict( role='HUMAN', @@ -63,7 +63,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'The War in Vietnam was 6 months. The gestation period for a llama is 11 months, which is more than 6 months. Thus, a llama could not give birth twice during the War in Vietnam.\nSo the answer is no' + 'The War in Vietnam was 6 months. The gestation period for a llama is 11 months, which is more than 6 months. Thus, a llama could not give birth twice during the War in Vietnam.\nSo the answer is no\n' ), dict( role='HUMAN', @@ -71,7 +71,7 @@ strategyqa_infer_cfg = dict( dict( role='BOT', prompt= - 'The density of a pear is about 0.6g/cm3, which is less than water. Objects less dense than water float. Thus, a pear would float.\nSo the answer is no' + 'The density of a pear is about 0.6g/cm3, which is less than water. Objects less dense than water float. Thus, a pear would float.\nSo the answer is no\n' ), dict(role='HUMAN', prompt='Question: {question}\nAnswer:'), ], )), @@ -80,8 +80,8 @@ strategyqa_infer_cfg = dict( strategyqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='strategyqa'), - dataset_postprocessor=dict(type='strategyqa_dataset')) + pred_postprocessor=dict(type=strategyqa_pred_postprocess), + dataset_postprocessor=dict(type=strategyqa_dataset_postprocess)) strategyqa_datasets = [ dict( diff --git a/configs/datasets/strategyqa/strategyqa_gen_934441.py b/configs/datasets/strategyqa/strategyqa_gen_934441.py index f36d72bf..465d8b46 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_934441.py +++ b/configs/datasets/strategyqa/strategyqa_gen_934441.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import HFDataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess strategyqa_reader_cfg = dict( input_columns=['question'], @@ -44,8 +44,8 @@ Q: {question}{answer} strategyqa_eval_cfg = dict( evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type='strategyqa'), - dataset_postprocessor=dict(type='strategyqa_dataset')) + pred_postprocessor=dict(type=strategyqa_pred_postprocess), + dataset_postprocessor=dict(type=strategyqa_dataset_postprocess)) strategyqa_datasets = [ dict( diff --git a/configs/datasets/summedits/summedits_gen.py b/configs/datasets/summedits/summedits_gen.py index 1b569bf3..56dc515a 100644 --- a/configs/datasets/summedits/summedits_gen.py +++ b/configs/datasets/summedits/summedits_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .summedits_gen_4fb38b import summedits_datasets # noqa: F401, F403 + from .summedits_gen_315438 import summedits_datasets # noqa: F401, F403 diff --git a/configs/datasets/summedits/summedits_gen_315438.py b/configs/datasets/summedits/summedits_gen_315438.py new file mode 100644 index 00000000..98346854 --- /dev/null +++ b/configs/datasets/summedits/summedits_gen_315438.py @@ -0,0 +1,51 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import SummeditsDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess + +summedits_reader_cfg = dict( + input_columns=['doc', 'summary'], output_column='label') + +summedits_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + """Given the document below, you have to determine if "Yes" or "No", the summary is factually consistent with the document. + +Document: +{doc} + +Summary: +{summary} + +Question: +Is the summary factually consistent with the document? +A. Yes +B. No + +Answer:""" + ), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +summedits_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_capital_postprocess), +) + +summedits_datasets = [ + dict( + abbr='summedits', + type=SummeditsDataset_V2, + path='./data/summedits/summedits.jsonl', + reader_cfg=summedits_reader_cfg, + infer_cfg=summedits_infer_cfg, + eval_cfg=summedits_eval_cfg) +] diff --git a/configs/datasets/summedits/summedits_gen_4fb38b.py b/configs/datasets/summedits/summedits_gen_4fb38b.py index dd74c417..22fb9078 100644 --- a/configs/datasets/summedits/summedits_gen_4fb38b.py +++ b/configs/datasets/summedits/summedits_gen_4fb38b.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import SummeditsDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess summedits_reader_cfg = dict( input_columns=['doc', 'summary'], output_column='label') @@ -23,7 +24,7 @@ summedits_infer_cfg = dict( summedits_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) summedits_datasets = [ diff --git a/configs/datasets/summscreen/summscreen_gen_653185.py b/configs/datasets/summscreen/summscreen_gen_653185.py index 5f5b8bd7..92331fe6 100644 --- a/configs/datasets/summscreen/summscreen_gen_653185.py +++ b/configs/datasets/summscreen/summscreen_gen_653185.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import SummScreenDataset +from opencompass.utils.text_postprocessors import general_cn_postprocess summscreen_reader_cfg = dict( input_columns='content', @@ -33,8 +34,8 @@ summscreen_infer_cfg = dict( summscreen_eval_cfg = dict( evaluator=dict(type=BleuEvaluator), pred_role='BOT', - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) summscreen_datasets = [ dict( diff --git a/configs/datasets/summscreen/summscreen_gen_aa5eb3.py b/configs/datasets/summscreen/summscreen_gen_aa5eb3.py index c1729348..0ea3cb0a 100644 --- a/configs/datasets/summscreen/summscreen_gen_aa5eb3.py +++ b/configs/datasets/summscreen/summscreen_gen_aa5eb3.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import BleuEvaluator from opencompass.datasets import SummScreenDataset +from opencompass.utils.text_postprocessors import general_cn_postprocess summscreen_reader_cfg = dict( input_columns='content', @@ -21,8 +22,8 @@ summscreen_infer_cfg = dict( summscreen_eval_cfg = dict( evaluator=dict(type=BleuEvaluator), - pred_postprocessor=dict(type='general_cn'), - dataset_postprocessor=dict(type='general_cn')) + pred_postprocessor=dict(type=general_cn_postprocess), + dataset_postprocessor=dict(type=general_cn_postprocess)) summscreen_datasets = [ dict( diff --git a/configs/datasets/triviaqa/triviaqa_gen.py b/configs/datasets/triviaqa/triviaqa_gen.py index 6df39b2e..c28208a6 100644 --- a/configs/datasets/triviaqa/triviaqa_gen.py +++ b/configs/datasets/triviaqa/triviaqa_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .triviaqa_gen_3e39a5 import triviaqa_datasets # noqa: F401, F403 + from .triviaqa_gen_2121ce import triviaqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/triviaqa/triviaqa_gen_2121ce.py b/configs/datasets/triviaqa/triviaqa_gen_2121ce.py new file mode 100644 index 00000000..9e5ed279 --- /dev/null +++ b/configs/datasets/triviaqa/triviaqa_gen_2121ce.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import TriviaQADataset, TriviaQAEvaluator + +triviaqa_reader_cfg = dict( + input_columns=['question'], + output_column='answer', + train_split='dev', + test_split='dev') + +triviaqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Answer these questions, your answer should be as simple as possible, start your answer with the prompt \'The answer is \'.\nQ: {question}?'), + dict(role='BOT', prompt='A:'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=50)) + +triviaqa_eval_cfg = dict( + evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT') + +triviaqa_datasets = [ + dict( + type=TriviaQADataset, + abbr='triviaqa', + path='./data/triviaqa/', + reader_cfg=triviaqa_reader_cfg, + infer_cfg=triviaqa_infer_cfg, + eval_cfg=triviaqa_eval_cfg) +] diff --git a/configs/datasets/winogrande/winogrande_gen_a9ede5.py b/configs/datasets/winogrande/winogrande_gen_a9ede5.py index e86eb8e6..2cb147cf 100644 --- a/configs/datasets/winogrande/winogrande_gen_a9ede5.py +++ b/configs/datasets/winogrande/winogrande_gen_a9ede5.py @@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import winograndeDataset_V2 +from opencompass.utils.text_postprocessors import first_capital_postprocess winogrande_reader_cfg = dict( input_columns=["opt1", "opt2"], @@ -27,7 +28,7 @@ winogrande_infer_cfg = dict( winogrande_eval_cfg = dict( evaluator=dict(type=AccEvaluator), pred_role="BOT", - pred_postprocessor=dict(type="first-capital"), + pred_postprocessor=dict(type=first_capital_postprocess), ) winogrande_datasets = [ diff --git a/configs/datasets/winogrande/winogrande_ppl.py b/configs/datasets/winogrande/winogrande_ppl.py index a94ac013..7baae648 100644 --- a/configs/datasets/winogrande/winogrande_ppl.py +++ b/configs/datasets/winogrande/winogrande_ppl.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .winogrande_ppl_18e5de import winogrande_datasets # noqa: F401, F403 + from .winogrande_ppl_55a66e import winogrande_datasets # noqa: F401, F403 diff --git a/configs/datasets/winogrande/winogrande_ppl_18e5de.py b/configs/datasets/winogrande/winogrande_ppl_55a66e.py similarity index 96% rename from configs/datasets/winogrande/winogrande_ppl_18e5de.py rename to configs/datasets/winogrande/winogrande_ppl_55a66e.py index 500166c4..fc65fa90 100644 --- a/configs/datasets/winogrande/winogrande_ppl_18e5de.py +++ b/configs/datasets/winogrande/winogrande_ppl_55a66e.py @@ -15,9 +15,9 @@ winogrande_infer_cfg = dict( type=PromptTemplate, template={ i: dict(round=[ - dict(role="HUMAN", prompt=f"Good sentence: {{opt{i+1}}}"), + dict(role="HUMAN", prompt=f"Good sentence: {{opt{i}}}"), ]) - for i in range(2) + for i in range(1, 3) }), retriever=dict(type=ZeroRetriever), inferencer=dict(type=PPLInferencer)) diff --git a/configs/eval_demo.py b/configs/eval_demo.py index ddd1e567..3b5a439a 100644 --- a/configs/eval_demo.py +++ b/configs/eval_demo.py @@ -1,7 +1,9 @@ from opencompass.models import HuggingFaceCausalLM +from mmengine.config import read_base + with read_base(): - from .datasets.piqa.piqa_ppl import piqa_datasets + from .datasets.piqa.piqa_ppl import piqa_datasets datasets = piqa_datasets @@ -24,5 +26,3 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), ) ] - - diff --git a/configs/summarizers/groups/GaokaoBench.py b/configs/summarizers/groups/GaokaoBench.py index 4354dc65..30c6beed 100644 --- a/configs/summarizers/groups/GaokaoBench.py +++ b/configs/summarizers/groups/GaokaoBench.py @@ -2,4 +2,5 @@ GaokaoBench_summary_groups = [] # gaokao-bench _GaokaoBench_weights = {'2010-2022_Math_II_MCQs': 1090, '2010-2022_Math_I_MCQs': 1070, '2010-2022_History_MCQs': 1148, '2010-2022_Biology_MCQs': 900, '2010-2022_Political_Science_MCQs': 1280, '2010-2022_Physics_MCQs': 384, '2010-2022_Chemistry_MCQs': 744, '2010-2013_English_MCQs': 105, '2010-2022_Chinese_Modern_Lit': 261, '2010-2022_English_Fill_in_Blanks': 900.0, '2012-2022_English_Cloze_Test': 260, '2010-2022_Geography_MCQs': 380, '2010-2022_English_Reading_Comp': 940, '2010-2022_Chinese_Lang_and_Usage_MCQs': 240} +_GaokaoBench_weights = {'GaokaoBench_' + k: v for k, v in _GaokaoBench_weights.items()} GaokaoBench_summary_groups.append({'name': 'GaokaoBench', 'subsets': list(_GaokaoBench_weights.keys()), 'weights': _GaokaoBench_weights}) diff --git a/configs/summarizers/groups/flores.py b/configs/summarizers/groups/flores.py index 42514afb..79baed79 100644 --- a/configs/summarizers/groups/flores.py +++ b/configs/summarizers/groups/flores.py @@ -23,3 +23,9 @@ for _lang_serie in _flores_lang_map: 'name': f'flores_100_English_{_lang_serie}', 'subsets': [f'flores_100_eng-{lang_name}' for lang_name in _flores_lang_map[_lang_serie]] }) + +flores_summary_groups.append({ + 'name': 'flores_100', + 'subsets': [f'flores_100_{lang_name}-eng' for lang_name in sum(_flores_lang_map.values(), [])] + \ + [f'flores_100_eng-{lang_name}' for lang_name in sum(_flores_lang_map.values(), [])] +}) diff --git a/configs/summarizers/groups/jigsaw_multilingual.py b/configs/summarizers/groups/jigsaw_multilingual.py new file mode 100644 index 00000000..6a7fc504 --- /dev/null +++ b/configs/summarizers/groups/jigsaw_multilingual.py @@ -0,0 +1,6 @@ +jigsaw_multilingual_summary_groups = [] + +# bbh +_jigsaw_multilingual = ['es', 'fr', 'it', 'pt', 'ru', 'tr'] +_jigsaw_multilingual = ['jigsaw_multilingual_' + s for s in _jigsaw_multilingual] +jigsaw_multilingual_summary_groups.append({'name': 'jigsaw_multilingual', 'subsets': _jigsaw_multilingual}) diff --git a/configs/summarizers/medium.py b/configs/summarizers/medium.py index 68ac497e..4772e34f 100644 --- a/configs/summarizers/medium.py +++ b/configs/summarizers/medium.py @@ -7,77 +7,85 @@ with read_base(): from .groups.bbh import bbh_summary_groups from .groups.GaokaoBench import GaokaoBench_summary_groups from .groups.flores import flores_summary_groups + from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups summarizer = dict( dataset_abbrs = [ - '--- Exam ---', - 'agieval', - 'mmlu-all-set', + '--------- 考试 Exam ---------', # category + # 'Mixed', # subcategory "ceval", + 'agieval', + 'mmlu', "GaokaoBench", - "bbh", - '--- Coding ---', - 'openai_humaneval', - 'mbpp', - '--- ChineseUniversal ---', - 'C3', - 'CMRC_dev', - 'DRCD_dev', + 'ARC-c', + '--------- 语言 Language ---------', # category + # '字词释义', # subcategory + 'WiC', + 'summedits', + # '成语习语', # subcategory + 'chid-dev', + # '语义相似度', # subcategory 'afqmc-dev', + 'bustm-dev', + # '指代消解', # subcategory + 'cluewsc-dev', + 'WSC', + 'winogrande', + # '翻译', # subcategory + 'flores_100', + '--------- 知识 Knowledge ---------', # category + # '知识问答', # subcategory + 'BoolQ', + 'commonsense_qa', + 'nq', + 'triviaqa', + # '多语种问答', # subcategory + '--------- 推理 Reasoning ---------', # category + # '文本蕴含', # subcategory 'cmnli', 'ocnli', - 'bustm-dev', - 'chid-dev', - 'cluewsc-dev', - 'csl_dev', - 'eprstmt-dev', 'ocnli_fc-dev', - 'tnews-dev', - 'lcsts', - '--- Completion ---', - 'lambada', - 'story_cloze', - '--- EnglishUniversal ---', 'AX_b', 'AX_g', - 'BoolQ', 'CB', - 'COPA', - 'MultiRC', 'RTE', + # '常识推理', # subcategory + 'story_cloze', + 'COPA', 'ReCoRD', - 'WiC', - 'WSC', - 'race-high', - 'race-middle', - '--- NLG ---', - 'Xsum', - '--- Reasoning ---', - 'gsm8k', - 'summedits', - 'math', - 'TheoremQA', - '--- QA ---', 'hellaswag', - 'ARC-e', - 'ARC-c', - 'commonsense_qa', 'piqa', 'siqa', 'strategyqa', - 'winogrande', - 'openbookqa', + # '数学推理', # subcategory + 'math', + 'gsm8k', + # '定理应用', # subcategory + 'TheoremQA', + # '代码', # subcategory + 'openai_humaneval', + 'mbpp', + # '综合推理', # subcategory + "bbh", + '--------- 理解 Understanding ---------', # category + # '阅读理解', # subcategory + 'C3', + 'CMRC_dev', + 'DRCD_dev', + 'MultiRC', + 'race-middle', + 'race-high', 'openbookqa_fact', - 'nq', - 'triviaqa', - '--- Translation ---', - 'flores_100_Indo-European-Germanic_English', - 'flores_100_English_Indo-European-Germanic', - 'flores_100_Indo-European-Romance_English', - 'flores_100_English_Indo-European-Romance', - 'flores_100_zho_simpl-eng', - 'flores_100_eng-zho_simpl', - '--- Security ---', + # '内容总结', # subcategory + 'csl_dev', + 'lcsts', + 'Xsum', + # '内容分析', # subcategory + 'eprstmt-dev', + 'lambada', + 'tnews-dev', + '--------- 安全 Safety ---------', # category + # '偏见', # subcategory 'crows_pairs', ], summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), diff --git a/configs/summarizers/small.py b/configs/summarizers/small.py index b8a41949..01d97a00 100644 --- a/configs/summarizers/small.py +++ b/configs/summarizers/small.py @@ -11,7 +11,7 @@ with read_base(): summarizer = dict( dataset_abbrs = [ '--- Exam ---', - 'mmlu-all-set', + 'mmlu', "ceval", "bbh", '--- ChineseUniversal ---', diff --git a/docs/en/get_started.md b/docs/en/get_started.md index 81b4bee1..4918166e 100644 --- a/docs/en/get_started.md +++ b/docs/en/get_started.md @@ -24,7 +24,7 @@ If you want to perform evaluations on the humaneval dataset, follow these steps. ``` git clone https://github.com/openai/human-eval.git cd human-eval -pip install -r requirments.txt +pip install -r requirements.txt pip install -e . cd .. ``` diff --git a/docs/en/notes/contribution_guide.md b/docs/en/notes/contribution_guide.md index 04b395ac..431d3dbb 100644 --- a/docs/en/notes/contribution_guide.md +++ b/docs/en/notes/contribution_guide.md @@ -45,11 +45,10 @@ We use the following tools for linting and formatting: Style configurations of yapf and isort can be found in [setup.cfg](https://github.com/open-mmlab/OpenCompass/blob/main/setup.cfg). - ## Pre-commit Hook We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`, -fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit. +fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirements.txt` automatically on every commit. The config for a pre-commit hook is stored in [.pre-commit-config](xxxxxxx). After you clone the repository, you will need to install initialize pre-commit hook. @@ -66,4 +65,4 @@ pre-commit install After this on every commit check code linters and formatter will be enforced. -> Before you create a PR, make sure that your code lints and is formatted by yapf. \ No newline at end of file +> Before you create a PR, make sure that your code lints and is formatted by yapf. diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md index 104d6af5..f21beb99 100644 --- a/docs/zh_cn/get_started.md +++ b/docs/zh_cn/get_started.md @@ -24,7 +24,7 @@ pip install -e . ``` git clone https://github.com/openai/human-eval.git cd human-eval -pip install -r requirments.txt +pip install -r requirements.txt pip install -e . cd .. ``` diff --git a/docs/zh_cn/notes/contribution_guide.md b/docs/zh_cn/notes/contribution_guide.md index b842ed73..ac7f7d1a 100644 --- a/docs/zh_cn/notes/contribution_guide.md +++ b/docs/zh_cn/notes/contribution_guide.md @@ -48,7 +48,7 @@ yapf和isort的样式配置可以在[setup.cfg](https://github.com/OpenCompass/b ## 预提交钩子 (Pre-commit Hook) 我们使用[预提交钩子](https://pre-commit.com/)用于在每次提交时自动检查与格式化`flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`, -修复`end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,并自动排序`requirments.txt`。预提交钩子的配置存储在[.pre-commit-config]()中。 +修复`end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,并自动排序`requirements.txt`。预提交钩子的配置存储在[.pre-commit-config](<>)中。 在你克隆仓库后,你需要安装并初始化预提交钩子。 @@ -64,4 +64,4 @@ pre-commit install 之后,在每次提交时都会强制执行代码 linters 和格式化器。 -> 在你创建PR前,确保你的代码通过了 lint 检查并被 yapf 格式化。 \ No newline at end of file +> 在你创建PR前,确保你的代码通过了 lint 检查并被 yapf 格式化。 diff --git a/opencompass/datasets/TheoremQA.py b/opencompass/datasets/TheoremQA.py index fc529a61..b3beb892 100644 --- a/opencompass/datasets/TheoremQA.py +++ b/opencompass/datasets/TheoremQA.py @@ -17,11 +17,10 @@ class TheoremQADataset(BaseDataset): @TEXT_POSTPROCESSORS.register_module('TheoremQA') def TheoremQA_postprocess(text: str) -> str: - - text = text.strip().split('\n')[0].strip() - matches = re.findall(r'answer is (.*)', text) + text = text.strip() + matches = re.findall(r'answer is ([^\s]+)', text) if len(matches) == 0: return text else: - text = matches[0].strip()[:-1] + text = matches[0].strip().strip('.,?!\"\';:') return text diff --git a/opencompass/datasets/cmrc.py b/opencompass/datasets/cmrc.py index ccd83494..bb388976 100644 --- a/opencompass/datasets/cmrc.py +++ b/opencompass/datasets/cmrc.py @@ -2,7 +2,7 @@ import json from datasets import Dataset -from opencompass.registry import LOAD_DATASET +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS from .base import BaseDataset @@ -38,3 +38,10 @@ class CMRCDataset(BaseDataset): }) return dataset + + +@TEXT_POSTPROCESSORS.register_module('cmrc') +def cmrc_postprocess(text: str) -> str: + if '答案是' in text: + text = text.split('答案是')[1] + return text diff --git a/opencompass/datasets/drcd.py b/opencompass/datasets/drcd.py index 55893ac9..44466242 100644 --- a/opencompass/datasets/drcd.py +++ b/opencompass/datasets/drcd.py @@ -2,7 +2,7 @@ import json from datasets import Dataset -from opencompass.registry import LOAD_DATASET +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS from .base import BaseDataset @@ -38,3 +38,10 @@ class DRCDDataset(BaseDataset): }) return dataset + + +@TEXT_POSTPROCESSORS.register_module('drcd') +def drcd_postprocess(text: str) -> str: + if '答案是' in text: + text = text.split('答案是')[1] + return text diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index 71b6b209..9b1ec10a 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -33,12 +33,11 @@ class HumanEvaluator(BaseEvaluator): with tempfile.TemporaryDirectory() as tmp_dir: out_dir = osp.join(tmp_dir, 'human_eval.json') self.write_jsonl(out_dir, predictions) - score = self.eval( - out_dir, - self.k, - n_workers=4, - timeout=3.0, - problem_file=self.HUMAN_EVAL) + score = self.eval(out_dir, + self.k, + n_workers=4, + timeout=3.0, + problem_file=self.HUMAN_EVAL) return {f'humaneval_{k}': score[k] * 100 for k in score} @@ -47,7 +46,7 @@ def humaneval_postprocess(text: str) -> str: text = text.split('\n\n')[0] if '```' in text: text = text.split('```')[1] - if text.startswith('def'): + if text.strip().startswith('def'): text = '\n'.join(text.split('\n')[1:]) if not text.startswith(' '): if text.startswith(' '): diff --git a/opencompass/datasets/math.py b/opencompass/datasets/math.py index d640a3cb..bdd02650 100644 --- a/opencompass/datasets/math.py +++ b/opencompass/datasets/math.py @@ -3,7 +3,8 @@ import json from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator -from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, TEXT_POSTPROCESSORS +from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET, + TEXT_POSTPROCESSORS) from .base import BaseDataset @@ -65,12 +66,12 @@ class MATHDataset(BaseDataset): return dataset -@TEXT_POSTPROCESSORS.register_module('math') +@TEXT_POSTPROCESSORS.register_module('math_postprocess') def math_postprocess(text: str) -> str: SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''), (r'\ ', ''), (' ', ''), ('mbox', 'text'), (',\\text{and}', ','), ('\\text{and}', ','), - ('\\text{m}', '\\text{}'), ('\le', '<')] + ('\\text{m}', '\\text{}'), ('\\le', '<')] REMOVED_EXPRESSIONS = [ 'square', 'ways', 'integers', 'dollars', 'mph', 'inches', 'ft', 'hours', 'km', 'units', '\\ldots', 'sue', 'points', 'feet', 'minutes', @@ -96,7 +97,9 @@ def math_postprocess(text: str) -> str: final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer) final_answer = re.sub(r'(\\boxed\{)(.*)(\})', '\\2', final_answer) - assert '\n' not in final_answer and '\r' not in final_answer and '\f' not in final_answer + assert '\n' not in final_answer + assert '\r' not in final_answer + assert '\f' not in final_answer if len(re.findall(r'finalansweris(.*)', final_answer)) > 0: final_answer = re.findall(r'finalansweris(.*)', final_answer)[-1] diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index 06cd7204..2c129bfa 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -20,13 +20,13 @@ class MBPPDataset(BaseDataset): def processing_test(example): example['test_case'] = example['test_list'] example['test_list'] = '\n'.join(example['test_list']) + example['test_list_2'] = example['test_list'] return example - train = load_dataset( - 'json', data_files=path, split='train[:10]').map(processing_test) - test = load_dataset( - 'json', data_files=path, - split='train[10:510]').map(processing_test) + train = load_dataset('json', data_files=path, + split='train[:10]').map(processing_test) + test = load_dataset('json', data_files=path, + split='train[10:510]').map(processing_test) return DatasetDict({'train': train, 'test': test}) diff --git a/opencompass/datasets/natural_question.py b/opencompass/datasets/natural_question.py index 3ed49de4..1fea3713 100644 --- a/opencompass/datasets/natural_question.py +++ b/opencompass/datasets/natural_question.py @@ -1,6 +1,5 @@ import csv import os.path as osp -import re from datasets import Dataset, DatasetDict @@ -43,16 +42,19 @@ class NQEvaluator(BaseEvaluator): 'error': 'predictions and references have different ' 'length' } - predictions = [ - re.split(r'[\n]', prediction, 1)[0].lower() - for prediction in predictions - ] + processed_predictions = [] + for prediction in predictions: + prediction = prediction.split('\n')[0].lower() + if 'answer is' in prediction: + prediction = prediction.split('answer is')[-1] + prediction = general_postprocess(prediction) + processed_predictions.append(prediction) processed_answers = [[general_postprocess(j).lower() for j in i] for i in references] cnt = 0 - for pred, cand_ans in zip(predictions, processed_answers): - cnt += int(any([cand in pred for cand in cand_ans])) + for pred, cand_ans in zip(processed_predictions, processed_answers): + cnt += int(any([cand == pred for cand in cand_ans])) score = cnt / len(predictions) * 100 return {'score': score} diff --git a/opencompass/datasets/realtoxicprompts.py b/opencompass/datasets/realtoxicprompts.py index 4098bb3a..9e27834f 100644 --- a/opencompass/datasets/realtoxicprompts.py +++ b/opencompass/datasets/realtoxicprompts.py @@ -1,4 +1,4 @@ -from datasets import load_dataset +from datasets import Dataset, DatasetDict, load_dataset from opencompass.registry import LOAD_DATASET @@ -11,7 +11,17 @@ class RealToxicPromptsDataset(BaseDataset): @staticmethod def load(**kwargs): challenging_subset = kwargs.pop('challenging_subset', False) - dataset = load_dataset(**kwargs) + if kwargs['path'] == 'allenai/real-toxicity-prompts': + try: + dataset = load_dataset(**kwargs) + except ConnectionError as e: + raise ConnectionError( + f'{e} Something wrong with this dataset, ' + 'cannot track it online or use offline mode, ' + 'please set local file path directly.') + else: + dataset = Dataset.from_file(kwargs.pop('path')) + dataset = DatasetDict(train=dataset) def preprocess(example): diff --git a/opencompass/datasets/strategyqa.py b/opencompass/datasets/strategyqa.py index f0f56ec2..ae8a155f 100644 --- a/opencompass/datasets/strategyqa.py +++ b/opencompass/datasets/strategyqa.py @@ -1,12 +1,16 @@ +import re + from opencompass.registry import TEXT_POSTPROCESSORS @TEXT_POSTPROCESSORS.register_module('strategyqa') def strategyqa_pred_postprocess(text: str) -> str: text = text.split('\n\n')[0] - strategyqa_pre = text.split('So the answer is ')[-1].strip().replace( - '.', '') - return strategyqa_pre + text = text.split('answer is ')[-1] + match = re.search(r'(yes|no)', text.lower()) + if match: + return match.group(1) + return '' @TEXT_POSTPROCESSORS.register_module('strategyqa_dataset') diff --git a/opencompass/datasets/triviaqa.py b/opencompass/datasets/triviaqa.py index c7f7a757..a4a675c6 100644 --- a/opencompass/datasets/triviaqa.py +++ b/opencompass/datasets/triviaqa.py @@ -1,6 +1,5 @@ import csv import os.path as osp -import re from datasets import Dataset, DatasetDict @@ -36,25 +35,25 @@ class TriviaQADataset(BaseDataset): @ICL_EVALUATORS.register_module() class TriviaQAEvaluator(BaseEvaluator): - def __init__(self) -> None: - super().__init__() - def score(self, predictions, references): if len(predictions) != len(references): return { 'error': 'predictions and references have different ' 'length' } - predictions = [ - re.split(r'[\n]', prediction, 1)[0].lower() - for prediction in predictions - ] + processed_predictions = [] + for prediction in predictions: + prediction = prediction.split('\n')[0].lower() + if 'answer is' in prediction: + prediction = prediction.split('answer is')[-1] + prediction = general_postprocess(prediction) + processed_predictions.append(prediction) processed_answers = [[general_postprocess(j).lower() for j in i] for i in references] cnt = 0 - for pred, cand_ans in zip(predictions, processed_answers): - cnt += int(any([cand in pred for cand in cand_ans])) + for pred, cand_ans in zip(processed_predictions, processed_answers): + cnt += int(any([cand == pred for cand in cand_ans])) score = cnt / len(predictions) * 100 return {'score': score} diff --git a/opencompass/datasets/wsc.py b/opencompass/datasets/wsc.py index 398ce4d7..31f54948 100644 --- a/opencompass/datasets/wsc.py +++ b/opencompass/datasets/wsc.py @@ -56,3 +56,47 @@ class WSCDataset_V2(BaseDataset): } data.append(item) return Dataset.from_list(data) + + +@LOAD_DATASET.register_module() +class WSCDataset_V3(BaseDataset): + + @staticmethod + def load(path): + data = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + + text_list = line['text'].split(' ') + span_text1_len = len(line['target']['span1_text'].split(' ')) + span_text2_len = len(line['target']['span2_text'].split(' ')) + span1_start = line['target']['span1_index'] + span1_end = span1_start + span_text1_len + span2_start = line['target']['span2_index'] + span2_end = span2_start + span_text2_len + new_text_list = [] + for i, t in enumerate(text_list): + if span1_start <= i < span1_end: + if i == span1_start: + new_text_list.append('* ' + + line['target']['span1_text'] + + ' *') + elif span2_start <= i < span2_end: + if i == span2_start: + new_text_list.append('# ' + + line['target']['span2_text'] + + ' #') + else: + new_text_list.append(t) + item = { + 'span1': line['target']['span1_text'], + 'span2': line['target']['span2_text'], + 'text': ' '.join(new_text_list), + 'label': { + 'true': 'A', + 'false': 'B' + }[line['label']], + } + data.append(item) + return Dataset.from_list(data) diff --git a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py index ad2f7506..f88b7571 100644 --- a/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_toxic_evaluator.py @@ -1,6 +1,7 @@ -import os import json +import os import threading +import time from typing import List import numpy as np @@ -44,11 +45,11 @@ class PerspectiveAPIClient: """Build resource when request in real.""" if self.client is None: self.client = discovery.build( - "commentanalyzer", - "v1alpha1", + 'commentanalyzer', + 'v1alpha1', developerKey=self.key, discoveryServiceUrl= # noqa - "https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1", # noqa + 'https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1', # noqa static_discovery=False, ) @@ -63,13 +64,13 @@ class PerspectiveAPIClient: dict: Request body dictionary. """ return { - "comment": { - "text": text + 'comment': { + 'text': text }, - "requestedAttributes": { + 'requestedAttributes': { 'TOXICITY': {} }, - "languages": ["en", "zh"], # This does not affect the result score + 'languages': ['en', 'zh'], # This does not affect the result score } def extract_toxicity_attributes(self, response: dict) -> dict: @@ -83,19 +84,19 @@ class PerspectiveAPIClient: """ if response: all_scores = { - f"{attribute.lower()}_score": - scores["spanScores"][0]["score"]["value"] - for attribute, scores in response["attributeScores"].items() + f'{attribute.lower()}_score': + scores['spanScores'][0]['score']['value'] + for attribute, scores in response['attributeScores'].items() } else: all_scores = {'toxicity_score': -1} return all_scores - def get_toxicity_scores(self, predicitons: List) -> dict: + def get_toxicity_scores(self, predictions: List) -> dict: """Request toxicity scores for PerspectiveAPI. Args: - predicitons: Text to perform evaluation. + predictions: Text to perform evaluation. Returns: dict: output scores dictionary for all samples. @@ -104,12 +105,13 @@ class PerspectiveAPIClient: batch_results = dict() - pbar = ProgressBar(len(predicitons)) + pbar = ProgressBar(len(predictions)) def callback(request_id: str, response: dict, error): if error: content = json.loads(error.args[1]) - error_type = content['error']['details'][0]['errorType'] + error_type = content['error']['details'][0].get( + 'errorType', '') if error_type != 'COMMENT_EMPTY': raise error batch_results[request_id] = response @@ -123,7 +125,7 @@ class PerspectiveAPIClient: # Add individual request to the batch request. Deduplicate since we # use the text as request keys. - for idx, text in enumerate(predicitons): + for idx, text in enumerate(predictions): batch_request.add( request=self.client.comments().analyze( body=PerspectiveAPIClient.create_request_body( @@ -134,6 +136,7 @@ class PerspectiveAPIClient: if (idx + 1) % bs == 0: batch_request.execute() + time.sleep(1) batch_request = self.client.new_batch_http_request() with self.request_lock: @@ -198,11 +201,10 @@ class ToxicEvaluator(BaseEvaluator): length = np.count_nonzero(~np.isnan(toxicity_scores)) max_toxicity_score = max(toxicity_scores) - return dict( - expected_max_toxicity=round(max_toxicity_score, 4), - max_toxicity_probability=max_toxicity_score >= self.thr, - toxic_frac=round(num_toxic_completions / length, 4), - avg_toxicity_score=round(np.nanmean(toxicity_scores), 4)) + return dict(expected_max_toxicity=round(max_toxicity_score, 4), + max_toxicity_probability=max_toxicity_score >= self.thr, + toxic_frac=round(num_toxic_completions / length, 4), + avg_toxicity_score=round(np.nanmean(toxicity_scores), 4)) def score(self, predictions: List, references: List) -> dict: """Calculate scores. Reference is not needed. diff --git a/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py b/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py index 85d7d210..9b266125 100644 --- a/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_clp_inferencer.py @@ -93,7 +93,7 @@ class CLPInferencer: output_json_filename: Optional[str] = None, normalizing_str: Optional[str] = None) -> List: # 1. Preparation for output logs - output_handler = PPLInferencerOutputHandler(self.accelerator) + output_handler = PPLInferencerOutputHandler() ice = [] @@ -122,9 +122,17 @@ class CLPInferencer: choice_target_ids = [] # TODO: Hard code temperaily, need to modified here choices = retriever.test_ds[0]['choices'] - choice_ids = [ - self.model.tokenizer.encode(c, False, False) for c in choices - ] + try: + choice_ids = [ + self.model.tokenizer.encode(c, False, False) + for c in choices + ] + except ValueError: + choice_ids = [self.model.tokenizer.encode(c) for c in choices] + if self.model.tokenizer.add_bos_token: + choice_ids = [c[1:] for c in choice_ids] + if self.model.tokenizer.add_eos_token: + choice_ids = [c[:-1] for c in choice_ids] if isinstance(choice_ids[0], list): # in case tokenizer returns list for single token choice_ids = list(itertools.chain(*choice_ids)) @@ -185,15 +193,10 @@ class CLPInferencer: index = index + 1 # 5. Output - os.makedirs(output_json_filepath, exist_ok=True) - output_handler.subprocess_write_to_json(output_json_filepath, - output_json_filename) - if self.accelerator is not None: - self.accelerator.wait_for_everyone() - output_handler.merge_to_main_process(output_json_filepath, - output_json_filename) - output_handler.write_to_json(output_json_filepath, - output_json_filename) + if self.is_main_process: + os.makedirs(output_json_filepath, exist_ok=True) + output_handler.write_to_json(output_json_filepath, + output_json_filename) return [ sample['prediction'] @@ -206,8 +209,10 @@ class CLPInferencer: choice_ids, mask_length=None): # TODO: support multiple tokens - outputs, _ = self.model.generator.get_logits(input_texts) - + try: + outputs, _ = self.model.generator.get_logits(input_texts) + except AttributeError: + outputs, _ = self.model.get_logits(input_texts) shift_logits = outputs[..., :-1, :].contiguous() shift_logits = F.log_softmax(shift_logits, dim=-1) diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 35ee9990..a2d2c607 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -111,7 +111,6 @@ class DLCRunner(BaseRunner): f' --worker_gpu {num_gpus}' f' --worker_memory {max(num_gpus * 32, 48)}' f" --worker_image {self.aliyun_cfg['worker_image']}" - ' --priority 3' ' --interactive') logger = get_logger() diff --git a/opencompass/utils/summarizer.py b/opencompass/utils/summarizer.py index 3e05bb00..a6704b68 100644 --- a/opencompass/utils/summarizer.py +++ b/opencompass/utils/summarizer.py @@ -13,7 +13,7 @@ from opencompass.utils import (LarkReporter, dataset_abbr_from_cfg, model_abbr_from_cfg) from opencompass.utils.prompt import get_prompt_hash -METRIC_WHITELIST = ['score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] +METRIC_WHITELIST = ['score', 'auc_score', 'accuracy', 'humaneval_pass@1', 'rouge1', 'avg_toxicity_score', 'bleurt_diff', 'matthews_correlation', 'truth'] METRIC_BLACKLIST = ['bp', 'sys_len', 'ref_len'] class Summarizer: diff --git a/requirements.txt b/requirements.txt index e2950dbf..50a5ca52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,7 +10,6 @@ jieba mmengine>0.8.0 nltk==3.8 numpy==1.23.4 -openai==0.27.1 openai pandas<2.0.0 rank_bm25==0.2.2 @@ -18,7 +17,6 @@ requests==2.28.1 scikit_learn==1.2.1 sentence_transformers==2.2.2 tabulate -tabulate tiktoken tokenizers>=0.13.3 torch>=1.13.1 diff --git a/run.py b/run.py index ea7e82ad..f671a3e8 100644 --- a/run.py +++ b/run.py @@ -149,9 +149,12 @@ def main(): cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') if args.reuse: if args.reuse == 'latest': - dirs = os.listdir(cfg.work_dir) - assert len(dirs) > 0, 'No previous results to reuse!' - dir_time_str = sorted(dirs)[-1] + if not os.path.exists(cfg.work_dir) or not os.listdir( + cfg.work_dir): + logger.warning('No previous results to reuse!') + else: + dirs = os.listdir(cfg.work_dir) + dir_time_str = sorted(dirs)[-1] else: dir_time_str = args.reuse logger.info(f'Reusing experiements from {dir_time_str}') diff --git a/tools/prompt_viewer.py b/tools/prompt_viewer.py index 65bff78d..31543b7a 100644 --- a/tools/prompt_viewer.py +++ b/tools/prompt_viewer.py @@ -4,7 +4,8 @@ from typing import Dict from mmengine.config import Config, ConfigDict -from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer +from opencompass.openicl.icl_inferencer import (CLPInferencer, GenInferencer, + PPLInferencer) from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS from opencompass.utils import (Menu, build_dataset_from_cfg, build_model_from_cfg, dataset_abbr_from_cfg, @@ -126,7 +127,7 @@ def print_prompts(model_cfg, dataset_cfg): print('-' * 100) print(prompt) print('-' * 100) - elif infer_cfg.inferencer.type == GenInferencer: + elif infer_cfg.inferencer.type in [GenInferencer, CLPInferencer]: idx, ice_idx = 0, ice_idx_list[0] ice = retriever.generate_ice(ice_idx, ice_template=ice_template) prompt = retriever.generate_prompt_for_generate_task(