diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..3dfeef9c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,69 @@ +exclude: | + (?x)^( + tests/data/| + opencompass/models/internal/| + opencompass/utils/internal/| + configs/ + ) +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + - repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort + - repo: https://github.com/pre-commit/mirrors-yapf + rev: v0.32.0 + hooks: + - id: yapf + - repo: https://github.com/codespell-project/codespell + rev: v2.2.1 + hooks: + - id: codespell + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + exclude: | + (?x)^( + dicts/| + projects/.*?/dicts/ + ) + - id: check-yaml + - id: end-of-file-fixer + exclude: | + (?x)^( + dicts/| + projects/.*?/dicts/ + ) + - id: requirements-txt-fixer + - id: double-quote-string-fixer + - id: check-merge-conflict + - id: fix-encoding-pragma + args: ["--remove"] + - id: mixed-line-ending + args: ["--fix=lf"] + - id: mixed-line-ending + args: ["--fix=lf"] + - repo: https://github.com/executablebooks/mdformat + rev: 0.7.9 + hooks: + - id: mdformat + args: ["--number", "--table-width", "200"] + additional_dependencies: + - mdformat-openmmlab + - mdformat_frontmatter + - linkify-it-py + - repo: https://github.com/myint/docformatter + rev: v1.3.1 + hooks: + - id: docformatter + args: ["--in-place", "--wrap-descriptions", "79"] + # - repo: https://github.com/open-mmlab/pre-commit-hooks + # rev: v0.2.0 # Use the ref you want to point at + # hooks: + # - id: check-algo-readme + # - id: check-copyright + # args: ["mmocr", "tests", "tools"] # these directories will be checked diff --git a/configs/datasets/ARC_e/ARC_e_ppl.py b/configs/datasets/ARC_e/ARC_e_ppl.py new file mode 100644 index 00000000..fab5d48b --- /dev/null +++ b/configs/datasets/ARC_e/ARC_e_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .ARC_e_ppl_f86898 import ARC_e_datasets # noqa: F401, F403 diff --git a/configs/datasets/CLUE_C3/CLUE_C3_ppl_588820.py b/configs/datasets/CLUE_C3/CLUE_C3_ppl_588820.py new file mode 100644 index 00000000..e9e9d137 --- /dev/null +++ b/configs/datasets/CLUE_C3/CLUE_C3_ppl_588820.py @@ -0,0 +1,37 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import C3Dataset + +C3_reader_cfg = dict( + input_columns=[ + 'question', 'content', 'choice0', 'choice1', 'choice2', 'choice3', + 'choices' + ], + output_column='label') + +C3_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + i: dict(round=[ + dict(role="HUMAN", prompt="文章:{content}\n问题:{question}"), + dict(role="BOT", prompt=f"答案:{{choice{i}}}") + ]) + for i in range(4) + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +C3_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +C3_datasets = [ + dict( + type=C3Dataset, + abbr='C3', + path='./data/CLUE/C3/dev_0.json', + reader_cfg=C3_reader_cfg, + infer_cfg=C3_infer_cfg, + eval_cfg=C3_eval_cfg) +] diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_72a8d5.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_72a8d5.py new file mode 100644 index 00000000..43a6aab1 --- /dev/null +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_72a8d5.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import CMRCDataset + +CMRC_reader_cfg = dict( + input_columns=['question', 'context'], output_column='answers') + +CMRC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt="文章:{context}\n根据上文,回答如下问题:\n{question}\n答:"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +CMRC_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role="BOT", +) + +CMRC_datasets = [ + dict( + type=CMRCDataset, + abbr='CMRC_dev', + path='./data/CLUE/CMRC/dev.json', + reader_cfg=CMRC_reader_cfg, + infer_cfg=CMRC_infer_cfg, + eval_cfg=CMRC_eval_cfg), +] diff --git a/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_d7096f.py b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_d7096f.py new file mode 100644 index 00000000..eff1b8b5 --- /dev/null +++ b/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen_d7096f.py @@ -0,0 +1,33 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import CMRCDataset + +CMRC_reader_cfg = dict( + input_columns=['question', 'context'], output_column='answers') + +CMRC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict(role="HUMAN", prompt="文章:{context}\n根据上文,回答如下问题:{question}"), + dict(role="BOT", prompt="答:"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +CMRC_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role="BOT", +) + +CMRC_datasets = [ + dict( + type=CMRCDataset, + abbr='CMRC_dev', + path='./data/CLUE/CMRC/dev.json', + reader_cfg=CMRC_reader_cfg, + infer_cfg=CMRC_infer_cfg, + eval_cfg=CMRC_eval_cfg), +] diff --git a/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_03b96b.py b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_03b96b.py new file mode 100644 index 00000000..d0b9ec4f --- /dev/null +++ b/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen_03b96b.py @@ -0,0 +1,34 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import DRCDDataset + +DRCD_reader_cfg = dict( + input_columns=['question', 'context'], output_column='answers') + +DRCD_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt="文章:{context}\n根据上文,回答如下问题:\n{question}\n答:"), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +DRCD_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role="BOT", +) + +DRCD_datasets = [ + dict( + type=DRCDDataset, + abbr='DRCD_dev', + path='./data/CLUE/DRCD/dev.json', + reader_cfg=DRCD_reader_cfg, + infer_cfg=DRCD_infer_cfg, + eval_cfg=DRCD_eval_cfg), +] diff --git a/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_305431.py b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_305431.py new file mode 100644 index 00000000..b9698c7a --- /dev/null +++ b/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen_305431.py @@ -0,0 +1,50 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import AFQMCDataset_V2 + +bustm_reader_cfg = dict( + input_columns=["sentence1", "sentence2"], + output_column="label", + test_split="train") + +bustm_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?\nA. 无关\nB. 相关\n请从“A”,“B”中进行选择。\n答:", + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +bustm_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +bustm_datasets = [ + dict( + abbr="bustm-dev", + type=AFQMCDataset_V2, # bustm share the same format with AFQMC + path="./data/FewCLUE/bustm/dev_few_all.json", + reader_cfg=bustm_reader_cfg, + infer_cfg=bustm_infer_cfg, + eval_cfg=bustm_eval_cfg, + ), + dict( + abbr="bustm-test", + type=AFQMCDataset_V2, # bustm share the same format with AFQMC + path="./data/FewCLUE/bustm/test_public.json", + reader_cfg=bustm_reader_cfg, + infer_cfg=bustm_infer_cfg, + eval_cfg=bustm_eval_cfg, + ), +] diff --git a/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_b6cd88.py b/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_b6cd88.py new file mode 100644 index 00000000..0a30b0fb --- /dev/null +++ b/configs/datasets/FewCLUE_chid/FewCLUE_chid_ppl_b6cd88.py @@ -0,0 +1,45 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import CHIDDataset + +chid_reader_cfg = dict( + input_columns=[f'content{i}' for i in range(7)], output_column='answer') + +chid_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + i: dict( + round=[ + dict(role="HUMAN", prompt=f"以下句子是否通顺?\n{{content{i}}}"), + dict(role="BOT", prompt="这个句子是通顺的。"), + ], ) + for i in range(7) + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +chid_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_role="BOT") + +chid_datasets = [ + dict( + type=CHIDDataset, + path='json', + abbr='chid-dev', + data_files='./data/FewCLUE/chid/dev_few_all.json', + split='train', + reader_cfg=chid_reader_cfg, + infer_cfg=chid_infer_cfg, + eval_cfg=chid_eval_cfg), + dict( + type=CHIDDataset, + path='json', + abbr='chid-test', + data_files='./data/FewCLUE/chid/test_public.json', + split='train', + reader_cfg=chid_reader_cfg, + infer_cfg=chid_infer_cfg, + eval_cfg=chid_eval_cfg), +] diff --git a/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_fc45f0.py b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_fc45f0.py new file mode 100644 index 00000000..7f6308a6 --- /dev/null +++ b/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_ppl_fc45f0.py @@ -0,0 +1,58 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import CluewscDataset + +cluewsc_reader_cfg = dict( + input_columns=['span1', 'span2', 'text', 'new_text'], + output_column='answer') + +cluewsc_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict( + role="HUMAN", + prompt= + "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"?" + ), + dict(role="BOT", prompt="No.") + ]), + 1: + dict(round=[ + dict( + role="HUMAN", + prompt= + "{text}\nHere, is the pronoun \"{span2}\" used to mean \"{span1}\"?" + ), + dict(role="BOT", prompt="Yes.") + ]), + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +cluewsc_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +cluewsc_datasets = [ + dict( + type=CluewscDataset, + path='json', + abbr='cluewsc-dev', + data_files='./data/FewCLUE/cluewsc/dev_few_all.json', + split='train', + reader_cfg=cluewsc_reader_cfg, + infer_cfg=cluewsc_infer_cfg, + eval_cfg=cluewsc_eval_cfg), + dict( + type=CluewscDataset, + path='json', + abbr='cluewsc-test', + data_files='./data/FewCLUE/cluewsc/test_public.json', + split='train', + reader_cfg=cluewsc_reader_cfg, + infer_cfg=cluewsc_infer_cfg, + eval_cfg=cluewsc_eval_cfg), +] diff --git a/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_784b9e.py b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_784b9e.py new file mode 100644 index 00000000..6ca84485 --- /dev/null +++ b/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_ppl_784b9e.py @@ -0,0 +1,48 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import TNewsDataset + +tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2') + +tnews_labels = [ + '农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯', + '军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻' +] + +tnews_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + lb: dict(round=[ + dict(role='HUMAN', prompt='{sentence}\n上述内容属于什么新闻?'), + dict(role='BOT', prompt=lb) + ]) + for lb in tnews_labels + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +tnews_datasets = [ + dict( + type=TNewsDataset, + path='json', + abbr='tnews-dev', + data_files='./data/FewCLUE/tnews/dev_few_all.json', + split='train', + reader_cfg=tnews_reader_cfg, + infer_cfg=tnews_infer_cfg, + eval_cfg=tnews_eval_cfg), + dict( + type=TNewsDataset, + path='json', + abbr='tnews-test', + data_files='./data/FewCLUE/tnews/test_public.json', + split='train', + reader_cfg=tnews_reader_cfg, + infer_cfg=tnews_infer_cfg, + eval_cfg=tnews_eval_cfg) +] diff --git a/configs/datasets/GaokaoBench/GaokaoBench_gen.py b/configs/datasets/GaokaoBench/GaokaoBench_gen.py new file mode 100644 index 00000000..26aa3fd2 --- /dev/null +++ b/configs/datasets/GaokaoBench/GaokaoBench_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .GaokaoBench_gen_aed980 import GaokaoBench_datasets # noqa: F401, F403 diff --git a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_7a5dee.py b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_7a5dee.py new file mode 100644 index 00000000..b879cedd --- /dev/null +++ b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen_7a5dee.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import AXDataset_V2 + +AX_g_reader_cfg = dict( + input_columns=["hypothesis", "premise"], + output_column="label", +) + +AX_g_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer:" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +AX_g_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +AX_g_datasets = [ + dict( + abbr="AX_g", + type=AXDataset_V2, + path="./data/SuperGLUE/AX-g/AX-g.jsonl", + reader_cfg=AX_g_reader_cfg, + infer_cfg=AX_g_infer_cfg, + eval_cfg=AX_g_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_ppl_8d9bf9.py b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_ppl_8d9bf9.py new file mode 100644 index 00000000..24609587 --- /dev/null +++ b/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_ppl_8d9bf9.py @@ -0,0 +1,53 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import HFDataset + +AX_g_reader_cfg = dict( + input_columns=["hypothesis", "premise"], + output_column="label", + test_split="train") + +AX_g_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + "entailment": + dict(round=[ + dict( + role="HUMAN", + prompt= + "{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?" + ), + dict(role="BOT", prompt="Yes"), + ]), + "not_entailment": + dict(round=[ + dict( + role="HUMAN", + prompt= + "{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?" + ), + dict(role="BOT", prompt="No"), + ]) + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +AX_g_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +AX_g_datasets = [ + dict( + type=HFDataset, + abbr="AX_g", + path="json", + data_files="./data/SuperGLUE/AX-g/AX-g.jsonl", + split="train", + reader_cfg=AX_g_reader_cfg, + infer_cfg=AX_g_infer_cfg, + eval_cfg=AX_g_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py new file mode 100644 index 00000000..d2f2d5ed --- /dev/null +++ b/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_BoolQ_ppl_f80fb0 import BoolQ_datasets # noqa: F401, F403 diff --git a/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py new file mode 100644 index 00000000..7fe6c83d --- /dev/null +++ b/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .SuperGLUE_CB_gen_bb97e1 import CB_datasets # noqa: F401, F403 diff --git a/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_6d5e67.py b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_6d5e67.py new file mode 100644 index 00000000..879390b9 --- /dev/null +++ b/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen_6d5e67.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import COPADataset_V2 + +COPA_reader_cfg = dict( + input_columns=["question", "premise", "choice1", "choice2"], + output_column="label", +) + +COPA_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= + "{premise}\nQuestion: Which may be the {question}?\nA. {choice1}\nB. {choice2}\nAnswer:" + ), + ], ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +COPA_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +COPA_datasets = [ + dict( + abbr="COPA", + type=COPADataset_V2, + path="./data/SuperGLUE/COPA/val.jsonl", + reader_cfg=COPA_reader_cfg, + infer_cfg=COPA_infer_cfg, + eval_cfg=COPA_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_26c9dc.py b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_26c9dc.py new file mode 100644 index 00000000..a3cce8e5 --- /dev/null +++ b/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen_26c9dc.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import MultiRCDataset_V2 + +MultiRC_reader_cfg = dict( + input_columns=["question", "text", "answer"], + output_column="label", +) + +MultiRC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "{text}\nQuestion: {question}\nAnswer: {answer}\nIs it true?\nA. Yes\nB. No\nAnswer:" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +MultiRC_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +MultiRC_datasets = [ + dict( + abbr="MultiRC", + type=MultiRCDataset_V2, + path="./data/SuperGLUE/MultiRC/val.jsonl", + reader_cfg=MultiRC_reader_cfg, + infer_cfg=MultiRC_infer_cfg, + eval_cfg=MultiRC_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_c39367.py b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_c39367.py new file mode 100644 index 00000000..cfa15e4a --- /dev/null +++ b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_c39367.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WiCDataset_V2 + +WiC_reader_cfg = dict( + input_columns=[ + "word", + "sentence1", + "sentence2", + ], + output_column="label", +) + +WiC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + "Sentence 1: {sentence1}\nSentence 2: {sentence2}\nAre '{word}' in the above two sentenses the same?\nA. Yes\nB. No\nAnswer:" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +WiC_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +WiC_datasets = [ + dict( + abbr="WiC", + type=WiCDataset_V2, + path="./data/SuperGLUE/WiC/val.jsonl", + reader_cfg=WiC_reader_cfg, + infer_cfg=WiC_infer_cfg, + eval_cfg=WiC_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_4118db.py b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_4118db.py new file mode 100644 index 00000000..e7e59441 --- /dev/null +++ b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_4118db.py @@ -0,0 +1,55 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WiCDataset + +WiC_reader_cfg = dict( + input_columns=[ + "word", + "sentence1", + "sentence2", + ], + output_column="answer", + test_split="train") + +WiC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict( + role="HUMAN", + prompt= + "Sentence 1: {sentence1}\nSentence 2: {sentence2}\n'{word}' in the above two sentenses are different." + ), + ]), + 1: + dict(round=[ + dict( + role="HUMAN", + prompt= + "Sentence 1: {sentence1}\nSentence 2: {sentence2}\n'{word}' in the above two sentenses are the same." + ), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +WiC_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +WiC_datasets = [ + dict( + type=WiCDataset, + abbr="WiC", + path="json", + data_files="./data/SuperGLUE/WiC/val.jsonl", + split="train", + reader_cfg=WiC_reader_cfg, + infer_cfg=WiC_infer_cfg, + eval_cfg=WiC_eval_cfg, + ) +] diff --git a/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_d316eb.py b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_d316eb.py new file mode 100644 index 00000000..1af5a93d --- /dev/null +++ b/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_d316eb.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import WiCDataset + +WiC_reader_cfg = dict( + input_columns=[ + "word", + "sentence1", + "sentence2", + ], + output_column="answer", + test_split="train") + +WiC_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict( + role="HUMAN", + prompt="{word} in {sentence1} and {sentence2} is different."), + ]), + 1: + dict(round=[ + dict(role="HUMAN", prompt="{word} in {sentence1} and {sentence2} is same."), + ]), + }, + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), +) + +WiC_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +WiC_datasets = [ + dict( + type=WiCDataset, + abbr="WiC", + path="json", + data_files="./data/SuperGLUE/WiC/val.jsonl", + split="train", + reader_cfg=WiC_reader_cfg, + infer_cfg=WiC_infer_cfg, + eval_cfg=WiC_eval_cfg, + ) +] diff --git a/configs/datasets/TheoremQA/TheoremQA_gen_74abc9.py b/configs/datasets/TheoremQA/TheoremQA_gen_74abc9.py new file mode 100644 index 00000000..8046ce40 --- /dev/null +++ b/configs/datasets/TheoremQA/TheoremQA_gen_74abc9.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import TheoremQADataset + +TheoremQA_reader_cfg = dict( + input_columns=['Question', 'Answer_type'], + output_column='Answer', + train_split='test') + +TheoremQA_prompt1 = "Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. " \ + "If the Answer type in [bool], the answer needs to be True or False. " \ + "Else if the Answer type in [integer, float] , The answer needs to be in numerical form. " \ + "Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. " \ + "Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d)." \ + "You need to output the answer in your final sentence like 'Therefore, the answer is ...'." +TheoremQA_prompt2 = f"Below is an instruction that describes a task, paired with an input that provides further context. " \ + f"Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n" + +TheoremQA_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=TheoremQA_prompt2), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +TheoremQA_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type='TheoremQA')) + +TheoremQA_datasets = [ + dict( + abbr='TheoremQA', + type=TheoremQADataset, + path="./data/TheoremQA/test.csv", + reader_cfg=TheoremQA_reader_cfg, + infer_cfg=TheoremQA_infer_cfg, + eval_cfg=TheoremQA_eval_cfg) +] diff --git a/configs/datasets/ceval/ceval_gen.py b/configs/datasets/ceval/ceval_gen.py new file mode 100644 index 00000000..cc33b7ea --- /dev/null +++ b/configs/datasets/ceval/ceval_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .ceval_gen_ee2cb0 import ceval_datasets # noqa: F401, F403 diff --git a/configs/datasets/civilcomments/civilcomments_ppl_e01497.py b/configs/datasets/civilcomments/civilcomments_ppl_e01497.py new file mode 100644 index 00000000..02e30c01 --- /dev/null +++ b/configs/datasets/civilcomments/civilcomments_ppl_e01497.py @@ -0,0 +1,31 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import CLPInferencer +from opencompass.openicl.icl_evaluator import AUCROCEvaluator +from opencompass.datasets import CivilCommentsDataset + +civilcomments_reader_cfg = dict( + input_columns=['text'], + output_column='label', + train_split='test', + test_split='test') + +civilcomments_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template='Text: {text}\nQuestion: ' + 'Does the above text contain rude, hateful, aggressive, disrespectful ' + 'or unreasonable language?\nAnswer:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=CLPInferencer)) + +civilcomments_eval_cfg = dict(evaluator=dict(type=AUCROCEvaluator), ) + +civilcomments_datasets = [ + dict( + type=CivilCommentsDataset, + path='civil_comments', + reader_cfg=civilcomments_reader_cfg, + infer_cfg=civilcomments_infer_cfg, + eval_cfg=civilcomments_eval_cfg) +] diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py new file mode 100644 index 00000000..5bfb05f3 --- /dev/null +++ b/configs/datasets/collections/chat_medium.py @@ -0,0 +1,57 @@ +from mmengine.config import read_base + +with read_base(): + from ..mmlu.mmlu_gen_a568f1 import mmlu_datasets + from ..ceval.ceval_gen_ee2cb0 import ceval_datasets + from ..agieval.agieval_gen_dc7dae import agieval_datasets + from ..GaokaoBench.GaokaoBench_gen_aed980 import GaokaoBench_datasets + from ..bbh.bbh_gen_58abc3 import bbh_datasets + from ..humaneval.humaneval_gen_d428f1 import humaneval_datasets + from ..mbpp.mbpp_gen_4104e4 import mbpp_datasets + from ..CLUE_C3.CLUE_C3_gen_9e3de9 import C3_datasets + from ..CLUE_CMRC.CLUE_CMRC_gen_72a8d5 import CMRC_datasets + from ..CLUE_DRCD.CLUE_DRCD_gen_03b96b import DRCD_datasets + from ..CLUE_afqmc.CLUE_afqmc_gen_db509b import afqmc_datasets + from ..CLUE_cmnli.CLUE_cmnli_gen_316313 import cmnli_datasets + from ..CLUE_ocnli.CLUE_ocnli_gen_7c44b0 import ocnli_datasets + from ..FewCLUE_bustm.FewCLUE_bustm_gen_305431 import bustm_datasets + from ..FewCLUE_chid.FewCLUE_chid_gen_686c63 import chid_datasets + from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_276956 import cluewsc_datasets + from ..FewCLUE_csl.FewCLUE_csl_gen_1b0c02 import csl_datasets + from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_d6d06d import eprstmt_datasets + from ..FewCLUE_ocnli_fc.FewCLUE_ocnli_fc_gen_bef37f import ocnli_fc_datasets + from ..FewCLUE_tnews.FewCLUE_tnews_gen_8d59ba import tnews_datasets + from ..lcsts.lcsts_gen_427fde import lcsts_datasets + from ..lambada.lambada_gen_7ffe3d import lambada_datasets + from ..storycloze.storycloze_gen_c5a230 import storycloze_datasets + from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_477186 import AX_b_datasets + from ..SuperGLUE_AX_g.SuperGLUE_AX_g_gen_7a5dee import AX_g_datasets + from ..SuperGLUE_BoolQ.SuperGLUE_BoolQ_gen_8525d1 import BoolQ_datasets + from ..SuperGLUE_CB.SuperGLUE_CB_gen_bb97e1 import CB_datasets + from ..SuperGLUE_COPA.SuperGLUE_COPA_gen_6d5e67 import COPA_datasets + from ..SuperGLUE_MultiRC.SuperGLUE_MultiRC_gen_26c9dc import MultiRC_datasets + from ..SuperGLUE_RTE.SuperGLUE_RTE_gen_ce346a import RTE_datasets + from ..SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_d8f19c import ReCoRD_datasets + from ..SuperGLUE_WiC.SuperGLUE_WiC_gen_c39367 import WiC_datasets + from ..SuperGLUE_WSC.SuperGLUE_WSC_gen_d8d441 import WSC_datasets + from ..race.race_gen_12de48 import race_datasets + from ..Xsum.Xsum_gen_d2126e import Xsum_datasets + from ..gsm8k.gsm8k_gen_2dd372 import gsm8k_datasets + from ..summedits.summedits_gen_4f35b5 import summedits_datasets + from ..math.math_gen_78bcba import math_datasets + from ..TheoremQA.TheoremQA_gen_891fcf import TheoremQA_datasets + from ..hellaswag.hellaswag_gen_cae9cb import hellaswag_datasets + from ..ARC_e.ARC_e_gen_0a29bf import ARC_e_datasets + from ..ARC_c.ARC_c_gen_3f3039 import ARC_c_datasets + from ..commonsenseqa.commonsenseqa_gen_a58dbd import commonsenseqa_datasets + from ..piqa.piqa_gen_8287ae import piqa_datasets + from ..siqa.siqa_gen_a3c714 import siqa_datasets + from ..strategyqa.strategyqa_gen_be3f8d import strategyqa_datasets + from ..winogrande.winogrande_gen_c19d87 import winogrande_datasets + from ..obqa.obqa_gen_b2cde9 import obqa_datasets + from ..nq.nq_gen_a6ffca import nq_datasets + from ..triviaqa.triviaqa_gen_cc3cbf import triviaqa_datasets + from ..flores.flores_gen_8eb9ca import flores_datasets + from ..crowspairs.crowspairs_gen_dd110a import crowspairs_datasets + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_a58dbd.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_a58dbd.py new file mode 100644 index 00000000..9a3d008c --- /dev/null +++ b/configs/datasets/commonsenseqa/commonsenseqa_gen_a58dbd.py @@ -0,0 +1,60 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import MDLRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import commonsenseqaDataset + +commonsenseqa_reader_cfg = dict( + input_columns=["question", "A", "B", "C", "D", "E"], + output_column="answerKey", + test_split="validation") + +_ice_template = dict( + type=PromptTemplate, + template=dict( + begin="", + round=[ + dict( + role="HUMAN", + prompt= + "{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nAnswer:", + ), + dict( + role="BOT", + prompt="{answerKey}", + ), + ], + ), + ice_token="", +) + +commonsenseqa_infer_cfg = dict( + ice_template=_ice_template, + retriever=dict( + type=MDLRetriever, + ice_num=8, + candidate_num=30, + select_time=10, + seed=1, + batch_size=12, + ice_template=_ice_template, + ), + inferencer=dict(type=GenInferencer), +) + +commonsenseqa_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type="first-capital"), +) + +commonsenseqa_datasets = [ + dict( + type=commonsenseqaDataset, + path="commonsense_qa", + reader_cfg=commonsenseqa_reader_cfg, + infer_cfg=commonsenseqa_infer_cfg, + eval_cfg=commonsenseqa_eval_cfg, + ) +] + +del _ice_template diff --git a/configs/datasets/glm/C3.py b/configs/datasets/glm/C3.py new file mode 100644 index 00000000..36c70e04 --- /dev/null +++ b/configs/datasets/glm/C3.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import C3Dataset + +C3_reader_cfg = dict( + input_columns=[ + 'question', 'content', 'choice0', 'choice1', 'choice2', 'choice3', + 'choices' + ], + output_column='label') + +C3_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + "阅读以下内容,选择合适的选项回答: {content} 问题:{question}\n 选项: -{choice0} -{choice1} -{choice2} -{choice3} 答: [MASK]-{choice0}", + 1: + "阅读以下内容,选择合适的选项回答: {content} 问题:{question}\n 选项: -{choice0} -{choice1} -{choice2} -{choice3} 答: [MASK]-{choice1}", + 2: + "阅读以下内容,选择合适的选项回答: {content} 问题:{question}\n 选项: -{choice0} -{choice1} -{choice2} -{choice3} 答: [MASK]-{choice2}", + 3: + "阅读以下内容,选择合适的选项回答: {content} 问题:{question}\n 选项: -{choice0} -{choice1} -{choice2} -{choice3} 答: [MASK]-{choice3}", + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +C3_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +C3_datasets = [ + dict( + type=C3Dataset, + abbr='C3', + path='./data/CLUE/C3/dev_0.json', + reader_cfg=C3_reader_cfg, + infer_cfg=C3_infer_cfg, + eval_cfg=C3_eval_cfg) +] diff --git a/configs/datasets/glm/tnews.py b/configs/datasets/glm/tnews.py new file mode 100644 index 00000000..7b4e651e --- /dev/null +++ b/configs/datasets/glm/tnews.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GLMChoiceInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import TNewsDataset + +tnews_reader_cfg = dict(input_columns='sentence', output_column='label_desc2') + +tnews_labels = [ + '农业新闻', '旅游新闻', '游戏新闻', '科技类别公司新闻', '体育类别新闻', '初升高教育新闻', '娱乐圈新闻', '投资资讯', + '军事类别常识', '车辆新闻', '楼市新闻', '环球不含中国类别新闻', '书籍文化历史类别新闻', '故事类别新闻', '股票市场类别新闻' +] + +tnews_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template={lb: f'这篇新闻属于:{lb}' + for lb in tnews_labels}, + column_token_map={'sentence': ''}, + ice_token=''), + prompt_template=dict( + type=PromptTemplate, + template='\n以上这篇新闻属于', + column_token_map={'sentence': ''}, + ice_token=''), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GLMChoiceInferencer, choices=tnews_labels)) + +tnews_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +tnews_datasets = [ + dict( + type=TNewsDataset, + path='json', + abbr='tnews', + data_files='./data/FewCLUE/tnews/test_public.json', + split='train', + reader_cfg=tnews_reader_cfg, + infer_cfg=tnews_infer_cfg, + eval_cfg=tnews_eval_cfg) +] diff --git a/configs/datasets/govrepcrs/govrepcrs_gen.py b/configs/datasets/govrepcrs/govrepcrs_gen.py new file mode 100644 index 00000000..8ff35942 --- /dev/null +++ b/configs/datasets/govrepcrs/govrepcrs_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .govrepcrs_gen_455586 import govrepcrs_datasets # noqa: F401, F403 diff --git a/configs/datasets/govrepcrs/govrepcrs_gen_455586.py b/configs/datasets/govrepcrs/govrepcrs_gen_455586.py new file mode 100644 index 00000000..9af1402a --- /dev/null +++ b/configs/datasets/govrepcrs/govrepcrs_gen_455586.py @@ -0,0 +1,47 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import BleuEvaluator +from opencompass.datasets import GovRepcrsDataset + +govrepcrs_reader_cfg = dict( + input_columns='content', + output_column='summary', + train_split='test', + test_split='test') + +govrepcrs_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role="HUMAN", + prompt= + 'Please summarize the following English report in English:' + ), + ], + round=[ + dict(role='HUMAN', prompt='{content}'), + dict(role='BOT', prompt='{summary}'), + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, batch_size=4, max_out_len=500, max_seq_len=8192)) + +govrepcrs_eval_cfg = dict( + evaluator=dict(type=BleuEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type='general_cn'), + dataset_postprocessor=dict(type='general_cn')) + +govrepcrs_datasets = [ + dict( + type=GovRepcrsDataset, + path='./data/govrep/', + abbr='GovRepcrs', + reader_cfg=govrepcrs_reader_cfg, + infer_cfg=govrepcrs_infer_cfg, + eval_cfg=govrepcrs_eval_cfg) +] diff --git a/configs/datasets/hellaswag/hellaswag_gen_cae9cb.py b/configs/datasets/hellaswag/hellaswag_gen_cae9cb.py new file mode 100644 index 00000000..5fc8d799 --- /dev/null +++ b/configs/datasets/hellaswag/hellaswag_gen_cae9cb.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import hellaswagDataset_V2 + +hellaswag_reader_cfg = dict( + input_columns=["ctx", "A", "B", "C", "D"], + output_column="label", + test_split="validation") + +hellaswag_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt=( + "{ctx}\nQuestion: Which ending makes the most sense?\n" + "A. {A}\nB. {B}\nC. {C}\nD. {D}\n" + "You may choose from 'A', 'B', 'C', 'D'.\n" + "Answer:"), + ), + ]), + ), + retriever=dict(type=ZeroRetriever, ), + inferencer=dict(type=GenInferencer), +) + +hellaswag_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type="first-capital"), +) + +hellaswag_datasets = [ + dict( + type=hellaswagDataset_V2, + path="hellaswag", + reader_cfg=hellaswag_reader_cfg, + infer_cfg=hellaswag_infer_cfg, + eval_cfg=hellaswag_eval_cfg) +] diff --git a/configs/datasets/iwslt2017/iwslt2017_gen_02ea0b.py b/configs/datasets/iwslt2017/iwslt2017_gen_02ea0b.py new file mode 100644 index 00000000..a696c372 --- /dev/null +++ b/configs/datasets/iwslt2017/iwslt2017_gen_02ea0b.py @@ -0,0 +1,38 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import BM25Retriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import BleuEvaluator +from opencompass.datasets import IWSLT2017Dataset + +iwslt2017_reader_cfg = dict( + input_columns='en', output_column='de', train_split='validation') + +iwslt2017_infer_cfg = dict( + ice_template=dict( + type=PromptTemplate, + template=dict( + begin='', + round=[ + dict(role='HUMAN', prompt='Please translate the following English statements to German:\n{en}'), + dict(role='BOT', prompt='{de}'), + ] + ), + ice_token=''), + retriever=dict(type=BM25Retriever, ice_num=1), + inferencer=dict(type=GenInferencer)) + +iwslt2017_eval_cfg = dict( + evaluator=dict(type=BleuEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type='general_cn'), + dataset_postprocessor=dict(type='general_cn')) + +iwslt2017_datasets = [ + dict( + type=IWSLT2017Dataset, + path='iwslt2017', + name='iwslt2017-en-de', + reader_cfg=iwslt2017_reader_cfg, + infer_cfg=iwslt2017_infer_cfg, + eval_cfg=iwslt2017_eval_cfg) +] \ No newline at end of file diff --git a/configs/datasets/lcsts/lcsts_gen.py b/configs/datasets/lcsts/lcsts_gen.py new file mode 100644 index 00000000..d3cc71b9 --- /dev/null +++ b/configs/datasets/lcsts/lcsts_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .lcsts_gen_427fde import lcsts_datasets # noqa: F401, F403 diff --git a/configs/datasets/mbpp/mbpp_gen_b60b47.py b/configs/datasets/mbpp/mbpp_gen_b60b47.py new file mode 100644 index 00000000..18facf02 --- /dev/null +++ b/configs/datasets/mbpp/mbpp_gen_b60b47.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='code') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n" + ), + dict(role="BOT", prompt="[BEGIN]\n"), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") + +mbpp_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp', + path='./data/mbpp/mbpp.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/configs/datasets/narrativeqa/narrativeqa_gen_ca4b64.py b/configs/datasets/narrativeqa/narrativeqa_gen_ca4b64.py new file mode 100644 index 00000000..37de9e64 --- /dev/null +++ b/configs/datasets/narrativeqa/narrativeqa_gen_ca4b64.py @@ -0,0 +1,30 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import NarrativeQADataset, TriviaQAEvaluator + +narrativeqa_reader_cfg = dict( + input_columns=['question', 'evidence'], + output_column='answer', + train_split='valid', + test_split='valid') + +narrativeqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="{evidence}\nAnswer these questions:\nQ: {question}?\nA:"), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, max_out_len=50, max_seq_len=8192, batch_size=4)) + +narrativeqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator)) + +narrativeqa_datasets = [ + dict( + type=NarrativeQADataset, + abbr='NarrativeQA', + path='./data/narrativeqa/', + reader_cfg=narrativeqa_reader_cfg, + infer_cfg=narrativeqa_infer_cfg, + eval_cfg=narrativeqa_eval_cfg) +] diff --git a/configs/datasets/obqa/obqa_ppl.py b/configs/datasets/obqa/obqa_ppl.py new file mode 100644 index 00000000..82a03738 --- /dev/null +++ b/configs/datasets/obqa/obqa_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .obqa_ppl_2b5b12 import obqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/obqa/obqa_ppl_2b5b12.py b/configs/datasets/obqa/obqa_ppl_2b5b12.py new file mode 100644 index 00000000..9a4c8546 --- /dev/null +++ b/configs/datasets/obqa/obqa_ppl_2b5b12.py @@ -0,0 +1,66 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import OBQADataset + +_input_columns = [ + ['question_stem', 'A', 'B', 'C', 'D'], + ['question_stem', 'A', 'B', 'C', 'D', 'fact1'], +] +_template = [ + { + ans: dict( + round=[ + dict( + role="HUMAN", + prompt= + "Question: {question_stem}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:" + ), + dict(role="BOT", prompt=ans), + ], ) + for ans in ['A', 'B', 'C', 'D'] + }, + { + ans: dict( + round=[ + dict( + role="HUMAN", + prompt= + "Given the fact: {fact1}\nQuestion: {question_stem}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nAnswer:" + ), + dict(role="BOT", prompt=ans), + ], ) + for ans in ['A', 'B', 'C', 'D'] + } +] + +obqa_datasets = [ + dict( + type=OBQADataset, + path='openbookqa', + split='test', + ), + dict( + abbr='openbookqa_fact', + type=OBQADataset, + path='openbookqa', + name='additional', + split='test', + ), +] +for _i in range(2): + obqa_reader_cfg = dict( + input_columns=_input_columns[_i], output_column="answerKey") + obqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=_template[_i]), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + obqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator), ) + + obqa_datasets[_i]["reader_cfg"] = obqa_reader_cfg + obqa_datasets[_i]["infer_cfg"] = obqa_infer_cfg + obqa_datasets[_i]["eval_cfg"] = obqa_eval_cfg diff --git a/configs/datasets/obqa/obqa_ppl_3609cc.py b/configs/datasets/obqa/obqa_ppl_3609cc.py new file mode 100644 index 00000000..bb07200f --- /dev/null +++ b/configs/datasets/obqa/obqa_ppl_3609cc.py @@ -0,0 +1,52 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import OBQADataset + +_input_columns = [ + ['question_stem', 'A', 'B', 'C', 'D'], + ['question_stem', 'A', 'B', 'C', 'D', 'fact1'], +] +_template = [{ + 'A': "{question_stem} {A}", + 'B': "{question_stem} {B}", + 'C': "{question_stem} {C}", + 'D': "{question_stem} {D}", +}, { + 'A': "Given the fact {fact1}, we know that {question_stem} {A}", + 'B': "Given the fact {fact1}, we know that {question_stem} {B}", + 'C': "Given the fact {fact1}, we know that {question_stem} {C}", + 'D': "Given the fact {fact1}, we know that {question_stem} {D}", +}] + +obqa_datasets = [ + dict( + abbr="openbookqa", + type=OBQADataset, + path="openbookqa", + split="test", + ), + dict( + abbr="openbookqa_fact", + type=OBQADataset, + path="openbookqa", + name="additional", + split="test", + ), +] +for _i in range(2): + obqa_reader_cfg = dict( + input_columns=_input_columns[_i], output_column="answerKey") + obqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=_template[_i]), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer), + ) + obqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + + obqa_datasets[_i]["reader_cfg"] = obqa_reader_cfg + obqa_datasets[_i]["infer_cfg"] = obqa_infer_cfg + obqa_datasets[_i]["eval_cfg"] = obqa_eval_cfg diff --git a/configs/datasets/realtoxicprompts/realtoxicprompts_gen_82a2f2.py b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_82a2f2.py new file mode 100644 index 00000000..91a5e0e3 --- /dev/null +++ b/configs/datasets/realtoxicprompts/realtoxicprompts_gen_82a2f2.py @@ -0,0 +1,36 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import ToxicEvaluator +from opencompass.datasets import RealToxicPromptsDataset + +realtoxicprompts_reader_cfg = dict( + input_columns=['prompt_text'], + output_column='filename', + train_split='train', + test_split='train') + +# TODO: allow empty output-column +realtoxicprompts_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role="HUMAN", prompt="{prompt_text}")])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +# When key is set to "ENV", the key will be fetched from the environment +# variable $PerspectiveAPIkey. Otherwise, set key in here directly. +realtoxicprompts_eval_cfg = dict( + evaluator=dict(type=ToxicEvaluator, key='ENV'), + pred_role='BOT', +) + +realtoxicprompts_datasets = [ + dict( + type=RealToxicPromptsDataset, + path='allenai/real-toxicity-prompts', + challenging_subset=True, + reader_cfg=realtoxicprompts_reader_cfg, + infer_cfg=realtoxicprompts_infer_cfg, + eval_cfg=realtoxicprompts_eval_cfg) +] diff --git a/configs/datasets/storycloze/storycloze_ppl_c1912d.py b/configs/datasets/storycloze/storycloze_ppl_c1912d.py new file mode 100644 index 00000000..05aea9ba --- /dev/null +++ b/configs/datasets/storycloze/storycloze_ppl_c1912d.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import storyclozeDataset + +storycloze_reader_cfg = dict( + input_columns=['context', 'sentence_quiz1', 'sentence_quiz2'], + output_column='answer_right_ending', + train_split='test', + test_split='test') + +storycloze_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + i: dict(round=[ + dict(role="HUMAN", prompt="{context}"), + dict(role="BOT", prompt=f"{{sentence_quiz{i}}}"), + ]) + for i in range(1, 3) + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +storycloze_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +# The original story cloze dataset and repo are not long maintaining. +# Using multilingual version of this dataset. +storycloze_datasets = [ + dict( + abbr='story_cloze', + type=storyclozeDataset, + path='juletxara/xstory_cloze', + name='en', + reader_cfg=storycloze_reader_cfg, + infer_cfg=storycloze_infer_cfg, + eval_cfg=storycloze_eval_cfg) +] diff --git a/configs/datasets/summedits/summedits_ppl_163352.py b/configs/datasets/summedits/summedits_ppl_163352.py new file mode 100644 index 00000000..57171a98 --- /dev/null +++ b/configs/datasets/summedits/summedits_ppl_163352.py @@ -0,0 +1,50 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import PPLInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import HFDataset + +summedits_reader_cfg = dict( + input_columns=['doc', 'summary'], + output_column='label', + test_split='train') + +summedits_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template={ + 0: + dict(round=[ + dict( + role="HUMAN", + prompt= + """\nDocument:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? """ + ), + dict(role="BOT", prompt="No") + ]), + 1: + dict(round=[ + dict( + role="HUMAN", + prompt= + """Document:\n{doc}\nSummary:\n{summary}\nIs the summary factually consistent with the document? """ + ), + dict(role="BOT", prompt="Yes") + ]), + }), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=PPLInferencer)) + +summedits_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) + +summedits_datasets = [ + dict( + type=HFDataset, + abbr='summedits', + path='json', + split='train', + data_files='./data/summedits/summedits.jsonl', + reader_cfg=summedits_reader_cfg, + infer_cfg=summedits_infer_cfg, + eval_cfg=summedits_eval_cfg) +] diff --git a/configs/datasets/summscreen/summscreen_gen_997ee2.py b/configs/datasets/summscreen/summscreen_gen_997ee2.py new file mode 100644 index 00000000..c1729348 --- /dev/null +++ b/configs/datasets/summscreen/summscreen_gen_997ee2.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import BleuEvaluator +from opencompass.datasets import SummScreenDataset + +summscreen_reader_cfg = dict( + input_columns='content', + output_column='summary', + train_split='dev', + test_split='dev') + +summscreen_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template= + "Please summarize the following English report in English:{content}\n{summary}."), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=GenInferencer, batch_size=4, max_out_len=500, max_seq_len=8192)) + +summscreen_eval_cfg = dict( + evaluator=dict(type=BleuEvaluator), + pred_postprocessor=dict(type='general_cn'), + dataset_postprocessor=dict(type='general_cn')) + +summscreen_datasets = [ + dict( + type=SummScreenDataset, + path='./data/SummScreen/', + abbr='SummScreen', + reader_cfg=summscreen_reader_cfg, + infer_cfg=summscreen_infer_cfg, + eval_cfg=summscreen_eval_cfg) +] diff --git a/configs/datasets/winograd/winograd_ppl.py b/configs/datasets/winograd/winograd_ppl.py new file mode 100644 index 00000000..a3dbf68b --- /dev/null +++ b/configs/datasets/winograd/winograd_ppl.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .winograd_ppl_c1c427 import winograd_datasets # noqa: F401, F403 diff --git a/configs/datasets/z_bench/z_bench_gen_61db0a.py b/configs/datasets/z_bench/z_bench_gen_61db0a.py new file mode 100644 index 00000000..63cfded8 --- /dev/null +++ b/configs/datasets/z_bench/z_bench_gen_61db0a.py @@ -0,0 +1,28 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import HFDataset + +z_bench_reader_cfg = dict( + ds_size=4, + input_columns=['text'], + output_column='category', + train_split='test') + +z_bench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[dict(role="HUMAN", prompt="{text}")]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +z_bench_dataset = dict( + type=HFDataset, + path= + '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', + data_dir= + '/mnt/petrelfs/gaotong/llm_eval/openagieval_dataset/eval_datasets/z_bench', + name='question', + reader_cfg=z_bench_reader_cfg, + infer_cfg=z_bench_infer_cfg) diff --git a/docs/en/Makefile b/docs/en/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/en/prompt/prompt_template.md b/docs/en/prompt/prompt_template.md new file mode 100644 index 00000000..468f5524 --- /dev/null +++ b/docs/en/prompt/prompt_template.md @@ -0,0 +1 @@ +# Prompt Template \ No newline at end of file diff --git a/docs/en/user_guides/models.md b/docs/en/user_guides/models.md new file mode 100644 index 00000000..c93550a7 --- /dev/null +++ b/docs/en/user_guides/models.md @@ -0,0 +1 @@ +# Prepare Models diff --git a/docs/zh_cn/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css new file mode 100644 index 00000000..1891c912 --- /dev/null +++ b/docs/zh_cn/_static/css/readthedocs.css @@ -0,0 +1,62 @@ +.header-logo { + background-image: url("../image/logo.png"); + background-size: 183px 50px; + height: 50px; + width: 183px; +} + +@media screen and (min-width: 1100px) { + .header-logo { + top: -12px; + } +} + +pre { + white-space: pre; +} + +@media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } +} + + +article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; +} + +/* Disable the change in tables */ +article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; +} + +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +article.pytorch-article p.rubric { + font-weight: bold; +} diff --git a/docs/zh_cn/_static/image/logo.png b/docs/zh_cn/_static/image/logo.png new file mode 100644 index 00000000..8b5efee6 Binary files /dev/null and b/docs/zh_cn/_static/image/logo.png differ diff --git a/docs/zh_cn/_templates/callable.rst b/docs/zh_cn/_templates/callable.rst new file mode 100644 index 00000000..3a7b9d2b --- /dev/null +++ b/docs/zh_cn/_templates/callable.rst @@ -0,0 +1,14 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :members: + :special-members: __call__ + +.. + autogenerated from _templates/callable.rst + note it does not have :inherited-members: diff --git a/docs/zh_cn/notes/contribution_guide.md b/docs/zh_cn/notes/contribution_guide.md new file mode 100644 index 00000000..b842ed73 --- /dev/null +++ b/docs/zh_cn/notes/contribution_guide.md @@ -0,0 +1,67 @@ +# 为 OpenCompass 做贡献 + +- [为OpenCompass做贡献](#为opencompass做贡献) + - [工作流程](#工作流程) + - [代码风格](#代码风格) + - [Python](#python) + - [预提交钩子 (Pre-commit Hook)](#预提交钩子-pre-commit-hook) + +感谢你对于OpenCompass的贡献!我们欢迎各种形式的贡献,包括但不限于以下几点。 + +- 修改错别字或修复bug +- 添加文档或将文档翻译成其它语言 +- 添加新功能和组件 + +## 工作流程 + +我们建议潜在的贡献者遵循以下的贡献工作流程。 + +1. Fork并拉取最新的OpenCompass仓库,按照[开始使用](https://OpenCompass.readthedocs.io/en/latest/get_started.html)来设置环境。 +2. 检出一个新的分支(**不要使用master或dev分支来创建PR**) + +```bash +git checkout -b xxxx # xxxx 是新分支的名称 +``` + +3. 编辑相关文件,并且遵循下面提到的代码风格 +4. 使用[预提交钩子](https://pre-commit.com/)来检查和格式化你的更改。 +5. 提交你的更改 +6. 创建一个带有相关信息的PR + +## 代码风格 + +### Python + +我们采用[PEP8](https://www.python.org/dev/peps/pep-0008/)作为首选的代码风格。 + +我们使用以下工具进行linting和格式化: + +- [flake8](https://github.com/PyCQA/flake8): 一个围绕一些linter工具的封装器。 +- [isort](https://github.com/timothycrosley/isort): 一个用于排序Python导入的实用程序。 +- [yapf](https://github.com/google/yapf): 一个Python文件的格式化器。 +- [codespell](https://github.com/codespell-project/codespell): 一个Python实用程序,用于修复文本文件中常见的拼写错误。 +- [mdformat](https://github.com/executablebooks/mdformat): mdformat是一个有明确定义的Markdown格式化程序,可以用来在Markdown文件中强制执行一致的样式。 +- [docformatter](https://github.com/myint/docformatter): 一个格式化docstring的工具。 + +yapf和isort的样式配置可以在[setup.cfg](https://github.com/OpenCompass/blob/main/setup.cfg)中找到。 + +## 预提交钩子 (Pre-commit Hook) + +我们使用[预提交钩子](https://pre-commit.com/)用于在每次提交时自动检查与格式化`flake8`、`yapf`、`isort`、`trailing whitespaces`、`markdown files`, +修复`end-of-files`、`double-quoted-strings`、`python-encoding-pragma`、`mixed-line-ending`,并自动排序`requirments.txt`。预提交钩子的配置存储在[.pre-commit-config]()中。 + +在你克隆仓库后,你需要安装并初始化预提交钩子。 + +```shell +pip install -U pre-commit +``` + +从仓库文件夹运行 + +```shell +pre-commit install +``` + +之后,在每次提交时都会强制执行代码 linters 和格式化器。 + +> 在你创建PR前,确保你的代码通过了 lint 检查并被 yapf 格式化。 \ No newline at end of file diff --git a/opencompass/datasets/arc.py b/opencompass/datasets/arc.py new file mode 100644 index 00000000..62e8cbea --- /dev/null +++ b/opencompass/datasets/arc.py @@ -0,0 +1,45 @@ +import json + +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class ARCDataset(BaseDataset): + + @staticmethod + def load(path: str): + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for i, line in enumerate(in_f): + sample = json.loads(line.strip()) + answerKey = sample['answerKey'] + sample = sample['question'] + question = sample['stem'] + choices = sample['choices'] + if len(choices) != 4: + continue + textA = choices[0]['text'] + textB = choices[1]['text'] + textC = choices[2]['text'] + textD = choices[3]['text'] + rows.append({ + 'question': question, + 'answerKey': answerKey, + 'textA': textA, + 'textB': textB, + 'textC': textC, + 'textD': textD + }) + dataset = Dataset.from_dict({ + 'question': [row['question'] for row in rows], + 'answerKey': [row['answerKey'] for row in rows], + 'textA': [row['textA'] for row in rows], + 'textB': [row['textB'] for row in rows], + 'textC': [row['textC'] for row in rows], + 'textD': [row['textD'] for row in rows] + }) + return dataset diff --git a/opencompass/datasets/flores.py b/opencompass/datasets/flores.py new file mode 100644 index 00000000..a2292fec --- /dev/null +++ b/opencompass/datasets/flores.py @@ -0,0 +1,36 @@ +import re + +from datasets import DatasetDict, load_dataset + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class FloresFirst100Dataset(BaseDataset): + + @staticmethod + def load(name): + return DatasetDict({ + 'dev': + load_dataset(path='facebook/flores', name=name, split='dev'), + 'devtest': + load_dataset( + path='facebook/flores', name=name, split='devtest[:100]') + }) + + +@TEXT_POSTPROCESSORS.register_module('flores') +def flores_postprocess(text: str) -> str: + text = text.strip().split('\n')[0] + return text + + +@TEXT_POSTPROCESSORS.register_module('flores-chinese') +def flores_postprocess_chinese(text: str) -> str: + import jieba + truncated_text = text.strip().split('\n')[0] + cleaned_text = re.sub(r'\s+', ' ', truncated_text).strip() + cleaned_text = ' '.join(jieba.cut(cleaned_text)) + return cleaned_text diff --git a/opencompass/datasets/qasper.py b/opencompass/datasets/qasper.py new file mode 100644 index 00000000..b860c612 --- /dev/null +++ b/opencompass/datasets/qasper.py @@ -0,0 +1,43 @@ +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class QASPERDataset(BaseDataset): + + @staticmethod + def load(path: str): + import json + import os + dataset_dict = DatasetDict() + split = 'dev' + dev_list = [] + + dev = os.path.join(path, 'qasper-dev-v0.3.json') + with open(dev, 'r') as f: + dev_json = json.load(f) + + for article_id in dev_json.keys(): + full_article = '\n'.join([ + (x['section_name'] if x['section_name'] else '') + '\n' + + '\n'.join(x['paragraphs']) + '\n' + for x in dev_json[article_id]['full_text'] + ]) + for qa in dev_json[article_id]['qas']: + question = qa['question'] + answers = [] + for x in qa['answers']: + answers.extend(x['answer']['extractive_spans']) + if answers: + dev_list.append({ + 'answer': answers, + 'question': question, + 'evidence': full_article, + }) + else: + continue + dataset_dict[split] = Dataset.from_list(dev_list) + return dataset_dict diff --git a/opencompass/datasets/qaspercut.py b/opencompass/datasets/qaspercut.py new file mode 100644 index 00000000..d892dea9 --- /dev/null +++ b/opencompass/datasets/qaspercut.py @@ -0,0 +1,53 @@ +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class QASPERCUTDataset(BaseDataset): + + @staticmethod + def load(path: str): + import json + import os + dataset_dict = DatasetDict() + split = 'dev' + dev_list = [] + + dev = os.path.join(path, 'qasper-dev-v0.3.json') + with open(dev, 'r') as f: + dev_json = json.load(f) + + for article_id in dev_json.keys(): + full_article = '\n'.join([ + (x['section_name'] if x['section_name'] else '') + '\n' + + '\n'.join(x['paragraphs']) + '\n' + for x in dev_json[article_id]['full_text'] + ]) + for qa in dev_json[article_id]['qas']: + question = qa['question'] + answers = [] + clues = [] + for x in qa['answers']: + answers.extend(x['answer']['extractive_spans']) + clues.extend(x['answer']['evidence']) + + evis = [full_article.find(clue) + for clue in clues] + [100000000] + evi = min(evis) + if evi == -1 or evi == 100000000: + evi = 0 + + if answers: + dev_list.append({ + 'answer': answers, + 'question': question, + 'evidence': full_article[evi:], + }) + else: + continue + + dataset_dict[split] = Dataset.from_list(dev_list) + return dataset_dict diff --git a/opencompass/datasets/safety.py b/opencompass/datasets/safety.py new file mode 100644 index 00000000..1cd9550e --- /dev/null +++ b/opencompass/datasets/safety.py @@ -0,0 +1,23 @@ +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class SafetyDataset(BaseDataset): + + @staticmethod + def load(path): + dataset = DatasetDict() + + data_list = list() + idx = 0 + with open(path, 'r') as f: + for line in f: + if line.strip(): + data_list.append({'idx': idx, 'prompt': line.strip()}) + idx += 1 + + dataset['test'] = Dataset.from_list(data_list) diff --git a/opencompass/datasets/triviaqarc.py b/opencompass/datasets/triviaqarc.py new file mode 100644 index 00000000..ccdf69d8 --- /dev/null +++ b/opencompass/datasets/triviaqarc.py @@ -0,0 +1,58 @@ +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class TriviaQArcDataset(BaseDataset): + + @staticmethod + def load(path: str): + import json + import os + dataset_dict = DatasetDict() + split = 'dev' + dev_list = [] + + web_dev = os.path.join(path, 'qa', 'verified-web-dev.json') + with open(web_dev, 'r') as f: + web_dev_json = json.load(f) + + for x in web_dev_json['Data']: + cand_answers = x['Answer']['Aliases'] + x['Answer']['HumanAnswers'] + question = x['Question'] + evidence = '' + if x['SearchResults']: + x_path = os.path.join(path, 'evidence', 'web', + x['SearchResults'][0]['Filename']) + with open(x_path, 'r') as f: + evidence = f.read(100000) + dev_list.append({ + 'answer': cand_answers, + 'question': question, + 'evidence': evidence, + }) + + wiki_dev = os.path.join(path, 'qa', 'verified-wikipedia-dev.json') + with open(wiki_dev, 'r') as f: + wiki_dev_json = json.load(f) + + for x in wiki_dev_json['Data']: + cand_answers = x['Answer']['Aliases'] + question = x['Question'] + evidence = '' + if x['EntityPages']: + x_path = os.path.join(path, 'evidence', 'wikipedia', + x['EntityPages'][0]['Filename']) + with open(x_path, 'r') as f: + evidence = f.read(100000) + dev_list.append({ + 'answer': cand_answers, + 'question': question, + 'evidence': evidence, + }) + + dataset_dict[split] = Dataset.from_list(dev_list) + return dataset_dict diff --git a/opencompass/datasets/winogrande.py b/opencompass/datasets/winogrande.py new file mode 100644 index 00000000..90d19910 --- /dev/null +++ b/opencompass/datasets/winogrande.py @@ -0,0 +1,44 @@ +from datasets import load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class winograndeDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + + dataset = load_dataset(**kwargs) + + def preprocess(example): + prompt = example.pop('sentence') + example['opt1'] = prompt.replace('_', example.pop('option1')) + example['opt2'] = prompt.replace('_', example.pop('option2')) + return example + + return dataset.map(preprocess) + + +@LOAD_DATASET.register_module() +class winograndeDataset_V2(BaseDataset): + + @staticmethod + def load(**kwargs): + + dataset = load_dataset(**kwargs) + + def preprocess(example): + prompt = example.pop('sentence') + example['opt1'] = prompt.replace('_', example.pop('option1')) + example['opt2'] = prompt.replace('_', example.pop('option2')) + answer = example.pop('answer') + if answer == '': + example['label'] = 'NULL' + else: + example['label'] = ' AB'[int(answer)] + return example + + return dataset.map(preprocess) diff --git a/opencompass/datasets/xcopa.py b/opencompass/datasets/xcopa.py new file mode 100644 index 00000000..542f56c3 --- /dev/null +++ b/opencompass/datasets/xcopa.py @@ -0,0 +1,29 @@ +from datasets import concatenate_datasets, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class XCOPADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + path = kwargs.get('path', None) + lans = [ + 'et', 'ht', 'it', 'id', 'qu', 'sw', 'zh', 'ta', 'th', 'tr', 'vi', + 'translation-et', 'translation-ht', 'translation-it', + 'translation-id', 'translation-sw', 'translation-zh', + 'translation-ta', 'translation-th', 'translation-tr', + 'translation-vi' + ] + + datasets = [] + for lan in lans: + dataset = load_dataset(path, lan)['validation'] + datasets.append(dataset) + + combined_dataset = concatenate_datasets(datasets) + + return combined_dataset diff --git a/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py b/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py new file mode 100644 index 00000000..827a3bbe --- /dev/null +++ b/opencompass/openicl/icl_evaluator/icl_aucroc_evaluator.py @@ -0,0 +1,41 @@ +from typing import List +import numpy as np +from sklearn.metrics import roc_auc_score + +from opencompass.registry import ICL_EVALUATORS + +from .icl_base_evaluator import BaseEvaluator + + +@ICL_EVALUATORS.register_module() +class AUCROCEvaluator(BaseEvaluator): + """Calculate AUC-ROC scores and accuracy according the prediction. + + For some dataset, the accuracy cannot reveal the difference between + models because of the saturation. AUC-ROC scores can further exam + model abilities to distinguish different labels. More details can refer to + https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_auc_score.html + """ # noqa + + def __init__(self) -> None: + super().__init__() + + def score(self, predictions: List, references: List) -> dict: + """Calculate scores and accuracy. + + Args: + predictions (List): List of probabilities for each class of each + sample. + references (List): List of target labels for each sample. + + Returns: + dict: calculated scores. + """ + if len(predictions) != len(references): + return { + 'error': 'predictions and references have different length.' + } + auc_score = roc_auc_score(references, np.array(predictions)[:, 1]) + accuracy = sum( + references == np.argmax(predictions, axis=1)) / len(references) + return dict(auc_score=auc_score * 100, accuracy=accuracy * 100) diff --git a/opencompass/openicl/icl_inferencer/__init__.py b/opencompass/openicl/icl_inferencer/__init__.py new file mode 100644 index 00000000..fe36bed5 --- /dev/null +++ b/opencompass/openicl/icl_inferencer/__init__.py @@ -0,0 +1,4 @@ +from .icl_base_inferencer import BaseInferencer +from .icl_gen_inferencer import GenInferencer +from .icl_ppl_inferencer import PPLInferencer +from .icl_clp_inferencer import CLPInferencer diff --git a/opencompass/openicl/icl_retriever/icl_bm25_retriever.py b/opencompass/openicl/icl_retriever/icl_bm25_retriever.py new file mode 100644 index 00000000..ff2a8a61 --- /dev/null +++ b/opencompass/openicl/icl_retriever/icl_bm25_retriever.py @@ -0,0 +1,74 @@ +"""BM25 Retriever.""" + +from typing import List, Optional + +import numpy as np +from nltk.tokenize import word_tokenize +from rank_bm25 import BM25Okapi +from tqdm import trange + +from opencompass.openicl.icl_retriever import BaseRetriever +from opencompass.openicl.utils.logging import get_logger +from opencompass.registry import ICL_RETRIEVERS + +logger = get_logger(__name__) + + +@ICL_RETRIEVERS.register_module() +class BM25Retriever(BaseRetriever): + """BM25 Retriever. In information retrieval, Okapi BM25 (BM is an + abbreviation of best matching) is a ranking function used by search engines + to estimate the relevance of documents to a given search query. You can + find more details in https://en.wikipedia.org/wiki/Okapi_BM25. Each in- + context example of the test prompts is retrieved by the BM25 Algorithm. + + Args: + dataset (`BaseDataset`): Any BaseDataset instances. + Attributes of ``reader``, ``train`` and ``test`` will be used. + ice_separator (`Optional[str]`): The separator between each in-context + example template when origin `PromptTemplate` is provided. Defaults + to '\n'. + ice_eos_token (`Optional[str]`): The end of sentence token for + in-context example template when origin `PromptTemplate` is + provided. Defaults to '\n'. + ice_num (`Optional[int]`): The number of in-context example template + when origin `PromptTemplate` is provided. Defaults to 1. + index_split (`Optional[str]`): The split of the dataset to retrieve the + in-context example index, used when `dataset_reader.dataset` is an + instance of `datasets.Dataset`. Defaults to 'train'. + test_split (`Optional[str]`): The split of the dataset to retrieve the + in-context example, used when `dataset_reader.dataset` is an + instance of `datasets.Dataset`. Defaults to 'test'. + """ + bm25 = None + index_corpus = None + test_corpus = None + + def __init__(self, + dataset, + ice_separator: Optional[str] = '\n', + ice_eos_token: Optional[str] = '\n', + ice_num: Optional[int] = 1) -> None: + super().__init__(dataset, ice_separator, ice_eos_token, ice_num) + self.index_corpus = [ + word_tokenize(data) for data in + self.dataset_reader.generate_input_field_corpus(self.index_ds) + ] + self.bm25 = BM25Okapi(self.index_corpus) + self.test_corpus = [ + word_tokenize(data) for data in + self.dataset_reader.generate_input_field_corpus(self.test_ds) + ] + + def retrieve(self) -> List[List]: + """Retrieve the in-context example index for each test example.""" + rtr_idx_list = [] + logger.info('Retrieving data for test set...') + for idx in trange(len(self.test_corpus), + disable=not self.is_main_process): + query = self.test_corpus[idx] + scores = self.bm25.get_scores(query) + near_ids = list(np.argsort(scores)[::-1][:self.ice_num]) + near_ids = [int(a) for a in near_ids] + rtr_idx_list.append(near_ids) + return rtr_idx_list diff --git a/opencompass/openicl/icl_retriever/icl_random_retriever.py b/opencompass/openicl/icl_retriever/icl_random_retriever.py new file mode 100644 index 00000000..077111be --- /dev/null +++ b/opencompass/openicl/icl_retriever/icl_random_retriever.py @@ -0,0 +1,40 @@ +"""Random Retriever.""" + +from typing import Optional + +import numpy as np +from tqdm import trange + +from opencompass.openicl.icl_retriever import BaseRetriever +from opencompass.openicl.utils.logging import get_logger + +logger = get_logger(__name__) + + +class RandomRetriever(BaseRetriever): + """Random Retriever. Each in-context example of the test prompts is + retrieved in a random way. + + **WARNING**: This class has not been tested thoroughly. Please use it with + caution. + """ + + def __init__(self, + dataset, + ice_separator: Optional[str] = '\n', + ice_eos_token: Optional[str] = '\n', + ice_num: Optional[int] = 1, + seed: Optional[int] = 43) -> None: + super().__init__(dataset, ice_separator, ice_eos_token, ice_num) + self.seed = seed + + def retrieve(self): + np.random.seed(self.seed) + num_idx = len(self.index_ds) + rtr_idx_list = [] + logger.info('Retrieving data for test set...') + for _ in trange(len(self.test_ds), disable=not self.is_main_process): + idx_list = np.random.choice(num_idx, self.ice_num, + replace=False).tolist() + rtr_idx_list.append(idx_list) + return rtr_idx_list diff --git a/opencompass/openicl/icl_retriever/icl_zero_retriever.py b/opencompass/openicl/icl_retriever/icl_zero_retriever.py new file mode 100644 index 00000000..7ffb01c8 --- /dev/null +++ b/opencompass/openicl/icl_retriever/icl_zero_retriever.py @@ -0,0 +1,26 @@ +"""Zeroshot Retriever.""" + +from typing import List, Optional + +from opencompass.openicl.icl_retriever import BaseRetriever +from opencompass.registry import ICL_RETRIEVERS + + +@ICL_RETRIEVERS.register_module() +class ZeroRetriever(BaseRetriever): + """Zeroshot Retriever. The retriever returns empty list for all queries. + + Args: + dataset (`BaseDataset`): Any BaseDataset instances. + Attributes of ``reader``, ``train`` and ``test`` will be used. + ice_eos_token (`Optional[str]`): The end of sentence token for + in-context example template when origin `PromptTemplate` is + provided. Defaults to ''. + """ + + def __init__(self, dataset, ice_eos_token: Optional[str] = '') -> None: + super().__init__(dataset, '', ice_eos_token, 0) + + def retrieve(self) -> List[List]: + rtr_idx_list = [[] for _ in range(len(self.test_ds))] + return rtr_idx_list diff --git a/opencompass/openicl/utils/__init__.py b/opencompass/openicl/utils/__init__.py new file mode 100644 index 00000000..e060b377 --- /dev/null +++ b/opencompass/openicl/utils/__init__.py @@ -0,0 +1 @@ +from .logging import * diff --git a/opencompass/utils/logging.py b/opencompass/utils/logging.py new file mode 100644 index 00000000..0631fa7c --- /dev/null +++ b/opencompass/utils/logging.py @@ -0,0 +1,13 @@ +from mmengine.logging import MMLogger + + +def get_logger(log_level='INFO') -> MMLogger: + """Get the logger for OpenCompass. + + Args: + log_level (str): The log level. Default: 'INFO'. Choices are 'DEBUG', + 'INFO', 'WARNING', 'ERROR', 'CRITICAL'. + """ + return MMLogger.get_instance('OpenCompass', + logger_name='OpenCompass', + log_level=log_level)