From bf79ff1c6d4271cc6cf8d957ef309af46dd4ef42 Mon Sep 17 00:00:00 2001 From: Tong Gao Date: Fri, 11 Aug 2023 17:38:31 +0800 Subject: [PATCH] [Feature] Add LEval datasets Co-authored-by: kennymckormick --- .../LEvalCoursera/LEval_coursera_gen.py | 4 ++ .../LEval_coursera_gen_5c84a9.py | 42 ++++++++++++++ .../LEvalFinancialQA/LEval_financialqa_gen.py | 4 ++ .../LEval_financialqa_gen_9f5404.py | 40 +++++++++++++ .../datasets/LEvalGSM100/LEval_gsm100_gen.py | 4 ++ .../LEvalGSM100/LEval_gsm100_gen_a4d1f8.py | 43 ++++++++++++++ .../LEval_gov_report_summ_gen.py | 4 ++ .../LEval_gov_report_summ_gen_c68a56.py | 40 +++++++++++++ .../LEval_legalcontractqa_gen.py | 4 ++ .../LEval_legalcontractqa_gen_f0bb20.py | 40 +++++++++++++ .../LEvalMeetingSumm/LEval_meetingsumm_gen.py | 4 ++ .../LEval_meetingsumm_gen_6c03d0.py | 40 +++++++++++++ .../LEvalMultidocQA/LEval_multidocqa_gen.py | 4 ++ .../LEval_multidocqa_gen_87dc85.py | 40 +++++++++++++ .../LEvalNarrativeQA/LEval_narrativeqa_gen.py | 4 ++ .../LEval_narrativeqa_gen_9fec98.py | 40 +++++++++++++ .../LEval_naturalquestion_gen.py | 4 ++ .../LEval_naturalquestion_gen_9fec98.py | 40 +++++++++++++ .../LEvalNewsSumm/LEval_newssumm_gen.py | 4 ++ .../LEval_newssumm_gen_db3565.py | 40 +++++++++++++ .../LEval_paper_assistant_gen.py | 4 ++ .../LEval_paper_assistant_gen_6c03d0.py | 40 +++++++++++++ .../LEvalPatentSumm/LEval_patent_summ_gen.py | 4 ++ .../LEval_patent_summ_gen_db3565.py | 40 +++++++++++++ .../LEvalQuality/LEval_quality_gen.py | 4 ++ .../LEvalQuality/LEval_quality_gen_bd35f4.py | 42 ++++++++++++++ .../LEvalReviewSumm/LEval_review_summ_gen.py | 4 ++ .../LEval_review_summ_gen_6c03d0.py | 40 +++++++++++++ .../LEval_scientificqa_gen.py | 4 ++ .../LEval_scientificqa_gen_0c6e71.py | 40 +++++++++++++ configs/datasets/LEvalTPO/LEval_tpo_gen.py | 4 ++ .../datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py | 42 ++++++++++++++ .../LEval_tvshow_summ_gen-checkpoint.py | 4 ++ .../LEval_tvshow_summ_gen_rouge-checkpoint.py | 40 +++++++++++++ .../LEvalTVShowSumm/LEval_tvshow_summ_gen.py | 4 ++ .../LEval_tvshow_summ_gen_049a5c.py | 40 +++++++++++++ .../LEval_topic_retrieval_gen.py | 4 ++ .../LEval_topic_retrieval_gen_af0562.py | 42 ++++++++++++++ configs/datasets/agieval/agieval_gen.py | 2 +- ...al_gen_397d81.py => agieval_gen_64afd3.py} | 0 ...al_gen_0a9ace.py => agieval_gen_a0c741.py} | 0 configs/datasets/bbh/bbh_gen.py | 2 +- .../{bbh_gen_5b92b0.py => bbh_gen_6bd693.py} | 0 configs/datasets/cmmlu/cmmlu_gen.py | 2 +- ...mmlu_gen_ffe7c0.py => cmmlu_gen_c13365.py} | 0 configs/datasets/cmmlu/cmmlu_ppl.py | 2 +- ...mmlu_ppl_fd1f2f.py => cmmlu_ppl_8b9c76.py} | 0 configs/datasets/collections/base_medium.py | 2 +- configs/datasets/collections/base_small.py | 2 +- configs/datasets/collections/chat_medium.py | 4 +- configs/datasets/collections/chat_small.py | 4 +- configs/datasets/crowspairs/crowspairs_gen.py | 2 +- ...gen_21f7cb.py => crowspairs_gen_381af0.py} | 0 .../cvalues/cvalues_responsibility_gen.py | 2 +- ...y => cvalues_responsibility_gen_543378.py} | 0 configs/eval_LEval.py | 48 +++++++++++++++ configs/summarizers/LEval.py | 29 ++++++++++ opencompass/datasets/LEval_coursera.py | 27 +++++++++ opencompass/datasets/LEval_financial_qa.py | 27 +++++++++ opencompass/datasets/LEval_gov_report_summ.py | 27 +++++++++ opencompass/datasets/LEval_gsm100.py | 58 +++++++++++++++++++ .../datasets/LEval_legal_contract_qa.py | 27 +++++++++ opencompass/datasets/LEval_meeting_summ.py | 27 +++++++++ opencompass/datasets/LEval_multidoc_qa.py | 27 +++++++++ opencompass/datasets/LEval_narrattive_qa.py | 27 +++++++++ .../datasets/LEval_natural_question.py | 27 +++++++++ opencompass/datasets/LEval_news_summ.py | 27 +++++++++ opencompass/datasets/LEval_paper_assistant.py | 27 +++++++++ opencompass/datasets/LEval_patent_summ.py | 27 +++++++++ opencompass/datasets/LEval_quality.py | 27 +++++++++ opencompass/datasets/LEval_review_summ.py | 27 +++++++++ opencompass/datasets/LEval_scientific_qa.py | 27 +++++++++ opencompass/datasets/LEval_topic_retrieval.py | 27 +++++++++ opencompass/datasets/LEval_tpo.py | 27 +++++++++ opencompass/datasets/LEval_tvshow_summ.py | 27 +++++++++ opencompass/datasets/__init__.py | 18 ++++++ 76 files changed, 1471 insertions(+), 12 deletions(-) create mode 100644 configs/datasets/LEvalCoursera/LEval_coursera_gen.py create mode 100644 configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py create mode 100644 configs/datasets/LEvalFinancialQA/LEval_financialqa_gen.py create mode 100644 configs/datasets/LEvalFinancialQA/LEval_financialqa_gen_9f5404.py create mode 100644 configs/datasets/LEvalGSM100/LEval_gsm100_gen.py create mode 100644 configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py create mode 100644 configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen.py create mode 100644 configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen_c68a56.py create mode 100644 configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen.py create mode 100644 configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen_f0bb20.py create mode 100644 configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen.py create mode 100644 configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen_6c03d0.py create mode 100644 configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py create mode 100644 configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen_87dc85.py create mode 100644 configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen.py create mode 100644 configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen_9fec98.py create mode 100644 configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen.py create mode 100644 configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen_9fec98.py create mode 100644 configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py create mode 100644 configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py create mode 100644 configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen.py create mode 100644 configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen_6c03d0.py create mode 100644 configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen.py create mode 100644 configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen_db3565.py create mode 100644 configs/datasets/LEvalQuality/LEval_quality_gen.py create mode 100644 configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py create mode 100644 configs/datasets/LEvalReviewSumm/LEval_review_summ_gen.py create mode 100644 configs/datasets/LEvalReviewSumm/LEval_review_summ_gen_6c03d0.py create mode 100644 configs/datasets/LEvalScientificQA/LEval_scientificqa_gen.py create mode 100644 configs/datasets/LEvalScientificQA/LEval_scientificqa_gen_0c6e71.py create mode 100644 configs/datasets/LEvalTPO/LEval_tpo_gen.py create mode 100644 configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py create mode 100644 configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen-checkpoint.py create mode 100644 configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen_rouge-checkpoint.py create mode 100644 configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen.py create mode 100644 configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen_049a5c.py create mode 100644 configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen.py create mode 100644 configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen_af0562.py rename configs/datasets/agieval/{agieval_gen_397d81.py => agieval_gen_64afd3.py} (100%) rename configs/datasets/agieval/{agieval_gen_0a9ace.py => agieval_gen_a0c741.py} (100%) rename configs/datasets/bbh/{bbh_gen_5b92b0.py => bbh_gen_6bd693.py} (100%) rename configs/datasets/cmmlu/{cmmlu_gen_ffe7c0.py => cmmlu_gen_c13365.py} (100%) rename configs/datasets/cmmlu/{cmmlu_ppl_fd1f2f.py => cmmlu_ppl_8b9c76.py} (100%) rename configs/datasets/crowspairs/{crowspairs_gen_21f7cb.py => crowspairs_gen_381af0.py} (100%) rename configs/datasets/cvalues/{cvalues_responsibility_gen_4aec9f.py => cvalues_responsibility_gen_543378.py} (100%) create mode 100644 configs/eval_LEval.py create mode 100644 configs/summarizers/LEval.py create mode 100644 opencompass/datasets/LEval_coursera.py create mode 100644 opencompass/datasets/LEval_financial_qa.py create mode 100644 opencompass/datasets/LEval_gov_report_summ.py create mode 100644 opencompass/datasets/LEval_gsm100.py create mode 100644 opencompass/datasets/LEval_legal_contract_qa.py create mode 100644 opencompass/datasets/LEval_meeting_summ.py create mode 100644 opencompass/datasets/LEval_multidoc_qa.py create mode 100644 opencompass/datasets/LEval_narrattive_qa.py create mode 100644 opencompass/datasets/LEval_natural_question.py create mode 100644 opencompass/datasets/LEval_news_summ.py create mode 100644 opencompass/datasets/LEval_paper_assistant.py create mode 100644 opencompass/datasets/LEval_patent_summ.py create mode 100644 opencompass/datasets/LEval_quality.py create mode 100644 opencompass/datasets/LEval_review_summ.py create mode 100644 opencompass/datasets/LEval_scientific_qa.py create mode 100644 opencompass/datasets/LEval_topic_retrieval.py create mode 100644 opencompass/datasets/LEval_tpo.py create mode 100644 opencompass/datasets/LEval_tvshow_summ.py diff --git a/configs/datasets/LEvalCoursera/LEval_coursera_gen.py b/configs/datasets/LEvalCoursera/LEval_coursera_gen.py new file mode 100644 index 00000000..ce2f62eb --- /dev/null +++ b/configs/datasets/LEvalCoursera/LEval_coursera_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_coursera_gen_5c84a9 import LEval_coursera_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py b/configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py new file mode 100644 index 00000000..516738d3 --- /dev/null +++ b/configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator +from opencompass.datasets import LEvalCourseraDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi + +LEval_coursera_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_coursera_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\n{question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=10) +) + +LEval_coursera_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess_multi), + pred_role='BOT' +) + +LEval_coursera_datasets = [ + dict( + type=LEvalCourseraDataset, + abbr='LEval_coursera', + path='L4NLP/LEval', + name='coursera', + reader_cfg=LEval_coursera_reader_cfg, + infer_cfg=LEval_coursera_infer_cfg, + eval_cfg=LEval_coursera_eval_cfg) +] diff --git a/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen.py b/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen.py new file mode 100644 index 00000000..d559a2e5 --- /dev/null +++ b/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_financialqa_gen_9f5404 import LEval_financialqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen_9f5404.py b/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen_9f5404.py new file mode 100644 index 00000000..7ce984f4 --- /dev/null +++ b/configs/datasets/LEvalFinancialQA/LEval_financialqa_gen_9f5404.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator +from opencompass.datasets import LEvalFinancialQADataset + +LEval_financialqa_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_financialqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\n{question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_financialqa_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_financialqa_datasets = [ + dict( + type=LEvalFinancialQADataset, + abbr='LEval_financialqa', + path='L4NLP/LEval', + name='financial_qa', + reader_cfg=LEval_financialqa_reader_cfg, + infer_cfg=LEval_financialqa_infer_cfg, + eval_cfg=LEval_financialqa_eval_cfg) +] diff --git a/configs/datasets/LEvalGSM100/LEval_gsm100_gen.py b/configs/datasets/LEvalGSM100/LEval_gsm100_gen.py new file mode 100644 index 00000000..24e63948 --- /dev/null +++ b/configs/datasets/LEvalGSM100/LEval_gsm100_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_gsm100_gen_a4d1f8 import LEval_gsm100_datasets diff --git a/configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py b/configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py new file mode 100644 index 00000000..7db6e21a --- /dev/null +++ b/configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator +from opencompass.datasets import LEvalGSM100Dataset +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi +from opencompass.registry import TEXT_POSTPROCESSORS +from opencompass.datasets import gsm100_dataset_postprocess, gsm100_postprocess + +LEval_gsm100_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_gsm100_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{question}\n'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + + +LEval_gsm100_eval_cfg = dict(evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=gsm100_postprocess), + dataset_postprocessor=dict(type=gsm100_dataset_postprocess) +) + +LEval_gsm100_datasets = [ + dict( + type=LEvalGSM100Dataset, + abbr='LEval_gsm100', + path='L4NLP/LEval', + name='gsm100', + reader_cfg=LEval_gsm100_reader_cfg, + infer_cfg=LEval_gsm100_infer_cfg, + eval_cfg=LEval_gsm100_eval_cfg) +] diff --git a/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen.py b/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen.py new file mode 100644 index 00000000..1ad57ac7 --- /dev/null +++ b/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_gov_report_summ_gen_c68a56 import LEval_govreport_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen_c68a56.py b/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen_c68a56.py new file mode 100644 index 00000000..bd78389c --- /dev/null +++ b/configs/datasets/LEvalGovReportSumm/LEval_gov_report_summ_gen_c68a56.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalGovReportSummDataset + +LEval_govreport_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_govreport_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='Government report: {context}\n{question}\nTL;DR:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_govreport_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_govreport_summ_datasets = [ + dict( + type=LEvalGovReportSummDataset, + abbr='LEval_gov_report_summ', + path='L4NLP/LEval', + name='gov_report_summ', + reader_cfg=LEval_govreport_summ_reader_cfg, + infer_cfg=LEval_govreport_summ_infer_cfg, + eval_cfg=LEval_govreport_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen.py b/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen.py new file mode 100644 index 00000000..30f5ee2b --- /dev/null +++ b/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_legalcontractqa_gen_f0bb20 import LEval_legalqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen_f0bb20.py b/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen_f0bb20.py new file mode 100644 index 00000000..d4bb5fa1 --- /dev/null +++ b/configs/datasets/LEvalLegalContractQA/LEval_legalcontractqa_gen_f0bb20.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator +from opencompass.datasets import LEvalLegalContractQADataset + +LEval_legalqa_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_legalqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=128) +) + +LEval_legalqa_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_legalqa_datasets = [ + dict( + type=LEvalLegalContractQADataset, + abbr='LEval_legal_contract_qa', + path='L4NLP/LEval', + name='legal_contract_qa', + reader_cfg=LEval_legalqa_reader_cfg, + infer_cfg=LEval_legalqa_infer_cfg, + eval_cfg=LEval_legalqa_eval_cfg) +] diff --git a/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen.py b/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen.py new file mode 100644 index 00000000..8f4f345d --- /dev/null +++ b/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_meetingsumm_gen_6c03d0 import LEval_meetingsumm_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen_6c03d0.py b/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen_6c03d0.py new file mode 100644 index 00000000..9582d5fc --- /dev/null +++ b/configs/datasets/LEvalMeetingSumm/LEval_meetingsumm_gen_6c03d0.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalMeetingSummDataset + +LEval_meetingsumm_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_meetingsumm_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_meetingsumm_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_meetingsumm_datasets = [ + dict( + type=LEvalMeetingSummDataset, + abbr='LEval_meeting_summ', + path='L4NLP/LEval', + name='meeting_summ', + reader_cfg=LEval_meetingsumm_reader_cfg, + infer_cfg=LEval_meetingsumm_infer_cfg, + eval_cfg=LEval_meetingsumm_eval_cfg) +] diff --git a/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py b/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py new file mode 100644 index 00000000..e86cf38c --- /dev/null +++ b/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_multidocqa_gen_87dc85 import LEval_multidocqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen_87dc85.py b/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen_87dc85.py new file mode 100644 index 00000000..63ab7404 --- /dev/null +++ b/configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen_87dc85.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator +from opencompass.datasets import LEvalMultidocQADataset + +LEval_multidocqa_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_multidocqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=64) +) + +LEval_multidocqa_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_multidocqa_datasets = [ + dict( + type=LEvalMultidocQADataset, + abbr='LEval_multidocqa', + path='L4NLP/LEval', + name='multidoc_qa', + reader_cfg=LEval_multidocqa_reader_cfg, + infer_cfg=LEval_multidocqa_infer_cfg, + eval_cfg=LEval_multidocqa_eval_cfg) +] diff --git a/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen.py b/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen.py new file mode 100644 index 00000000..e4b04c23 --- /dev/null +++ b/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_narrativeqa_gen_9fec98 import LEval_narrativeqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen_9fec98.py b/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen_9fec98.py new file mode 100644 index 00000000..49bc152d --- /dev/null +++ b/configs/datasets/LEvalNarrativeQA/LEval_narrativeqa_gen_9fec98.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator +from opencompass.datasets import LEvalNarrativeQADataset + +LEval_narrativeqa_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_narrativeqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=50) +) + +LEval_narrativeqa_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_narrativeqa_datasets = [ + dict( + type=LEvalNarrativeQADataset, + abbr='LEval_narrativeqa', + path='L4NLP/LEval', + name='narrative_qa', + reader_cfg=LEval_narrativeqa_reader_cfg, + infer_cfg=LEval_narrativeqa_infer_cfg, + eval_cfg=LEval_narrativeqa_eval_cfg) +] diff --git a/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen.py b/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen.py new file mode 100644 index 00000000..19da74e3 --- /dev/null +++ b/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_naturalquestion_gen_9fec98 import LEval_nq_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen_9fec98.py b/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen_9fec98.py new file mode 100644 index 00000000..e262e5ca --- /dev/null +++ b/configs/datasets/LEvalNaturalQuestion/LEval_naturalquestion_gen_9fec98.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalNaturalQuestionDataset + +LEval_nq_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_nq_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=50) +) + +LEval_nq_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_nq_datasets = [ + dict( + type=LEvalNaturalQuestionDataset, + abbr='LEval_nq', + path='L4NLP/LEval', + name='natural_question', + reader_cfg=LEval_nq_reader_cfg, + infer_cfg=LEval_nq_infer_cfg, + eval_cfg=LEval_nq_eval_cfg) +] diff --git a/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py b/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py new file mode 100644 index 00000000..2faab902 --- /dev/null +++ b/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_newssumm_gen_db3565 import LEval_newssumm_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py b/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py new file mode 100644 index 00000000..3033c5f4 --- /dev/null +++ b/configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalNewsSummDataset + +LEval_newssumm_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_newssumm_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\n{question}\nTL;DR:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_newssumm_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_newssumm_datasets = [ + dict( + type=LEvalNewsSummDataset, + abbr='LEval_news_summ', + path='L4NLP/LEval', + name='news_summ', + reader_cfg=LEval_newssumm_reader_cfg, + infer_cfg=LEval_newssumm_infer_cfg, + eval_cfg=LEval_newssumm_eval_cfg) +] diff --git a/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen.py b/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen.py new file mode 100644 index 00000000..f0a42825 --- /dev/null +++ b/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_paper_assistant_gen_6c03d0 import LEval_ps_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen_6c03d0.py b/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen_6c03d0.py new file mode 100644 index 00000000..5d59acce --- /dev/null +++ b/configs/datasets/LEvalPaperAssistant/LEval_paper_assistant_gen_6c03d0.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalPaperAssistantDataset + +LEval_ps_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_ps_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_ps_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_ps_summ_datasets = [ + dict( + type=LEvalPaperAssistantDataset, + abbr='LEval_paper_assistant', + path='L4NLP/LEval', + name='paper_assistant', + reader_cfg=LEval_ps_summ_reader_cfg, + infer_cfg=LEval_ps_summ_infer_cfg, + eval_cfg=LEval_ps_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen.py b/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen.py new file mode 100644 index 00000000..ac78849c --- /dev/null +++ b/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_patent_summ_gen_db3565 import LEval_patent_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen_db3565.py b/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen_db3565.py new file mode 100644 index 00000000..156e08cd --- /dev/null +++ b/configs/datasets/LEvalPatentSumm/LEval_patent_summ_gen_db3565.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalPatentSummDataset + +LEval_patent_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_patent_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\n{question}\nTL;DR:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_patent_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_patent_summ_datasets = [ + dict( + type=LEvalPatentSummDataset, + abbr='LEval_patent_summ', + path='L4NLP/LEval', + name='patent_summ', + reader_cfg=LEval_patent_summ_reader_cfg, + infer_cfg=LEval_patent_summ_infer_cfg, + eval_cfg=LEval_patent_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalQuality/LEval_quality_gen.py b/configs/datasets/LEvalQuality/LEval_quality_gen.py new file mode 100644 index 00000000..576ea8fc --- /dev/null +++ b/configs/datasets/LEvalQuality/LEval_quality_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_quality_gen_bd35f4 import LEval_quality_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py b/configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py new file mode 100644 index 00000000..914d13d2 --- /dev/null +++ b/configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator +from opencompass.datasets import LEvalQualityDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi + +LEval_quality_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_quality_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=10) +) + +LEval_quality_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess), + pred_role='BOT' +) + +LEval_quality_datasets = [ + dict( + type=LEvalQualityDataset, + abbr='LEval_quality', + path='L4NLP/LEval', + name='quality', + reader_cfg=LEval_quality_reader_cfg, + infer_cfg=LEval_quality_infer_cfg, + eval_cfg=LEval_quality_eval_cfg) +] diff --git a/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen.py b/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen.py new file mode 100644 index 00000000..c472db9d --- /dev/null +++ b/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_review_summ_gen_6c03d0 import LEval_review_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen_6c03d0.py b/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen_6c03d0.py new file mode 100644 index 00000000..bb54015c --- /dev/null +++ b/configs/datasets/LEvalReviewSumm/LEval_review_summ_gen_6c03d0.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalReviewSummDataset + +LEval_review_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_review_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_review_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_review_summ_datasets = [ + dict( + type=LEvalReviewSummDataset, + abbr='LEval_review_summ', + path='L4NLP/LEval', + name='review_summ', + reader_cfg=LEval_review_summ_reader_cfg, + infer_cfg=LEval_review_summ_infer_cfg, + eval_cfg=LEval_review_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen.py b/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen.py new file mode 100644 index 00000000..df00c568 --- /dev/null +++ b/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_scientificqa_gen_0c6e71 import LEval_scientificqa_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen_0c6e71.py b/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen_0c6e71.py new file mode 100644 index 00000000..05243a6f --- /dev/null +++ b/configs/datasets/LEvalScientificQA/LEval_scientificqa_gen_0c6e71.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator +from opencompass.datasets import LEvalScientificQADataset + +LEval_scientificqa_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_scientificqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=64) +) + +LEval_scientificqa_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_scientificqa_datasets = [ + dict( + type=LEvalScientificQADataset, + abbr='LEval_scientificqa', + path='L4NLP/LEval', + name='scientific_qa', + reader_cfg=LEval_scientificqa_reader_cfg, + infer_cfg=LEval_scientificqa_infer_cfg, + eval_cfg=LEval_scientificqa_eval_cfg) +] diff --git a/configs/datasets/LEvalTPO/LEval_tpo_gen.py b/configs/datasets/LEvalTPO/LEval_tpo_gen.py new file mode 100644 index 00000000..82787d06 --- /dev/null +++ b/configs/datasets/LEvalTPO/LEval_tpo_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_tpo_gen_bd35f4 import LEval_tpo_datasets diff --git a/configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py b/configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py new file mode 100644 index 00000000..3bdd3acf --- /dev/null +++ b/configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator +from opencompass.datasets import LEvalTPODataset +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi + +LEval_tpo_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_tpo_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=10) +) + +LEval_tpo_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_postprocessor=dict(type=first_capital_postprocess), + pred_role='BOT' +) + +LEval_tpo_datasets = [ + dict( + type=LEvalTPODataset, + abbr='LEval_tpo', + path='L4NLP/LEval', + name='tpo', + reader_cfg=LEval_tpo_reader_cfg, + infer_cfg=LEval_tpo_infer_cfg, + eval_cfg=LEval_tpo_eval_cfg) +] diff --git a/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen-checkpoint.py b/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen-checkpoint.py new file mode 100644 index 00000000..23a1d529 --- /dev/null +++ b/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen-checkpoint.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_tvshow_summ_gen_rouge import LEval_tvshow_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen_rouge-checkpoint.py b/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen_rouge-checkpoint.py new file mode 100644 index 00000000..a915900f --- /dev/null +++ b/configs/datasets/LEvalTVShowSumm/.ipynb_checkpoints/LEval_tvshow_summ_gen_rouge-checkpoint.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalTVShowSummDataset + +LEval_tvshow_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_tvshow_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}'), + dict(role='BOT', prompt='TL;DR:'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_tvshow_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_tvshow_summ_datasets = [ + dict( + type=LEvalTVShowSummDataset, + abbr='LEval_tvshow_summ', + path='L4NLP/LEval', + name='tv_show_summ', + reader_cfg=LEval_tvshow_summ_reader_cfg, + infer_cfg=LEval_tvshow_summ_infer_cfg, + eval_cfg=LEval_tvshow_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen.py b/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen.py new file mode 100644 index 00000000..3b815ad5 --- /dev/null +++ b/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_tvshow_summ_gen_049a5c import LEval_tvshow_summ_datasets # noqa: F401, F403 diff --git a/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen_049a5c.py b/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen_049a5c.py new file mode 100644 index 00000000..86c36677 --- /dev/null +++ b/configs/datasets/LEvalTVShowSumm/LEval_tvshow_summ_gen_049a5c.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator +from opencompass.datasets import LEvalTVShowSummDataset + +LEval_tvshow_summ_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_tvshow_summ_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nTL;DR:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512) +) + +LEval_tvshow_summ_eval_cfg = dict( + evaluator=dict(type=RougeEvaluator), + pred_role='BOT' +) + +LEval_tvshow_summ_datasets = [ + dict( + type=LEvalTVShowSummDataset, + abbr='LEval_tvshow_summ', + path='L4NLP/LEval', + name='tv_show_summ', + reader_cfg=LEval_tvshow_summ_reader_cfg, + infer_cfg=LEval_tvshow_summ_infer_cfg, + eval_cfg=LEval_tvshow_summ_eval_cfg) +] diff --git a/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen.py b/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen.py new file mode 100644 index 00000000..b5b2977f --- /dev/null +++ b/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .LEval_topic_retrieval_gen_af0562 import LEval_tr_datasets diff --git a/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen_af0562.py b/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen_af0562.py new file mode 100644 index 00000000..e4870a50 --- /dev/null +++ b/configs/datasets/LEvalTopicRetrieval/LEval_topic_retrieval_gen_af0562.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator +from opencompass.datasets import LEvalTopicRetrievalDataset +from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi, general_postprocess + +LEval_tr_reader_cfg = dict( + input_columns=['context', 'question'], + output_column='answer', + train_split='test', + test_split='test' +) + +LEval_tr_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'), + dict(role='BOT', prompt=''), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=30) +) + +LEval_tr_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_postprocessor=dict(type=general_postprocess), + pred_role='BOT' +) + +LEval_tr_datasets = [ + dict( + type=LEvalTopicRetrievalDataset, + abbr='LEval_topic_retrieval', + path='L4NLP/LEval', + name='topic_retrieval_longchat', + reader_cfg=LEval_tr_reader_cfg, + infer_cfg=LEval_tr_infer_cfg, + eval_cfg=LEval_tr_eval_cfg) +] diff --git a/configs/datasets/agieval/agieval_gen.py b/configs/datasets/agieval/agieval_gen.py index 7cf48417..6af2dc6a 100644 --- a/configs/datasets/agieval/agieval_gen.py +++ b/configs/datasets/agieval/agieval_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .agieval_gen_397d81 import agieval_datasets # noqa: F401, F403 + from .agieval_gen_64afd3 import agieval_datasets # noqa: F401, F403 diff --git a/configs/datasets/agieval/agieval_gen_397d81.py b/configs/datasets/agieval/agieval_gen_64afd3.py similarity index 100% rename from configs/datasets/agieval/agieval_gen_397d81.py rename to configs/datasets/agieval/agieval_gen_64afd3.py diff --git a/configs/datasets/agieval/agieval_gen_0a9ace.py b/configs/datasets/agieval/agieval_gen_a0c741.py similarity index 100% rename from configs/datasets/agieval/agieval_gen_0a9ace.py rename to configs/datasets/agieval/agieval_gen_a0c741.py diff --git a/configs/datasets/bbh/bbh_gen.py b/configs/datasets/bbh/bbh_gen.py index cb9dff44..03768981 100644 --- a/configs/datasets/bbh/bbh_gen.py +++ b/configs/datasets/bbh/bbh_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403 + from .bbh_gen_6bd693 import bbh_datasets # noqa: F401, F403 diff --git a/configs/datasets/bbh/bbh_gen_5b92b0.py b/configs/datasets/bbh/bbh_gen_6bd693.py similarity index 100% rename from configs/datasets/bbh/bbh_gen_5b92b0.py rename to configs/datasets/bbh/bbh_gen_6bd693.py diff --git a/configs/datasets/cmmlu/cmmlu_gen.py b/configs/datasets/cmmlu/cmmlu_gen.py index 0245f871..7f3baa9f 100644 --- a/configs/datasets/cmmlu/cmmlu_gen.py +++ b/configs/datasets/cmmlu/cmmlu_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cmmlu_gen_ffe7c0 import cmmlu_datasets # noqa: F401, F403 + from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403 diff --git a/configs/datasets/cmmlu/cmmlu_gen_ffe7c0.py b/configs/datasets/cmmlu/cmmlu_gen_c13365.py similarity index 100% rename from configs/datasets/cmmlu/cmmlu_gen_ffe7c0.py rename to configs/datasets/cmmlu/cmmlu_gen_c13365.py diff --git a/configs/datasets/cmmlu/cmmlu_ppl.py b/configs/datasets/cmmlu/cmmlu_ppl.py index 645494f8..65811c1b 100644 --- a/configs/datasets/cmmlu/cmmlu_ppl.py +++ b/configs/datasets/cmmlu/cmmlu_ppl.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cmmlu_ppl_fd1f2f import cmmlu_datasets # noqa: F401, F403 + from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403 diff --git a/configs/datasets/cmmlu/cmmlu_ppl_fd1f2f.py b/configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py similarity index 100% rename from configs/datasets/cmmlu/cmmlu_ppl_fd1f2f.py rename to configs/datasets/cmmlu/cmmlu_ppl_8b9c76.py diff --git a/configs/datasets/collections/base_medium.py b/configs/datasets/collections/base_medium.py index 9a9962f3..63954833 100644 --- a/configs/datasets/collections/base_medium.py +++ b/configs/datasets/collections/base_medium.py @@ -5,7 +5,7 @@ with read_base(): from ..ceval.ceval_ppl_578f8d import ceval_datasets from ..agieval.agieval_mixed_2f14ad import agieval_datasets from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets - from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..bbh.bbh_gen_6bd693 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets diff --git a/configs/datasets/collections/base_small.py b/configs/datasets/collections/base_small.py index a038ad39..4b762073 100644 --- a/configs/datasets/collections/base_small.py +++ b/configs/datasets/collections/base_small.py @@ -2,7 +2,7 @@ from mmengine.config import read_base with read_base(): from ..ceval.ceval_ppl_578f8d import ceval_datasets - from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..bbh.bbh_gen_6bd693 import bbh_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py index 6b63538d..577e3b0b 100644 --- a/configs/datasets/collections/chat_medium.py +++ b/configs/datasets/collections/chat_medium.py @@ -3,9 +3,9 @@ from mmengine.config import read_base with read_base(): from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets from ..ceval.ceval_gen_5f30c7 import ceval_datasets - from ..agieval.agieval_gen_397d81 import agieval_datasets + from ..agieval.agieval_gen_64afd3 import agieval_datasets from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets - from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..bbh.bbh_gen_6bd693 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets diff --git a/configs/datasets/collections/chat_small.py b/configs/datasets/collections/chat_small.py index 89fe4b8b..b8ab4dd7 100644 --- a/configs/datasets/collections/chat_small.py +++ b/configs/datasets/collections/chat_small.py @@ -3,7 +3,7 @@ from mmengine.config import read_base with read_base(): from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets from ..ceval.ceval_gen_5f30c7 import ceval_datasets - from ..bbh.bbh_gen_5b92b0 import bbh_datasets + from ..bbh.bbh_gen_6bd693 import bbh_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets @@ -35,6 +35,6 @@ with read_base(): from ..obqa.obqa_gen_9069e4 import obqa_datasets from ..nq.nq_gen_c788f6 import nq_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from ..crowspairs.crowspairs_gen_21f7cb import crowspairs_datasets + from ..crowspairs.crowspairs_gen_381af0 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/crowspairs/crowspairs_gen.py b/configs/datasets/crowspairs/crowspairs_gen.py index 912f56bf..eea2eaf6 100644 --- a/configs/datasets/crowspairs/crowspairs_gen.py +++ b/configs/datasets/crowspairs/crowspairs_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .crowspairs_gen_21f7cb import crowspairs_datasets # noqa: F401, F403 + from .crowspairs_gen_381af0 import crowspairs_datasets # noqa: F401, F403 diff --git a/configs/datasets/crowspairs/crowspairs_gen_21f7cb.py b/configs/datasets/crowspairs/crowspairs_gen_381af0.py similarity index 100% rename from configs/datasets/crowspairs/crowspairs_gen_21f7cb.py rename to configs/datasets/crowspairs/crowspairs_gen_381af0.py diff --git a/configs/datasets/cvalues/cvalues_responsibility_gen.py b/configs/datasets/cvalues/cvalues_responsibility_gen.py index 5cd4ee07..613edbda 100644 --- a/configs/datasets/cvalues/cvalues_responsibility_gen.py +++ b/configs/datasets/cvalues/cvalues_responsibility_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .cvalues_responsibility_gen_4aec9f import cvalues_datasets # noqa: F401, F403 + from .cvalues_responsibility_gen_543378 import cvalues_datasets # noqa: F401, F403 diff --git a/configs/datasets/cvalues/cvalues_responsibility_gen_4aec9f.py b/configs/datasets/cvalues/cvalues_responsibility_gen_543378.py similarity index 100% rename from configs/datasets/cvalues/cvalues_responsibility_gen_4aec9f.py rename to configs/datasets/cvalues/cvalues_responsibility_gen_543378.py diff --git a/configs/eval_LEval.py b/configs/eval_LEval.py new file mode 100644 index 00000000..16b4696f --- /dev/null +++ b/configs/eval_LEval.py @@ -0,0 +1,48 @@ +from opencompass.models import HuggingFaceCausalLM +import torch + +# long context evaluation tasks +with read_base(): + from .datasets.LEvalNaturalQuestion.LEval_naturalquestion_gen import LEval_nq_datasets + from .datasets.LEvalNarrativeQA.LEval_narrativeqa_gen import LEval_narrativeqa_datasets + from .datasets.LEvalMultidocQA.LEval_multidocqa_gen import LEval_multidocqa_datasets + from .datasets.LEvalCoursera.LEval_coursera_gen import LEval_coursera_datasets + from .datasets.LEvalTPO.LEval_tpo_gen import LEval_tpo_datasets + from .datasets.LEvalQuality.LEval_quality_gen import LEval_quality_datasets + from .datasets.LEvalGSM100.LEval_gsm100_gen import LEval_gsm100_datasets + from .datasets.LEvalTopicRetrieval.LEval_topic_retrieval_gen import LEval_tr_datasets + from .datasets.LEvalFinancialQA.LEval_financialqa_gen import LEval_financialqa_datasets + from .datasets.LEvalGovReportSumm.LEval_gov_report_summ_gen import LEval_govreport_summ_datasets + from .datasets.LEvalLegalContractQA.LEval_legalcontractqa_gen import LEval_legalqa_datasets + from .datasets.LEvalMeetingSumm.LEval_meetingsumm_gen import LEval_meetingsumm_datasets + from .datasets.LEvalNewsSumm.LEval_newssumm_gen import LEval_newssumm_datasets + from .datasets.LEvalPaperAssistant.LEval_paper_assistant_gen import LEval_ps_summ_datasets + from .datasets.LEvalPatentSumm.LEval_patent_summ_gen import LEval_patent_summ_datasets + from .datasets.LEvalTVShowSumm.LEval_tvshow_summ_gen import LEval_tvshow_summ_datasets + from .datasets.LEvalScientificQA.LEval_scientificqa_gen import LEval_scientificqa_datasets + from .datasets.LEvalReviewSumm.LEval_review_summ_gen import LEval_review_summ_datasets + # choose a model of interest + # ininternlm as an example + from .models.hf_internlm_7b import models + # and output the results in a choosen format + from .summarizers.LEval import summarizer + +datasets = [*LEval_coursera_datasets, + *LEval_tpo_datasets, + *LEval_quality_datasets, + *LEval_gsm100_datasets, + *LEval_tr_datasets, + *LEval_financialqa_datasets, + *LEval_govreport_summ_datasets, + *LEval_legalqa_datasets, + *LEval_meetingsumm_datasets, + *LEval_multidocqa_datasets, + *LEval_narrativeqa_datasets, + *LEval_nq_datasets, + *LEval_newssumm_datasets, + *LEval_newssumm_datasets, + *LEval_patent_summ_datasets, + *LEval_tvshow_summ_datasets, + *LEval_scientificqa_datasets, + *LEval_review_summ_datasets, + *LEval_ps_summ_datasets] diff --git a/configs/summarizers/LEval.py b/configs/summarizers/LEval.py new file mode 100644 index 00000000..2c925318 --- /dev/null +++ b/configs/summarizers/LEval.py @@ -0,0 +1,29 @@ +summarizer = dict( + dataset_abbrs = [ + '--------- LEval Exact Match (Acc) ---------', # category + "LEval_coursera", + 'LEval_gsm100', + 'LEval_quality', + "LEval_tpo", + 'LEval_topic_retrieval', + '--------- LEval Gen (ROUGE) ---------', # category + 'LEval_financialqa', + 'LEval_gov_report_summ', + 'LEval_legal_contract_qa', + 'LEval_meeting_summ', + 'LEval_multidocqa', + 'LEval_narrativeqa', + 'LEval_nq', + 'LEval_news_summ', + 'LEval_paper_assistant', + 'LEval_patent_summ', + 'LEval_review_summ', + 'LEval_scientificqa', + 'LEval_tvshow_summ' + ], + summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), + prompt_db=dict( + database_path='configs/datasets/log.json', + config_dir='configs/datasets', + blacklist='.promptignore'), +) diff --git a/opencompass/datasets/LEval_coursera.py b/opencompass/datasets/LEval_coursera.py new file mode 100644 index 00000000..1182795d --- /dev/null +++ b/opencompass/datasets/LEval_coursera.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalCourseraDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_financial_qa.py b/opencompass/datasets/LEval_financial_qa.py new file mode 100644 index 00000000..b5bd0b0a --- /dev/null +++ b/opencompass/datasets/LEval_financial_qa.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalFinancialQADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_gov_report_summ.py b/opencompass/datasets/LEval_gov_report_summ.py new file mode 100644 index 00000000..4eac58c8 --- /dev/null +++ b/opencompass/datasets/LEval_gov_report_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalGovReportSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_gsm100.py b/opencompass/datasets/LEval_gsm100.py new file mode 100644 index 00000000..086347d2 --- /dev/null +++ b/opencompass/datasets/LEval_gsm100.py @@ -0,0 +1,58 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + +from .base import BaseDataset + + +@TEXT_POSTPROCESSORS.register_module('gsm100_dataset') +def gsm100_dataset_postprocess(text: str) -> str: + return text.replace(',', '') + + +@TEXT_POSTPROCESSORS.register_module('gsm100') +def gsm100_postprocess(text: str) -> str: + # text = text.split('\n\n')[0] + segs = text.split('The answer is') + if len(segs) < 2: + return '' + text = segs[1] + text = text.split(' ') + flag = False + ret = '' + for i in range(len(text)): + s = text[i] + for i in range(len(s)): + if s[i].isdigit(): + flag = True + ret = s + break + if flag: + break + ret1 = '' + for i in range(len(ret)): + if ret[i].isdigit(): + ret1 += ret[i] + return ret1 + + +@LOAD_DATASET.register_module() +class LEvalGSM100Dataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_legal_contract_qa.py b/opencompass/datasets/LEval_legal_contract_qa.py new file mode 100644 index 00000000..872c4a4a --- /dev/null +++ b/opencompass/datasets/LEval_legal_contract_qa.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalLegalContractQADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_meeting_summ.py b/opencompass/datasets/LEval_meeting_summ.py new file mode 100644 index 00000000..626636bd --- /dev/null +++ b/opencompass/datasets/LEval_meeting_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalMeetingSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_multidoc_qa.py b/opencompass/datasets/LEval_multidoc_qa.py new file mode 100644 index 00000000..1eaed353 --- /dev/null +++ b/opencompass/datasets/LEval_multidoc_qa.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalMultidocQADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_narrattive_qa.py b/opencompass/datasets/LEval_narrattive_qa.py new file mode 100644 index 00000000..39dea745 --- /dev/null +++ b/opencompass/datasets/LEval_narrattive_qa.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalNarrativeQADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_natural_question.py b/opencompass/datasets/LEval_natural_question.py new file mode 100644 index 00000000..d5be60e7 --- /dev/null +++ b/opencompass/datasets/LEval_natural_question.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalNaturalQuestionDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_news_summ.py b/opencompass/datasets/LEval_news_summ.py new file mode 100644 index 00000000..5558d456 --- /dev/null +++ b/opencompass/datasets/LEval_news_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalNewsSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_paper_assistant.py b/opencompass/datasets/LEval_paper_assistant.py new file mode 100644 index 00000000..a42d335a --- /dev/null +++ b/opencompass/datasets/LEval_paper_assistant.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalPaperAssistantDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_patent_summ.py b/opencompass/datasets/LEval_patent_summ.py new file mode 100644 index 00000000..28a364b3 --- /dev/null +++ b/opencompass/datasets/LEval_patent_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalPatentSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_quality.py b/opencompass/datasets/LEval_quality.py new file mode 100644 index 00000000..5b3f7e97 --- /dev/null +++ b/opencompass/datasets/LEval_quality.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalQualityDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer[1] + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_review_summ.py b/opencompass/datasets/LEval_review_summ.py new file mode 100644 index 00000000..8fb50da5 --- /dev/null +++ b/opencompass/datasets/LEval_review_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalReviewSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_scientific_qa.py b/opencompass/datasets/LEval_scientific_qa.py new file mode 100644 index 00000000..e4fe5f33 --- /dev/null +++ b/opencompass/datasets/LEval_scientific_qa.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalScientificQADataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_topic_retrieval.py b/opencompass/datasets/LEval_topic_retrieval.py new file mode 100644 index 00000000..80043fc9 --- /dev/null +++ b/opencompass/datasets/LEval_topic_retrieval.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalTopicRetrievalDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_tpo.py b/opencompass/datasets/LEval_tpo.py new file mode 100644 index 00000000..0d2342ae --- /dev/null +++ b/opencompass/datasets/LEval_tpo.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalTPODataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/LEval_tvshow_summ.py b/opencompass/datasets/LEval_tvshow_summ.py new file mode 100644 index 00000000..9036362e --- /dev/null +++ b/opencompass/datasets/LEval_tvshow_summ.py @@ -0,0 +1,27 @@ +from datasets import Dataset, load_dataset + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class LEvalTVShowSummDataset(BaseDataset): + + @staticmethod + def load(**kwargs): + dataset = load_dataset(**kwargs) + split = 'test' + raw_data = [] + for i in range(len(dataset[split])): + instructions = dataset[split]['instructions'][i] + outputs = dataset[split]['outputs'][i] + context = dataset[split]['input'][i] + for question, answer in zip(instructions, outputs): + raw_data.append({ + 'question': question, + 'context': context, + 'answer': answer + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index e925f26e..175e8538 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -34,6 +34,24 @@ from .iwslt2017 import * # noqa: F401, F403 from .jigsawmultilingual import * # noqa: F401, F403 from .lambada import * # noqa: F401, F403 from .lcsts import * # noqa: F401, F403 +from .LEval_coursera import * # noqa: F401, F403 +from .LEval_financial_qa import * # noqa: F401, F403 +from .LEval_gov_report_summ import * # noqa: F401, F403 +from .LEval_gsm100 import * # noqa: F401, F403 +from .LEval_legal_contract_qa import * # noqa: F401, F403 +from .LEval_meeting_summ import * # noqa: F401, F403 +from .LEval_multidoc_qa import * # noqa: F401, F403 +from .LEval_narrattive_qa import * # noqa: F401, F403 +from .LEval_natural_question import * # noqa: F401, F403 +from .LEval_news_summ import * # noqa: F401, F403 +from .LEval_paper_assistant import * # noqa: F401, F403 +from .LEval_patent_summ import * # noqa: F401, F403 +from .LEval_quality import * # noqa: F401, F403 +from .LEval_review_summ import * # noqa: F401, F403 +from .LEval_scientific_qa import * # noqa: F401, F403 +from .LEval_topic_retrieval import * # noqa: F401, F403 +from .LEval_tpo import * # noqa: F401, F403 +from .LEval_tvshow_summ import * # noqa: F401, F403 from .math import * # noqa: F401, F403 from .mbpp import * # noqa: F401, F403 from .mmlu import * # noqa: F401, F403