mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add LEval datasets
Co-authored-by: kennymckormick <dhd@pku.edu.cn>
This commit is contained in:
parent
8d9cee060f
commit
bf79ff1c6d
4
configs/datasets/LEvalCoursera/LEval_coursera_gen.py
Normal file
4
configs/datasets/LEvalCoursera/LEval_coursera_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_coursera_gen_5c84a9 import LEval_coursera_datasets # noqa: F401, F403
|
42
configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py
Normal file
42
configs/datasets/LEvalCoursera/LEval_coursera_gen_5c84a9.py
Normal file
@ -0,0 +1,42 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
|
||||
from opencompass.datasets import LEvalCourseraDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
|
||||
|
||||
LEval_coursera_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_coursera_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\n{question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=10)
|
||||
)
|
||||
|
||||
LEval_coursera_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess_multi),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_coursera_datasets = [
|
||||
dict(
|
||||
type=LEvalCourseraDataset,
|
||||
abbr='LEval_coursera',
|
||||
path='L4NLP/LEval',
|
||||
name='coursera',
|
||||
reader_cfg=LEval_coursera_reader_cfg,
|
||||
infer_cfg=LEval_coursera_infer_cfg,
|
||||
eval_cfg=LEval_coursera_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_financialqa_gen_9f5404 import LEval_financialqa_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator
|
||||
from opencompass.datasets import LEvalFinancialQADataset
|
||||
|
||||
LEval_financialqa_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_financialqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\n{question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_financialqa_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_financialqa_datasets = [
|
||||
dict(
|
||||
type=LEvalFinancialQADataset,
|
||||
abbr='LEval_financialqa',
|
||||
path='L4NLP/LEval',
|
||||
name='financial_qa',
|
||||
reader_cfg=LEval_financialqa_reader_cfg,
|
||||
infer_cfg=LEval_financialqa_infer_cfg,
|
||||
eval_cfg=LEval_financialqa_eval_cfg)
|
||||
]
|
4
configs/datasets/LEvalGSM100/LEval_gsm100_gen.py
Normal file
4
configs/datasets/LEvalGSM100/LEval_gsm100_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_gsm100_gen_a4d1f8 import LEval_gsm100_datasets
|
43
configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py
Normal file
43
configs/datasets/LEvalGSM100/LEval_gsm100_gen_a4d1f8.py
Normal file
@ -0,0 +1,43 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
|
||||
from opencompass.datasets import LEvalGSM100Dataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
|
||||
from opencompass.registry import TEXT_POSTPROCESSORS
|
||||
from opencompass.datasets import gsm100_dataset_postprocess, gsm100_postprocess
|
||||
|
||||
LEval_gsm100_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_gsm100_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\n'),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
|
||||
LEval_gsm100_eval_cfg = dict(evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=gsm100_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm100_dataset_postprocess)
|
||||
)
|
||||
|
||||
LEval_gsm100_datasets = [
|
||||
dict(
|
||||
type=LEvalGSM100Dataset,
|
||||
abbr='LEval_gsm100',
|
||||
path='L4NLP/LEval',
|
||||
name='gsm100',
|
||||
reader_cfg=LEval_gsm100_reader_cfg,
|
||||
infer_cfg=LEval_gsm100_infer_cfg,
|
||||
eval_cfg=LEval_gsm100_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_gov_report_summ_gen_c68a56 import LEval_govreport_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalGovReportSummDataset
|
||||
|
||||
LEval_govreport_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_govreport_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='Government report: {context}\n{question}\nTL;DR:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_govreport_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_govreport_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalGovReportSummDataset,
|
||||
abbr='LEval_gov_report_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='gov_report_summ',
|
||||
reader_cfg=LEval_govreport_summ_reader_cfg,
|
||||
infer_cfg=LEval_govreport_summ_infer_cfg,
|
||||
eval_cfg=LEval_govreport_summ_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_legalcontractqa_gen_f0bb20 import LEval_legalqa_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator
|
||||
from opencompass.datasets import LEvalLegalContractQADataset
|
||||
|
||||
LEval_legalqa_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_legalqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=128)
|
||||
)
|
||||
|
||||
LEval_legalqa_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_legalqa_datasets = [
|
||||
dict(
|
||||
type=LEvalLegalContractQADataset,
|
||||
abbr='LEval_legal_contract_qa',
|
||||
path='L4NLP/LEval',
|
||||
name='legal_contract_qa',
|
||||
reader_cfg=LEval_legalqa_reader_cfg,
|
||||
infer_cfg=LEval_legalqa_infer_cfg,
|
||||
eval_cfg=LEval_legalqa_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_meetingsumm_gen_6c03d0 import LEval_meetingsumm_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalMeetingSummDataset
|
||||
|
||||
LEval_meetingsumm_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_meetingsumm_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_meetingsumm_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_meetingsumm_datasets = [
|
||||
dict(
|
||||
type=LEvalMeetingSummDataset,
|
||||
abbr='LEval_meeting_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='meeting_summ',
|
||||
reader_cfg=LEval_meetingsumm_reader_cfg,
|
||||
infer_cfg=LEval_meetingsumm_infer_cfg,
|
||||
eval_cfg=LEval_meetingsumm_eval_cfg)
|
||||
]
|
4
configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py
Normal file
4
configs/datasets/LEvalMultidocQA/LEval_multidocqa_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_multidocqa_gen_87dc85 import LEval_multidocqa_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator
|
||||
from opencompass.datasets import LEvalMultidocQADataset
|
||||
|
||||
LEval_multidocqa_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_multidocqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=64)
|
||||
)
|
||||
|
||||
LEval_multidocqa_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_multidocqa_datasets = [
|
||||
dict(
|
||||
type=LEvalMultidocQADataset,
|
||||
abbr='LEval_multidocqa',
|
||||
path='L4NLP/LEval',
|
||||
name='multidoc_qa',
|
||||
reader_cfg=LEval_multidocqa_reader_cfg,
|
||||
infer_cfg=LEval_multidocqa_infer_cfg,
|
||||
eval_cfg=LEval_multidocqa_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_narrativeqa_gen_9fec98 import LEval_narrativeqa_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator
|
||||
from opencompass.datasets import LEvalNarrativeQADataset
|
||||
|
||||
LEval_narrativeqa_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_narrativeqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=50)
|
||||
)
|
||||
|
||||
LEval_narrativeqa_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_narrativeqa_datasets = [
|
||||
dict(
|
||||
type=LEvalNarrativeQADataset,
|
||||
abbr='LEval_narrativeqa',
|
||||
path='L4NLP/LEval',
|
||||
name='narrative_qa',
|
||||
reader_cfg=LEval_narrativeqa_reader_cfg,
|
||||
infer_cfg=LEval_narrativeqa_infer_cfg,
|
||||
eval_cfg=LEval_narrativeqa_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_naturalquestion_gen_9fec98 import LEval_nq_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalNaturalQuestionDataset
|
||||
|
||||
LEval_nq_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_nq_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}?\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=50)
|
||||
)
|
||||
|
||||
LEval_nq_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_nq_datasets = [
|
||||
dict(
|
||||
type=LEvalNaturalQuestionDataset,
|
||||
abbr='LEval_nq',
|
||||
path='L4NLP/LEval',
|
||||
name='natural_question',
|
||||
reader_cfg=LEval_nq_reader_cfg,
|
||||
infer_cfg=LEval_nq_infer_cfg,
|
||||
eval_cfg=LEval_nq_eval_cfg)
|
||||
]
|
4
configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py
Normal file
4
configs/datasets/LEvalNewsSumm/LEval_newssumm_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_newssumm_gen_db3565 import LEval_newssumm_datasets # noqa: F401, F403
|
40
configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py
Normal file
40
configs/datasets/LEvalNewsSumm/LEval_newssumm_gen_db3565.py
Normal file
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalNewsSummDataset
|
||||
|
||||
LEval_newssumm_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_newssumm_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\n{question}\nTL;DR:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_newssumm_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_newssumm_datasets = [
|
||||
dict(
|
||||
type=LEvalNewsSummDataset,
|
||||
abbr='LEval_news_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='news_summ',
|
||||
reader_cfg=LEval_newssumm_reader_cfg,
|
||||
infer_cfg=LEval_newssumm_infer_cfg,
|
||||
eval_cfg=LEval_newssumm_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_paper_assistant_gen_6c03d0 import LEval_ps_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalPaperAssistantDataset
|
||||
|
||||
LEval_ps_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_ps_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_ps_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_ps_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalPaperAssistantDataset,
|
||||
abbr='LEval_paper_assistant',
|
||||
path='L4NLP/LEval',
|
||||
name='paper_assistant',
|
||||
reader_cfg=LEval_ps_summ_reader_cfg,
|
||||
infer_cfg=LEval_ps_summ_infer_cfg,
|
||||
eval_cfg=LEval_ps_summ_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_patent_summ_gen_db3565 import LEval_patent_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalPatentSummDataset
|
||||
|
||||
LEval_patent_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_patent_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\n{question}\nTL;DR:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_patent_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_patent_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalPatentSummDataset,
|
||||
abbr='LEval_patent_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='patent_summ',
|
||||
reader_cfg=LEval_patent_summ_reader_cfg,
|
||||
infer_cfg=LEval_patent_summ_infer_cfg,
|
||||
eval_cfg=LEval_patent_summ_eval_cfg)
|
||||
]
|
4
configs/datasets/LEvalQuality/LEval_quality_gen.py
Normal file
4
configs/datasets/LEvalQuality/LEval_quality_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_quality_gen_bd35f4 import LEval_quality_datasets # noqa: F401, F403
|
42
configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py
Normal file
42
configs/datasets/LEvalQuality/LEval_quality_gen_bd35f4.py
Normal file
@ -0,0 +1,42 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
|
||||
from opencompass.datasets import LEvalQualityDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
|
||||
|
||||
LEval_quality_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_quality_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=10)
|
||||
)
|
||||
|
||||
LEval_quality_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_quality_datasets = [
|
||||
dict(
|
||||
type=LEvalQualityDataset,
|
||||
abbr='LEval_quality',
|
||||
path='L4NLP/LEval',
|
||||
name='quality',
|
||||
reader_cfg=LEval_quality_reader_cfg,
|
||||
infer_cfg=LEval_quality_infer_cfg,
|
||||
eval_cfg=LEval_quality_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_review_summ_gen_6c03d0 import LEval_review_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalReviewSummDataset
|
||||
|
||||
LEval_review_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_review_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_review_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_review_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalReviewSummDataset,
|
||||
abbr='LEval_review_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='review_summ',
|
||||
reader_cfg=LEval_review_summ_reader_cfg,
|
||||
infer_cfg=LEval_review_summ_infer_cfg,
|
||||
eval_cfg=LEval_review_summ_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_scientificqa_gen_0c6e71 import LEval_scientificqa_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator
|
||||
from opencompass.datasets import LEvalScientificQADataset
|
||||
|
||||
LEval_scientificqa_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_scientificqa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=64)
|
||||
)
|
||||
|
||||
LEval_scientificqa_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_scientificqa_datasets = [
|
||||
dict(
|
||||
type=LEvalScientificQADataset,
|
||||
abbr='LEval_scientificqa',
|
||||
path='L4NLP/LEval',
|
||||
name='scientific_qa',
|
||||
reader_cfg=LEval_scientificqa_reader_cfg,
|
||||
infer_cfg=LEval_scientificqa_infer_cfg,
|
||||
eval_cfg=LEval_scientificqa_eval_cfg)
|
||||
]
|
4
configs/datasets/LEvalTPO/LEval_tpo_gen.py
Normal file
4
configs/datasets/LEvalTPO/LEval_tpo_gen.py
Normal file
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_tpo_gen_bd35f4 import LEval_tpo_datasets
|
42
configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py
Normal file
42
configs/datasets/LEvalTPO/LEval_tpo_gen_bd35f4.py
Normal file
@ -0,0 +1,42 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
|
||||
from opencompass.datasets import LEvalTPODataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi
|
||||
|
||||
LEval_tpo_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_tpo_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=10)
|
||||
)
|
||||
|
||||
LEval_tpo_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_tpo_datasets = [
|
||||
dict(
|
||||
type=LEvalTPODataset,
|
||||
abbr='LEval_tpo',
|
||||
path='L4NLP/LEval',
|
||||
name='tpo',
|
||||
reader_cfg=LEval_tpo_reader_cfg,
|
||||
infer_cfg=LEval_tpo_infer_cfg,
|
||||
eval_cfg=LEval_tpo_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_tvshow_summ_gen_rouge import LEval_tvshow_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalTVShowSummDataset
|
||||
|
||||
LEval_tvshow_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}'),
|
||||
dict(role='BOT', prompt='TL;DR:'),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalTVShowSummDataset,
|
||||
abbr='LEval_tvshow_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='tv_show_summ',
|
||||
reader_cfg=LEval_tvshow_summ_reader_cfg,
|
||||
infer_cfg=LEval_tvshow_summ_infer_cfg,
|
||||
eval_cfg=LEval_tvshow_summ_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_tvshow_summ_gen_049a5c import LEval_tvshow_summ_datasets # noqa: F401, F403
|
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator
|
||||
from opencompass.datasets import LEvalTVShowSummDataset
|
||||
|
||||
LEval_tvshow_summ_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nTL;DR:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512)
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_eval_cfg = dict(
|
||||
evaluator=dict(type=RougeEvaluator),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_tvshow_summ_datasets = [
|
||||
dict(
|
||||
type=LEvalTVShowSummDataset,
|
||||
abbr='LEval_tvshow_summ',
|
||||
path='L4NLP/LEval',
|
||||
name='tv_show_summ',
|
||||
reader_cfg=LEval_tvshow_summ_reader_cfg,
|
||||
infer_cfg=LEval_tvshow_summ_infer_cfg,
|
||||
eval_cfg=LEval_tvshow_summ_eval_cfg)
|
||||
]
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .LEval_topic_retrieval_gen_af0562 import LEval_tr_datasets
|
@ -0,0 +1,42 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator, RougeEvaluator, SquadEvaluator, AccEvaluator
|
||||
from opencompass.datasets import LEvalTopicRetrievalDataset
|
||||
from opencompass.utils.text_postprocessors import first_capital_postprocess, first_capital_postprocess_multi, general_postprocess
|
||||
|
||||
LEval_tr_reader_cfg = dict(
|
||||
input_columns=['context', 'question'],
|
||||
output_column='answer',
|
||||
train_split='test',
|
||||
test_split='test'
|
||||
)
|
||||
|
||||
LEval_tr_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{context}\nQuestion: {question}\nAnswer:'),
|
||||
dict(role='BOT', prompt=''),
|
||||
], )),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=30)
|
||||
)
|
||||
|
||||
LEval_tr_eval_cfg = dict(
|
||||
evaluator=dict(type=EMEvaluator),
|
||||
pred_postprocessor=dict(type=general_postprocess),
|
||||
pred_role='BOT'
|
||||
)
|
||||
|
||||
LEval_tr_datasets = [
|
||||
dict(
|
||||
type=LEvalTopicRetrievalDataset,
|
||||
abbr='LEval_topic_retrieval',
|
||||
path='L4NLP/LEval',
|
||||
name='topic_retrieval_longchat',
|
||||
reader_cfg=LEval_tr_reader_cfg,
|
||||
infer_cfg=LEval_tr_infer_cfg,
|
||||
eval_cfg=LEval_tr_eval_cfg)
|
||||
]
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .agieval_gen_397d81 import agieval_datasets # noqa: F401, F403
|
||||
from .agieval_gen_64afd3 import agieval_datasets # noqa: F401, F403
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .bbh_gen_5b92b0 import bbh_datasets # noqa: F401, F403
|
||||
from .bbh_gen_6bd693 import bbh_datasets # noqa: F401, F403
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .cmmlu_gen_ffe7c0 import cmmlu_datasets # noqa: F401, F403
|
||||
from .cmmlu_gen_c13365 import cmmlu_datasets # noqa: F401, F403
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .cmmlu_ppl_fd1f2f import cmmlu_datasets # noqa: F401, F403
|
||||
from .cmmlu_ppl_8b9c76 import cmmlu_datasets # noqa: F401, F403
|
||||
|
@ -5,7 +5,7 @@ with read_base():
|
||||
from ..ceval.ceval_ppl_578f8d import ceval_datasets
|
||||
from ..agieval.agieval_mixed_2f14ad import agieval_datasets
|
||||
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||
from ..bbh.bbh_gen_6bd693 import bbh_datasets
|
||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
||||
|
@ -2,7 +2,7 @@ from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ..ceval.ceval_ppl_578f8d import ceval_datasets
|
||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||
from ..bbh.bbh_gen_6bd693 import bbh_datasets
|
||||
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
||||
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
||||
from ..CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
||||
|
@ -3,9 +3,9 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from ..agieval.agieval_gen_397d81 import agieval_datasets
|
||||
from ..agieval.agieval_gen_64afd3 import agieval_datasets
|
||||
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||
from ..bbh.bbh_gen_6bd693 import bbh_datasets
|
||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||
|
@ -3,7 +3,7 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from ..mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from ..ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||
from ..bbh.bbh_gen_6bd693 import bbh_datasets
|
||||
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
||||
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
||||
from ..CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
||||
@ -35,6 +35,6 @@ with read_base():
|
||||
from ..obqa.obqa_gen_9069e4 import obqa_datasets
|
||||
from ..nq.nq_gen_c788f6 import nq_datasets
|
||||
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from ..crowspairs.crowspairs_gen_21f7cb import crowspairs_datasets
|
||||
from ..crowspairs.crowspairs_gen_381af0 import crowspairs_datasets
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .crowspairs_gen_21f7cb import crowspairs_datasets # noqa: F401, F403
|
||||
from .crowspairs_gen_381af0 import crowspairs_datasets # noqa: F401, F403
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .cvalues_responsibility_gen_4aec9f import cvalues_datasets # noqa: F401, F403
|
||||
from .cvalues_responsibility_gen_543378 import cvalues_datasets # noqa: F401, F403
|
||||
|
48
configs/eval_LEval.py
Normal file
48
configs/eval_LEval.py
Normal file
@ -0,0 +1,48 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
import torch
|
||||
|
||||
# long context evaluation tasks
|
||||
with read_base():
|
||||
from .datasets.LEvalNaturalQuestion.LEval_naturalquestion_gen import LEval_nq_datasets
|
||||
from .datasets.LEvalNarrativeQA.LEval_narrativeqa_gen import LEval_narrativeqa_datasets
|
||||
from .datasets.LEvalMultidocQA.LEval_multidocqa_gen import LEval_multidocqa_datasets
|
||||
from .datasets.LEvalCoursera.LEval_coursera_gen import LEval_coursera_datasets
|
||||
from .datasets.LEvalTPO.LEval_tpo_gen import LEval_tpo_datasets
|
||||
from .datasets.LEvalQuality.LEval_quality_gen import LEval_quality_datasets
|
||||
from .datasets.LEvalGSM100.LEval_gsm100_gen import LEval_gsm100_datasets
|
||||
from .datasets.LEvalTopicRetrieval.LEval_topic_retrieval_gen import LEval_tr_datasets
|
||||
from .datasets.LEvalFinancialQA.LEval_financialqa_gen import LEval_financialqa_datasets
|
||||
from .datasets.LEvalGovReportSumm.LEval_gov_report_summ_gen import LEval_govreport_summ_datasets
|
||||
from .datasets.LEvalLegalContractQA.LEval_legalcontractqa_gen import LEval_legalqa_datasets
|
||||
from .datasets.LEvalMeetingSumm.LEval_meetingsumm_gen import LEval_meetingsumm_datasets
|
||||
from .datasets.LEvalNewsSumm.LEval_newssumm_gen import LEval_newssumm_datasets
|
||||
from .datasets.LEvalPaperAssistant.LEval_paper_assistant_gen import LEval_ps_summ_datasets
|
||||
from .datasets.LEvalPatentSumm.LEval_patent_summ_gen import LEval_patent_summ_datasets
|
||||
from .datasets.LEvalTVShowSumm.LEval_tvshow_summ_gen import LEval_tvshow_summ_datasets
|
||||
from .datasets.LEvalScientificQA.LEval_scientificqa_gen import LEval_scientificqa_datasets
|
||||
from .datasets.LEvalReviewSumm.LEval_review_summ_gen import LEval_review_summ_datasets
|
||||
# choose a model of interest
|
||||
# ininternlm as an example
|
||||
from .models.hf_internlm_7b import models
|
||||
# and output the results in a choosen format
|
||||
from .summarizers.LEval import summarizer
|
||||
|
||||
datasets = [*LEval_coursera_datasets,
|
||||
*LEval_tpo_datasets,
|
||||
*LEval_quality_datasets,
|
||||
*LEval_gsm100_datasets,
|
||||
*LEval_tr_datasets,
|
||||
*LEval_financialqa_datasets,
|
||||
*LEval_govreport_summ_datasets,
|
||||
*LEval_legalqa_datasets,
|
||||
*LEval_meetingsumm_datasets,
|
||||
*LEval_multidocqa_datasets,
|
||||
*LEval_narrativeqa_datasets,
|
||||
*LEval_nq_datasets,
|
||||
*LEval_newssumm_datasets,
|
||||
*LEval_newssumm_datasets,
|
||||
*LEval_patent_summ_datasets,
|
||||
*LEval_tvshow_summ_datasets,
|
||||
*LEval_scientificqa_datasets,
|
||||
*LEval_review_summ_datasets,
|
||||
*LEval_ps_summ_datasets]
|
29
configs/summarizers/LEval.py
Normal file
29
configs/summarizers/LEval.py
Normal file
@ -0,0 +1,29 @@
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
'--------- LEval Exact Match (Acc) ---------', # category
|
||||
"LEval_coursera",
|
||||
'LEval_gsm100',
|
||||
'LEval_quality',
|
||||
"LEval_tpo",
|
||||
'LEval_topic_retrieval',
|
||||
'--------- LEval Gen (ROUGE) ---------', # category
|
||||
'LEval_financialqa',
|
||||
'LEval_gov_report_summ',
|
||||
'LEval_legal_contract_qa',
|
||||
'LEval_meeting_summ',
|
||||
'LEval_multidocqa',
|
||||
'LEval_narrativeqa',
|
||||
'LEval_nq',
|
||||
'LEval_news_summ',
|
||||
'LEval_paper_assistant',
|
||||
'LEval_patent_summ',
|
||||
'LEval_review_summ',
|
||||
'LEval_scientificqa',
|
||||
'LEval_tvshow_summ'
|
||||
],
|
||||
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
||||
prompt_db=dict(
|
||||
database_path='configs/datasets/log.json',
|
||||
config_dir='configs/datasets',
|
||||
blacklist='.promptignore'),
|
||||
)
|
27
opencompass/datasets/LEval_coursera.py
Normal file
27
opencompass/datasets/LEval_coursera.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalCourseraDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_financial_qa.py
Normal file
27
opencompass/datasets/LEval_financial_qa.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalFinancialQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_gov_report_summ.py
Normal file
27
opencompass/datasets/LEval_gov_report_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalGovReportSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
58
opencompass/datasets/LEval_gsm100.py
Normal file
58
opencompass/datasets/LEval_gsm100.py
Normal file
@ -0,0 +1,58 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('gsm100_dataset')
|
||||
def gsm100_dataset_postprocess(text: str) -> str:
|
||||
return text.replace(',', '')
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('gsm100')
|
||||
def gsm100_postprocess(text: str) -> str:
|
||||
# text = text.split('\n\n')[0]
|
||||
segs = text.split('The answer is')
|
||||
if len(segs) < 2:
|
||||
return ''
|
||||
text = segs[1]
|
||||
text = text.split(' ')
|
||||
flag = False
|
||||
ret = ''
|
||||
for i in range(len(text)):
|
||||
s = text[i]
|
||||
for i in range(len(s)):
|
||||
if s[i].isdigit():
|
||||
flag = True
|
||||
ret = s
|
||||
break
|
||||
if flag:
|
||||
break
|
||||
ret1 = ''
|
||||
for i in range(len(ret)):
|
||||
if ret[i].isdigit():
|
||||
ret1 += ret[i]
|
||||
return ret1
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalGSM100Dataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_legal_contract_qa.py
Normal file
27
opencompass/datasets/LEval_legal_contract_qa.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalLegalContractQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_meeting_summ.py
Normal file
27
opencompass/datasets/LEval_meeting_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalMeetingSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_multidoc_qa.py
Normal file
27
opencompass/datasets/LEval_multidoc_qa.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalMultidocQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_narrattive_qa.py
Normal file
27
opencompass/datasets/LEval_narrattive_qa.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalNarrativeQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_natural_question.py
Normal file
27
opencompass/datasets/LEval_natural_question.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalNaturalQuestionDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_news_summ.py
Normal file
27
opencompass/datasets/LEval_news_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalNewsSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_paper_assistant.py
Normal file
27
opencompass/datasets/LEval_paper_assistant.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalPaperAssistantDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_patent_summ.py
Normal file
27
opencompass/datasets/LEval_patent_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalPatentSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_quality.py
Normal file
27
opencompass/datasets/LEval_quality.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalQualityDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer[1]
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_review_summ.py
Normal file
27
opencompass/datasets/LEval_review_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalReviewSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_scientific_qa.py
Normal file
27
opencompass/datasets/LEval_scientific_qa.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalScientificQADataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_topic_retrieval.py
Normal file
27
opencompass/datasets/LEval_topic_retrieval.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalTopicRetrievalDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_tpo.py
Normal file
27
opencompass/datasets/LEval_tpo.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalTPODataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
27
opencompass/datasets/LEval_tvshow_summ.py
Normal file
27
opencompass/datasets/LEval_tvshow_summ.py
Normal file
@ -0,0 +1,27 @@
|
||||
from datasets import Dataset, load_dataset
|
||||
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
|
||||
@LOAD_DATASET.register_module()
|
||||
class LEvalTVShowSummDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(**kwargs):
|
||||
dataset = load_dataset(**kwargs)
|
||||
split = 'test'
|
||||
raw_data = []
|
||||
for i in range(len(dataset[split])):
|
||||
instructions = dataset[split]['instructions'][i]
|
||||
outputs = dataset[split]['outputs'][i]
|
||||
context = dataset[split]['input'][i]
|
||||
for question, answer in zip(instructions, outputs):
|
||||
raw_data.append({
|
||||
'question': question,
|
||||
'context': context,
|
||||
'answer': answer
|
||||
})
|
||||
dataset[split] = Dataset.from_list(raw_data)
|
||||
return dataset
|
@ -34,6 +34,24 @@ from .iwslt2017 import * # noqa: F401, F403
|
||||
from .jigsawmultilingual import * # noqa: F401, F403
|
||||
from .lambada import * # noqa: F401, F403
|
||||
from .lcsts import * # noqa: F401, F403
|
||||
from .LEval_coursera import * # noqa: F401, F403
|
||||
from .LEval_financial_qa import * # noqa: F401, F403
|
||||
from .LEval_gov_report_summ import * # noqa: F401, F403
|
||||
from .LEval_gsm100 import * # noqa: F401, F403
|
||||
from .LEval_legal_contract_qa import * # noqa: F401, F403
|
||||
from .LEval_meeting_summ import * # noqa: F401, F403
|
||||
from .LEval_multidoc_qa import * # noqa: F401, F403
|
||||
from .LEval_narrattive_qa import * # noqa: F401, F403
|
||||
from .LEval_natural_question import * # noqa: F401, F403
|
||||
from .LEval_news_summ import * # noqa: F401, F403
|
||||
from .LEval_paper_assistant import * # noqa: F401, F403
|
||||
from .LEval_patent_summ import * # noqa: F401, F403
|
||||
from .LEval_quality import * # noqa: F401, F403
|
||||
from .LEval_review_summ import * # noqa: F401, F403
|
||||
from .LEval_scientific_qa import * # noqa: F401, F403
|
||||
from .LEval_topic_retrieval import * # noqa: F401, F403
|
||||
from .LEval_tpo import * # noqa: F401, F403
|
||||
from .LEval_tvshow_summ import * # noqa: F401, F403
|
||||
from .math import * # noqa: F401, F403
|
||||
from .mbpp import * # noqa: F401, F403
|
||||
from .mmlu import * # noqa: F401, F403
|
||||
|
Loading…
Reference in New Issue
Block a user