diff --git a/README.md b/README.md index 80901b7c..7c63af55 100644 --- a/README.md +++ b/README.md @@ -83,8 +83,8 @@ git clone https://github.com/open-compass/opencompass opencompass cd opencompass pip install -e . # Download dataset to data/ folder -wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip -unzip OpenCompassData.zip +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip +unzip OpenCompassData-core-20231110.zip ``` Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html). diff --git a/README_zh-CN.md b/README_zh-CN.md index 3f308fa1..56118e34 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -85,8 +85,8 @@ git clone https://github.com/open-compass/opencompass opencompass cd opencompass pip install -e . # 下载数据集到 data/ 处 -wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip -unzip OpenCompassData.zip +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip +unzip OpenCompassData-core-20231110.zip ``` 有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。 diff --git a/configs/datasets/collections/base_medium.py b/configs/datasets/collections/base_medium.py index 9a9962f3..2c62cc46 100644 --- a/configs/datasets/collections/base_medium.py +++ b/configs/datasets/collections/base_medium.py @@ -52,10 +52,5 @@ with read_base(): from ..nq.nq_gen_c788f6 import nq_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..flores.flores_gen_806ede import flores_datasets - from ..crowspairs.crowspairs_ppl_e811e1 import crowspairs_datasets - from ..civilcomments.civilcomments_clp_a3c5fd import civilcomments_datasets - from ..jigsawmultilingual.jigsawmultilingual_clp_fe50d8 import jigsawmultilingual_datasets - from ..realtoxicprompts.realtoxicprompts_gen_7605e4 import realtoxicprompts_datasets - from ..truthfulqa.truthfulqa_gen_5ddc62 import truthfulqa_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/base_medium_llama.py b/configs/datasets/collections/base_medium_llama.py index 1066b94b..f5381a9b 100644 --- a/configs/datasets/collections/base_medium_llama.py +++ b/configs/datasets/collections/base_medium_llama.py @@ -52,6 +52,5 @@ with read_base(): from ..nq.nq_gen_0356ec import nq_datasets from ..triviaqa.triviaqa_gen_0356ec import triviaqa_datasets from ..flores.flores_gen_806ede import flores_datasets - from ..crowspairs.crowspairs_ppl_e811e1 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/base_small.py b/configs/datasets/collections/base_small.py index a038ad39..a1bbc61c 100644 --- a/configs/datasets/collections/base_small.py +++ b/configs/datasets/collections/base_small.py @@ -34,6 +34,5 @@ with read_base(): from ..obqa.obqa_ppl_c7c154 import obqa_datasets from ..nq.nq_gen_c788f6 import nq_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from ..crowspairs.crowspairs_ppl_e811e1 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py index 4ece6907..909cd21f 100644 --- a/configs/datasets/collections/chat_medium.py +++ b/configs/datasets/collections/chat_medium.py @@ -52,10 +52,5 @@ with read_base(): from ..nq.nq_gen_c788f6 import nq_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets from ..flores.flores_gen_806ede import flores_datasets - from ..crowspairs.crowspairs_gen_381af0 import crowspairs_datasets - from ..civilcomments.civilcomments_clp_a3c5fd import civilcomments_datasets - from ..jigsawmultilingual.jigsawmultilingual_clp_fe50d8 import jigsawmultilingual_datasets - from ..realtoxicprompts.realtoxicprompts_gen_7605e4 import realtoxicprompts_datasets - from ..truthfulqa.truthfulqa_gen_5ddc62 import truthfulqa_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/chat_small.py b/configs/datasets/collections/chat_small.py index 91eec528..11ac216c 100644 --- a/configs/datasets/collections/chat_small.py +++ b/configs/datasets/collections/chat_small.py @@ -35,6 +35,5 @@ with read_base(): from ..obqa.obqa_gen_9069e4 import obqa_datasets from ..nq.nq_gen_c788f6 import nq_datasets from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets - from ..crowspairs.crowspairs_gen_381af0 import crowspairs_datasets datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py index 088e2906..9f64303b 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_gen_c946f2.py @@ -50,8 +50,9 @@ commonsenseqa_eval_cfg = dict( commonsenseqa_datasets = [ dict( + abbr='commonsense_qa', type=commonsenseqaDataset, - path="commonsense_qa", + path='./data/commonsenseqa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg, diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py index 8f119355..0646b978 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_3e9f2d.py @@ -45,8 +45,9 @@ commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) commonsenseqa_datasets = [ dict( + abbr='commonsense_qa', type=commonsenseqaDataset, - path='commonsense_qa', + path='./data/commonsenseqa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py index 26324929..de7af961 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_5545e2.py @@ -40,11 +40,10 @@ commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) commonsenseqa_datasets = [ dict( + abbr='commonsense_qa', type=commonsenseqaDataset, - path='commonsense_qa', + path='./data/commonsenseqa', reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) ] - -del _ice_template diff --git a/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py b/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py index ce9cc088..83e22c73 100644 --- a/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py +++ b/configs/datasets/commonsenseqa/commonsenseqa_ppl_716f78.py @@ -4,6 +4,11 @@ from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.datasets import commonsenseqaDataset +commonsenseqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D', 'E'], + output_column='answerKey', + test_split='validation') + _ice_template = dict( type=PromptTemplate, template={ @@ -31,15 +36,10 @@ commonsenseqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) commonsenseqa_datasets = [ dict( + abbr='commonsense_qa', type=commonsenseqaDataset, - path='commonsense_qa', - reader_cfg=dict( - input_columns=['question', 'A', 'B', 'C', 'D', 'E'], - output_column='answerKey', - test_split='validation', - ), + path='./data/commonsenseqa', + reader_cfg=commonsenseqa_reader_cfg, infer_cfg=commonsenseqa_infer_cfg, eval_cfg=commonsenseqa_eval_cfg) ] - -del _ice_template diff --git a/configs/datasets/drop/drop_gen_599f07.py b/configs/datasets/drop/drop_gen_599f07.py index 01b8d763..69f28332 100644 --- a/configs/datasets/drop/drop_gen_599f07.py +++ b/configs/datasets/drop/drop_gen_599f07.py @@ -4,11 +4,18 @@ from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import EMEvaluator from opencompass.datasets import dropDataset +drop_reader_cfg = dict( + input_columns=['prompt', 'question'], + output_column='answers', + train_split='validation', + test_split='validation', +) + drop_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, - template= - '''Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older. + template='''\ +Text: In the county, the population was spread out with 23.50% under the age of 18, 8.70% from 18 to 24, 29.70% from 25 to 44, 24.70% from 45 to 64, and 13.30% who were 65 years of age or older. Question: How many more percent are under the age of 18 compared to the 18 to 24 group? Anawer: According to the text, 23.5% are under the age of 18, and 8.7% are from ages 18 to 24. 23.5%-8.7%=14.8%. So the answer is 14.8. @@ -30,13 +37,8 @@ drop_datasets = [ dict( abbr='drop', type=dropDataset, - path='drop', - reader_cfg=dict( - input_columns=['prompt'], - output_column='answers', - train_split='validation', - test_split='validation', - ), + path='./data/drop/drop_dataset_dev.json', + reader_cfg=drop_reader_cfg, infer_cfg=drop_infer_cfg, eval_cfg=drop_eval_cfg) ] diff --git a/configs/datasets/flores/flores_gen_806ede.py b/configs/datasets/flores/flores_gen_806ede.py index 4bbe1016..13951bff 100644 --- a/configs/datasets/flores/flores_gen_806ede.py +++ b/configs/datasets/flores/flores_gen_806ede.py @@ -118,6 +118,12 @@ for _flores_subtask in _flores_subtasks: _, _flores_source, _src_inst, _ = flores_lang_map[_src] _, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt] + flores_reader_cfg = dict( + input_columns=f"sentence_{_flores_source}", + output_column=f"sentence_{_flores_target}", + train_split="dev", + test_split="devtest" + ) flores_infer_cfg = dict( ice_template=dict( type=PromptTemplate, @@ -146,16 +152,11 @@ for _flores_subtask in _flores_subtasks: flores_eval_cfg["dataset_postprocessor"] = dict(type="flores") flores_datasets.append( dict( - type=FloresFirst100Dataset, abbr=f"flores_100_{_src}-{_tgt}", + type=FloresFirst100Dataset, + path='./data/flores_first100', name=f"{_flores_source}-{_flores_target}", - reader_cfg=dict( - input_columns=f"sentence_{_flores_source}", - output_column=f"sentence_{_flores_target}", - train_split="dev", - test_split="devtest"), + reader_cfg=flores_reader_cfg.copy(), infer_cfg=flores_infer_cfg.copy(), eval_cfg=flores_eval_cfg.copy(), )) - -del _flores_lang_map, _flores_subtask, _src, _tgt, _, _flores_source, _src_inst, _flores_target, _tgt_inst diff --git a/configs/datasets/flores/flores_gen_aad4fd.py b/configs/datasets/flores/flores_gen_aad4fd.py index d340d3b2..c967544e 100644 --- a/configs/datasets/flores/flores_gen_aad4fd.py +++ b/configs/datasets/flores/flores_gen_aad4fd.py @@ -118,6 +118,12 @@ for _flores_subtask in _flores_subtasks: _, _flores_source, _src_inst, _ = flores_lang_map[_src] _, _flores_target, _tgt_inst, _ = flores_lang_map[_tgt] + flores_reader_cfg = dict( + input_columns=f"sentence_{_flores_source}", + output_column=f"sentence_{_flores_target}", + train_split="dev", + test_split="devtest" + ) flores_infer_cfg = dict( ice_template=dict( type=PromptTemplate, @@ -139,16 +145,11 @@ for _flores_subtask in _flores_subtasks: flores_eval_cfg["dataset_postprocessor"] = dict(type="flores-chinese") flores_datasets.append( dict( - type=FloresFirst100Dataset, abbr=f"flores_100_{_src}-{_tgt}", + type=FloresFirst100Dataset, + path='./data/flores_first100', name=f"{_flores_source}-{_flores_target}", - reader_cfg=dict( - input_columns=f"sentence_{_flores_source}", - output_column=f"sentence_{_flores_target}", - train_split="dev", - test_split="devtest"), + reader_cfg=flores_reader_cfg.copy(), infer_cfg=flores_infer_cfg.copy(), eval_cfg=flores_eval_cfg.copy(), )) - -del _flores_lang_map, _flores_subtask, _src, _tgt, _, _flores_source, _src_inst, _flores_target, _tgt_inst diff --git a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py index 15217aa2..207fc674 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1d7fe4.py @@ -1,8 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -32,9 +31,8 @@ gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator), gsm8k_datasets = [ dict( abbr='gsm8k', - type=HFDataset, - path='gsm8k', - name='main', + type=GSM8KDataset, + path='./data/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py index 0e0860ed..293dd70f 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_1dce88.py +++ b/configs/datasets/gsm8k/gsm8k_gen_1dce88.py @@ -1,8 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -79,9 +78,8 @@ gsm8k_eval_cfg = dict( gsm8k_datasets = [ dict( abbr='gsm8k', - type=HFDataset, - path='gsm8k', - name='main', + type=GSM8KDataset, + path='./data/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py b/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py index 1f6d0d5d..c052afff 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py +++ b/configs/datasets/gsm8k/gsm8k_gen_57b0b1.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import AgentInferencer -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kAgentEvaluator +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kAgentEvaluator # This config is for code interpreter gsm8k_example = """ @@ -76,9 +76,8 @@ gsm8k_eval_cfg = dict( gsm8k_datasets = [ dict( abbr='gsm8k', - type=HFDataset, - path='gsm8k', - name='main', + type=GSM8KDataset, + path='./data/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py index 9d7657f4..f038028b 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py +++ b/configs/datasets/gsm8k/gsm8k_gen_a3e34a.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import SCInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer' ) generation_kwargs = dict(do_sample=True, temperature=0.7, top_k=40) @@ -81,9 +81,8 @@ gsm8k_eval_cfg = dict( gsm8k_datasets = [ dict( abbr='gsm8k', - type=HFDataset, - path='gsm8k', - name='main', + type=GSM8KDataset, + path='./data/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py index a5a9974b..ba9e07f5 100644 --- a/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py +++ b/configs/datasets/gsm8k/gsm8k_gen_e9e91e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') @@ -41,9 +41,9 @@ gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator), gsm8k_datasets = [ dict( - type=HFDataset, - path='gsm8k', - name='main', + abbr='gsm8k', + type=GSM8KDataset, + path='./data/gsm8k', reader_cfg=gsm8k_reader_cfg, infer_cfg=gsm8k_infer_cfg, eval_cfg=gsm8k_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py index 48f0fe91..c1c414dc 100644 --- a/configs/datasets/hellaswag/hellaswag_gen_6faab5.py +++ b/configs/datasets/hellaswag/hellaswag_gen_6faab5.py @@ -8,7 +8,7 @@ from opencompass.utils.text_postprocessors import first_option_postprocess hellaswag_reader_cfg = dict( input_columns=["ctx", "A", "B", "C", "D"], output_column="label", - test_split="validation") +) hellaswag_infer_cfg = dict( prompt_template=dict( @@ -35,8 +35,9 @@ hellaswag_eval_cfg = dict( hellaswag_datasets = [ dict( + abbr='hellaswag', type=hellaswagDataset_V2, - path="hellaswag", + path='./data/hellaswag/hellaswag.jsonl', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py b/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py index fa6294f8..0216cdec 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_47bff9.py @@ -27,8 +27,9 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( + abbr='hellaswag', type=hellaswagDataset, - path='hellaswag', + path='./data/hellaswag/hellaswag.jsonl', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py b/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py index e90a7556..e007e9a6 100644 --- a/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py +++ b/configs/datasets/hellaswag/hellaswag_ppl_9dbb12.py @@ -6,9 +6,8 @@ from opencompass.datasets import hellaswagDataset hellaswag_reader_cfg = dict( input_columns=['ctx', 'A', 'B', 'C', 'D'], - output_column='label', - train_split='validation', - test_split='validation') + output_column='label' +) hellaswag_infer_cfg = dict( prompt_template=dict( @@ -26,8 +25,9 @@ hellaswag_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) hellaswag_datasets = [ dict( + abbr='hellaswag', type=hellaswagDataset, - path='hellaswag', + path='./data/hellaswag/hellaswag.jsonl', reader_cfg=hellaswag_reader_cfg, infer_cfg=hellaswag_infer_cfg, eval_cfg=hellaswag_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_6f294d.py b/configs/datasets/humaneval/humaneval_gen_6f294d.py index a0a991a3..3d0eeaa1 100644 --- a/configs/datasets/humaneval/humaneval_gen_6f294d.py +++ b/configs/datasets/humaneval/humaneval_gen_6f294d.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -32,8 +32,9 @@ humaneval_eval_cfg = dict( humaneval_datasets = [ dict( - type=HFDataset, - path='openai_humaneval', + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_8e312c.py b/configs/datasets/humaneval/humaneval_gen_8e312c.py index cd8421b6..2b9e20ff 100644 --- a/configs/datasets/humaneval/humaneval_gen_8e312c.py +++ b/configs/datasets/humaneval/humaneval_gen_8e312c.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -27,8 +27,9 @@ humaneval_eval_cfg = dict( humaneval_datasets = [ dict( - type=HFDataset, - path='openai_humaneval', + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_a82cae.py b/configs/datasets/humaneval/humaneval_gen_a82cae.py index 3bfe6089..cb8b2d42 100644 --- a/configs/datasets/humaneval/humaneval_gen_a82cae.py +++ b/configs/datasets/humaneval/humaneval_gen_a82cae.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -27,8 +27,9 @@ humaneval_eval_cfg = dict( humaneval_datasets = [ dict( - type=HFDataset, - path='openai_humaneval', + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_fd5822.py b/configs/datasets/humaneval/humaneval_gen_fd5822.py index 9b28d30f..6d0c9903 100644 --- a/configs/datasets/humaneval/humaneval_gen_fd5822.py +++ b/configs/datasets/humaneval/humaneval_gen_fd5822.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -22,8 +22,9 @@ humaneval_eval_cfg = dict( humaneval_datasets = [ dict( - type=HFDataset, - path='openai_humaneval', + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/humaneval/humaneval_gen_ff7054.py b/configs/datasets/humaneval/humaneval_gen_ff7054.py index 845a5eda..35b2d9d0 100644 --- a/configs/datasets/humaneval/humaneval_gen_ff7054.py +++ b/configs/datasets/humaneval/humaneval_gen_ff7054.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import HFDataset, HumanEvaluator, humaneval_postprocess +from opencompass.datasets import HumanevalDataset, HumanEvaluator, humaneval_postprocess humaneval_reader_cfg = dict( input_columns=['prompt'], output_column='task_id', train_split='test') @@ -32,8 +32,9 @@ humaneval_eval_cfg = dict( humaneval_datasets = [ dict( - type=HFDataset, - path='openai_humaneval', + abbr='openai_humaneval', + type=HumanevalDataset, + path='./data/humaneval/human-eval-v2-20210705.jsonl', reader_cfg=humaneval_reader_cfg, infer_cfg=humaneval_infer_cfg, eval_cfg=humaneval_eval_cfg) diff --git a/configs/datasets/lambada/lambada_gen_217e11.py b/configs/datasets/lambada/lambada_gen_217e11.py index d83c95cc..0c125f57 100644 --- a/configs/datasets/lambada/lambada_gen_217e11.py +++ b/configs/datasets/lambada/lambada_gen_217e11.py @@ -26,7 +26,7 @@ lambada_datasets = [ dict( abbr='lambada', type=lambadaDataset, - path='craffel/openai_lambada', + path='./data/lambada/test.jsonl', reader_cfg=lambada_reader_cfg, infer_cfg=lambada_infer_cfg, eval_cfg=lambada_eval_cfg) diff --git a/configs/datasets/lambada/lambada_gen_8b48a5.py b/configs/datasets/lambada/lambada_gen_8b48a5.py index d798045c..cd85b152 100644 --- a/configs/datasets/lambada/lambada_gen_8b48a5.py +++ b/configs/datasets/lambada/lambada_gen_8b48a5.py @@ -22,7 +22,7 @@ lambada_datasets = [ dict( abbr='lambada', type=lambadaDataset, - path='craffel/openai_lambada', + path='./data/lambada/test.jsonl', reader_cfg=lambada_reader_cfg, infer_cfg=lambada_infer_cfg, eval_cfg=lambada_eval_cfg) diff --git a/configs/datasets/obqa/obqa_gen_9069e4.py b/configs/datasets/obqa/obqa_gen_9069e4.py index 352d9ebd..48ca3457 100644 --- a/configs/datasets/obqa/obqa_gen_9069e4.py +++ b/configs/datasets/obqa/obqa_gen_9069e4.py @@ -32,15 +32,12 @@ obqa_datasets = [ dict( abbr="openbookqa", type=OBQADataset, - path="openbookqa", - split="test", + path='./data/openbookqa/Main/test.jsonl', ), dict( abbr="openbookqa_fact", type=OBQADataset, - path="openbookqa", - name="additional", - split="test", + path='./data/openbookqa/Additional/test_complete.jsonl', ), ] diff --git a/configs/datasets/obqa/obqa_ppl_1defe8.py b/configs/datasets/obqa/obqa_ppl_1defe8.py index bb07200f..9c5abd31 100644 --- a/configs/datasets/obqa/obqa_ppl_1defe8.py +++ b/configs/datasets/obqa/obqa_ppl_1defe8.py @@ -24,15 +24,12 @@ obqa_datasets = [ dict( abbr="openbookqa", type=OBQADataset, - path="openbookqa", - split="test", + path='./data/openbookqa/Main/test.jsonl', ), dict( abbr="openbookqa_fact", type=OBQADataset, - path="openbookqa", - name="additional", - split="test", + path='./data/openbookqa/Additional/test_complete.jsonl', ), ] for _i in range(2): diff --git a/configs/datasets/obqa/obqa_ppl_6aac9e.py b/configs/datasets/obqa/obqa_ppl_6aac9e.py index fc74d117..d8e21c1e 100644 --- a/configs/datasets/obqa/obqa_ppl_6aac9e.py +++ b/configs/datasets/obqa/obqa_ppl_6aac9e.py @@ -33,9 +33,7 @@ obqa_datasets = [ dict( abbr='openbookqa_fact', type=OBQADataset_V2, - path='openbookqa', - name='additional', - split='test', + path='./data/openbookqa/Additional/test_complete.jsonl', reader_cfg=obqa_reader_cfg, infer_cfg=obqa_infer_cfg, eval_cfg=obqa_eval_cfg, diff --git a/configs/datasets/obqa/obqa_ppl_c7c154.py b/configs/datasets/obqa/obqa_ppl_c7c154.py index 9a4c8546..58cabae9 100644 --- a/configs/datasets/obqa/obqa_ppl_c7c154.py +++ b/configs/datasets/obqa/obqa_ppl_c7c154.py @@ -37,16 +37,14 @@ _template = [ obqa_datasets = [ dict( + abbr="openbookqa", type=OBQADataset, - path='openbookqa', - split='test', + path='./data/openbookqa/Main/test.jsonl', ), dict( abbr='openbookqa_fact', type=OBQADataset, - path='openbookqa', - name='additional', - split='test', + path='./data/openbookqa/Additional/test_complete.jsonl', ), ] for _i in range(2): diff --git a/configs/datasets/piqa/piqa_gen_1194eb.py b/configs/datasets/piqa/piqa_gen_1194eb.py index 69488edd..1f6a5c25 100644 --- a/configs/datasets/piqa/piqa_gen_1194eb.py +++ b/configs/datasets/piqa/piqa_gen_1194eb.py @@ -34,7 +34,7 @@ piqa_datasets = [ dict( abbr="piqa", type=piqaDataset_V2, - path="piqa", + path='./data/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_0cfff2.py b/configs/datasets/piqa/piqa_ppl_0cfff2.py index 22e9030a..d8d8cfcd 100644 --- a/configs/datasets/piqa/piqa_ppl_0cfff2.py +++ b/configs/datasets/piqa/piqa_ppl_0cfff2.py @@ -30,7 +30,7 @@ piqa_datasets = [ dict( abbr='piqa', type=piqaDataset_V3, - path='piqa', + path='./data/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_1cf9f0.py b/configs/datasets/piqa/piqa_ppl_1cf9f0.py index 7c43bf6b..bb2a0f3b 100644 --- a/configs/datasets/piqa/piqa_ppl_1cf9f0.py +++ b/configs/datasets/piqa/piqa_ppl_1cf9f0.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import piqaDataset piqa_reader_cfg = dict( input_columns=['goal', 'sol1', 'sol2'], @@ -23,8 +23,9 @@ piqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) piqa_datasets = [ dict( - type=HFDataset, - path='piqa', + abbr='piqa', + type=piqaDataset, + path='./data/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/piqa/piqa_ppl_3431ea.py b/configs/datasets/piqa/piqa_ppl_3431ea.py index cb194b17..3a9ac7d9 100644 --- a/configs/datasets/piqa/piqa_ppl_3431ea.py +++ b/configs/datasets/piqa/piqa_ppl_3431ea.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import piqaDataset piqa_reader_cfg = dict( input_columns=['goal', 'sol1', 'sol2'], @@ -33,8 +33,9 @@ piqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) piqa_datasets = [ dict( - type=HFDataset, - path='piqa', + abbr='piqa', + type=piqaDataset, + path='./data/piqa', reader_cfg=piqa_reader_cfg, infer_cfg=piqa_infer_cfg, eval_cfg=piqa_eval_cfg) diff --git a/configs/datasets/race/race_gen_69ee4f.py b/configs/datasets/race/race_gen_69ee4f.py index 6ffd013f..4ce9665d 100644 --- a/configs/datasets/race/race_gen_69ee4f.py +++ b/configs/datasets/race/race_gen_69ee4f.py @@ -7,7 +7,10 @@ from opencompass.utils.text_postprocessors import first_option_postprocess race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer') + output_column='answer', + train_split="validation", + test_split="test" +) race_infer_cfg = dict( prompt_template=dict( @@ -29,17 +32,17 @@ race_eval_cfg = dict( race_datasets = [ dict( - type=RaceDataset, abbr='race-middle', - path='race', + type=RaceDataset, + path='./data/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, eval_cfg=race_eval_cfg), dict( - type=RaceDataset, abbr='race-high', - path='race', + type=RaceDataset, + path='./data/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_gen_9302a5.py b/configs/datasets/race/race_gen_9302a5.py index b6e61391..999c9fb8 100644 --- a/configs/datasets/race/race_gen_9302a5.py +++ b/configs/datasets/race/race_gen_9302a5.py @@ -7,7 +7,10 @@ from opencompass.utils.text_postprocessors import first_capital_postprocess race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer') + output_column='answer', + train_split="validation", + test_split="test" +) race_infer_cfg = dict( prompt_template=dict( @@ -23,17 +26,17 @@ race_eval_cfg = dict( race_datasets = [ dict( - type=RaceDataset, abbr='race-middle', - path='race', + type=RaceDataset, + path='./data/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, eval_cfg=race_eval_cfg), dict( - type=RaceDataset, abbr='race-high', - path='race', + type=RaceDataset, + path='./data/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_5831a0.py b/configs/datasets/race/race_ppl_5831a0.py index cc0cfe6f..35d052a4 100644 --- a/configs/datasets/race/race_ppl_5831a0.py +++ b/configs/datasets/race/race_ppl_5831a0.py @@ -6,7 +6,10 @@ from opencompass.datasets import RaceDataset race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer') + output_column='answer', + train_split="validation", + test_split="test" +) race_infer_cfg = dict( prompt_template=dict( @@ -27,17 +30,17 @@ race_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) race_datasets = [ dict( - type=RaceDataset, abbr='race-middle', - path='race', + type=RaceDataset, + path='./data/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, eval_cfg=race_eval_cfg), dict( - type=RaceDataset, abbr='race-high', - path='race', + type=RaceDataset, + path='./data/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_a138cd.py b/configs/datasets/race/race_ppl_a138cd.py index 8b0f36d4..1d611aa8 100644 --- a/configs/datasets/race/race_ppl_a138cd.py +++ b/configs/datasets/race/race_ppl_a138cd.py @@ -6,7 +6,10 @@ from opencompass.datasets import RaceDataset race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer') + output_column='answer', + train_split="validation", + test_split="test" +) race_infer_cfg = dict( prompt_template=dict( @@ -29,17 +32,17 @@ race_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) race_datasets = [ dict( - type=RaceDataset, abbr='race-middle', - path='race', + type=RaceDataset, + path='./data/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, eval_cfg=race_eval_cfg), dict( - type=RaceDataset, abbr='race-high', - path='race', + type=RaceDataset, + path='./data/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/race/race_ppl_abed12.py b/configs/datasets/race/race_ppl_abed12.py index e98e997d..5adcec1c 100644 --- a/configs/datasets/race/race_ppl_abed12.py +++ b/configs/datasets/race/race_ppl_abed12.py @@ -6,7 +6,10 @@ from opencompass.datasets import RaceDataset race_reader_cfg = dict( input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer') + output_column='answer', + train_split="validation", + test_split="test" +) race_infer_cfg = dict( prompt_template=dict( @@ -28,17 +31,17 @@ race_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) race_datasets = [ dict( - type=RaceDataset, abbr='race-middle', - path='race', + type=RaceDataset, + path='./data/race', name='middle', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, eval_cfg=race_eval_cfg), dict( - type=RaceDataset, abbr='race-high', - path='race', + type=RaceDataset, + path='./data/race', name='high', reader_cfg=race_reader_cfg, infer_cfg=race_infer_cfg, diff --git a/configs/datasets/siqa/siqa_gen_e78df3.py b/configs/datasets/siqa/siqa_gen_e78df3.py index a470739b..e61d3435 100644 --- a/configs/datasets/siqa/siqa_gen_e78df3.py +++ b/configs/datasets/siqa/siqa_gen_e78df3.py @@ -34,7 +34,7 @@ siqa_datasets = [ dict( abbr="siqa", type=siqaDataset_V2, - path="social_i_qa", + path='./data/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_42bc6e.py b/configs/datasets/siqa/siqa_ppl_42bc6e.py index a0cb3ee5..9210ca71 100644 --- a/configs/datasets/siqa/siqa_ppl_42bc6e.py +++ b/configs/datasets/siqa/siqa_ppl_42bc6e.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import siqaDataset siqa_reader_cfg = dict( input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'], @@ -25,8 +25,8 @@ siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) siqa_datasets = [ dict( abbr="siqa", - type=HFDataset, - path='social_i_qa', + type=siqaDataset, + path='./data/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_7845b0.py b/configs/datasets/siqa/siqa_ppl_7845b0.py index a3efeabe..4faa1fd3 100644 --- a/configs/datasets/siqa/siqa_ppl_7845b0.py +++ b/configs/datasets/siqa/siqa_ppl_7845b0.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import siqaDataset siqa_reader_cfg = dict( input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'], @@ -25,9 +25,8 @@ siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) siqa_datasets = [ dict( abbr="siqa", - type=HFDataset, - path='social_i_qa', - name='social_i_qa', + type=siqaDataset, + path='./data/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_ced5f6.py b/configs/datasets/siqa/siqa_ppl_ced5f6.py index 24ea1b6e..1007f389 100644 --- a/configs/datasets/siqa/siqa_ppl_ced5f6.py +++ b/configs/datasets/siqa/siqa_ppl_ced5f6.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import siqaDataset siqa_reader_cfg = dict( input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'], @@ -37,8 +37,8 @@ siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) siqa_datasets = [ dict( abbr="siqa", - type=HFDataset, - path='social_i_qa', + type=siqaDataset, + path='./data/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/siqa/siqa_ppl_e8d8c5.py b/configs/datasets/siqa/siqa_ppl_e8d8c5.py index 140dcef3..2cd5c08e 100644 --- a/configs/datasets/siqa/siqa_ppl_e8d8c5.py +++ b/configs/datasets/siqa/siqa_ppl_e8d8c5.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import PPLInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset +from opencompass.datasets import siqaDataset siqa_reader_cfg = dict( input_columns=['context', 'question', 'answerA', 'answerB', 'answerC'], @@ -37,8 +37,8 @@ siqa_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) siqa_datasets = [ dict( abbr="siqa", - type=HFDataset, - path='social_i_qa', + type=siqaDataset, + path='./data/siqa', reader_cfg=siqa_reader_cfg, infer_cfg=siqa_infer_cfg, eval_cfg=siqa_eval_cfg) diff --git a/configs/datasets/storycloze/storycloze_gen_7f656a.py b/configs/datasets/storycloze/storycloze_gen_7f656a.py index ae141378..16f3e771 100644 --- a/configs/datasets/storycloze/storycloze_gen_7f656a.py +++ b/configs/datasets/storycloze/storycloze_gen_7f656a.py @@ -37,8 +37,8 @@ storycloze_datasets = [ dict( abbr="story_cloze", type=storyclozeDataset_V2, - path="juletxara/xstory_cloze", - name="en", + path='./data/xstory_cloze', + lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, eval_cfg=storycloze_eval_cfg, diff --git a/configs/datasets/storycloze/storycloze_ppl_496661.py b/configs/datasets/storycloze/storycloze_ppl_496661.py index 05aea9ba..78848306 100644 --- a/configs/datasets/storycloze/storycloze_ppl_496661.py +++ b/configs/datasets/storycloze/storycloze_ppl_496661.py @@ -31,8 +31,8 @@ storycloze_datasets = [ dict( abbr='story_cloze', type=storyclozeDataset, - path='juletxara/xstory_cloze', - name='en', + path='./data/xstory_cloze', + lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, eval_cfg=storycloze_eval_cfg) diff --git a/configs/datasets/storycloze/storycloze_ppl_afd16f.py b/configs/datasets/storycloze/storycloze_ppl_afd16f.py index e33bfe38..15c6ce42 100644 --- a/configs/datasets/storycloze/storycloze_ppl_afd16f.py +++ b/configs/datasets/storycloze/storycloze_ppl_afd16f.py @@ -28,8 +28,8 @@ storycloze_datasets = [ dict( abbr='story_cloze', type=storyclozeDataset, - path='juletxara/xstory_cloze', - name='en', + path='./data/xstory_cloze', + lang='en', reader_cfg=storycloze_reader_cfg, infer_cfg=storycloze_infer_cfg, eval_cfg=storycloze_eval_cfg) diff --git a/configs/datasets/strategyqa/strategyqa_gen_1180a7.py b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py index faecd76f..2eb96593 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_1180a7.py +++ b/configs/datasets/strategyqa/strategyqa_gen_1180a7.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess +from opencompass.datasets import StrategyQADataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess strategyqa_reader_cfg = dict( input_columns=['question'], @@ -86,8 +86,8 @@ strategyqa_eval_cfg = dict( strategyqa_datasets = [ dict( abbr='strategyqa', - type=HFDataset, - path='wics/strategy-qa', + type=StrategyQADataset, + path='./data/strategyqa/strategyQA_train.json', reader_cfg=strategyqa_reader_cfg, infer_cfg=strategyqa_infer_cfg, eval_cfg=strategyqa_eval_cfg) diff --git a/configs/datasets/strategyqa/strategyqa_gen_934441.py b/configs/datasets/strategyqa/strategyqa_gen_934441.py index 465d8b46..fa6270df 100644 --- a/configs/datasets/strategyqa/strategyqa_gen_934441.py +++ b/configs/datasets/strategyqa/strategyqa_gen_934441.py @@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import HFDataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess +from opencompass.datasets import StrategyQADataset, strategyqa_pred_postprocess, strategyqa_dataset_postprocess strategyqa_reader_cfg = dict( input_columns=['question'], @@ -50,8 +50,8 @@ strategyqa_eval_cfg = dict( strategyqa_datasets = [ dict( abbr='strategyqa', - type=HFDataset, - path='wics/strategy-qa', + type=StrategyQADataset, + path='./data/strategyqa/strategyQA_train.json', reader_cfg=strategyqa_reader_cfg, infer_cfg=strategyqa_infer_cfg, eval_cfg=strategyqa_eval_cfg) diff --git a/configs/datasets/tydiqa/tydiqa_gen_978d2a.py b/configs/datasets/tydiqa/tydiqa_gen_978d2a.py index 07ff7fa3..838a694c 100644 --- a/configs/datasets/tydiqa/tydiqa_gen_978d2a.py +++ b/configs/datasets/tydiqa/tydiqa_gen_978d2a.py @@ -6,9 +6,8 @@ from opencompass.datasets import TydiQADataset, TydiQAEvaluator # All configs are for TydiQA Goldp task tydiqa_reader_cfg = dict( input_columns=["passage_text", "question_text"], - output_column="answer", - test_split='validation', - train_split='validation',) + output_column="answer" +) langs = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai'] @@ -33,19 +32,25 @@ for _lang in langs: prompt_template=dict( type=PromptTemplate, template=f"{_hint[0]}\n\n{_hint[1]}{{passage_text}}\n{_hint[2]} {{question_text}}\n{_hint[3]} {{answer}}" , - ice_token=''), + ice_token='' + ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), max_out_len=50) + inferencer=dict(type=GenInferencer), max_out_len=50 + ) + + tydiqa_eval_cfg = dict( + evaluator=dict(type=TydiQAEvaluator), + ds_split='validation', + ds_column='answer', + ) - tydiqa_eval_cfg = dict(evaluator=dict(type=TydiQAEvaluator), - ds_split='validation', - ds_column='answer', - ) tydiqa_datasets.append( - dict(abbr=f'tyidqa-goldp_{_lang}', - type=TydiQADataset, - path='khalidalt/tydiqa-goldp', - name=_lang, - reader_cfg=tydiqa_reader_cfg, - infer_cfg=tydiqa_infer_cfg, - eval_cfg=tydiqa_eval_cfg)) \ No newline at end of file + dict(abbr=f'tyidqa-goldp_{_lang}', + type=TydiQADataset, + path='./data/tydiqa', + lang=_lang, + reader_cfg=tydiqa_reader_cfg, + infer_cfg=tydiqa_infer_cfg, + eval_cfg=tydiqa_eval_cfg + ) + ) diff --git a/configs/datasets/winogrande/winogrande_gen_a9ede5.py b/configs/datasets/winogrande/winogrande_gen_a9ede5.py index 0dabfb2e..4063b175 100644 --- a/configs/datasets/winogrande/winogrande_gen_a9ede5.py +++ b/configs/datasets/winogrande/winogrande_gen_a9ede5.py @@ -7,8 +7,8 @@ from opencompass.utils.text_postprocessors import first_option_postprocess winogrande_reader_cfg = dict( input_columns=["opt1", "opt2"], - output_column="label", - test_split="validation") + output_column="answer", +) winogrande_infer_cfg = dict( prompt_template=dict( @@ -35,8 +35,7 @@ winogrande_datasets = [ dict( abbr="winogrande", type=winograndeDataset_V2, - path="winogrande", - name="winogrande_xs", + path='./data/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg, diff --git a/configs/datasets/winogrande/winogrande_ppl_55a66e.py b/configs/datasets/winogrande/winogrande_ppl_55a66e.py index fc65fa90..57f34268 100644 --- a/configs/datasets/winogrande/winogrande_ppl_55a66e.py +++ b/configs/datasets/winogrande/winogrande_ppl_55a66e.py @@ -7,8 +7,7 @@ from opencompass.datasets import winograndeDataset winogrande_reader_cfg = dict( input_columns=['opt1', 'opt2'], output_column='answer', - train_split='validation', - test_split='validation') +) winogrande_infer_cfg = dict( prompt_template=dict( @@ -28,8 +27,7 @@ winogrande_datasets = [ dict( abbr='winogrande', type=winograndeDataset, - path='winogrande', - name='winogrande_xs', + path='./data/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/datasets/winogrande/winogrande_ppl_9307fd.py b/configs/datasets/winogrande/winogrande_ppl_9307fd.py index e58cb485..9301028e 100644 --- a/configs/datasets/winogrande/winogrande_ppl_9307fd.py +++ b/configs/datasets/winogrande/winogrande_ppl_9307fd.py @@ -7,8 +7,7 @@ from opencompass.datasets import winograndeDataset winogrande_reader_cfg = dict( input_columns=['opt1', 'opt2'], output_column='answer', - train_split='validation', - test_split='validation') +) winogrande_infer_cfg = dict( prompt_template=dict( @@ -26,8 +25,7 @@ winogrande_datasets = [ dict( abbr='winogrande', type=winograndeDataset, - path='winogrande', - name='winogrande_xs', + path='./data/winogrande', reader_cfg=winogrande_reader_cfg, infer_cfg=winogrande_infer_cfg, eval_cfg=winogrande_eval_cfg) diff --git a/configs/summarizers/medium.py b/configs/summarizers/medium.py index d813977e..de630861 100644 --- a/configs/summarizers/medium.py +++ b/configs/summarizers/medium.py @@ -87,17 +87,6 @@ summarizer = dict( 'eprstmt-dev', 'lambada', 'tnews-dev', - '--------- 安全 Safety ---------', # category - # '偏见', # subcategory - 'crows_pairs', - # '有毒性(判别)', # subcategory - 'civil_comments', - # '有毒性(判别)多语言', # subcategory - 'jigsaw_multilingual', - # '有毒性(生成)', # subcategory - 'real-toxicity-prompts', - # '真实性/有用性', # subcategory - 'truthful_qa', ], summary_groups=sum( [v for k, v in locals().items() if k.endswith("_summary_groups")], []), diff --git a/configs/summarizers/small.py b/configs/summarizers/small.py index fc95caa8..09cdb692 100644 --- a/configs/summarizers/small.py +++ b/configs/summarizers/small.py @@ -56,8 +56,6 @@ summarizer = dict( 'openbookqa_fact', 'nq', 'triviaqa', - '--- Security ---', - 'crows_pairs', ], summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), ) diff --git a/docs/en/get_started/installation.md b/docs/en/get_started/installation.md index b578722e..c94fd3e4 100644 --- a/docs/en/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -66,10 +66,21 @@ Run the following commands to download and place the datasets in the `${OpenComp ```bash # Run in the OpenCompass directory -wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip -unzip OpenCompassData.zip +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip +unzip OpenCompassData-core-20231110.zip ``` +If you need to use the more comprehensive dataset (~500M) provided by OpenCompass, You can download it using the following command: + +```bash +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-complete-20231110.zip +unzip OpenCompassData-complete-20231110.zip +cd ./data +unzip *.zip +``` + +The list of datasets included in both `.zip` can be found [here](https://github.com/open-compass/opencompass/releases/tag/0.1.8.rc1) + OpenCompass has supported most of the datasets commonly used for performance comparison, please refer to `configs/dataset` for the specific list of supported datasets. For next step, please read [Quick Start](./quick_start.md). diff --git a/docs/zh_cn/get_started/installation.md b/docs/zh_cn/get_started/installation.md index 499ee7bd..360fdb81 100644 --- a/docs/zh_cn/get_started/installation.md +++ b/docs/zh_cn/get_started/installation.md @@ -66,10 +66,21 @@ OpenCompass 支持的数据集主要包括两个部分: 在 OpenCompass 项目根目录下运行下面命令,将数据集准备至 `${OpenCompass}/data` 目录下: ```bash -wget https://github.com/open-compass/opencompass/releases/download/0.1.1/OpenCompassData.zip -unzip OpenCompassData.zip +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip +unzip OpenCompassData-core-20231110.zip ``` +如果需要使用 OpenCompass 提供的更加完整的数据集 (~500M),可以使用下述命令进行下载: + +```bash +wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-complete-20231110.zip +unzip OpenCompassData-complete-20231110.zip +cd ./data +unzip *.zip +``` + +两个 `.zip` 中所含数据集列表如[此处](https://github.com/open-compass/opencompass/releases/tag/0.1.8.rc1)所示。 + OpenCompass 已经支持了大多数常用于性能比较的数据集,具体支持的数据集列表请直接在 `configs/datasets` 下进行查找。 接下来,你可以阅读[快速上手](./quick_start.md)了解 OpenCompass 的基本用法。 diff --git a/opencompass/datasets/commonsenseqa.py b/opencompass/datasets/commonsenseqa.py index 17b836d0..a78601da 100644 --- a/opencompass/datasets/commonsenseqa.py +++ b/opencompass/datasets/commonsenseqa.py @@ -1,4 +1,7 @@ -from datasets import load_dataset +import json +import os + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET @@ -9,14 +12,33 @@ from .base import BaseDataset class commonsenseqaDataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) + def load(path): + dataset = {} + for split, stub in [ + ['train', 'train_rand_split.jsonl'], + ['validation', 'dev_rand_split.jsonl'], + ]: + data_path = os.path.join(path, stub) + dataset_list = [] + with open(data_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + dataset_list.append({ + 'question': + line['question']['stem'], + 'A': + line['question']['choices'][0]['text'], + 'B': + line['question']['choices'][1]['text'], + 'C': + line['question']['choices'][2]['text'], + 'D': + line['question']['choices'][3]['text'], + 'E': + line['question']['choices'][4]['text'], + 'answerKey': + line['answerKey'], + }) + dataset[split] = Dataset.from_list(dataset_list) - def pre_process(example): - for i in range(5): - example[chr(ord('A') + i)] = example['choices']['text'][i] - return example - - dataset = dataset.map(pre_process).remove_columns( - ['question_concept', 'id', 'choices']) - return dataset + return DatasetDict(dataset) diff --git a/opencompass/datasets/drop.py b/opencompass/datasets/drop.py index cd3da6d2..81ddbb19 100644 --- a/opencompass/datasets/drop.py +++ b/opencompass/datasets/drop.py @@ -1,4 +1,6 @@ -from datasets import DatasetDict, load_dataset +import json + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET @@ -9,21 +11,37 @@ from .base import BaseDataset class dropDataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs, split='validation') + def get_answers(validated_answers): + answers = [] + for answer_item in validated_answers: + if answer_item['number']: + answers.append(answer_item['number']) + elif any(answer_item['date'][i] for i in ['day', 'month', 'year']): + d = [answer_item['date'][i] for i in ['day', 'month', 'year']] + answers.append(' '.join(d).strip()) + else: + for span in answer_item['spans']: + answers.append(span) + answers = list(set(answers)) + return answers - def pre_process(example): - example['answers'] = example['answers_spans']['spans'] - example['prompt'] = example.pop('passage') - return example + @staticmethod + def load(path, only_number=True): + with open(path, 'r', encoding='utf-8') as f: + lines = json.load(f) + dataset_list = [] + for line in lines.values(): + for qa_pair in line['qa_pairs']: + validated_answers = qa_pair['validated_answers'] + if only_number and not any(i['number'] + for i in validated_answers): + continue + item = { + 'prompt': line['passage'], + 'question': qa_pair['question'], + 'answers': dropDataset.get_answers(validated_answers), + } + dataset_list.append(item) - def only_number(example): - for i in example['answers_spans']['types']: - if i == 'number': - return True - return False - - dataset = dataset.filter(only_number) - dataset = dataset.map(pre_process).remove_columns( - ['section_id', 'query_id']) - return DatasetDict({'validation': dataset}) + dataset_list = Dataset.from_list(dataset_list) + return DatasetDict({'validation': dataset_list}) diff --git a/opencompass/datasets/flores.py b/opencompass/datasets/flores.py index 3c3c03b8..b4d69f8d 100644 --- a/opencompass/datasets/flores.py +++ b/opencompass/datasets/flores.py @@ -1,6 +1,7 @@ +import os import re -from datasets import DatasetDict, load_dataset +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS @@ -11,15 +12,30 @@ from .base import BaseDataset class FloresFirst100Dataset(BaseDataset): @staticmethod - def load(name): - return DatasetDict({ - 'dev': - load_dataset(path='facebook/flores', name=name, split='dev'), - 'devtest': - load_dataset(path='facebook/flores', - name=name, - split='devtest[:100]') - }) + def load_single(src_path, tgt_path, src_lang, tgt_lang): + + with open(src_path, 'r', encoding='utf-8') as f: + src_lines = f.readlines() + with open(tgt_path, 'r', encoding='utf-8') as f: + tgt_lines = f.readlines() + assert len(src_lines) == len(tgt_lines) + dataset_list = [{ + f'sentence_{src_lang}': src_lines[i].strip(), + f'sentence_{tgt_lang}': tgt_lines[i].strip(), + } for i in range(len(src_lines))] + return Dataset.from_list(dataset_list) + + @staticmethod + def load(path, name): + src_lang, tgt_lang = name.split('-') + dev_dataset = FloresFirst100Dataset.load_single( + os.path.join(path, 'dev', f'{src_lang}.dev'), + os.path.join(path, 'dev', f'{tgt_lang}.dev'), src_lang, tgt_lang) + devtest_dataset = FloresFirst100Dataset.load_single( + os.path.join(path, 'devtest', f'{src_lang}.devtest'), + os.path.join(path, 'devtest', f'{tgt_lang}.devtest'), src_lang, + tgt_lang) + return DatasetDict({'dev': dev_dataset, 'devtest': devtest_dataset}) @TEXT_POSTPROCESSORS.register_module('flores') diff --git a/opencompass/datasets/gsm8k.py b/opencompass/datasets/gsm8k.py index 3351775a..d440b2ec 100644 --- a/opencompass/datasets/gsm8k.py +++ b/opencompass/datasets/gsm8k.py @@ -1,5 +1,30 @@ +import json +import os + +from datasets import Dataset, DatasetDict + from opencompass.openicl import BaseEvaluator -from opencompass.registry import TEXT_POSTPROCESSORS +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class GSM8KDataset(BaseDataset): + + @staticmethod + def load(path): + datasets = {} + for split in ['train', 'test']: + split_path = os.path.join(path, split + '.jsonl') + dataset = [] + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line.strip()) + line['answer'] + dataset.append(line) + datasets[split] = Dataset.from_list(dataset) + return DatasetDict(datasets) @TEXT_POSTPROCESSORS.register_module('gsm8k_dataset') diff --git a/opencompass/datasets/hellaswag.py b/opencompass/datasets/hellaswag.py index 4541ca18..d615ea9e 100644 --- a/opencompass/datasets/hellaswag.py +++ b/opencompass/datasets/hellaswag.py @@ -1,6 +1,6 @@ import json -from datasets import Dataset, load_dataset +from datasets import Dataset from opencompass.registry import LOAD_DATASET @@ -11,15 +11,20 @@ from .base import BaseDataset class hellaswagDataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def preprocess(example): - for i in range(4): - example[chr(ord('A') + i)] = example['endings'][i] - return example - - dataset = dataset.map(preprocess).remove_columns(['endings']) + def load(path): + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset.append({ + 'ctx': data['query'].split(': ', 2)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': data['gold'], + }) + dataset = Dataset.from_list(dataset) return dataset @@ -27,19 +32,20 @@ class hellaswagDataset(BaseDataset): class hellaswagDataset_V2(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def preprocess(example): - for i in range(4): - example[chr(ord('A') + i)] = example['endings'][i] - if example['label']: - example['label'] = 'ABCD'[int(example['label'])] - else: - example['label'] = 'NULL' - return example - - dataset = dataset.map(preprocess).remove_columns(['endings']) + def load(path): + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + data = json.loads(line) + dataset.append({ + 'ctx': data['query'].split(': ', 1)[-1], + 'A': data['choices'][0], + 'B': data['choices'][1], + 'C': data['choices'][2], + 'D': data['choices'][3], + 'label': 'ABCD'[data['gold']], + }) + dataset = Dataset.from_list(dataset) return dataset diff --git a/opencompass/datasets/humaneval.py b/opencompass/datasets/humaneval.py index a58ce05b..78b8cc25 100644 --- a/opencompass/datasets/humaneval.py +++ b/opencompass/datasets/humaneval.py @@ -1,9 +1,27 @@ +import json import os.path as osp import re import tempfile from typing import List +from datasets import Dataset + from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +@LOAD_DATASET.register_module() +class HumanevalDataset(BaseDataset): + + @staticmethod + def load(path): + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + dataset.append(json.loads(line.strip())) + return Dataset.from_list(dataset) class HumanEvaluator(BaseEvaluator): diff --git a/opencompass/datasets/lambada.py b/opencompass/datasets/lambada.py index 41c57141..cf8266f5 100644 --- a/opencompass/datasets/lambada.py +++ b/opencompass/datasets/lambada.py @@ -1,7 +1,8 @@ +import json import re import string -from datasets import DatasetDict, load_dataset +from datasets import Dataset, DatasetDict from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET @@ -14,16 +15,12 @@ from .base import BaseDataset class lambadaDataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs, split='test') - - def preprocess(example): - prompt, target = example['text'].strip().rsplit(' ', 1) - example['prompt'] = prompt - example['label'] = target - return example - - dataset = dataset.map(preprocess) + def load(path): + dataset = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + dataset.append(json.loads(line)) + dataset = Dataset.from_list(dataset) return DatasetDict({'test': dataset}) diff --git a/opencompass/datasets/obqa.py b/opencompass/datasets/obqa.py index 4af7f44a..cd9cb4bc 100644 --- a/opencompass/datasets/obqa.py +++ b/opencompass/datasets/obqa.py @@ -1,4 +1,6 @@ -from datasets import load_dataset +import json + +from datasets import Dataset from opencompass.registry import LOAD_DATASET @@ -9,33 +11,46 @@ from .base import BaseDataset class OBQADataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def pre_process(example): - for i in range(4): - example[chr(ord('A') + i)] = example['choices']['text'][i] - return example - - dataset = dataset.map(pre_process).remove_columns(['id', 'choices']) - return dataset + def load(path): + dataset_list = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + item = { + 'A': line['question']['choices'][0]['text'], + 'B': line['question']['choices'][1]['text'], + 'C': line['question']['choices'][2]['text'], + 'D': line['question']['choices'][3]['text'], + 'question_stem': line['question']['stem'], + 'answerKey': line['answerKey'], + } + if 'fact1' in line: + item['fact1'] = line['fact1'] + dataset_list.append(item) + return Dataset.from_list(dataset_list) @LOAD_DATASET.register_module() class OBQADataset_V2(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def pre_process(example): - example['A'] = example['choices']['text'][0] - example['B'] = example['choices']['text'][1] - example['C'] = example['choices']['text'][2] - example['D'] = example['choices']['text'][3] - if not example['question_stem'].endswith('?'): - example['question_stem'] += ' what?' - return example - - dataset = dataset.map(pre_process).remove_columns(['id', 'choices']) - return dataset + def load(path): + dataset_list = [] + with open(path, 'r') as f: + for line in f: + line = json.loads(line) + question = line['question']['stem'] + if not question.endswith('?'): + question += ' what?' + item = { + 'A': line['question']['choices'][0]['text'], + 'B': line['question']['choices'][1]['text'], + 'C': line['question']['choices'][2]['text'], + 'D': line['question']['choices'][3]['text'], + 'question_stem': question, + 'answerKey': line['answerKey'], + } + if 'fact1' in line: + item['fact1'] = line['fact1'] + dataset_list.append(item) + return Dataset.from_list(dataset_list) diff --git a/opencompass/datasets/piqa.py b/opencompass/datasets/piqa.py index 76b1eb1e..89ac5ec2 100644 --- a/opencompass/datasets/piqa.py +++ b/opencompass/datasets/piqa.py @@ -1,50 +1,108 @@ -from datasets import load_dataset +import json +import os + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET from .base import BaseDataset +@LOAD_DATASET.register_module() +class piqaDataset(BaseDataset): + + @staticmethod + def load_single(path, data_filename, label_filename): + data_path = os.path.join(path, data_filename) + label_path = os.path.join(path, label_filename) + dataset = [] + with open(data_path, 'r', encoding='utf-8') as f: + data_lines = f.readlines() + with open(label_path, 'r', encoding='utf-8') as f: + label_lines = f.readlines() + assert len(data_lines) == len(label_lines) + for data, label in zip(data_lines, label_lines): + i = json.loads(data.strip()) + i['label'] = int(label.strip()) + dataset.append(i) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + train_dataset = piqaDataset.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = piqaDataset.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + + @LOAD_DATASET.register_module() class piqaDataset_V2(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def preprocess(example): - assert isinstance(example['label'], int) - if example['label'] < 0: - example['answer'] = 'NULL' + def load_single(path, data_filename, label_filename): + data_path = os.path.join(path, data_filename) + label_path = os.path.join(path, label_filename) + dataset = [] + with open(data_path, 'r', encoding='utf-8') as f: + data_lines = f.readlines() + with open(label_path, 'r', encoding='utf-8') as f: + label_lines = f.readlines() + assert len(data_lines) == len(label_lines) + for data, label in zip(data_lines, label_lines): + i = json.loads(data.strip()) + label = int(label.strip()) + if label < 0: + i['answer'] = 'NULL' else: - example['answer'] = 'AB'[example['label']] - example.pop('label') - return example + i['answer'] = 'AB'[label] + dataset.append(i) - dataset = dataset.map(preprocess) - return dataset + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + train_dataset = piqaDataset_V2.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = piqaDataset_V2.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({'train': train_dataset, 'validation': val_dataset}) @LOAD_DATASET.register_module() class piqaDataset_V3(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def preprocess(example): - example['goal'] = example['goal'][0].upper() + example['goal'][1:] - if example['goal'].endswith('?') or example['goal'].endswith('.'): - example['sol1'] = example['sol1'][0].upper( - ) + example['sol1'][1:] - example['sol2'] = example['sol2'][0].upper( - ) + example['sol2'][1:] + def load_single(path, data_filename, label_filename): + data_path = os.path.join(path, data_filename) + label_path = os.path.join(path, label_filename) + dataset = [] + with open(data_path, 'r', encoding='utf-8') as f: + data_lines = f.readlines() + with open(label_path, 'r', encoding='utf-8') as f: + label_lines = f.readlines() + assert len(data_lines) == len(label_lines) + for data, label in zip(data_lines, label_lines): + i = json.loads(data.strip()) + i['label'] = int(label.strip()) + # some preprocessing + i['goal'] = i['goal'][0].upper() + i['goal'][1:] + if i['goal'].endswith('?') or i['goal'].endswith('.'): + i['sol1'] = i['sol1'][0].upper() + i['sol1'][1:] + i['sol2'] = i['sol2'][0].upper() + i['sol2'][1:] else: - example['sol1'] = example['sol1'][0].lower( - ) + example['sol1'][1:] - example['sol2'] = example['sol2'][0].lower( - ) + example['sol2'][1:] - return example + i['sol1'] = i['sol1'][0].lower() + i['sol1'][1:] + i['sol2'] = i['sol2'][0].lower() + i['sol2'][1:] - dataset = dataset.map(preprocess) - return dataset + dataset.append(i) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + train_dataset = piqaDataset_V3.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = piqaDataset_V3.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({'train': train_dataset, 'validation': val_dataset}) diff --git a/opencompass/datasets/race.py b/opencompass/datasets/race.py index aa71c67f..f5be41cf 100644 --- a/opencompass/datasets/race.py +++ b/opencompass/datasets/race.py @@ -1,4 +1,7 @@ -from datasets import load_dataset +import json +import os + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET @@ -10,12 +13,21 @@ class RaceDataset(BaseDataset): @staticmethod def load(path: str, name: str): - dataset = load_dataset(path, name) - - def preprocess(x): - for ans, option in zip(['A', 'B', 'C', 'D'], x['options']): - x[ans] = option - del x['options'] - return x - - return dataset.map(preprocess) + dataset = {} + for split in ['validation', 'test']: + jsonl_path = os.path.join(path, split, f'{name}.jsonl') + dataset_list = [] + with open(jsonl_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + dataset_list.append({ + 'article': line['article'], + 'question': line['question'], + 'A': line['options'][0], + 'B': line['options'][1], + 'C': line['options'][2], + 'D': line['options'][3], + 'answer': line['answer'], + }) + dataset[split] = Dataset.from_list(dataset_list) + return DatasetDict(dataset) diff --git a/opencompass/datasets/siqa.py b/opencompass/datasets/siqa.py index 37409631..68d143e5 100644 --- a/opencompass/datasets/siqa.py +++ b/opencompass/datasets/siqa.py @@ -1,4 +1,7 @@ -from datasets import load_dataset +import json +import os + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET @@ -6,24 +9,72 @@ from .base import BaseDataset @LOAD_DATASET.register_module() -class siqaDataset_V2(BaseDataset): +class siqaDataset(BaseDataset): + """Disconnect from HuggingFace version of HFDataset.""" @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) + def load_single(path, data_filename, label_filename): + data_path = os.path.join(path, data_filename) + label_path = os.path.join(path, label_filename) + dataset = [] + with open(data_path, 'r', encoding='utf-8') as f: + data_lines = f.readlines() + with open(label_path, 'r', encoding='utf-8') as f: + label_lines = f.readlines() + assert len(data_lines) == len(label_lines) + for data, label in zip(data_lines, label_lines): + i = json.loads(data.strip()) + i['label'] = int(label.strip()) + dataset.append(i) - def preprocess(example): - example['all_labels'] = { + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + train_dataset = siqaDataset.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = siqaDataset.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({'train': train_dataset, 'validation': val_dataset}) + + +@LOAD_DATASET.register_module() +class siqaDataset_V2(BaseDataset): + """Disconnect from HuggingFace version of siqaDataset_V2.""" + + @staticmethod + def load_single(path, data_filename, label_filename): + data_path = os.path.join(path, data_filename) + label_path = os.path.join(path, label_filename) + dataset = [] + with open(data_path, 'r', encoding='utf-8') as f: + data_lines = f.readlines() + with open(label_path, 'r', encoding='utf-8') as f: + label_lines = f.readlines() + assert len(data_lines) == len(label_lines) + for data, label in zip(data_lines, label_lines): + i = json.loads(data.strip()) + label = int(label.strip()) + # some preprocessing + i['all_labels'] = { 'candidates': [ - f'A. {example["answerA"]}', - f'B. {example["answerB"]}', - f'C. {example["answerC"]}', + [f'A. {i["answerA"]}', 'A', i['answerA']], + [f'B. {i["answerB"]}', 'B', i['answerB']], + [f'C. {i["answerC"]}', 'C', i['answerC']], ], 'label': - int(example['label']) - 1 + label - 1 } - example['label'] = ' ABC'[int(example['label'])] - return example + i['label'] = ' ABC'[label] - dataset = dataset.map(preprocess) - return dataset + dataset.append(i) + + return Dataset.from_list(dataset) + + @staticmethod + def load(path): + train_dataset = siqaDataset_V2.load_single(path, 'train.jsonl', + 'train-labels.lst') + val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl', + 'dev-labels.lst') + return DatasetDict({'train': train_dataset, 'validation': val_dataset}) diff --git a/opencompass/datasets/storycloze.py b/opencompass/datasets/storycloze.py index 44577253..a0e2ec6b 100644 --- a/opencompass/datasets/storycloze.py +++ b/opencompass/datasets/storycloze.py @@ -1,4 +1,7 @@ -from datasets import DatasetDict, load_dataset +import json +import os + +from datasets import Dataset, DatasetDict from opencompass.registry import LOAD_DATASET @@ -9,38 +12,39 @@ from .base import BaseDataset class storyclozeDataset(BaseDataset): @staticmethod - def load(**kwargs): - # special process - dataset = load_dataset(**kwargs, split='train+eval') - - def preprocess(example): - example['context'] = ' '.join([ - example['input_sentence_1'], example['input_sentence_2'], - example['input_sentence_3'], example['input_sentence_4'] - ]) - return example - - dataset = dataset.map(preprocess) - - return DatasetDict({'test': dataset}) + def load(path, lang): + dataset_list = [] + for split in ['train', 'eval']: + split_path = os.path.join(path, f'{lang}_{split}.jsonl') + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['context'] = ' '.join([ + line['input_sentence_1'], line['input_sentence_2'], + line['input_sentence_3'], line['input_sentence_4'] + ]) + dataset_list.append(line) + dataset_list = Dataset.from_list(dataset_list) + return DatasetDict({'test': dataset_list}) @LOAD_DATASET.register_module() class storyclozeDataset_V2(BaseDataset): @staticmethod - def load(**kwargs): - # special process - dataset = load_dataset(**kwargs, split='train+eval') - - def preprocess(example): - example['context'] = ' '.join([ - example['input_sentence_1'], example['input_sentence_2'], - example['input_sentence_3'], example['input_sentence_4'] - ]) - example['answer_right_ending'] = ' AB'[ - example['answer_right_ending']] - return example - - dataset = dataset.map(preprocess) - return dataset + def load(path, lang): + dataset_list = [] + for split in ['train', 'eval']: + split_path = os.path.join(path, f'{lang}_{split}.jsonl') + with open(split_path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + line['context'] = ' '.join([ + line['input_sentence_1'], line['input_sentence_2'], + line['input_sentence_3'], line['input_sentence_4'] + ]) + line['answer_right_ending'] = ' AB'[ + line['answer_right_ending']] + dataset_list.append(line) + dataset_list = Dataset.from_list(dataset_list) + return dataset_list diff --git a/opencompass/datasets/strategyqa.py b/opencompass/datasets/strategyqa.py index ae8a155f..5e0117f3 100644 --- a/opencompass/datasets/strategyqa.py +++ b/opencompass/datasets/strategyqa.py @@ -1,6 +1,11 @@ +import json import re -from opencompass.registry import TEXT_POSTPROCESSORS +from datasets import Dataset + +from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS + +from .base import BaseDataset @TEXT_POSTPROCESSORS.register_module('strategyqa') @@ -16,3 +21,13 @@ def strategyqa_pred_postprocess(text: str) -> str: @TEXT_POSTPROCESSORS.register_module('strategyqa_dataset') def strategyqa_dataset_postprocess(text: str) -> str: return 'yes' if str(text) == 'True' else 'no' + + +@LOAD_DATASET.register_module() +class StrategyQADataset(BaseDataset): + + @staticmethod + def load(path): + with open(path, 'r', encoding='utf-8') as f: + dataset = json.load(f) + return Dataset.from_list(dataset) diff --git a/opencompass/datasets/tydiqa.py b/opencompass/datasets/tydiqa.py index 7b048594..eebbab29 100644 --- a/opencompass/datasets/tydiqa.py +++ b/opencompass/datasets/tydiqa.py @@ -1,7 +1,9 @@ +import json +import os import re from collections import Counter -from datasets import load_dataset +from datasets import Dataset from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.utils.text_postprocessors import general_postprocess @@ -12,15 +14,16 @@ from .base import BaseDataset class TydiQADataset(BaseDataset): @staticmethod - def load(**kwargs): - dataset = load_dataset(**kwargs) - - def pre_process(example): - example['answer'] = example['answers']['text'] - return example - - dataset = dataset.map(pre_process).remove_columns(['id', 'answers']) - return dataset + def load(path, lang): + path = os.path.join(path, 'dev', f'{lang}-dev.jsonl') + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + answer = list(set([i['text'] for i in line['answers']])) + line['answer'] = answer + dataset_list.append(line) + return Dataset.from_list(dataset_list) class TydiQAEvaluator(BaseEvaluator): diff --git a/opencompass/datasets/winogrande.py b/opencompass/datasets/winogrande.py index 90d19910..b238b7f8 100644 --- a/opencompass/datasets/winogrande.py +++ b/opencompass/datasets/winogrande.py @@ -1,4 +1,7 @@ -from datasets import load_dataset +import json +import os + +from datasets import Dataset from opencompass.registry import LOAD_DATASET @@ -7,38 +10,49 @@ from .base import BaseDataset @LOAD_DATASET.register_module() class winograndeDataset(BaseDataset): + """Disconnect from Huggingface, winograndeDataset.""" @staticmethod - def load(**kwargs): - - dataset = load_dataset(**kwargs) - - def preprocess(example): - prompt = example.pop('sentence') - example['opt1'] = prompt.replace('_', example.pop('option1')) - example['opt2'] = prompt.replace('_', example.pop('option2')) - return example - - return dataset.map(preprocess) + def load(path): + path = os.path.join(path, 'dev.jsonl') + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + prompt = line['sentence'] + dataset_list.append({ + 'opt1': + prompt.replace('_', line['option1']), + 'opt2': + prompt.replace('_', line['option2']), + 'answer': + line['answer'] + }) + dataset_list = Dataset.from_list(dataset_list) + return dataset_list @LOAD_DATASET.register_module() class winograndeDataset_V2(BaseDataset): + """Disconnect from Huggingface, winograndeDataset_V2.""" @staticmethod - def load(**kwargs): - - dataset = load_dataset(**kwargs) - - def preprocess(example): - prompt = example.pop('sentence') - example['opt1'] = prompt.replace('_', example.pop('option1')) - example['opt2'] = prompt.replace('_', example.pop('option2')) - answer = example.pop('answer') - if answer == '': - example['label'] = 'NULL' - else: - example['label'] = ' AB'[int(answer)] - return example - - return dataset.map(preprocess) + def load(path): + path = os.path.join(path, 'dev.jsonl') + dataset_list = [] + with open(path, 'r', encoding='utf-8') as f: + for line in f: + line = json.loads(line) + prompt = line['sentence'] + answer = line['answer'] + answer = ' AB'[int(answer)] if answer != '' else 'NULL' + dataset_list.append({ + 'opt1': + prompt.replace('_', line['option1']), + 'opt2': + prompt.replace('_', line['option2']), + 'answer': + answer + }) + dataset_list = Dataset.from_list(dataset_list) + return dataset_list