From ae3c1869dacc6afc56076ab516edec6b8f648696 Mon Sep 17 00:00:00 2001 From: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com> Date: Thu, 17 Aug 2023 11:11:26 +0800 Subject: [PATCH] [Feature]: Add other public datasets config (#214) * [Feature]: Add flickr30k * [Feature]: Add GQA * [Feature]: Add OCR VQA * [Feature]: Add OK VQA * [Feature]: Add text vqa * [Feature]: Add other vqa --- .../minigpt_4/minigpt_4_7b_flickr30k.py | 53 ++++++++++++++++++ .../multimodal/minigpt_4/minigpt_4_7b_gqa.py | 52 ++++++++++++++++++ .../minigpt_4/minigpt_4_7b_ocr-vqa.py | 53 ++++++++++++++++++ .../minigpt_4/minigpt_4_7b_ok-vqa.py | 55 +++++++++++++++++++ .../minigpt_4/minigpt_4_7b_textvqa.py | 55 +++++++++++++++++++ .../minigpt_4/minigpt_4_7b_vizwiz.py | 52 ++++++++++++++++++ 6 files changed, 320 insertions(+) create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py b/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py new file mode 100644 index 00000000..69bd2afc --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py @@ -0,0 +1,53 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4COCOCaotionPromptConstructor, + MiniGPT4COCOCaptionPostProcessor, +) + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(384, 384), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id']) +] + +dataset = dict(type='mmpretrain.Flickr30kCaption', + data_root='data/flickr30k', + ann_file='annotations/dataset_flickr30k.json', + data_prefix='images', + split='val', + pipeline=val_pipeline) + +minigpt_4_flickr30k_dataloader = dict( + batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +minigpt_4_flickr30k_model = dict( + type='minigpt-4', + low_resource=False, + img_size=384, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict(type=MiniGPT4COCOCaotionPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4COCOCaptionPostProcessor)) + +# evaluation settings +minigpt_4_flickr30k_evaluator = [ + dict( + type='mmpretrain.COCOCaption', + ann_file='data/coco/annotations/coco_karpathy_val_gt.json', + ) # noqa +] + +minigpt_4_flickr30k_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py new file mode 100644 index 00000000..f0cec5a6 --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py @@ -0,0 +1,52 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4VQAPromptConstructor, + MiniGPT4VQAPostProcessor, +) + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(224, 224), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.GQA', + data_root='data/gqa', + data_prefix='images', + ann_file='annotations/testdev_balanced_questions.json', + pipeline=val_pipeline) + +minigpt_4_gqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', + shuffle=False)) + +# model settings +minigpt_4_gqa_model = dict(type='minigpt-4', + low_resource=False, + img_size=224, + max_length=10, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict( + type=MiniGPT4VQAPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4VQAPostProcessor)) + +# evaluation settings +minigpt_4_gqa_evaluator = [dict(type='mmpretrain.GQAAcc')] + +minigpt_4_gqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py new file mode 100644 index 00000000..f95b2f85 --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py @@ -0,0 +1,53 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4VQAPromptConstructor, + MiniGPT4VQAPostProcessor, +) + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(224, 224), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.OCRVQA', + data_root='data/ocrvqa', + ann_file='annotations/dataset.json', + split='test', + data_prefix='images', + pipeline=val_pipeline) + +minigpt_4_ocr_vqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', + shuffle=False)) + +# model settings +minigpt_4_ocr_vqa_model = dict( + type='minigpt-4', + low_resource=False, + img_size=224, + max_length=10, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4VQAPostProcessor)) + +# evaluation settings +minigpt_4_ocr_vqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +minigpt_4_ocr_vqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py new file mode 100644 index 00000000..249223f5 --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py @@ -0,0 +1,55 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4VQAPromptConstructor, + MiniGPT4VQAPostProcessor, +) + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(224, 224), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.COCOVQA', + data_root='data/okvqa', + question_file='annotations/OpenEnded_mscoco_val2014_questions.json', + ann_file='annotations/mscoco_val2014_annotations.json', + pipeline=val_pipeline, + data_prefix='images/val2014', +) + +minigpt_4_ok_vqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', + shuffle=False)) + +# model settings +minigpt_4_ok_vqa_model = dict( + type='minigpt-4', + low_resource=False, + img_size=224, + max_length=10, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4VQAPostProcessor)) + +# evaluation settings +minigpt_4_ok_vqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +minigpt_4_ok_vqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py new file mode 100644 index 00000000..3ad2b7ef --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py @@ -0,0 +1,55 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4VQAPromptConstructor, + MiniGPT4VQAPostProcessor, +) + + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(224, 224), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.TextVQA', + data_root='data/textvqa', + ann_file='annotations/TextVQA_0.5.1_val.json', + pipeline=val_pipeline, + data_prefix='images/train_images', +) + +minigpt_4_textvqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', + shuffle=False)) + +# model settings +minigpt_4_textvqa_model = dict( + type='minigpt-4', + low_resource=False, + img_size=224, + max_length=10, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4VQAPostProcessor)) + +# evaluation settings +minigpt_4_textvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +minigpt_4_textvqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py b/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py new file mode 100644 index 00000000..0a371aed --- /dev/null +++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py @@ -0,0 +1,52 @@ +from opencompass.multimodal.models.minigpt_4 import ( + MiniGPT4VQAPromptConstructor, + MiniGPT4VQAPostProcessor, +) + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(224, 224), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.VizWiz', + data_root='data/vizwiz/', + data_prefix='Images/val', + ann_file='Annotations/val.json', + pipeline=val_pipeline) + +minigpt_4_vizwiz_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', + shuffle=False)) + +# model settings +minigpt_4_vizwiz_model = dict( + type='minigpt-4', + low_resource=False, + img_size=224, + max_length=10, + llama_model='/path/to/vicuna-7b/', + prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor, + image_prompt='###Human: ', + reply_prompt='###Assistant:'), + post_processor=dict(type=MiniGPT4VQAPostProcessor)) + +# evaluation settings +minigpt_4_vizwiz_evaluator = [dict(type='mmpretrain.VQAAcc')] + +minigpt_4_vizwiz_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa