From ae3c1869dacc6afc56076ab516edec6b8f648696 Mon Sep 17 00:00:00 2001
From: Yuan Liu <30762564+YuanLiuuuuuu@users.noreply.github.com>
Date: Thu, 17 Aug 2023 11:11:26 +0800
Subject: [PATCH] [Feature]: Add other public datasets config (#214)
* [Feature]: Add flickr30k
* [Feature]: Add GQA
* [Feature]: Add OCR VQA
* [Feature]: Add OK VQA
* [Feature]: Add text vqa
* [Feature]: Add other vqa
---
.../minigpt_4/minigpt_4_7b_flickr30k.py | 53 ++++++++++++++++++
.../multimodal/minigpt_4/minigpt_4_7b_gqa.py | 52 ++++++++++++++++++
.../minigpt_4/minigpt_4_7b_ocr-vqa.py | 53 ++++++++++++++++++
.../minigpt_4/minigpt_4_7b_ok-vqa.py | 55 +++++++++++++++++++
.../minigpt_4/minigpt_4_7b_textvqa.py | 55 +++++++++++++++++++
.../minigpt_4/minigpt_4_7b_vizwiz.py | 52 ++++++++++++++++++
6 files changed, 320 insertions(+)
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py
create mode 100644 configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py b/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py
new file mode 100644
index 00000000..69bd2afc
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py
@@ -0,0 +1,53 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4COCOCaotionPromptConstructor,
+ MiniGPT4COCOCaptionPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(384, 384),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id'])
+]
+
+dataset = dict(type='mmpretrain.Flickr30kCaption',
+ data_root='data/flickr30k',
+ ann_file='annotations/dataset_flickr30k.json',
+ data_prefix='images',
+ split='val',
+ pipeline=val_pipeline)
+
+minigpt_4_flickr30k_dataloader = dict(
+ batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler', shuffle=False))
+
+# model settings
+minigpt_4_flickr30k_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=384,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4COCOCaotionPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4COCOCaptionPostProcessor))
+
+# evaluation settings
+minigpt_4_flickr30k_evaluator = [
+ dict(
+ type='mmpretrain.COCOCaption',
+ ann_file='data/coco/annotations/coco_karpathy_val_gt.json',
+ ) # noqa
+]
+
+minigpt_4_flickr30k_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py
new file mode 100644
index 00000000..f0cec5a6
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py
@@ -0,0 +1,52 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(type='mmpretrain.GQA',
+ data_root='data/gqa',
+ data_prefix='images',
+ ann_file='annotations/testdev_balanced_questions.json',
+ pipeline=val_pipeline)
+
+minigpt_4_gqa_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_gqa_model = dict(type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(
+ type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_gqa_evaluator = [dict(type='mmpretrain.GQAAcc')]
+
+minigpt_4_gqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py
new file mode 100644
index 00000000..f95b2f85
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py
@@ -0,0 +1,53 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(type='mmpretrain.OCRVQA',
+ data_root='data/ocrvqa',
+ ann_file='annotations/dataset.json',
+ split='test',
+ data_prefix='images',
+ pipeline=val_pipeline)
+
+minigpt_4_ocr_vqa_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_ocr_vqa_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_ocr_vqa_evaluator = [dict(type='mmpretrain.VQAAcc')]
+
+minigpt_4_ocr_vqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py
new file mode 100644
index 00000000..249223f5
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py
@@ -0,0 +1,55 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(
+ type='mmpretrain.COCOVQA',
+ data_root='data/okvqa',
+ question_file='annotations/OpenEnded_mscoco_val2014_questions.json',
+ ann_file='annotations/mscoco_val2014_annotations.json',
+ pipeline=val_pipeline,
+ data_prefix='images/val2014',
+)
+
+minigpt_4_ok_vqa_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_ok_vqa_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_ok_vqa_evaluator = [dict(type='mmpretrain.VQAAcc')]
+
+minigpt_4_ok_vqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py
new file mode 100644
index 00000000..3ad2b7ef
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py
@@ -0,0 +1,55 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(
+ type='mmpretrain.TextVQA',
+ data_root='data/textvqa',
+ ann_file='annotations/TextVQA_0.5.1_val.json',
+ pipeline=val_pipeline,
+ data_prefix='images/train_images',
+)
+
+minigpt_4_textvqa_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_textvqa_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_textvqa_evaluator = [dict(type='mmpretrain.VQAAcc')]
+
+minigpt_4_textvqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py b/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py
new file mode 100644
index 00000000..0a371aed
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py
@@ -0,0 +1,52 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(type='mmpretrain.VizWiz',
+ data_root='data/vizwiz/',
+ data_prefix='Images/val',
+ ann_file='Annotations/val.json',
+ pipeline=val_pipeline)
+
+minigpt_4_vizwiz_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_vizwiz_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_vizwiz_evaluator = [dict(type='mmpretrain.VQAAcc')]
+
+minigpt_4_vizwiz_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa