diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_coco_caption.py b/configs/multimodal/minigpt_4/minigpt_4_7b_coco_caption.py
new file mode 100644
index 00000000..b240a496
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_coco_caption.py
@@ -0,0 +1,52 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4COCOCaotionPromptConstructor,
+ MiniGPT4COCOCaptionPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(384, 384),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id'])
+]
+
+dataset = dict(type='mmpretrain.COCOCaption',
+ data_root='data/coco',
+ data_prefix=dict(img_path='images'),
+ ann_file='annotations/coco_karpathy_val.json',
+ pipeline=val_pipeline)
+
+minigpt_4_coco_caption_dataloader = dict(
+ batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler', shuffle=False))
+
+# model settings
+minigpt_4_coco_caption_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=384,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4COCOCaotionPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4COCOCaptionPostProcessor))
+
+# evaluation settings
+minigpt_4_coco_caption_evaluator = [
+ dict(
+ type='mmpretrain.COCOCaption',
+ ann_file='data/coco/annotations/coco_karpathy_val_gt.json',
+ ) # noqa
+]
+
+minigpt_4_coco_caption_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_mmbench.py b/configs/multimodal/minigpt_4/minigpt_4_7b_mmbench.py
index 43ecb801..60a83047 100644
--- a/configs/multimodal/minigpt_4/minigpt_4_7b_mmbench.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_mmbench.py
@@ -1,5 +1,5 @@
from opencompass.multimodal.models.minigpt_4 import (
- MiniGPT4MMBenchPromptConstructor, MiniGPT4PostProcessor)
+ MiniGPT4MMBenchPromptConstructor, MiniGPT4MMBenchPostProcessor)
# dataloader settings
val_pipeline = [
@@ -29,13 +29,13 @@ minigpt_4_dataloader = dict(batch_size=1,
# model settings
minigpt_4_model = dict(
- type='minigpt-4-mmbench',
+ type='minigpt-4',
low_resource=False,
llama_model='/path/to/vicuna-7b/',
prompt_constructor=dict(type=MiniGPT4MMBenchPromptConstructor,
image_prompt='###Human:
',
reply_prompt='###Assistant:'),
- post_processor=dict(type=MiniGPT4PostProcessor))
+ post_processor=dict(type=MiniGPT4MMBenchPostProcessor))
# evaluation settings
minigpt_4_evaluator = [
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_scienceqa.py b/configs/multimodal/minigpt_4/minigpt_4_7b_scienceqa.py
new file mode 100644
index 00000000..302ec64b
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_scienceqa.py
@@ -0,0 +1,52 @@
+from opencompass.multimodal.models import (MiniGPT4ScienceQAPromptConstructor,
+ MiniGPT4ScienceQAPostProcessor)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(type='mmpretrain.PackInputs',
+ algorithm_keys=[
+ 'question', 'gt_answer', 'choices', 'hint', 'lecture', 'solution'
+ ])
+]
+
+dataset = dict(type='mmpretrain.ScienceQA',
+ data_root='./data/scienceqa',
+ split='val',
+ split_file='pid_splits.json',
+ ann_file='problems.json',
+ image_only=True,
+ data_prefix=dict(img_path='val'),
+ pipeline=val_pipeline)
+
+minigpt_4_scienceqa_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_scienceqa_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4ScienceQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4ScienceQAPostProcessor))
+
+# evaluation settings
+minigpt_4_scienceqa_evaluator = [dict(type='mmpretrain.ScienceQAMetric')]
+
+minigpt_4_scienceqa_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_vqav2.py b/configs/multimodal/minigpt_4/minigpt_4_7b_vqav2.py
new file mode 100644
index 00000000..b9bd7fba
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vqav2.py
@@ -0,0 +1,55 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VQAPostProcessor,
+)
+
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(
+ type='mmpretrain.COCOVQA',
+ data_root='data/coco',
+ data_prefix='images/val2014',
+ question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json',
+ ann_file='annotations/v2_mscoco_val2014_annotations.json',
+ pipeline=val_pipeline)
+
+minigpt_4_vqav2_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_vqav2_model = dict(
+ type='minigpt-4',
+ low_resource=False,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VQAPostProcessor))
+
+# evaluation settings
+minigpt_4_vqav2_evaluator = [dict(type='mmpretrain.VQAAcc')]
+
+minigpt_4_vqav2_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/minigpt_4/minigpt_4_7b_vsr.py b/configs/multimodal/minigpt_4/minigpt_4_7b_vsr.py
new file mode 100644
index 00000000..ab5799fc
--- /dev/null
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vsr.py
@@ -0,0 +1,52 @@
+from opencompass.multimodal.models.minigpt_4 import (
+ MiniGPT4VSRPromptConstructor,
+ MiniGPT4VSRPostProcessor,
+)
+
+# dataloader settings
+val_pipeline = [
+ dict(type='mmpretrain.LoadImageFromFile'),
+ dict(type='mmpretrain.ToPIL', to_rgb=True),
+ dict(type='mmpretrain.torchvision/Resize',
+ size=(224, 224),
+ interpolation=3),
+ dict(type='mmpretrain.torchvision/ToTensor'),
+ dict(type='mmpretrain.torchvision/Normalize',
+ mean=(0.48145466, 0.4578275, 0.40821073),
+ std=(0.26862954, 0.26130258, 0.27577711)),
+ dict(
+ type='mmpretrain.PackInputs',
+ algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'],
+ meta_keys=['question_id', 'image_id'],
+ )
+]
+
+dataset = dict(type='mmpretrain.VSR',
+ data_root='data/vsr/',
+ data_prefix='images/',
+ ann_file='annotations/test.json',
+ pipeline=val_pipeline)
+
+minigpt_4_vsr_dataloader = dict(batch_size=1,
+ num_workers=4,
+ dataset=dataset,
+ collate_fn=dict(type='pseudo_collate'),
+ sampler=dict(type='DefaultSampler',
+ shuffle=False))
+
+# model settings
+minigpt_4_vsr_model = dict(
+ type='minigpt-4',
+ low_resource=True,
+ img_size=224,
+ max_length=10,
+ llama_model='/path/to/vicuna-7b/',
+ prompt_constructor=dict(type=MiniGPT4VSRPromptConstructor,
+ image_prompt='###Human:
',
+ reply_prompt='###Assistant:'),
+ post_processor=dict(type=MiniGPT4VSRPostProcessor))
+
+# evaluation settings
+minigpt_4_vsr_evaluator = [dict(type='mmpretrain.GQAAcc')]
+
+minigpt_4_vsr_load_from = '/path/to/prerained_minigpt4_7b.pth' # noqa
diff --git a/configs/multimodal/tasks.py b/configs/multimodal/tasks.py
index 94273b96..2d09757a 100644
--- a/configs/multimodal/tasks.py
+++ b/configs/multimodal/tasks.py
@@ -12,4 +12,4 @@ evaluators = [minigpt_4_evaluator]
load_froms = [minigpt_4_load_from]
num_gpus = 8
num_procs = 8
-launcher = 'pytorch'
+launcher = 'pytorch'
\ No newline at end of file
diff --git a/opencompass/multimodal/models/minigpt_4/__init__.py b/opencompass/multimodal/models/minigpt_4/__init__.py
index 6604c669..56e2cc69 100644
--- a/opencompass/multimodal/models/minigpt_4/__init__.py
+++ b/opencompass/multimodal/models/minigpt_4/__init__.py
@@ -1,8 +1,20 @@
-from .minigpt_4 import MiniGPT4MMBench
-from .post_processor import MiniGPT4PostProcessor
-from .prompt_constructor import MiniGPT4MMBenchPromptConstructor
+from .minigpt_4 import MiniGPT4Inferencer
+from .post_processor import (MiniGPT4COCOCaptionPostProcessor,
+ MiniGPT4MMBenchPostProcessor,
+ MiniGPT4ScienceQAPostProcessor,
+ MiniGPT4VQAPostProcessor,
+ MiniGPT4VSRPostProcessor)
+from .prompt_constructor import (MiniGPT4COCOCaotionPromptConstructor,
+ MiniGPT4MMBenchPromptConstructor,
+ MiniGPT4ScienceQAPromptConstructor,
+ MiniGPT4VQAPromptConstructor,
+ MiniGPT4VSRPromptConstructor)
__all__ = [
- 'MiniGPT4MMBench', 'MiniGPT4PostProcessor',
- 'MiniGPT4MMBenchPromptConstructor'
+ 'MiniGPT4Inferencer', 'MiniGPT4MMBenchPostProcessor',
+ 'MiniGPT4MMBenchPromptConstructor', 'MiniGPT4COCOCaotionPromptConstructor',
+ 'MiniGPT4COCOCaptionPostProcessor', 'MiniGPT4ScienceQAPromptConstructor',
+ 'MiniGPT4ScienceQAPostProcessor', 'MiniGPT4VQAPromptConstructor',
+ 'MiniGPT4VQAPostProcessor', 'MiniGPT4VSRPostProcessor',
+ 'MiniGPT4VSRPromptConstructor'
]
diff --git a/opencompass/multimodal/models/minigpt_4/minigpt_4.py b/opencompass/multimodal/models/minigpt_4/minigpt_4.py
index ee4d4c8c..eee0e3dc 100644
--- a/opencompass/multimodal/models/minigpt_4/minigpt_4.py
+++ b/opencompass/multimodal/models/minigpt_4/minigpt_4.py
@@ -37,14 +37,17 @@ def load_package():
MiniGPT4 = load_package()
-@MM_MODELS.register_module('minigpt-4-mmbench')
-class MiniGPT4MMBench(MiniGPT4):
- """Inference code of MiniGPT-4 on MMBench.
+@MM_MODELS.register_module('minigpt-4')
+class MiniGPT4Inferencer(MiniGPT4):
+ """Inference code of MiniGPT-4.
Args:
llama_model (str): The path of vicuna path.
prompt_constructor (dict): The config of prompt constructor.
post_processor (dict): The config of post processor.
+ do_sample (bool): Whether use sampling. Defaults to False.
+ max_length (int): The max length of output. Defaults to 30.
+ img_size (int): The size of image. Defaults to 224.
low_resource (bool): Whether loaded in low precision.
Defaults to False.
"""
@@ -53,8 +56,13 @@ class MiniGPT4MMBench(MiniGPT4):
llama_model: str,
prompt_constructor: dict,
post_processor: dict,
+ do_sample: bool = False,
+ max_length: int = 30,
+ img_size: int = 224,
low_resource: bool = False) -> None:
- super().__init__(llama_model=llama_model, low_resource=low_resource)
+ super().__init__(llama_model=llama_model,
+ low_resource=low_resource,
+ img_size=img_size)
cur_device = get_device()
stop_words_ids = [
@@ -67,6 +75,8 @@ class MiniGPT4MMBench(MiniGPT4):
prompt_constructor, MM_MODELS)
self.post_processor = mmengine.registry.build_from_cfg(
post_processor, MM_MODELS)
+ self.do_sample = do_sample
+ self.max_length = max_length
def encode_img(self, image):
device = image.device
@@ -125,9 +135,9 @@ class MiniGPT4MMBench(MiniGPT4):
# generate output
outputs = self.llama_model.generate(
inputs_embeds=prompt_embs,
- max_new_tokens=20,
+ max_length=self.max_length,
num_beams=5,
- do_sample=False,
+ do_sample=self.do_sample,
min_length=1,
top_p=0.9,
repetition_penalty=1.0,
diff --git a/opencompass/multimodal/models/minigpt_4/post_processor.py b/opencompass/multimodal/models/minigpt_4/post_processor.py
index 301a3422..85d1f83f 100644
--- a/opencompass/multimodal/models/minigpt_4/post_processor.py
+++ b/opencompass/multimodal/models/minigpt_4/post_processor.py
@@ -1,9 +1,10 @@
+import random
import re
import torch
-class MiniGPT4PostProcessor:
+class MiniGPT4MMBenchPostProcessor:
""""Post processor for MiniGPT-4 on MMBench."""
def __init__(self) -> None:
@@ -32,3 +33,89 @@ class MiniGPT4PostProcessor:
if len(res) > 0:
output_text = res[0][:-1]
return output_text
+
+
+class MiniGPT4COCOCaptionPostProcessor:
+ """"Post processor for MiniGPT-4 on COCO Caption."""
+
+ def __init__(self) -> None:
+ pass
+
+ def __call__(self, output_token: torch.tensor, tokenizer) -> str:
+
+ if output_token[0] == 0:
+ output_token = output_token[1:]
+ if output_token[0] == 1:
+ output_token = output_token[1:]
+ output_text = tokenizer.decode(output_token,
+ add_special_tokens=False) # noqa
+ output_text = output_text.split('###')[0]
+ output_text = output_text.split('Assistant:')[-1].strip()
+ output_text = output_text.split('. ')[0]
+ output_text = output_text.strip('
')
+ output_text = output_text.strip()
+ return output_text
+
+
+class MiniGPT4ScienceQAPostProcessor:
+ """"Post processor for MiniGPT-4 on ScienceQA."""
+
+ def __init__(self) -> None:
+ pass
+
+ def __call__(self, output_token: torch.tensor, tokenizer) -> str:
+
+ if output_token[0] == 0:
+ output_token = output_token[1:]
+ if output_token[0] == 1:
+ output_token = output_token[1:]
+ output_text = tokenizer.decode(output_token,
+ add_special_tokens=False) # noqa
+ output_text = output_text.split('###')[0]
+ output_text = output_text.split('Assistant:')[-1].strip()
+ pattern = re.compile(r'\(([A-Z])\)')
+ output_text = pattern.findall(output_text)
+ if len(output_text) == 0:
+ output_text = random.choice(['A', 'B', 'C', 'D'])
+ else:
+ output_text = output_text[0]
+ return output_text
+
+
+class MiniGPT4VQAPostProcessor:
+ """"Post processor for MiniGPT-4 on VQA."""
+
+ def __init__(self) -> None:
+ pass
+
+ def __call__(self, output_token: torch.tensor, tokenizer) -> str:
+
+ if output_token[0] == 0:
+ output_token = output_token[1:]
+ if output_token[0] == 1:
+ output_token = output_token[1:]
+ output_text = tokenizer.decode(output_token,
+ add_special_tokens=False) # noqa
+ output_text = output_text.split('###')[0]
+ output_text = output_text.split('Assistant:')[-1].strip()
+ return output_text
+
+
+class MiniGPT4VSRPostProcessor:
+ """"Post processor for MiniGPT-4 on VSR."""
+
+ def __init__(self) -> None:
+ pass
+
+ def __call__(self, output_token: torch.tensor, tokenizer) -> str:
+
+ if output_token[0] == 0:
+ output_token = output_token[1:]
+ if output_token[0] == 1:
+ output_token = output_token[1:]
+ output_text = tokenizer.decode(output_token, add_special_tokens=False)
+ pattern = r'yes|no|Yes|No'
+ output_text = re.findall(pattern, output_text)
+ if len(output_text) > 0:
+ output_text = output_text[0].lower()
+ return output_text
diff --git a/opencompass/multimodal/models/minigpt_4/prompt_constructor.py b/opencompass/multimodal/models/minigpt_4/prompt_constructor.py
index de07c1bf..aec42b95 100644
--- a/opencompass/multimodal/models/minigpt_4/prompt_constructor.py
+++ b/opencompass/multimodal/models/minigpt_4/prompt_constructor.py
@@ -53,3 +53,68 @@ class MiniGPT4MMBenchPromptConstructor:
else:
prompt = self.image_prompt + ' ' + question + ' ' + option + ' ' + self.reply_prompt # noqa
return prompt
+
+
+class MiniGPT4COCOCaotionPromptConstructor(MiniGPT4MMBenchPromptConstructor):
+ """Prompt constructor for MiniGPT-4 on COCO Caption."""
+
+ def _process(self, data_samples: List[DataSample]) -> str:
+ assert len(data_samples) == 1, 'Only support batch size 1.'
+ prompt = self.image_prompt + ' ' + 'a photo of' + self.reply_prompt
+ return prompt
+
+
+class MiniGPT4ScienceQAPromptConstructor(MiniGPT4MMBenchPromptConstructor):
+ """Prompt constructor for MiniGPT-4 on ScienceQA."""
+
+ choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'}
+
+ def _process(self, data_samples: List[DataSample]) -> str:
+ assert len(data_samples) == 1, 'Only support batch size 1.'
+ questions = [
+ 'Question: ' + data_sample.get('question') + '\n'
+ for data_sample in data_samples
+ ] # noqa
+ choices = [data_sample.get('choices') for data_sample in data_samples]
+ choices = [[
+ f'({self.choice_mapping[i]}) ' + item
+ for i, item in enumerate(choice)
+ ] for choice in choices]
+ choices = [
+ 'Choices: ' + ' '.join(choice) + '\n' for choice in choices
+ ] # noqa
+ contexts = [
+ 'Context: ' + data_sample.get('hint') + '\n'
+ for data_sample in data_samples
+ ] # noqa
+ question = questions[0]
+ choice = choices[0]
+ context = contexts[0]
+ prompt = self.image_prompt + ' ' + context + ' ' + question + ' ' + choice + self.reply_prompt + ' ' + 'The answer is' # noqa
+ return prompt
+
+
+class MiniGPT4VQAPromptConstructor(MiniGPT4MMBenchPromptConstructor):
+ """Prompt constructor for MiniGPT-4 on VQA."""
+
+ def _process(self, data_samples: List[DataSample]) -> str:
+ assert len(data_samples) == 1, 'Only support batch size 1.'
+ questions = [
+ data_sample.get('question') for data_sample in data_samples
+ ]
+ question = questions[0]
+ prompt = self.image_prompt + ' ' + question + ' ' + 'Answer this question in a single word.' + ' ' + self.reply_prompt # noqa
+ return prompt
+
+
+class MiniGPT4VSRPromptConstructor(MiniGPT4MMBenchPromptConstructor):
+ """Prompt constructor for MiniGPT-4 on VSR."""
+
+ def _process(self, data_samples: List[DataSample]) -> str:
+ assert len(data_samples) == 1, 'Only support batch size 1.'
+ questions = [
+ data_sample.get('question') for data_sample in data_samples
+ ]
+ question = questions[0]
+ prompt = self.image_prompt + ' ' + question + ' ' + 'Is the above description correct? Answer yes or no.' + ' ' + self.reply_prompt # noqa
+ return prompt