diff --git a/configs/multimodal/llava/llava_7b_mmbench.py b/configs/multimodal/llava/llava_7b_mmbench.py index 2722e391..0ef8eba9 100644 --- a/configs/multimodal/llava/llava_7b_mmbench.py +++ b/configs/multimodal/llava/llava_7b_mmbench.py @@ -24,7 +24,7 @@ dataset = dict(type='opencompass.MMBenchDataset', data_file='data/mmbench/mmbench_test_20230712.tsv', pipeline=val_pipeline) -mmbench_dataloader = dict( +llava_mmbench_dataloader = dict( batch_size=1, num_workers=4, dataset=dataset, @@ -33,7 +33,7 @@ mmbench_dataloader = dict( ) # model settings -llava_model = dict( +llava_mmbench_model = dict( type='llava', model_path='/path/to/llava', prompt_constructor=dict(type=LLaVAMMBenchPromptConstructor), @@ -41,7 +41,7 @@ llava_model = dict( ) # noqa # evaluation settings -mmbench_evaluator = [ +llava_mmbench_evaluator = [ dict(type='opencompass.DumpResults', save_path='work_dirs/llava-7b-mmbench.xlsx') ] diff --git a/configs/multimodal/mplug_owl/mplug_owl_7b_mmbench.py b/configs/multimodal/mplug_owl/mplug_owl_7b_mmbench.py index cecc1e24..322c041f 100644 --- a/configs/multimodal/mplug_owl/mplug_owl_7b_mmbench.py +++ b/configs/multimodal/mplug_owl/mplug_owl_7b_mmbench.py @@ -35,8 +35,8 @@ mplug_owl_mmbench_dataloader = dict( # model settings mplug_owl_mmbench_model = dict( - type='mplug_owl_7b', - model_path='/mplug-owl-llama-7b-ft/', + type='mplug_owl-7b', + model_path='/mplug-owl-llama-7b-ft', prompt_constructor=dict(type=MplugOwlMMBenchPromptConstructor), post_processor=dict(type=MplugOwlMMBenchPostProcessor) ) # noqa @@ -46,5 +46,3 @@ mplug_owl_mmbench_evaluator = [ dict(type='opencompass.DumpResults', save_path='work_dirs/mplug_owl-7b-mmagibench-v0.1.0.xlsx') ] - -mplug_owl_mmbench_load_from = None \ No newline at end of file diff --git a/configs/multimodal/openflamingo/openflamingo_coco_caption.py b/configs/multimodal/openflamingo/openflamingo_coco_caption.py new file mode 100644 index 00000000..dad7b1b2 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_coco_caption.py @@ -0,0 +1,75 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoCaptionPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id']) +] + +dataset = dict(type='mmpretrain.COCOCaption', + data_root='data/coco', + data_prefix=dict(img_path='images'), + ann_file='annotations/coco_karpathy_val.json', + pipeline=val_pipeline) + +openflamingo_coco_caption_dataloader = dict( + batch_size=1, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_coco_caption_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='caption', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoCaptionPromptConstructor) +) + +# evaluation settings +openflamingo_coco_caption_evaluator = [ + dict( + type='mmpretrain.COCOCaption', + ann_file='data/coco/annotations/coco_karpathy_val_gt.json', + ) # noqa +] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_flickr30k.py b/configs/multimodal/openflamingo/openflamingo_flickr30k.py new file mode 100644 index 00000000..e388f6c2 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_flickr30k.py @@ -0,0 +1,76 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoCaptionPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id']) +] + +dataset = dict(type='mmpretrain.Flickr30kCaption', + data_root='data/flickr30k', + ann_file='annotations/dataset_flickr30k.json', + data_prefix='images', + split='val', + pipeline=val_pipeline) + +openflamingo_flickr30k_dataloader = dict( + batch_size=1, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_flickr30k_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='caption', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoCaptionPromptConstructor) +) + +# evaluation settings +openflamingo_flickr30k_evaluator = [ + dict( + type='mmpretrain.COCOCaption', + ann_file='data/flickr30k/annotations/flickr30k_val_gt.json', + ) # noqa +] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_gqa.py b/configs/multimodal/openflamingo/openflamingo_gqa.py new file mode 100644 index 00000000..c4c33303 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_gqa.py @@ -0,0 +1,75 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.GQA', + data_root='data/gqa', + data_prefix='images', + ann_file='annotations/testdev_balanced_questions.json', + pipeline=val_pipeline) + +openflamingo_gqa_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_gqa_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_gqa_evaluator = [dict(type='mmpretrain.GQAAcc')] + + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_mmbench.py b/configs/multimodal/openflamingo/openflamingo_mmbench.py index 8327fb09..f01e5a78 100644 --- a/configs/multimodal/openflamingo/openflamingo_mmbench.py +++ b/configs/multimodal/openflamingo/openflamingo_mmbench.py @@ -1,3 +1,5 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoMMBenchPromptConstructor + # dataloader settings val_pipeline = [ dict(type='mmpretrain.PILToNumpy'), @@ -17,7 +19,7 @@ dataset = dict(type='opencompass.MMBenchDataset', data_file='data/mmbench/mmbench_test_20230712.tsv', pipeline=val_pipeline) -openflamingo_dataloader = dict( +openflamingo_mmbench_dataloader = dict( batch_size=1, num_workers=4, dataset=dataset, @@ -27,7 +29,7 @@ openflamingo_dataloader = dict( ) # model settings -openflamingo_model = dict( +openflamingo_mmbench_model = dict( type='openflamingo', data_preprocessor=dict( type='mmpretrain.MultiModalDataPreprocessor', @@ -59,11 +61,13 @@ openflamingo_model = dict( cross_attn_every_n_layers=4, use_media_placement_augmentation=False), ), + task='vqa', generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoMMBenchPromptConstructor) ) # evaluation settings -openflamingo_evaluator = [ +openflamingo_mmbench_evaluator = [ dict( type='opencompass.DumpResults', save_path= # noqa: E251 diff --git a/configs/multimodal/openflamingo/openflamingo_ocr_vqa.py b/configs/multimodal/openflamingo/openflamingo_ocr_vqa.py new file mode 100644 index 00000000..10298830 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_ocr_vqa.py @@ -0,0 +1,75 @@ +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.OCRVQA', + data_root='data/ocrvqa', + ann_file='annotations/dataset.json', + split='test', + data_prefix='images', + pipeline=val_pipeline) + +openflamingo_ocrvqa_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor + +# model settings +openflamingo_ocrvqa_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_ocrvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_ok_vqa.py b/configs/multimodal/openflamingo/openflamingo_ok_vqa.py new file mode 100644 index 00000000..733d1457 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_ok_vqa.py @@ -0,0 +1,77 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.COCOVQA', + data_root='data/okvqa', + question_file='annotations/OpenEnded_mscoco_val2014_questions.json', + ann_file='annotations/mscoco_val2014_annotations.json', + pipeline=val_pipeline, + data_prefix='images/val2014', +) + +openflamingo_okvqa_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_okvqa_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_okvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_scienceqa.py b/configs/multimodal/openflamingo/openflamingo_scienceqa.py new file mode 100644 index 00000000..292b9146 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_scienceqa.py @@ -0,0 +1,76 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoScienceQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict(type='mmpretrain.PackInputs', + algorithm_keys=[ + 'question', 'gt_answer', 'choices', 'hint', 'lecture', 'solution' + ]) +] + +dataset = dict(type='mmpretrain.ScienceQA', + data_root='./data/scienceqa', + split='val', + split_file='pid_splits.json', + ann_file='problems.json', + image_only=True, + data_prefix=dict(img_path='val'), + pipeline=val_pipeline) + +openflamingo_scienceqa_dataloader = dict( + batch_size=1, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_scienceqa_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoScienceQAPromptConstructor) +) + +# evaluation settings +openflamingo_scienceqa_evaluator = [dict(type='mmpretrain.ScienceQAMetric')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_textvqa.py b/configs/multimodal/openflamingo/openflamingo_textvqa.py new file mode 100644 index 00000000..67f0b343 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_textvqa.py @@ -0,0 +1,76 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.TextVQA', + data_root='data/textvqa', + ann_file='annotations/TextVQA_0.5.1_val.json', + pipeline=val_pipeline, + data_prefix='images/train_images', +) + +openflamingo_textvqa_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_textvqa_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_textvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_vizwiz.py b/configs/multimodal/openflamingo/openflamingo_vizwiz.py new file mode 100644 index 00000000..e9b5262a --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_vizwiz.py @@ -0,0 +1,74 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.VizWiz', + data_root='data/vizwiz/', + data_prefix='Images/val', + ann_file='Annotations/val.json', + pipeline=val_pipeline) + +openflamingo_vizwiz_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_vizwiz_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_vizwiz_evaluator = [dict(type='mmpretrain.VQAAcc')] + + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_vqav2.py b/configs/multimodal/openflamingo/openflamingo_vqav2.py new file mode 100644 index 00000000..52d4dbf5 --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_vqav2.py @@ -0,0 +1,75 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.COCOVQA', + data_root='data/coco', + data_prefix='images/val2014', + question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json', + ann_file='annotations/v2_mscoco_val2014_annotations.json', + pipeline=val_pipeline) + +openflamingo_vqav2_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_vqav2_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor) +) + +# evaluation settings +openflamingo_vqav2_evaluator = [dict(type='mmpretrain.VQAAcc')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/openflamingo/openflamingo_vsr.py b/configs/multimodal/openflamingo/openflamingo_vsr.py new file mode 100644 index 00000000..0130962d --- /dev/null +++ b/configs/multimodal/openflamingo/openflamingo_vsr.py @@ -0,0 +1,75 @@ +from opencompass.multimodal.models.openflamingo import OpenFlamingoVQAPromptConstructor, OpenFlamingoVSRPostProcessor +# dataloader settings +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='mmpretrain.ResizeEdge', + scale=224, + interpolation='bicubic', + backend='pillow'), + dict(type='CenterCrop', crop_size=(224, 224)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.VSR', + data_root='data/vsr/', + data_prefix='images/', + ann_file='annotations/test.json', + pipeline=val_pipeline) + +openflamingo_vsr_dataloader = dict( + batch_size=8, + num_workers=4, + dataset=dataset, + sampler=dict(type='DefaultSampler', shuffle=False), + collate_fn=dict(type='default_collate'), + persistent_workers=True, +) + +# model settings +openflamingo_vsr_model = dict( + type='openflamingo', + data_preprocessor=dict( + type='mmpretrain.MultiModalDataPreprocessor', + mean=[122.770938, 116.7460125, 104.09373615], + std=[68.5005327, 66.6321579, 70.32316305], + to_rgb=True, + ), + tokenizer=dict(type='mmpretrain.LlamaTokenizer', + name_or_path='decapoda-research/llama-7b-hf'), + vision_encoder=dict( + type='mmpretrain.VisionTransformer', + arch='l', + patch_size=14, + pre_norm=True, + norm_cfg=dict(type='LN', eps=1e-5), + layer_cfgs=dict(act_cfg=dict(type='mmpretrain.QuickGELU')), + final_norm=False, + out_type='raw', + pretrained= # noqa: E251 + '/path/to/vision/encoder', # noqa + ), + lang_encoder=dict( + base=dict(type='mmpretrain.AutoModelForCausalLM', + name_or_path= + 'decapoda-research/llama-7b-hf', + local_files_only=True), + adapter=dict(type='mmpretrain.FlamingoLMAdapter', + vis_hidden_size=1024, + cross_attn_every_n_layers=4, + use_media_placement_augmentation=False), + ), + task='vqa', + generation_cfg=dict(num_beams=3, max_new_tokens=20, length_penalty=-2.0), + prompt_constructor=dict(type=OpenFlamingoVQAPromptConstructor, shot_prompt=('The cat is behind the laptop. Short Answer:yes<|endofchunk|>' # noqa: E501 + 'The cow is ahead of the person. Short Answer:no<|endofchunk|>')), + post_processor=dict(type=OpenFlamingoVSRPostProcessor) +) + +# evaluation settings +openflamingo_vsr_evaluator = [dict(type='mmpretrain.GQAAcc')] + +openflamingo_load_from = '/path/to/pretrained/weights' # noqa diff --git a/configs/multimodal/otter/README.md b/configs/multimodal/otter/README.md index 986cb2c8..b747415a 100644 --- a/configs/multimodal/otter/README.md +++ b/configs/multimodal/otter/README.md @@ -3,12 +3,9 @@ ### Prepare the environment ```sh -cd opencompass/multimodal/models/otter -git clone https://github.com/Luodian/Otter.git +pip install otter_ai ``` -Then create a new conda environment and prepare the environement according to this [doc](https://github.com/Luodian/Otter) - ### Start evaluation #### Slurm diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_coco_caption.py b/configs/multimodal/qwen/qwenvl_chat_7b_coco_caption.py new file mode 100644 index 00000000..96b22e84 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_coco_caption.py @@ -0,0 +1,44 @@ +from opencompass.multimodal.models.qwen import QwenVLChatPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict(type='mmpretrain.PackInputs', + algorithm_keys=['image_id']) +] + +dataset = dict(type='mmpretrain.COCOCaption', + data_root='data/coco', + data_prefix=dict(img_path='images'), + ann_file='annotations/coco_karpathy_val.json', + pipeline=val_pipeline) + +qwen_coco_caption_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_coco_caption_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatPromptConstructor, prompt='Describe the image.'), + is_caption_task=True, +) + +# evaluation settings +qwen_coco_caption_evaluator = [ + dict( + type='mmpretrain.COCOCaption', + ann_file='data/coco/annotations/coco_karpathy_val_gt.json', + ) # noqa +] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_flickr30k.py b/configs/multimodal/qwen/qwenvl_chat_7b_flickr30k.py new file mode 100644 index 00000000..c286d064 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_flickr30k.py @@ -0,0 +1,44 @@ +from opencompass.multimodal.models.qwen import QwenVLChatPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict(type='mmpretrain.PackInputs', algorithm_keys=['image_id']) +] + +dataset = dict(type='mmpretrain.Flickr30kCaption', + data_root='data/flickr30k', + ann_file='annotations/dataset_flickr30k.json', + data_prefix='images', + split='val', + pipeline=val_pipeline) + +qwen_flickr30k_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_flickr30k_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatPromptConstructor, prompt='Describe the image.'), + is_caption_task=True, +) + +# evaluation settings +qwen_flickr30k_evaluator = [ + dict( + type='mmpretrain.COCOCaption', + ann_file='data/flickr30k/annotations/flickr30k_val_gt.json', + ) # noqa +] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_gqa.py b/configs/multimodal/qwen/qwenvl_chat_7b_gqa.py new file mode 100644 index 00000000..8ad5e0f2 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_gqa.py @@ -0,0 +1,41 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.GQA', + data_root='data/gqa', + data_prefix='images', + ann_file='annotations/testdev_balanced_questions.json', + pipeline=val_pipeline) + +qwen_gqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_gqa_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_gqa_evaluator = [dict(type='mmpretrain.GQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_mmbench_cn.py b/configs/multimodal/qwen/qwenvl_chat_7b_mmbench_cn.py new file mode 100644 index 00000000..18e811f8 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_mmbench_cn.py @@ -0,0 +1,41 @@ +from opencompass.multimodal.models.qwen import QwenVLMMBenchPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict(type='mmpretrain.PackInputs', + algorithm_keys=[ + 'question', 'options', 'category', 'l2-category', 'context', + 'index', 'options_dict' + ]) +] + +dataset = dict(type='opencompass.MMBenchDataset', + data_file='/mnt/petrelfs/share_data/yuanyike/cnbench_v010_rolling.tsv', + pipeline=val_pipeline, + sys_prompt='请从以下选项中选择一个正确选项。') + +qwen_mmbench_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLMMBenchPromptConstructor) +) + +# evaluation settings +qwen_mmbench_evaluator = [ + dict(type='opencompass.DumpResults', + save_path='work_dirs/qwenvl-chat-7b-cnbench-v010.xlsx') +] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_ocr_vqa.py b/configs/multimodal/qwen/qwenvl_chat_7b_ocr_vqa.py new file mode 100644 index 00000000..3ae7c32e --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_ocr_vqa.py @@ -0,0 +1,42 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.OCRVQA', + data_root='data/ocrvqa', + ann_file='annotations/dataset.json', + split='test', + data_prefix='images', + pipeline=val_pipeline) + +qwen_ocrvqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_ocrvqa_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_ocrvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_ok_vqa.py b/configs/multimodal/qwen/qwenvl_chat_7b_ok_vqa.py new file mode 100644 index 00000000..a1261e89 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_ok_vqa.py @@ -0,0 +1,44 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.COCOVQA', + data_root='data/okvqa', + question_file='annotations/OpenEnded_mscoco_val2014_questions.json', + ann_file='annotations/mscoco_val2014_annotations.json', + pipeline=val_pipeline, + data_prefix='images/val2014', +) + +qwen_okvqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_okvqa_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_okvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_scienceqa.py b/configs/multimodal/qwen/qwenvl_chat_7b_scienceqa.py new file mode 100644 index 00000000..49ac6849 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_scienceqa.py @@ -0,0 +1,43 @@ +from opencompass.multimodal.models.qwen import QwenVLChatScienceQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict(type='mmpretrain.PackInputs', + algorithm_keys=[ + 'question', 'gt_answer', 'choices', 'hint', 'lecture', 'solution' + ]) +] + +dataset = dict(type='mmpretrain.ScienceQA', + data_root='./data/scienceqa', + split='val', + split_file='pid_splits.json', + ann_file='problems.json', + image_only=True, + data_prefix=dict(img_path='val'), + pipeline=val_pipeline) + +qwen_scienceqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_scienceqa_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatScienceQAPromptConstructor) +) + +# evaluation settings +qwen_scienceqa_evaluator = [dict(type='mmpretrain.ScienceQAMetric')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_textvqa.py b/configs/multimodal/qwen/qwenvl_chat_7b_textvqa.py new file mode 100644 index 00000000..fec8a1d4 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_textvqa.py @@ -0,0 +1,43 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.TextVQA', + data_root='data/textvqa', + ann_file='annotations/TextVQA_0.5.1_val.json', + pipeline=val_pipeline, + data_prefix='images/train_images', +) + +qwen_textvqa_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_textvqa_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_textvqa_evaluator = [dict(type='mmpretrain.VQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_vizwiz.py b/configs/multimodal/qwen/qwenvl_chat_7b_vizwiz.py new file mode 100644 index 00000000..513a360e --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_vizwiz.py @@ -0,0 +1,41 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.VizWiz', + data_root='data/vizwiz/', + data_prefix='Images/val', + ann_file='Annotations/val.json', + pipeline=val_pipeline) + +qwen_vizwiz_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_vizwiz_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_vizwiz_evaluator = [dict(type='mmpretrain.VQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_vqav2.py b/configs/multimodal/qwen/qwenvl_chat_7b_vqav2.py new file mode 100644 index 00000000..5c855652 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_vqav2.py @@ -0,0 +1,43 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict( + type='mmpretrain.COCOVQA', + data_root='data/coco', + data_prefix='images/val2014', + question_file='annotations/v2_OpenEnded_mscoco_val2014_questions.json', + ann_file='annotations/v2_mscoco_val2014_annotations.json', + pipeline=val_pipeline) + +qwen_vqav2_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_vqav2_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor) +) + +# evaluation settings +qwen_vqav2_evaluator = [dict(type='mmpretrain.VQAAcc')] diff --git a/configs/multimodal/qwen/qwenvl_chat_7b_vsr.py b/configs/multimodal/qwen/qwenvl_chat_7b_vsr.py new file mode 100644 index 00000000..331a48f4 --- /dev/null +++ b/configs/multimodal/qwen/qwenvl_chat_7b_vsr.py @@ -0,0 +1,42 @@ +from opencompass.multimodal.models.qwen import QwenVLChatVQAPromptConstructor, QwenVLChatVSRPostProcessor + +# dataloader settings +val_pipeline = [ + dict(type='mmpretrain.LoadImageFromFile'), + dict(type='mmpretrain.ToPIL', to_rgb=True), + dict(type='mmpretrain.torchvision/Resize', + size=(448, 448), + interpolation=3), + dict(type='mmpretrain.torchvision/ToTensor'), + dict(type='mmpretrain.torchvision/Normalize', + mean=(0.48145466, 0.4578275, 0.40821073), + std=(0.26862954, 0.26130258, 0.27577711)), + dict( + type='mmpretrain.PackInputs', + algorithm_keys=['question', 'gt_answer', 'gt_answer_weight'], + meta_keys=['question_id', 'image_id'], + ) +] + +dataset = dict(type='mmpretrain.VSR', + data_root='data/vsr/', + data_prefix='images/', + ann_file='annotations/test.json', + pipeline=val_pipeline) + +qwen_vsr_dataloader = dict(batch_size=1, + num_workers=4, + dataset=dataset, + collate_fn=dict(type='pseudo_collate'), + sampler=dict(type='DefaultSampler', shuffle=False)) + +# model settings +qwen_vsr_model = dict( + type='qwen-vl-chat', + pretrained_path='Qwen/Qwen-VL-Chat', # or Huggingface repo id + prompt_constructor=dict(type=QwenVLChatVQAPromptConstructor), + post_processor=dict(type=QwenVLChatVSRPostProcessor) +) + +# evaluation settings +qwen_vsr_evaluator = [dict(type='mmpretrain.GQAAcc')] diff --git a/configs/multimodal/visualglm/visualglm_6b_coco_caption.py b/configs/multimodal/visualglm/visualglm_6b_coco_caption.py index e2fbceca..66e0801c 100644 --- a/configs/multimodal/visualglm/visualglm_6b_coco_caption.py +++ b/configs/multimodal/visualglm/visualglm_6b_coco_caption.py @@ -32,7 +32,7 @@ visualglm_coco_caption_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id is_caption_task=True, - prompt_constructor=dict(type=VisualGLMBasePromptConstructor), + prompt_constructor=dict(type=VisualGLMBasePromptConstructor, system_prompt='A photo of'), post_processor=dict(type=VisualGLMBasePostProcessor) ) diff --git a/configs/multimodal/visualglm/visualglm_6b_flickr30k.py b/configs/multimodal/visualglm/visualglm_6b_flickr30k.py index b88e519f..58ab4649 100644 --- a/configs/multimodal/visualglm/visualglm_6b_flickr30k.py +++ b/configs/multimodal/visualglm/visualglm_6b_flickr30k.py @@ -33,7 +33,7 @@ visualglm_flickr30k_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id is_caption_task=True, - prompt_constructor=dict(type=VisualGLMBasePromptConstructor), + prompt_constructor=dict(type=VisualGLMBasePromptConstructor, system_prompt='A photo of'), post_processor=dict(type=VisualGLMBasePostProcessor) ) diff --git a/configs/multimodal/visualglm/visualglm_6b_mmbench.py b/configs/multimodal/visualglm/visualglm_6b_mmbench.py index 0dbbbd27..8821fe4a 100644 --- a/configs/multimodal/visualglm/visualglm_6b_mmbench.py +++ b/configs/multimodal/visualglm/visualglm_6b_mmbench.py @@ -20,22 +20,23 @@ dataset = dict(type='opencompass.MMBenchDataset', data_file='data/mmbench/mmbench_test_20230712.tsv', pipeline=val_pipeline) -mmbench_dataloader = dict(batch_size=1, +visualglm_mmbench_dataloader = dict(batch_size=1, num_workers=4, dataset=dataset, collate_fn=dict(type='pseudo_collate'), sampler=dict(type='DefaultSampler', shuffle=False)) # model settings -visualglm_model = dict( +visualglm_mmbench_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id prompt_constructor=dict(type=VisualGLMMMBenchPromptConstructor), - post_processor=dict(type=VisualGLMBasePostProcessor) + post_processor=dict(type=VisualGLMBasePostProcessor), + gen_kwargs=dict(max_new_tokens=50,num_beams=5,do_sample=False,repetition_penalty=1.0,length_penalty=-1.0) ) # evaluation settings -mmbench_evaluator = [ +visualglm_mmbench_evaluator = [ dict(type='opencompass.DumpResults', save_path='work_dirs/visualglm-6b-mmbench.xlsx') ] diff --git a/configs/multimodal/visualglm/visualglm_6b_scienceqa.py b/configs/multimodal/visualglm/visualglm_6b_scienceqa.py index 4d56f40d..8ec2393c 100644 --- a/configs/multimodal/visualglm/visualglm_6b_scienceqa.py +++ b/configs/multimodal/visualglm/visualglm_6b_scienceqa.py @@ -26,7 +26,7 @@ dataset = dict(type='mmpretrain.ScienceQA', data_prefix=dict(img_path='val'), pipeline=val_pipeline) -visualglm_vizwiz_dataloader = dict(batch_size=1, +visualglm_scienceqa_dataloader = dict(batch_size=1, num_workers=4, dataset=dataset, collate_fn=dict(type='pseudo_collate'), diff --git a/configs/multimodal/visualglm/visualglm_6b_textvqa.py b/configs/multimodal/visualglm/visualglm_6b_textvqa.py index 20774938..a99ee625 100644 --- a/configs/multimodal/visualglm/visualglm_6b_textvqa.py +++ b/configs/multimodal/visualglm/visualglm_6b_textvqa.py @@ -33,7 +33,7 @@ visualglm_textvqa_dataloader = dict(batch_size=1, sampler=dict(type='DefaultSampler', shuffle=False)) # model settings -visualglm_model = dict( +visualglm_textvqa_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id prompt_constructor=dict(type=VisualGLMVQAPromptConstructor), diff --git a/configs/multimodal/visualglm/visualglm_6b_vizwiz.py b/configs/multimodal/visualglm/visualglm_6b_vizwiz.py index b49a8c6e..1accb4ac 100644 --- a/configs/multimodal/visualglm/visualglm_6b_vizwiz.py +++ b/configs/multimodal/visualglm/visualglm_6b_vizwiz.py @@ -31,7 +31,7 @@ visualglm_vizwiz_dataloader = dict(batch_size=1, sampler=dict(type='DefaultSampler', shuffle=False)) # model settings -visualglm_model = dict( +visualglm_vizwiz_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id prompt_constructor=dict(type=VisualGLMVQAPromptConstructor), diff --git a/configs/multimodal/visualglm/visualglm_6b_vqav2.py b/configs/multimodal/visualglm/visualglm_6b_vqav2.py index 4bbb8426..ff3083dd 100644 --- a/configs/multimodal/visualglm/visualglm_6b_vqav2.py +++ b/configs/multimodal/visualglm/visualglm_6b_vqav2.py @@ -33,7 +33,7 @@ visualglm_vqav2_dataloader = dict(batch_size=1, sampler=dict(type='DefaultSampler', shuffle=False)) # model settings -visualglm_model = dict( +visualglm_vqav2_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id prompt_constructor=dict(type=VisualGLMVQAPromptConstructor), diff --git a/configs/multimodal/visualglm/visualglm_6b_vsr.py b/configs/multimodal/visualglm/visualglm_6b_vsr.py index 69664835..f932975d 100644 --- a/configs/multimodal/visualglm/visualglm_6b_vsr.py +++ b/configs/multimodal/visualglm/visualglm_6b_vsr.py @@ -32,7 +32,7 @@ visualglm_vsr_dataloader = dict(batch_size=1, sampler=dict(type='DefaultSampler', shuffle=False)) # model settings -visualglm_model = dict( +visualglm_vsr_model = dict( type='visualglm', pretrained_path='/path/to/visualglm', # or Huggingface repo id prompt_constructor=dict(type=VisualGLMVQAPromptConstructor), diff --git a/opencompass/multimodal/models/__init__.py b/opencompass/multimodal/models/__init__.py index 6bb6603c..1157c33d 100644 --- a/opencompass/multimodal/models/__init__.py +++ b/opencompass/multimodal/models/__init__.py @@ -19,9 +19,6 @@ if osp.exists('opencompass/multimodal/models/mplug_owl/mPLUG-Owl'): from .mplug_owl import * # noqa: F401, F403 from .openflamingo import * # noqa: F401, F403 - -if osp.exists('opencompass/multimodal/models/otter/Otter'): - from .otter import * # noqa: F401, F403 - +from .otter import * # noqa: F401, F403 from .qwen import * # noqa: F401, F403 from .visualglm import * # noqa: F401, F403 diff --git a/opencompass/multimodal/models/openflamingo/__init__.py b/opencompass/multimodal/models/openflamingo/__init__.py index a6707eaf..e83bb40f 100644 --- a/opencompass/multimodal/models/openflamingo/__init__.py +++ b/opencompass/multimodal/models/openflamingo/__init__.py @@ -1,3 +1,12 @@ from .openflamingo import OpenFlamingoInferencer +from .post_processor import OpenFlamingoVSRPostProcessor +from .prompt_constructor import (OpenFlamingoCaptionPromptConstructor, + OpenFlamingoMMBenchPromptConstructor, + OpenFlamingoScienceQAPromptConstructor, + OpenFlamingoVQAPromptConstructor) -__all__ = ['OpenFlamingoInferencer'] +__all__ = [ + 'OpenFlamingoInferencer', 'OpenFlamingoMMBenchPromptConstructor', + 'OpenFlamingoCaptionPromptConstructor', 'OpenFlamingoVQAPromptConstructor', + 'OpenFlamingoScienceQAPromptConstructor', 'OpenFlamingoVSRPostProcessor' +] diff --git a/opencompass/multimodal/models/openflamingo/openflamingo.py b/opencompass/multimodal/models/openflamingo/openflamingo.py index a46e7ff0..d22bd8f3 100644 --- a/opencompass/multimodal/models/openflamingo/openflamingo.py +++ b/opencompass/multimodal/models/openflamingo/openflamingo.py @@ -1,3 +1,4 @@ +import re from typing import List, Optional, Union import mmengine @@ -21,17 +22,18 @@ class OpenFlamingoInferencer(Flamingo): """ def __init__(self, - prompt_constructor: Optional[dict] = None, + prompt_constructor: dict, post_processor: Optional[dict] = None, mode: str = 'generation', **kwargs): super().__init__(**kwargs) - if prompt_constructor is not None: - self.prompt_constructor = mmengine.registry.build_from_cfg( - prompt_constructor, MM_MODELS) + self.prompt_constructor = mmengine.registry.build_from_cfg( + prompt_constructor, MM_MODELS) if post_processor is not None: self.post_processor = mmengine.registry.build_from_cfg( post_processor, MM_MODELS) + else: + self.post_processor = None self.mode = mode def preprocess_text(self, data_samples: List[DataSample], @@ -46,16 +48,7 @@ class OpenFlamingoInferencer(Flamingo): Returns: List[DataSample]: Return list of data samples. """ - prompts = [] - for sample in data_samples: - question = sample.get('question') - option = sample.get('options') - - prompt = '' + question + ' ' + option + ' ' + 'Answer:' - if data_samples[0].get('context') is not None: - prompt = sample.get('context') + ' ' + prompt - - prompts.append(prompt) + prompts = self.prompt_constructor(data_samples) self.tokenizer.padding_side = 'left' input_text = self.tokenizer( @@ -67,6 +60,42 @@ class OpenFlamingoInferencer(Flamingo): ).to(device) return input_text + def post_process( + self, outputs: torch.Tensor, + data_samples: Optional[List[DataSample]]) -> List[DataSample]: + """Perform post process for outputs for different task. + + Args: + outputs (torch.Tensor): The generated outputs. + data_samples (List[DataSample], optional): The annotation + data of every samples. + + Returns: + List[DataSample]: Return list of data samples. + """ + outputs = self.tokenizer.batch_decode(outputs, + skip_special_tokens=True) + + if data_samples is None: + data_samples = [DataSample() for _ in range(len(outputs))] + + for output, data_sample in zip(outputs, data_samples): + # remove text pattern + if self.task == 'caption': + data_sample.pred_caption = re.split('Output', output, + 1)[0].replace('"', '') + if self.post_processor: + data_sample.pred_caption = self.post_processor( + data_sample.pred_caption) + elif self.task == 'vqa': + data_sample.pred_answer = re.split('Question|Answer', output, + 1)[0] + if self.post_processor: + data_sample.pred_answer = self.post_processor( + data_sample.pred_answer) + + return data_samples + def forward(self, batch: dict) -> Union[DataSample, List[DataSample]]: if self.mode == 'generation': diff --git a/opencompass/multimodal/models/openflamingo/post_processor.py b/opencompass/multimodal/models/openflamingo/post_processor.py new file mode 100644 index 00000000..096805f1 --- /dev/null +++ b/opencompass/multimodal/models/openflamingo/post_processor.py @@ -0,0 +1,13 @@ +class OpenFlamingoVSRPostProcessor: + """VSR post processor for Openflamingo.""" + + def __init__(self) -> None: + pass + + def __call__(self, raw_response: str) -> str: + if 'yes' in raw_response.lower(): + return 'yes' + elif 'no' in raw_response.lower(): + return 'no' + else: + return 'unknown' diff --git a/opencompass/multimodal/models/openflamingo/prompt_constructor.py b/opencompass/multimodal/models/openflamingo/prompt_constructor.py new file mode 100644 index 00000000..de64be37 --- /dev/null +++ b/opencompass/multimodal/models/openflamingo/prompt_constructor.py @@ -0,0 +1,130 @@ +from typing import Optional + +from mmpretrain.structures import DataSample + + +class OpenFlamingoMMBenchPromptConstructor: + """MMBench prompt constructor for OpenFlamingo.""" + + def __init__(self) -> None: + pass + + def __call__(self, data_samples: DataSample) -> tuple: + """Construct prompt. + + Args: + data_samples (DataSample): Input data_samples. + + Returns: + Raw text input (str). + """ + assert len(data_samples) == 1 + sample = data_samples[0] + prompts = [] + question = sample.get('question') + option = sample.get('options') + + prompt = '' + question + ' ' + option + ' ' + 'Answer:' + if sample.get('context') is not None: + prompt = sample.get('context') + ' ' + prompt + + prompts.append(prompt) + + return prompts + + +class OpenFlamingoCaptionPromptConstructor: + """Caption prompt constructor for OpenFlamingo.""" + + def __init__(self, shot_prompt: Optional[str] = None) -> None: + if shot_prompt: + self.shot_prompt = shot_prompt + else: + self.shot_prompt = ( + 'Output:A child holding a flowered umbrella and petting a yak.<|endofchunk|>' # noqa + 'Output:The child is holding a brush close to his mouth.<|endofchunk|>' # noqa + ) # noqa + + def __call__(self, data_samples: DataSample) -> tuple: + """Construct prompt. + + Args: + data_samples (DataSample): Input data_samples. + + Returns: + Raw text input (str). + """ + assert len(data_samples) == 1 + prompts = [] + prompt = 'Output:' + prompts.append(self.shot_prompt + prompt) + return prompts + + +class OpenFlamingoVQAPromptConstructor: + """VQA prompt constructor for OpenFlamingo.""" + + def __init__(self, shot_prompt: Optional[str] = None) -> None: + if shot_prompt: + self.shot_prompt = shot_prompt + else: + self.shot_prompt = ( + 'Question:Is the sky dark? Short Answer:yes<|endofchunk|>' # noqa: E501 + 'Question:What is on the white wall? Short Answer:pipe<|endofchunk|>' # noqa: E501 + ) # noqa + + def __call__(self, data_samples: DataSample) -> tuple: + """Construct prompt. + + Args: + data_samples (DataSample): Input data_samples. + + Returns: + Raw text input (str). + """ + prompts = [] + for sample in data_samples: + question = sample.get('question') + prompt = 'Question:{} Short Answer:'.format(question) + prompts.append(self.shot_prompt + prompt) + return prompts + + +class OpenFlamingoScienceQAPromptConstructor: + """ScienceQA prompt constructor for OpenFlamingo.""" + choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'} + + def __init__(self, shot_prompt: Optional[str] = None) -> None: + if shot_prompt: + self.shot_prompt = shot_prompt + else: + self.shot_prompt = ( + "Context:Question:Which of these states is farthest north? Choices:['(A) West Virginia' '(B) Louisiana' '(C) Arizona' '(D) Oklahoma'] Answer with a single character: A<|endofchunk|>" # noqa + 'Context:The diagrams below show two pure samples of gas in identical closed, rigid containers. Each colored ball represents one gas particle. Both samples have the same number of particles.' # noqa + "Question:Compare the average kinetic energies of the particles in each sample. Which sample has the higher temperature? Choices:'[(A) neither' '(B) sample A' '(C) sample B'] Answer with a single character: C<|endofchunk|>" # noqa + ) # noqa + + def __call__(self, data_samples: DataSample) -> tuple: + """Construct prompt. + + Args: + data_samples (DataSample): Input data_samples. + + Returns: + Raw text input (str). + """ + assert len(data_samples) == 1 + sample = data_samples[0] + question = sample.get('question') + choices = sample.get('choices') + choices = [ + f'({self.choice_mapping[i]}) ' + item + for i, item in enumerate(choices) + ] + hint = sample.get('hint') + prompts = [] + prompt = 'Context:{} Question:{} Choices:{}'.format( + hint, question, choices) + prompt += ' Answer with a single character:' + prompts.append(self.shot_prompt + prompt) + return prompts diff --git a/opencompass/multimodal/models/otter/__init__.py b/opencompass/multimodal/models/otter/__init__.py index 892442f9..e60056df 100644 --- a/opencompass/multimodal/models/otter/__init__.py +++ b/opencompass/multimodal/models/otter/__init__.py @@ -9,3 +9,11 @@ if TYPE_CHECKING: raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: pass + +from .otter import Otter +from .post_processor import OTTERMMBenchPostProcessor +from .prompt_constructor import OTTERMMBenchPromptConstructor + +__all__ = [ + 'Otter', 'OTTERMMBenchPromptConstructor', 'OTTERMMBenchPostProcessor' +] diff --git a/opencompass/multimodal/models/otter/otter.py b/opencompass/multimodal/models/otter/otter.py index 091ce5d0..ced2ba09 100644 --- a/opencompass/multimodal/models/otter/otter.py +++ b/opencompass/multimodal/models/otter/otter.py @@ -1,11 +1,12 @@ +import importlib + import mmengine import torch import torch.nn as nn +from mmengine.device import get_device from opencompass.registry import MM_MODELS -from .Otter.models.otter.modeling_otter import OtterForConditionalGeneration - @MM_MODELS.register_module('otter-9b') class Otter(nn.Module): @@ -19,14 +20,20 @@ class Otter(nn.Module): model_path (str): The path of OTTER model in Huggingface model hub format. load_bit (str): The bit of OTTER model, can be "fp32" or "bf16". + mode (str): The mode of inference. Defaults to 'generation'. """ - def __init__(self, model_path, load_bit, prompt_constructor, - post_processor) -> None: + def __init__(self, + model_path, + load_bit, + prompt_constructor, + post_processor, + mode='generation') -> None: super().__init__() torch_dtype = torch.bfloat16 if load_bit == 'bf16' else torch.float32 - self.model = OtterForConditionalGeneration.from_pretrained( - model_path, torch_dtype=torch_dtype) + otter_ai = importlib.import_module('otter_ai') + self.model = otter_ai.OtterForConditionalGeneration.from_pretrained( + model_path, torch_dtype=torch_dtype, device_map=get_device()) self.tokenizer = self.model.text_tokenizer self.tokenizer.padding_side = 'left' self.model_dtype = next(self.model.parameters()).dtype @@ -35,6 +42,7 @@ class Otter(nn.Module): if post_processor is not None: self.post_processor = mmengine.registry.build_from_cfg( post_processor, MM_MODELS) + self.mode = mode def forward(self, batch): if self.mode == 'generation': diff --git a/opencompass/multimodal/models/otter/prompt_constructor.py b/opencompass/multimodal/models/otter/prompt_constructor.py index 9e7b3cda..7d16582e 100644 --- a/opencompass/multimodal/models/otter/prompt_constructor.py +++ b/opencompass/multimodal/models/otter/prompt_constructor.py @@ -53,9 +53,9 @@ class OTTERMMBenchPromptConstructor: context = data_sample.get('context') # e.g. User: What is the color of the sky? A: Blue B: Red C: Green D: Yellow GPT: # noqa if context is not None: - prompt = f'{self.image_token}{self.user_label} {context[i]} {question[i]} {options[i]} {self.model_label}:{self.reply_token}' # noqa + prompt = f'{self.image_token}{self.user_label} {context} {question} {options} {self.model_label}:{self.reply_token}' # noqa else: - prompt = f'{self.image_token}{self.user_label} {question[i]} {options[i]} {self.model_label}:{self.reply_token}' # noqa + prompt = f'{self.image_token}{self.user_label} {question} {options} {self.model_label}:{self.reply_token}' # noqa return prompt diff --git a/opencompass/multimodal/models/qwen/__init__.py b/opencompass/multimodal/models/qwen/__init__.py index 94f33b6b..1677731b 100644 --- a/opencompass/multimodal/models/qwen/__init__.py +++ b/opencompass/multimodal/models/qwen/__init__.py @@ -1,8 +1,13 @@ -from .post_processor import QwenVLBasePostProcessor -from .prompt_constructor import QwenVLMMBenchPromptConstructor +from .post_processor import QwenVLBasePostProcessor, QwenVLChatVSRPostProcessor +from .prompt_constructor import (QwenVLChatPromptConstructor, + QwenVLChatScienceQAPromptConstructor, + QwenVLChatVQAPromptConstructor, + QwenVLMMBenchPromptConstructor) from .qwen import QwenVLBase, QwenVLChat __all__ = [ 'QwenVLBase', 'QwenVLChat', 'QwenVLBasePostProcessor', - 'QwenVLMMBenchPromptConstructor' + 'QwenVLMMBenchPromptConstructor', 'QwenVLChatPromptConstructor', + 'QwenVLChatVQAPromptConstructor', 'QwenVLChatVSRPostProcessor', + 'QwenVLChatScienceQAPromptConstructor' ] diff --git a/opencompass/multimodal/models/qwen/post_processor.py b/opencompass/multimodal/models/qwen/post_processor.py index 4382622f..e6b5525f 100644 --- a/opencompass/multimodal/models/qwen/post_processor.py +++ b/opencompass/multimodal/models/qwen/post_processor.py @@ -14,3 +14,18 @@ class QwenVLBasePostProcessor: response = self.tokenizer.decode(pred)[input_len:] response = response.replace('<|endoftext|>', '').strip() return response + + +class QwenVLChatVSRPostProcessor: + """VSR post processor for Qwen-VL-Chat.""" + + def __init__(self) -> None: + pass + + def __call__(self, response: str) -> str: + if 'yes' in response.lower(): + return 'yes' + elif 'no' in response.lower(): + return 'no' + else: + return 'unknown' diff --git a/opencompass/multimodal/models/qwen/prompt_constructor.py b/opencompass/multimodal/models/qwen/prompt_constructor.py index 476e1958..aa06ff26 100644 --- a/opencompass/multimodal/models/qwen/prompt_constructor.py +++ b/opencompass/multimodal/models/qwen/prompt_constructor.py @@ -7,7 +7,7 @@ class QwenVLMMBenchPromptConstructor: def __init__(self) -> None: pass - def __call__(self, inputs: dict) -> str: + def __call__(self, inputs: dict) -> list: data_samples = inputs['data_samples'] assert len(data_samples) == 1 data_sample = data_samples[0] @@ -27,3 +27,74 @@ class QwenVLMMBenchPromptConstructor: }, ] return format_input + + +class QwenVLChatPromptConstructor: + """Prompt constructorfor Qwen-VL-Chat.""" + + def __init__(self, prompt='') -> None: + self.prompt = prompt + + def __call__(self, inputs: dict) -> list: + assert len(inputs['data_samples']) == 1 + format_input = [ + { + 'image': 'This_is_path_to_an_image.' + }, # Just placeholder for Image Tokens + { + 'text': self.prompt + }, + ] + return format_input + + +class QwenVLChatVQAPromptConstructor: + """VQA prompt constructor for Qwen-VL-Chat.""" + + def __init__(self, prompt='') -> None: + self.prompt = prompt + + def __call__(self, inputs: dict) -> list: + data_samples = inputs['data_samples'] + assert len(data_samples) == 1 + data_sample = data_samples[0] + question = data_sample.get('question') + format_input = [ + { + 'image': 'This_is_path_to_an_image.' + }, # Just placeholder for Image Tokens + { + 'text': question + self.prompt + }, + ] + return format_input + + +class QwenVLChatScienceQAPromptConstructor: + """ScienceQA prompt constructor for Qwen-VL-Chat.""" + choice_mapping = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F'} + + def __init__(self, prompt='') -> None: + self.prompt = prompt + + def __call__(self, inputs: dict) -> list: + data_samples = inputs['data_samples'] + assert len(data_samples) == 1 + data_sample = data_samples[0] + question = data_sample.get('question') + choices = data_sample.get('choices') + choices = [ + f'({self.choice_mapping[i]}) ' + item + for i, item in enumerate(choices) + ] + choices = 'Choices: ' + ' '.join(choices) + '\n' + contexts = 'Context: ' + data_sample.get('hint') + format_input = [ + { + 'image': 'This_is_path_to_an_image.' + }, # Just placeholder for Image Tokens + { + 'text': contexts + question + choices + self.prompt + }, + ] + return format_input diff --git a/opencompass/multimodal/models/qwen/qwen.py b/opencompass/multimodal/models/qwen/qwen.py index 9682b5c9..5c75ed4a 100644 --- a/opencompass/multimodal/models/qwen/qwen.py +++ b/opencompass/multimodal/models/qwen/qwen.py @@ -55,6 +55,8 @@ class QwenVLBase(nn.Module): if post_processor is not None: self.post_processor = mmengine.registry.build_from_cfg( post_processor, MM_MODELS) + else: + self.post_processor = None self.is_caption_task = is_caption_task self.model.transformer.forward = types.MethodType( forward_hack, self.model.transformer) @@ -154,6 +156,9 @@ class QwenVLChat(QwenVLBase): verbose=False, errors='replace') + if self.post_processor: + response = self.post_processor(response) + data_sample = batch['data_samples'][0] if self.is_caption_task: data_sample.pred_caption = response diff --git a/opencompass/multimodal/models/visualglm/prompt_constructor.py b/opencompass/multimodal/models/visualglm/prompt_constructor.py index ea644c85..a10b7d77 100644 --- a/opencompass/multimodal/models/visualglm/prompt_constructor.py +++ b/opencompass/multimodal/models/visualglm/prompt_constructor.py @@ -81,9 +81,7 @@ class VisualGLMBasePromptConstructor: data_samples = batch.pop('data_samples') # generate text prompt - img_prompt = '' - prompt = img_prompt + self.prompt - image_position = prompt.rfind('') + 5 + prompt = ['' + self.prompt for i in range(images.shape[0])] image_position = 5 diff --git a/opencompass/multimodal/models/visualglm/visualglm.py b/opencompass/multimodal/models/visualglm/visualglm.py index 1bb99853..7187b97e 100644 --- a/opencompass/multimodal/models/visualglm/visualglm.py +++ b/opencompass/multimodal/models/visualglm/visualglm.py @@ -43,7 +43,14 @@ class VisualGLM(nn.Module): if gen_kwargs: self.gen_kwargs = gen_kwargs else: - self.gen_kwargs = dict() + self.gen_kwargs = dict( + max_new_tokens=30, + num_beams=1, + do_sample=False, + repetition_penalty=1.0, + length_penalty=-1.0, + ) + self.is_caption_task = is_caption_task def encode_by_tokenizer(self, multi_prompts, image_position):