OpenCompass/opencompass/multimodal/datasets/mmbench.py
Yuanhan Zhang 7c2726c23b
[Model] Yhzhang/add mlugowl llamaadapter (#405)
* refine gitignore

* [Feature]: Add minigpt-4

* [Feature]: Add mm local runner

* [Feature]: Add instructblip

* add otter and llama-adapter

* add owl

* add llama2-adapter and owl

* lint

* [Feature]: Add minigpt-4

* [Feature]: Add instructblip

* add otter and llama-adapter

* add owl

* add llama2-adapter and owl

* lint

* lint

* update

* lint

* lint

* add __init__.py

* update

* update

* update

* update

* [Feature]: Add minigpt-4

* [Feature]: Add mm local runner

* [Feature]: Add instructblip

* add otter and llama-adapter

* add owl

* add llama2-adapter and owl

* lint

* [Feature]: Add minigpt-4

* [Feature]: Add instructblip

* add otter and llama-adapter

* add owl

* add llama2-adapter and owl

* lint

* lint

* update

* lint

* lint

* add __init__.py

* update

* update

* update

* update

* optimize mmbench dataset args

* update

* update

* run commit hook

---------

Co-authored-by: liuyuan <3463423099@qq.com>
Co-authored-by: kennymckormick <dhd@pku.edu.cn>
Co-authored-by: kennymckormick <dhd.efz@gmail.com>
2023-09-19 14:21:26 +08:00

85 lines
2.5 KiB
Python

import base64
import io
from typing import List, Optional
import pandas as pd
from mmengine.dataset import Compose
from PIL import Image
from torch.utils.data import Dataset
from opencompass.registry import DATASETS
def decode_base64_to_image(base64_string) -> Image:
"""Convert raw data into Pillow image."""
image_data = base64.b64decode(base64_string)
image = Image.open(io.BytesIO(image_data))
return image
@DATASETS.register_module()
class MMBenchDataset(Dataset):
"""Dataset to load MMBench dataset.
Args:
data_file (str): The path of the dataset.
pipeline (dict): The data augmentation.
sys_prompt (str): The system prompt added to the head
of these options. Defaults to
There are several options:
"""
def __init__(self,
data_file: str,
pipeline: List[dict],
sys_prompt: str = 'There are several options:') -> None:
self.df = pd.read_csv(data_file, sep='\t')
self.pipeline = Compose(pipeline)
self.sys_prompt = sys_prompt
def __len__(self) -> None:
return len(self.df)
def __getitem__(self, idx: int) -> dict:
# Mandatory Fields Begin
index = self.df.iloc[idx]['index']
image = self.df.iloc[idx]['image']
image = decode_base64_to_image(image)
question = self.df.iloc[idx]['question']
option_candidate = ['A', 'B', 'C', 'D', 'E']
options = {
cand: self.load_from_df(idx, cand)
for cand in option_candidate
if self.load_from_df(idx, cand) is not None
}
options_prompt = f'{self.sys_prompt}\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
# Mandatory Fields End
# Optional Fields Begin
hint = self.load_from_df(idx, 'hint')
category = self.load_from_df(idx, 'category')
l2_catetory = self.load_from_df(idx, 'l2-category')
# Optional Fields End
data = {
'img': image,
'question': question,
'options': options_prompt,
'category': category,
'l2-category': l2_catetory,
'options_dict': options,
'index': index,
'context': hint,
}
data = self.pipeline(data)
return data
def load_from_df(self, idx: int, key: str) -> Optional[str]:
if key in self.df.iloc[idx] and not pd.isna(self.df.iloc[idx][key]):
return self.df.iloc[idx][key]
else:
return None