[Update] Support auto-download of FOFO/MT-Bench-101 (#1423)

* [Update] Support auto-download of FOFO/MT-Bench-101

* Update wildbench
This commit is contained in:
Songyang Zhang 2024-08-16 11:57:41 +08:00 committed by GitHub
parent ce7f4853ce
commit 9b3613f10b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 230 additions and 14 deletions

View File

@ -0,0 +1,99 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
'fofo_test_prompts'
]
base_prompt = """

View File

@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
)
data_path ='./data/WildBench/wildbench.jsonl'
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
@ -54,11 +54,11 @@ wildbench_datasets.append(
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude]

View File

@ -0,0 +1,99 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
'fofo_test_prompts'
]
base_prompt = """

View File

@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
)
data_path ='./data/WildBench/wildbench.jsonl'
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
@ -54,11 +54,11 @@ wildbench_datasets.append(
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude]

View File

@ -5,6 +5,7 @@ import os.path as osp
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
@ -13,6 +14,7 @@ from ..base import BaseDataset
class FofoDataset(BaseDataset):
def load(self, path: str, name: str, *args, **kwargs):
path = get_data_path(path, local_mode=True)
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:

View File

@ -6,6 +6,7 @@ import re
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
@ -260,6 +261,7 @@ class MTBench101Dataset(BaseDataset):
import copy
filename = osp.join(path, f'{name}.jsonl')
filename = get_data_path(filename, local_mode=True)
# filename = osp.join(path, 'mtbench101.jsonl')
dataset = DatasetDict()
raw_data = []

View File

@ -3,6 +3,7 @@ import json
from datasets import Dataset, DatasetDict
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from ..base import BaseDataset
@ -210,6 +211,7 @@ def parse_conversation(conversation):
class WildBenchDataset(BaseDataset):
def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
path = get_data_path(path, local_mode=True)
dataset = DatasetDict()
raw_data = []
with open(path, 'r', encoding='utf-8') as file:

View File

@ -342,4 +342,16 @@ DATASETS_URL = {
'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip',
'md5': 'd1afc0787aeac7f1f24872742e161069'
},
'subjective/fofo': {
'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip',
'md5': '8a302712e425e27e4292a9369df5b9d3'
},
'subjective/mtbench101': {
'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip',
'md5': '5d80257bc9929ebe5cfbf6d11184b04c',
},
'subjective/WildBench': {
'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip',
'md5': 'b06252857f1f8f44a17b1bfca4888ff4',
}
}