From 9b3613f10bbe032c6b85078454058b0502785450 Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:57:41 +0800
Subject: [PATCH] [Update] Support auto-download of FOFO/MT-Bench-101 (#1423)

* [Update] Support auto-download of FOFO/MT-Bench-101

* Update wildbench
---
 .../subjective/fofo/fofo_bilingual_judge.py   | 99 +++++++++++++++++++
 .../datasets/subjective/fofo/fofo_judge.py    |  2 +-
 .../wildbench/wildbench_pair_judge.py         | 12 +--
 .../subjective/fofo/fofo_bilingual_judge.py   | 99 +++++++++++++++++++
 .../datasets/subjective/fofo/fofo_judge.py    |  2 +-
 .../wildbench/wildbench_pair_judge.py         | 12 +--
 opencompass/datasets/subjective/fofo.py       |  2 +
 opencompass/datasets/subjective/mtbench101.py |  2 +
 opencompass/datasets/subjective/wildbench.py  |  2 +
 opencompass/utils/datasets_info.py            | 12 +++
 10 files changed, 230 insertions(+), 14 deletions(-)
 create mode 100644 configs/datasets/subjective/fofo/fofo_bilingual_judge.py
 create mode 100644 opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
diff --git a/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
new file mode 100644
index 00000000..ed221204
--- /dev/null
+++ b/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
+        ))
diff --git a/configs/datasets/subjective/fofo/fofo_judge.py b/configs/datasets/subjective/fofo/fofo_judge.py
index ed221204..89400892 100644
--- a/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/configs/datasets/subjective/fofo/fofo_judge.py
@@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
     )
 
 subjective_all_sets = [
-    'fofo_test_prompts', 'fofo_test_prompts_cn',
+    'fofo_test_prompts'
 ]
 
 base_prompt = """
diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
index 5037ae45..a07b1a7d 100644
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
     )
 
 
-data_path ='./data/WildBench/wildbench.jsonl'
+data_path ='./data/subjective/WildBench/wildbench.jsonl'
 
 wildbench_datasets = []
 subjective_infer_cfg = dict(
@@ -54,11 +54,11 @@ wildbench_datasets.append(
         reader_cfg=subjective_reader_cfg,
         infer_cfg=subjective_infer_cfg,
         eval_cfg=subjective_eval_cfg,
-        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
-                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
-                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
         mode='m2n', # m个模型 与 n个模型进行对战
         infer_order='random',
         base_models = [llama_2_70b, gpt4, claude]
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
new file mode 100644
index 00000000..ed221204
--- /dev/null
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import FofoDataset
+from opencompass.summarizers import FofoSummarizer
+from mmengine.config import read_base
+
+subjective_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    'fofo_test_prompts', 'fofo_test_prompts_cn',
+]
+
+base_prompt = """
+I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
+
+Here is the prompt:
+{
+    "instruction": "{question}",
+}
+
+Here are the outputs of the models:
+[
+    {
+        "model": "model",
+        "answer": "{prediction}"
+    },
+]
+
+Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
+```json
+[
+    {
+        'model': <model-name>,
+        'format_correctness': <correctness>,
+        'reasons': <reasons-of-format-correctness>
+    }
+]
+```
+
+Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
+"""
+
+fofo_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=4096),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = base_prompt
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    fofo_datasets.append(
+        dict(
+            abbr=f'{_name}',
+            type=FofoDataset,
+            path='./data/subjective/fofo',
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg,
+            mode='singlescore',
+            summarizer = dict(type=FofoSummarizer, judge_type='general')
+        ))
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
index ed221204..89400892 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
@@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
     )
 
 subjective_all_sets = [
-    'fofo_test_prompts', 'fofo_test_prompts_cn',
+    'fofo_test_prompts'
 ]
 
 base_prompt = """
diff --git a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
index 5037ae45..a07b1a7d 100644
--- a/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/opencompass/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@@ -11,7 +11,7 @@ subjective_reader_cfg = dict(
     )
 
 
-data_path ='./data/WildBench/wildbench.jsonl'
+data_path ='./data/subjective/WildBench/wildbench.jsonl'
 
 wildbench_datasets = []
 subjective_infer_cfg = dict(
@@ -54,11 +54,11 @@ wildbench_datasets.append(
         reader_cfg=subjective_reader_cfg,
         infer_cfg=subjective_infer_cfg,
         eval_cfg=subjective_eval_cfg,
-        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
-                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
-                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
-                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
         mode='m2n', # m个模型 与 n个模型进行对战
         infer_order='random',
         base_models = [llama_2_70b, gpt4, claude]
diff --git a/opencompass/datasets/subjective/fofo.py b/opencompass/datasets/subjective/fofo.py
index 1eb4e8b9..a1a5f466 100644
--- a/opencompass/datasets/subjective/fofo.py
+++ b/opencompass/datasets/subjective/fofo.py
@@ -5,6 +5,7 @@ import os.path as osp
 from datasets import Dataset
 
 from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
@@ -13,6 +14,7 @@ from ..base import BaseDataset
 class FofoDataset(BaseDataset):
 
     def load(self, path: str, name: str, *args, **kwargs):
+        path = get_data_path(path, local_mode=True)
         filename = osp.join(path, f'{name}.json')
         raw_data = []
         with open(filename, 'r', encoding='utf-8') as f:
diff --git a/opencompass/datasets/subjective/mtbench101.py b/opencompass/datasets/subjective/mtbench101.py
index a6852245..fb4b549f 100644
--- a/opencompass/datasets/subjective/mtbench101.py
+++ b/opencompass/datasets/subjective/mtbench101.py
@@ -6,6 +6,7 @@ import re
 from datasets import Dataset, DatasetDict
 
 from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
@@ -260,6 +261,7 @@ class MTBench101Dataset(BaseDataset):
         import copy
 
         filename = osp.join(path, f'{name}.jsonl')
+        filename = get_data_path(filename, local_mode=True)
         # filename = osp.join(path, 'mtbench101.jsonl')
         dataset = DatasetDict()
         raw_data = []
diff --git a/opencompass/datasets/subjective/wildbench.py b/opencompass/datasets/subjective/wildbench.py
index 65d8ec27..0485d129 100644
--- a/opencompass/datasets/subjective/wildbench.py
+++ b/opencompass/datasets/subjective/wildbench.py
@@ -3,6 +3,7 @@ import json
 from datasets import Dataset, DatasetDict
 
 from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
@@ -210,6 +211,7 @@ def parse_conversation(conversation):
 class WildBenchDataset(BaseDataset):
 
     def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
+        path = get_data_path(path, local_mode=True)
         dataset = DatasetDict()
         raw_data = []
         with open(path, 'r', encoding='utf-8') as file:
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 6bc23705..767a2c4c 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -342,4 +342,16 @@ DATASETS_URL = {
         'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip',
         'md5': 'd1afc0787aeac7f1f24872742e161069'
     },
+    'subjective/fofo': {
+        'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip',
+        'md5': '8a302712e425e27e4292a9369df5b9d3'
+    },
+    'subjective/mtbench101': {
+        'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip',
+        'md5': '5d80257bc9929ebe5cfbf6d11184b04c',
+    },
+    'subjective/WildBench': {
+        'url': 'http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip',
+        'md5': 'b06252857f1f8f44a17b1bfca4888ff4',
+    }
 }