From 116a24632c3e583072fb62c618f87402765c60c2 Mon Sep 17 00:00:00 2001
From: Hoter Young <hoteryoung@163.com>
Date: Fri, 24 Jan 2025 10:32:17 +0800
Subject: [PATCH] [Feature] Add OpenHuEval-HuLifeQA (#4)

---
 examples/eval_subjective_hulifeqa.py          | 97 +++++++++++++++++++
 .../configs/datasets/OpenHuEval/HuLifeQA.py   | 69 +++++++++++++
 .../models/openai/gpt_4o_mini_20240718.py     | 20 ++++
 opencompass/partitioners/__init__.py          |  1 +
 .../summarizers/subjective/wildbench.py       | 16 ++-
 opencompass/utils/run.py                      | 10 +-
 6 files changed, 207 insertions(+), 6 deletions(-)
 create mode 100644 examples/eval_subjective_hulifeqa.py
 create mode 100644 opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
 create mode 100644 opencompass/configs/models/openai/gpt_4o_mini_20240718.py

diff --git a/examples/eval_subjective_hulifeqa.py b/examples/eval_subjective_hulifeqa.py
new file mode 100644
index 00000000..d03ae4a7
--- /dev/null
+++ b/examples/eval_subjective_hulifeqa.py
@@ -0,0 +1,97 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
+        hu_life_qa_datasets,
+        task_group_new,
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat_model,
+    )
+    from opencompass.configs.models.openai.gpt_4o_mini_20240718 import (
+        models as gpt_4o_mini_20240718_model,
+    )
+
+from opencompass.models import OpenAI
+from opencompass.partitioners import (
+    NumWorkerPartitioner,
+    SubjectiveNumWorkerPartitioner,
+)
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import WildBenchSingleSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+api_meta_template = dict(
+    round=[
+        dict(role="SYSTEM", api_role="SYSTEM"),
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=True),
+    ]
+)
+
+models = [
+    *gpt_4o_mini_20240718_model,
+    *lmdeploy_internlm2_5_7b_chat_model,
+]
+
+judge_models = [
+    dict(
+        abbr="GPT-4o-2024-08-06",
+        type=OpenAI,
+        path="gpt-4o-2024-08-06",
+        key="ENV",
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=4096,
+        max_seq_len=4096,
+        batch_size=8,
+        temperature=0,
+    )
+]
+
+for ds in hu_life_qa_datasets:
+    ds.update(
+        dict(
+            mode="singlescore",
+            eval_mode="single"
+        )
+    )
+del ds
+datasets = [*hu_life_qa_datasets]
+del hu_life_qa_datasets
+
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8,
+    ),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNumWorkerPartitioner,
+        num_worker=8,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(
+        type=LocalRunner, 
+        max_num_workers=16, 
+        task=dict(type=SubjectiveEvalTask)
+    ),
+)
+
+summarizer = dict(
+    type=WildBenchSingleSummarizer,
+    customized_task_group_new=task_group_new,
+)
+
+work_dir = (
+    "./outputs/" + __file__.split("/")[-1].split(".")[0] + "/"
+)  # do NOT modify this line, yapf: disable, pylint: disable
diff --git a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
new file mode 100644
index 00000000..6e207092
--- /dev/null
+++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
@@ -0,0 +1,69 @@
+from opencompass.datasets import WildBenchDataset
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+hu_life_qa_reader_cfg = dict(
+    input_columns=["dialogue", "prompt"],
+    output_column="judge",
+)
+
+data_path ="/mnt/hwfile/opendatalab/yanghaote/share/g13k_hu/g13k_hu_vpaper.jsonl"
+
+hu_life_qa_datasets = []
+hu_life_qa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{dialogue}"""
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=ChatInferencer,
+        max_seq_len=4096,
+        max_out_len=512,
+        infer_mode="last",
+    ),
+)
+
+hu_life_qa_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate, 
+            template="""{prompt}"""
+        ),
+    ),
+    pred_role="BOT",
+)
+
+hu_life_qa_datasets.append(
+    dict(
+        abbr="hu_life_qa",
+        type=WildBenchDataset,
+        path=data_path,
+        reader_cfg=hu_life_qa_reader_cfg,
+        infer_cfg=hu_life_qa_infer_cfg,
+        eval_cfg=hu_life_qa_eval_cfg,
+    )
+)
+
+task_group_new = {
+    "business and finance": "business and finance",
+    "childbearing and education": "life, culture, and customs",
+    "culture and community": "life, culture, and customs",
+    'culture and customs': "life, culture, and customs",
+    "life, culture, and customs": "life, culture, and customs",
+    "education and profession": "education and profession",
+    "food and drink": "life, culture, and customs",
+    "health": "life, culture, and customs",
+    "holidays": "life, culture, and customs",
+    "home": "life, culture, and customs",
+    "person": "life, culture, and customs",
+    "politics": "politics, policy and law",
+    "politics, policy and law": "politics, policy and law",
+    "public education and courses": "education and profession",
+    "transport": "life, culture, and customs",
+    "science": "life, culture, and customs",
+    "travel": "life, culture, and customs",
+}
diff --git a/opencompass/configs/models/openai/gpt_4o_mini_20240718.py b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py
new file mode 100644
index 00000000..36bfa610
--- /dev/null
+++ b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py
@@ -0,0 +1,20 @@
+from opencompass.models import OpenAI
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+models = [
+    dict(
+        abbr='GPT-4o-mini-2024-07-18',
+        type=OpenAI,
+        path='gpt-4o-mini-2024-07-18',
+        key=
+        'ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8),
+]
diff --git a/opencompass/partitioners/__init__.py b/opencompass/partitioners/__init__.py
index ead3704d..8726c3db 100644
--- a/opencompass/partitioners/__init__.py
+++ b/opencompass/partitioners/__init__.py
@@ -1,3 +1,4 @@
 from .naive import *  # noqa: F401, F403
 from .num_worker import *  # noqa: F401, F403
 from .size import *  # noqa: F401, F403
+from .sub_num_worker import *  # noqa: F401, F403
diff --git a/opencompass/summarizers/subjective/wildbench.py b/opencompass/summarizers/subjective/wildbench.py
index 98e58cd8..5d920c47 100644
--- a/opencompass/summarizers/subjective/wildbench.py
+++ b/opencompass/summarizers/subjective/wildbench.py
@@ -65,6 +65,7 @@ def get_capability_results(
     fout,
     fout_flag,
     model_abbr,
+    customized_task_group=task_group_new,
 ):
     capability_ratings = defaultdict(float)
     capability_counts = defaultdict(float)
@@ -75,8 +76,8 @@ def get_capability_results(
         capability_counts['total'] += 1
         tags = [ref['primary_tag']] + ref['secondary_tag']
         for tag in tags:
-            capability_ratings[task_group_new[tag]] += ans
-            capability_counts[task_group_new[tag]] += 1
+            capability_ratings[customized_task_group[tag]] += ans
+            capability_counts[customized_task_group[tag]] += 1
 
     capability_avg_ratings = defaultdict(float)
 
@@ -102,7 +103,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
             It's expected to be filled out at runtime.
     """
 
-    def __init__(self, config: ConfigDict) -> None:
+    def __init__(self, config: ConfigDict, customized_task_group_new=task_group_new) -> None:
         self.judge_type = 'single'
         self.tasks = []
         self.cfg = config
@@ -110,6 +111,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
         self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
         self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
         self.judge_function = post_process_wildbench_single
+        self.task_group_new = customized_task_group_new
 
     def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
         """Summarize the subjectivity analysis based on evaluation results.
@@ -138,7 +140,13 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
                     overall_judged_answers += judged_answers
                     overall_references += references
 
-                get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
+                get_capability_results(
+                    overall_judged_answers, 
+                    overall_references, 
+                    fout, fout_flag, 
+                    show_model_abbr, 
+                    self.task_group_new,
+                )
                 fout_flag += 1
             else:
                 print(subdir_path + ' is not exist! please check!')
diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index accd3468..b96c57fc 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -369,12 +369,18 @@ def fill_infer_cfg(cfg, args):
 
 
 def fill_eval_cfg(cfg, args):
+    # To avoid the overwriting of the configured subjective partitioner and task type.
+    partitioner_type = cfg.get("eval", {}).get("partitioner", {}).get("type", None)
+    partitioner_type = partitioner_type if partitioner_type else get_config_type(NaivePartitioner)
+    task_type = cfg.get("eval", {}).get("runner", {}).get("task", {}).get("type", None)
+    task_type = task_type if task_type else get_config_type(OpenICLEvalTask)
+
     new_cfg = dict(
-        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
+        eval=dict(partitioner=dict(type=partitioner_type),
                   runner=dict(
                       max_num_workers=args.max_num_workers,
                       debug=args.debug,
-                      task=dict(type=get_config_type(OpenICLEvalTask)),
+                      task=dict(type=task_type),
                       lark_bot_url=cfg['lark_bot_url'],
                   )))
     if args.slurm: