[Feature] Add OpenHuEval-HuLifeQA (#4)

2025-05-30 16:03:24 +08:00 · 2025-01-24 10:32:17 +08:00 · 2025-01-24 10:32:17 +08:00 · 116a24632c
commit 116a24632c
parent 5f72e96d5b
6 changed files with 207 additions and 6 deletions
--- a/examples/eval_subjective_hulifeqa.py
+++ b/examples/eval_subjective_hulifeqa.py
@ -0,0 +1,97 @@
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
+        hu_life_qa_datasets,
+        task_group_new,
+    )
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
+        models as lmdeploy_internlm2_5_7b_chat_model,
+    )
+    from opencompass.configs.models.openai.gpt_4o_mini_20240718 import (
+        models as gpt_4o_mini_20240718_model,
+    )
+
+from opencompass.models import OpenAI
+from opencompass.partitioners import (
+    NumWorkerPartitioner,
+    SubjectiveNumWorkerPartitioner,
+)
+from opencompass.runners import LocalRunner, SlurmSequentialRunner
+from opencompass.summarizers import WildBenchSingleSummarizer
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+api_meta_template = dict(
+    round=[
+        dict(role="SYSTEM", api_role="SYSTEM"),
+        dict(role="HUMAN", api_role="HUMAN"),
+        dict(role="BOT", api_role="BOT", generate=True),
+    ]
+)
+
+models = [
+    *gpt_4o_mini_20240718_model,
+    *lmdeploy_internlm2_5_7b_chat_model,
+]
+
+judge_models = [
+    dict(
+        abbr="GPT-4o-2024-08-06",
+        type=OpenAI,
+        path="gpt-4o-2024-08-06",
+        key="ENV",
+        meta_template=api_meta_template,
+        query_per_second=16,
+        max_out_len=4096,
+        max_seq_len=4096,
+        batch_size=8,
+        temperature=0,
+    )
+]
+
+for ds in hu_life_qa_datasets:
+    ds.update(
+        dict(
+            mode="singlescore",
+            eval_mode="single"
+        )
+    )
+del ds
+datasets = [*hu_life_qa_datasets]
+del hu_life_qa_datasets
+
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=8,
+    ),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNumWorkerPartitioner,
+        num_worker=8,
+        models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(
+        type=LocalRunner, 
+        max_num_workers=16, 
+        task=dict(type=SubjectiveEvalTask)
+    ),
+)
+
+summarizer = dict(
+    type=WildBenchSingleSummarizer,
+    customized_task_group_new=task_group_new,
+)
+
+work_dir = (
+    "./outputs/" + __file__.split("/")[-1].split(".")[0] + "/"
+)  # do NOT modify this line, yapf: disable, pylint: disable
--- a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
+++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
@ -0,0 +1,69 @@
+from opencompass.datasets import WildBenchDataset
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+hu_life_qa_reader_cfg = dict(
+    input_columns=["dialogue", "prompt"],
+    output_column="judge",
+)
+
+data_path ="/mnt/hwfile/opendatalab/yanghaote/share/g13k_hu/g13k_hu_vpaper.jsonl"
+
+hu_life_qa_datasets = []
+hu_life_qa_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="""{dialogue}"""
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(
+        type=ChatInferencer,
+        max_seq_len=4096,
+        max_out_len=512,
+        infer_mode="last",
+    ),
+)
+
+hu_life_qa_eval_cfg = dict(
+    evaluator=dict(
+        type=LMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate, 
+            template="""{prompt}"""
+        ),
+    ),
+    pred_role="BOT",
+)
+
+hu_life_qa_datasets.append(
+    dict(
+        abbr="hu_life_qa",
+        type=WildBenchDataset,
+        path=data_path,
+        reader_cfg=hu_life_qa_reader_cfg,
+        infer_cfg=hu_life_qa_infer_cfg,
+        eval_cfg=hu_life_qa_eval_cfg,
+    )
+)
+
+task_group_new = {
+    "business and finance": "business and finance",
+    "childbearing and education": "life, culture, and customs",
+    "culture and community": "life, culture, and customs",
+    'culture and customs': "life, culture, and customs",
+    "life, culture, and customs": "life, culture, and customs",
+    "education and profession": "education and profession",
+    "food and drink": "life, culture, and customs",
+    "health": "life, culture, and customs",
+    "holidays": "life, culture, and customs",
+    "home": "life, culture, and customs",
+    "person": "life, culture, and customs",
+    "politics": "politics, policy and law",
+    "politics, policy and law": "politics, policy and law",
+    "public education and courses": "education and profession",
+    "transport": "life, culture, and customs",
+    "science": "life, culture, and customs",
+    "travel": "life, culture, and customs",
+}
--- a/opencompass/configs/models/openai/gpt_4o_mini_20240718.py
+++ b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py
@ -0,0 +1,20 @@
+from opencompass.models import OpenAI
+
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+models = [
+    dict(
+        abbr='GPT-4o-mini-2024-07-18',
+        type=OpenAI,
+        path='gpt-4o-mini-2024-07-18',
+        key=
+        'ENV',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+        meta_template=api_meta_template,
+        query_per_second=1,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=8),
+]
--- a/opencompass/partitioners/init.py
+++ b/opencompass/partitioners/init.py
@ -1,3 +1,4 @@
 from .naive import *  # noqa: F401, F403
 from .num_worker import *  # noqa: F401, F403
 from .size import *  # noqa: F401, F403
+from .sub_num_worker import *  # noqa: F401, F403
--- a/opencompass/summarizers/subjective/wildbench.py
+++ b/opencompass/summarizers/subjective/wildbench.py
@ -65,6 +65,7 @@ def get_capability_results(
    fout,
    fout_flag,
    model_abbr,
+    customized_task_group=task_group_new,
 ):
    capability_ratings = defaultdict(float)
    capability_counts = defaultdict(float)
@ -75,8 +76,8 @@ def get_capability_results(
        capability_counts['total'] += 1
        tags = [ref['primary_tag']] + ref['secondary_tag']
        for tag in tags:
-            capability_ratings[task_group_new[tag]] += ans
-            capability_counts[task_group_new[tag]] += 1
+            capability_ratings[customized_task_group[tag]] += ans
+            capability_counts[customized_task_group[tag]] += 1

    capability_avg_ratings = defaultdict(float)

@ -102,7 +103,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
            It's expected to be filled out at runtime.
    """

-    def __init__(self, config: ConfigDict) -> None:
+    def __init__(self, config: ConfigDict, customized_task_group_new=task_group_new) -> None:
        self.judge_type = 'single'
        self.tasks = []
        self.cfg = config
@ -110,6 +111,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
        self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
        self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
        self.judge_function = post_process_wildbench_single
+        self.task_group_new = customized_task_group_new

    def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
        """Summarize the subjectivity analysis based on evaluation results.
@ -138,7 +140,13 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
                    overall_judged_answers += judged_answers
                    overall_references += references

-                get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
+                get_capability_results(
+                    overall_judged_answers, 
+                    overall_references, 
+                    fout, fout_flag, 
+                    show_model_abbr, 
+                    self.task_group_new,
+                )
                fout_flag += 1
            else:
                print(subdir_path + ' is not exist! please check!')
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -369,12 +369,18 @@ def fill_infer_cfg(cfg, args):


 def fill_eval_cfg(cfg, args):
+    # To avoid the overwriting of the configured subjective partitioner and task type.
+    partitioner_type = cfg.get("eval", {}).get("partitioner", {}).get("type", None)
+    partitioner_type = partitioner_type if partitioner_type else get_config_type(NaivePartitioner)
+    task_type = cfg.get("eval", {}).get("runner", {}).get("task", {}).get("type", None)
+    task_type = task_type if task_type else get_config_type(OpenICLEvalTask)
+
    new_cfg = dict(
-        eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
+        eval=dict(partitioner=dict(type=partitioner_type),
                  runner=dict(
                      max_num_workers=args.max_num_workers,
                      debug=args.debug,
-                      task=dict(type=get_config_type(OpenICLEvalTask)),
+                      task=dict(type=task_type),
                      lark_bot_url=cfg['lark_bot_url'],
                  )))
    if args.slurm: