From 116a24632c3e583072fb62c618f87402765c60c2 Mon Sep 17 00:00:00 2001 From: Hoter Young Date: Fri, 24 Jan 2025 10:32:17 +0800 Subject: [PATCH] [Feature] Add OpenHuEval-HuLifeQA (#4) --- examples/eval_subjective_hulifeqa.py | 97 +++++++++++++++++++ .../configs/datasets/OpenHuEval/HuLifeQA.py | 69 +++++++++++++ .../models/openai/gpt_4o_mini_20240718.py | 20 ++++ opencompass/partitioners/__init__.py | 1 + .../summarizers/subjective/wildbench.py | 16 ++- opencompass/utils/run.py | 10 +- 6 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 examples/eval_subjective_hulifeqa.py create mode 100644 opencompass/configs/datasets/OpenHuEval/HuLifeQA.py create mode 100644 opencompass/configs/models/openai/gpt_4o_mini_20240718.py diff --git a/examples/eval_subjective_hulifeqa.py b/examples/eval_subjective_hulifeqa.py new file mode 100644 index 00000000..d03ae4a7 --- /dev/null +++ b/examples/eval_subjective_hulifeqa.py @@ -0,0 +1,97 @@ +from mmengine.config import read_base + +with read_base(): + from opencompass.configs.datasets.OpenHuEval.HuLifeQA import ( + hu_life_qa_datasets, + task_group_new, + ) + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import ( + models as lmdeploy_internlm2_5_7b_chat_model, + ) + from opencompass.configs.models.openai.gpt_4o_mini_20240718 import ( + models as gpt_4o_mini_20240718_model, + ) + +from opencompass.models import OpenAI +from opencompass.partitioners import ( + NumWorkerPartitioner, + SubjectiveNumWorkerPartitioner, +) +from opencompass.runners import LocalRunner, SlurmSequentialRunner +from opencompass.summarizers import WildBenchSingleSummarizer +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + +api_meta_template = dict( + round=[ + dict(role="SYSTEM", api_role="SYSTEM"), + dict(role="HUMAN", api_role="HUMAN"), + dict(role="BOT", api_role="BOT", generate=True), + ] +) + +models = [ + *gpt_4o_mini_20240718_model, + *lmdeploy_internlm2_5_7b_chat_model, +] + +judge_models = [ + dict( + abbr="GPT-4o-2024-08-06", + type=OpenAI, + path="gpt-4o-2024-08-06", + key="ENV", + meta_template=api_meta_template, + query_per_second=16, + max_out_len=4096, + max_seq_len=4096, + batch_size=8, + temperature=0, + ) +] + +for ds in hu_life_qa_datasets: + ds.update( + dict( + mode="singlescore", + eval_mode="single" + ) + ) +del ds +datasets = [*hu_life_qa_datasets] +del hu_life_qa_datasets + +infer = dict( + partitioner=dict( + type=NumWorkerPartitioner, + num_worker=8, + ), + runner=dict( + type=SlurmSequentialRunner, + max_num_workers=16, + task=dict(type=OpenICLInferTask), + ), +) + +eval = dict( + partitioner=dict( + type=SubjectiveNumWorkerPartitioner, + num_worker=8, + models=models, + judge_models=judge_models, + ), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=SubjectiveEvalTask) + ), +) + +summarizer = dict( + type=WildBenchSingleSummarizer, + customized_task_group_new=task_group_new, +) + +work_dir = ( + "./outputs/" + __file__.split("/")[-1].split(".")[0] + "/" +) # do NOT modify this line, yapf: disable, pylint: disable diff --git a/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py new file mode 100644 index 00000000..6e207092 --- /dev/null +++ b/opencompass/configs/datasets/OpenHuEval/HuLifeQA.py @@ -0,0 +1,69 @@ +from opencompass.datasets import WildBenchDataset +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.openicl.icl_inferencer import ChatInferencer +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever + +hu_life_qa_reader_cfg = dict( + input_columns=["dialogue", "prompt"], + output_column="judge", +) + +data_path ="/mnt/hwfile/opendatalab/yanghaote/share/g13k_hu/g13k_hu_vpaper.jsonl" + +hu_life_qa_datasets = [] +hu_life_qa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template="""{dialogue}""" + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict( + type=ChatInferencer, + max_seq_len=4096, + max_out_len=512, + infer_mode="last", + ), +) + +hu_life_qa_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template="""{prompt}""" + ), + ), + pred_role="BOT", +) + +hu_life_qa_datasets.append( + dict( + abbr="hu_life_qa", + type=WildBenchDataset, + path=data_path, + reader_cfg=hu_life_qa_reader_cfg, + infer_cfg=hu_life_qa_infer_cfg, + eval_cfg=hu_life_qa_eval_cfg, + ) +) + +task_group_new = { + "business and finance": "business and finance", + "childbearing and education": "life, culture, and customs", + "culture and community": "life, culture, and customs", + 'culture and customs': "life, culture, and customs", + "life, culture, and customs": "life, culture, and customs", + "education and profession": "education and profession", + "food and drink": "life, culture, and customs", + "health": "life, culture, and customs", + "holidays": "life, culture, and customs", + "home": "life, culture, and customs", + "person": "life, culture, and customs", + "politics": "politics, policy and law", + "politics, policy and law": "politics, policy and law", + "public education and courses": "education and profession", + "transport": "life, culture, and customs", + "science": "life, culture, and customs", + "travel": "life, culture, and customs", +} diff --git a/opencompass/configs/models/openai/gpt_4o_mini_20240718.py b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py new file mode 100644 index 00000000..36bfa610 --- /dev/null +++ b/opencompass/configs/models/openai/gpt_4o_mini_20240718.py @@ -0,0 +1,20 @@ +from opencompass.models import OpenAI + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='GPT-4o-mini-2024-07-18', + type=OpenAI, + path='gpt-4o-mini-2024-07-18', + key= + 'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/opencompass/partitioners/__init__.py b/opencompass/partitioners/__init__.py index ead3704d..8726c3db 100644 --- a/opencompass/partitioners/__init__.py +++ b/opencompass/partitioners/__init__.py @@ -1,3 +1,4 @@ from .naive import * # noqa: F401, F403 from .num_worker import * # noqa: F401, F403 from .size import * # noqa: F401, F403 +from .sub_num_worker import * # noqa: F401, F403 diff --git a/opencompass/summarizers/subjective/wildbench.py b/opencompass/summarizers/subjective/wildbench.py index 98e58cd8..5d920c47 100644 --- a/opencompass/summarizers/subjective/wildbench.py +++ b/opencompass/summarizers/subjective/wildbench.py @@ -65,6 +65,7 @@ def get_capability_results( fout, fout_flag, model_abbr, + customized_task_group=task_group_new, ): capability_ratings = defaultdict(float) capability_counts = defaultdict(float) @@ -75,8 +76,8 @@ def get_capability_results( capability_counts['total'] += 1 tags = [ref['primary_tag']] + ref['secondary_tag'] for tag in tags: - capability_ratings[task_group_new[tag]] += ans - capability_counts[task_group_new[tag]] += 1 + capability_ratings[customized_task_group[tag]] += ans + capability_counts[customized_task_group[tag]] += 1 capability_avg_ratings = defaultdict(float) @@ -102,7 +103,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer): It's expected to be filled out at runtime. """ - def __init__(self, config: ConfigDict) -> None: + def __init__(self, config: ConfigDict, customized_task_group_new=task_group_new) -> None: self.judge_type = 'single' self.tasks = [] self.cfg = config @@ -110,6 +111,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer): self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_function = post_process_wildbench_single + self.task_group_new = customized_task_group_new def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): """Summarize the subjectivity analysis based on evaluation results. @@ -138,7 +140,13 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer): overall_judged_answers += judged_answers overall_references += references - get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr) + get_capability_results( + overall_judged_answers, + overall_references, + fout, fout_flag, + show_model_abbr, + self.task_group_new, + ) fout_flag += 1 else: print(subdir_path + ' is not exist! please check!') diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index accd3468..b96c57fc 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -369,12 +369,18 @@ def fill_infer_cfg(cfg, args): def fill_eval_cfg(cfg, args): + # To avoid the overwriting of the configured subjective partitioner and task type. + partitioner_type = cfg.get("eval", {}).get("partitioner", {}).get("type", None) + partitioner_type = partitioner_type if partitioner_type else get_config_type(NaivePartitioner) + task_type = cfg.get("eval", {}).get("runner", {}).get("task", {}).get("type", None) + task_type = task_type if task_type else get_config_type(OpenICLEvalTask) + new_cfg = dict( - eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)), + eval=dict(partitioner=dict(type=partitioner_type), runner=dict( max_num_workers=args.max_num_workers, debug=args.debug, - task=dict(type=get_config_type(OpenICLEvalTask)), + task=dict(type=task_type), lark_bot_url=cfg['lark_bot_url'], ))) if args.slurm: