[Feature] Add OpenHuEval-HuLifeQA (#4)

This commit is contained in:
Hoter Young 2025-01-24 10:32:17 +08:00 committed by GitHub
parent 5f72e96d5b
commit 116a24632c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 207 additions and 6 deletions

View File

@ -0,0 +1,97 @@
from mmengine.config import read_base
with read_base():
from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
hu_life_qa_datasets,
task_group_new,
)
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
models as lmdeploy_internlm2_5_7b_chat_model,
)
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import (
models as gpt_4o_mini_20240718_model,
)
from opencompass.models import OpenAI
from opencompass.partitioners import (
NumWorkerPartitioner,
SubjectiveNumWorkerPartitioner,
)
from opencompass.runners import LocalRunner, SlurmSequentialRunner
from opencompass.summarizers import WildBenchSingleSummarizer
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(
round=[
dict(role="SYSTEM", api_role="SYSTEM"),
dict(role="HUMAN", api_role="HUMAN"),
dict(role="BOT", api_role="BOT", generate=True),
]
)
models = [
*gpt_4o_mini_20240718_model,
*lmdeploy_internlm2_5_7b_chat_model,
]
judge_models = [
dict(
abbr="GPT-4o-2024-08-06",
type=OpenAI,
path="gpt-4o-2024-08-06",
key="ENV",
meta_template=api_meta_template,
query_per_second=16,
max_out_len=4096,
max_seq_len=4096,
batch_size=8,
temperature=0,
)
]
for ds in hu_life_qa_datasets:
ds.update(
dict(
mode="singlescore",
eval_mode="single"
)
)
del ds
datasets = [*hu_life_qa_datasets]
del hu_life_qa_datasets
infer = dict(
partitioner=dict(
type=NumWorkerPartitioner,
num_worker=8,
),
runner=dict(
type=SlurmSequentialRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask),
),
)
eval = dict(
partitioner=dict(
type=SubjectiveNumWorkerPartitioner,
num_worker=8,
models=models,
judge_models=judge_models,
),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=SubjectiveEvalTask)
),
)
summarizer = dict(
type=WildBenchSingleSummarizer,
customized_task_group_new=task_group_new,
)
work_dir = (
"./outputs/" + __file__.split("/")[-1].split(".")[0] + "/"
) # do NOT modify this line, yapf: disable, pylint: disable

View File

@ -0,0 +1,69 @@
from opencompass.datasets import WildBenchDataset
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import ChatInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
hu_life_qa_reader_cfg = dict(
input_columns=["dialogue", "prompt"],
output_column="judge",
)
data_path ="/mnt/hwfile/opendatalab/yanghaote/share/g13k_hu/g13k_hu_vpaper.jsonl"
hu_life_qa_datasets = []
hu_life_qa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(
type=ChatInferencer,
max_seq_len=4096,
max_out_len=512,
infer_mode="last",
),
)
hu_life_qa_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
),
pred_role="BOT",
)
hu_life_qa_datasets.append(
dict(
abbr="hu_life_qa",
type=WildBenchDataset,
path=data_path,
reader_cfg=hu_life_qa_reader_cfg,
infer_cfg=hu_life_qa_infer_cfg,
eval_cfg=hu_life_qa_eval_cfg,
)
)
task_group_new = {
"business and finance": "business and finance",
"childbearing and education": "life, culture, and customs",
"culture and community": "life, culture, and customs",
'culture and customs': "life, culture, and customs",
"life, culture, and customs": "life, culture, and customs",
"education and profession": "education and profession",
"food and drink": "life, culture, and customs",
"health": "life, culture, and customs",
"holidays": "life, culture, and customs",
"home": "life, culture, and customs",
"person": "life, culture, and customs",
"politics": "politics, policy and law",
"politics, policy and law": "politics, policy and law",
"public education and courses": "education and profession",
"transport": "life, culture, and customs",
"science": "life, culture, and customs",
"travel": "life, culture, and customs",
}

View File

@ -0,0 +1,20 @@
from opencompass.models import OpenAI
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )
models = [
dict(
abbr='GPT-4o-mini-2024-07-18',
type=OpenAI,
path='gpt-4o-mini-2024-07-18',
key=
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=8),
]

View File

@ -1,3 +1,4 @@
from .naive import * # noqa: F401, F403
from .num_worker import * # noqa: F401, F403
from .size import * # noqa: F401, F403
from .sub_num_worker import * # noqa: F401, F403

View File

@ -65,6 +65,7 @@ def get_capability_results(
fout,
fout_flag,
model_abbr,
customized_task_group=task_group_new,
):
capability_ratings = defaultdict(float)
capability_counts = defaultdict(float)
@ -75,8 +76,8 @@ def get_capability_results(
capability_counts['total'] += 1
tags = [ref['primary_tag']] + ref['secondary_tag']
for tag in tags:
capability_ratings[task_group_new[tag]] += ans
capability_counts[task_group_new[tag]] += 1
capability_ratings[customized_task_group[tag]] += ans
capability_counts[customized_task_group[tag]] += 1
capability_avg_ratings = defaultdict(float)
@ -102,7 +103,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict) -> None:
def __init__(self, config: ConfigDict, customized_task_group_new=task_group_new) -> None:
self.judge_type = 'single'
self.tasks = []
self.cfg = config
@ -110,6 +111,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_function = post_process_wildbench_single
self.task_group_new = customized_task_group_new
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
@ -138,7 +140,13 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
get_capability_results(
overall_judged_answers,
overall_references,
fout, fout_flag,
show_model_abbr,
self.task_group_new,
)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')

View File

@ -369,12 +369,18 @@ def fill_infer_cfg(cfg, args):
def fill_eval_cfg(cfg, args):
# To avoid the overwriting of the configured subjective partitioner and task type.
partitioner_type = cfg.get("eval", {}).get("partitioner", {}).get("type", None)
partitioner_type = partitioner_type if partitioner_type else get_config_type(NaivePartitioner)
task_type = cfg.get("eval", {}).get("runner", {}).get("task", {}).get("type", None)
task_type = task_type if task_type else get_config_type(OpenICLEvalTask)
new_cfg = dict(
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
eval=dict(partitioner=dict(type=partitioner_type),
runner=dict(
max_num_workers=args.max_num_workers,
debug=args.debug,
task=dict(type=get_config_type(OpenICLEvalTask)),
task=dict(type=task_type),
lark_bot_url=cfg['lark_bot_url'],
)))
if args.slurm: