mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add OpenHuEval-HuLifeQA (#4)
This commit is contained in:
parent
5f72e96d5b
commit
116a24632c
97
examples/eval_subjective_hulifeqa.py
Normal file
97
examples/eval_subjective_hulifeqa.py
Normal file
@ -0,0 +1,97 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.OpenHuEval.HuLifeQA import (
|
||||
hu_life_qa_datasets,
|
||||
task_group_new,
|
||||
)
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import (
|
||||
models as lmdeploy_internlm2_5_7b_chat_model,
|
||||
)
|
||||
from opencompass.configs.models.openai.gpt_4o_mini_20240718 import (
|
||||
models as gpt_4o_mini_20240718_model,
|
||||
)
|
||||
|
||||
from opencompass.models import OpenAI
|
||||
from opencompass.partitioners import (
|
||||
NumWorkerPartitioner,
|
||||
SubjectiveNumWorkerPartitioner,
|
||||
)
|
||||
from opencompass.runners import LocalRunner, SlurmSequentialRunner
|
||||
from opencompass.summarizers import WildBenchSingleSummarizer
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role="SYSTEM", api_role="SYSTEM"),
|
||||
dict(role="HUMAN", api_role="HUMAN"),
|
||||
dict(role="BOT", api_role="BOT", generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
models = [
|
||||
*gpt_4o_mini_20240718_model,
|
||||
*lmdeploy_internlm2_5_7b_chat_model,
|
||||
]
|
||||
|
||||
judge_models = [
|
||||
dict(
|
||||
abbr="GPT-4o-2024-08-06",
|
||||
type=OpenAI,
|
||||
path="gpt-4o-2024-08-06",
|
||||
key="ENV",
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=4096,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)
|
||||
]
|
||||
|
||||
for ds in hu_life_qa_datasets:
|
||||
ds.update(
|
||||
dict(
|
||||
mode="singlescore",
|
||||
eval_mode="single"
|
||||
)
|
||||
)
|
||||
del ds
|
||||
datasets = [*hu_life_qa_datasets]
|
||||
del hu_life_qa_datasets
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(
|
||||
type=NumWorkerPartitioner,
|
||||
num_worker=8,
|
||||
),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNumWorkerPartitioner,
|
||||
num_worker=8,
|
||||
models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=SubjectiveEvalTask)
|
||||
),
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=WildBenchSingleSummarizer,
|
||||
customized_task_group_new=task_group_new,
|
||||
)
|
||||
|
||||
work_dir = (
|
||||
"./outputs/" + __file__.split("/")[-1].split(".")[0] + "/"
|
||||
) # do NOT modify this line, yapf: disable, pylint: disable
|
69
opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
Normal file
69
opencompass/configs/datasets/OpenHuEval/HuLifeQA.py
Normal file
@ -0,0 +1,69 @@
|
||||
from opencompass.datasets import WildBenchDataset
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.openicl.icl_inferencer import ChatInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
|
||||
hu_life_qa_reader_cfg = dict(
|
||||
input_columns=["dialogue", "prompt"],
|
||||
output_column="judge",
|
||||
)
|
||||
|
||||
data_path ="/mnt/hwfile/opendatalab/yanghaote/share/g13k_hu/g13k_hu_vpaper.jsonl"
|
||||
|
||||
hu_life_qa_datasets = []
|
||||
hu_life_qa_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template="""{dialogue}"""
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(
|
||||
type=ChatInferencer,
|
||||
max_seq_len=4096,
|
||||
max_out_len=512,
|
||||
infer_mode="last",
|
||||
),
|
||||
)
|
||||
|
||||
hu_life_qa_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template="""{prompt}"""
|
||||
),
|
||||
),
|
||||
pred_role="BOT",
|
||||
)
|
||||
|
||||
hu_life_qa_datasets.append(
|
||||
dict(
|
||||
abbr="hu_life_qa",
|
||||
type=WildBenchDataset,
|
||||
path=data_path,
|
||||
reader_cfg=hu_life_qa_reader_cfg,
|
||||
infer_cfg=hu_life_qa_infer_cfg,
|
||||
eval_cfg=hu_life_qa_eval_cfg,
|
||||
)
|
||||
)
|
||||
|
||||
task_group_new = {
|
||||
"business and finance": "business and finance",
|
||||
"childbearing and education": "life, culture, and customs",
|
||||
"culture and community": "life, culture, and customs",
|
||||
'culture and customs': "life, culture, and customs",
|
||||
"life, culture, and customs": "life, culture, and customs",
|
||||
"education and profession": "education and profession",
|
||||
"food and drink": "life, culture, and customs",
|
||||
"health": "life, culture, and customs",
|
||||
"holidays": "life, culture, and customs",
|
||||
"home": "life, culture, and customs",
|
||||
"person": "life, culture, and customs",
|
||||
"politics": "politics, policy and law",
|
||||
"politics, policy and law": "politics, policy and law",
|
||||
"public education and courses": "education and profession",
|
||||
"transport": "life, culture, and customs",
|
||||
"science": "life, culture, and customs",
|
||||
"travel": "life, culture, and customs",
|
||||
}
|
20
opencompass/configs/models/openai/gpt_4o_mini_20240718.py
Normal file
20
opencompass/configs/models/openai/gpt_4o_mini_20240718.py
Normal file
@ -0,0 +1,20 @@
|
||||
from opencompass.models import OpenAI
|
||||
|
||||
api_meta_template = dict(round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
], )
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='GPT-4o-mini-2024-07-18',
|
||||
type=OpenAI,
|
||||
path='gpt-4o-mini-2024-07-18',
|
||||
key=
|
||||
'ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8),
|
||||
]
|
@ -1,3 +1,4 @@
|
||||
from .naive import * # noqa: F401, F403
|
||||
from .num_worker import * # noqa: F401, F403
|
||||
from .size import * # noqa: F401, F403
|
||||
from .sub_num_worker import * # noqa: F401, F403
|
||||
|
@ -65,6 +65,7 @@ def get_capability_results(
|
||||
fout,
|
||||
fout_flag,
|
||||
model_abbr,
|
||||
customized_task_group=task_group_new,
|
||||
):
|
||||
capability_ratings = defaultdict(float)
|
||||
capability_counts = defaultdict(float)
|
||||
@ -75,8 +76,8 @@ def get_capability_results(
|
||||
capability_counts['total'] += 1
|
||||
tags = [ref['primary_tag']] + ref['secondary_tag']
|
||||
for tag in tags:
|
||||
capability_ratings[task_group_new[tag]] += ans
|
||||
capability_counts[task_group_new[tag]] += 1
|
||||
capability_ratings[customized_task_group[tag]] += ans
|
||||
capability_counts[customized_task_group[tag]] += 1
|
||||
|
||||
capability_avg_ratings = defaultdict(float)
|
||||
|
||||
@ -102,7 +103,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict) -> None:
|
||||
def __init__(self, config: ConfigDict, customized_task_group_new=task_group_new) -> None:
|
||||
self.judge_type = 'single'
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
@ -110,6 +111,7 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||
self.judge_function = post_process_wildbench_single
|
||||
self.task_group_new = customized_task_group_new
|
||||
|
||||
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
@ -138,7 +140,13 @@ class WildBenchSingleSummarizer(CompassArenaSummarizer):
|
||||
overall_judged_answers += judged_answers
|
||||
overall_references += references
|
||||
|
||||
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
|
||||
get_capability_results(
|
||||
overall_judged_answers,
|
||||
overall_references,
|
||||
fout, fout_flag,
|
||||
show_model_abbr,
|
||||
self.task_group_new,
|
||||
)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
|
@ -369,12 +369,18 @@ def fill_infer_cfg(cfg, args):
|
||||
|
||||
|
||||
def fill_eval_cfg(cfg, args):
|
||||
# To avoid the overwriting of the configured subjective partitioner and task type.
|
||||
partitioner_type = cfg.get("eval", {}).get("partitioner", {}).get("type", None)
|
||||
partitioner_type = partitioner_type if partitioner_type else get_config_type(NaivePartitioner)
|
||||
task_type = cfg.get("eval", {}).get("runner", {}).get("task", {}).get("type", None)
|
||||
task_type = task_type if task_type else get_config_type(OpenICLEvalTask)
|
||||
|
||||
new_cfg = dict(
|
||||
eval=dict(partitioner=dict(type=get_config_type(NaivePartitioner)),
|
||||
eval=dict(partitioner=dict(type=partitioner_type),
|
||||
runner=dict(
|
||||
max_num_workers=args.max_num_workers,
|
||||
debug=args.debug,
|
||||
task=dict(type=get_config_type(OpenICLEvalTask)),
|
||||
task=dict(type=task_type),
|
||||
lark_bot_url=cfg['lark_bot_url'],
|
||||
)))
|
||||
if args.slurm:
|
||||
|
Loading…
Reference in New Issue
Block a user