mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add multi-prompt generation demo (#568)
* [Feature] Add multi-prompt generation demo * [Fix] change form in winogrande_gen_XXX.py * [Fix] make multi prompt demo more directly * [Fix] fix bug * [Fix] minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com>
This commit is contained in:
parent
91fba2c2e9
commit
5e75e29711
49
configs/datasets/winogrande/winogrande_gen_a027b6.py
Normal file
49
configs/datasets/winogrande/winogrande_gen_a027b6.py
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
|
from opencompass.datasets import winograndeDataset_V2
|
||||||
|
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||||
|
|
||||||
|
winogrande_reader_cfg = dict(
|
||||||
|
input_columns=["opt1", "opt2"],
|
||||||
|
output_column="answer",
|
||||||
|
)
|
||||||
|
|
||||||
|
winogrande_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=AccEvaluator),
|
||||||
|
pred_role="BOT",
|
||||||
|
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
||||||
|
)
|
||||||
|
|
||||||
|
_winogrande_prompt = dict(
|
||||||
|
prompt_1="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||||
|
prompt_2="Which is a good sentence out of the following:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||||
|
prompt_3="Can you identify a good sentence from the following:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||||
|
)
|
||||||
|
|
||||||
|
winogrande_datasets = []
|
||||||
|
for _choice in _winogrande_prompt:
|
||||||
|
winogrande_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr='winogrande_'+_choice,
|
||||||
|
type=winograndeDataset_V2,
|
||||||
|
path="./data/winogrande",
|
||||||
|
reader_cfg=winogrande_reader_cfg,
|
||||||
|
infer_cfg=dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=_winogrande_prompt[_choice]
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
),
|
||||||
|
eval_cfg=winogrande_eval_cfg),
|
||||||
|
)
|
||||||
|
|
||||||
|
del _choice
|
48
configs/eval_multi_prompt_demo.py
Normal file
48
configs/eval_multi_prompt_demo.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .datasets.winogrande.winogrande_gen_a027b6 import winogrande_datasets
|
||||||
|
|
||||||
|
datasets = [*winogrande_datasets]
|
||||||
|
|
||||||
|
_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||||
|
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
models=[
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='internlm-chat-7b-hf',
|
||||||
|
path="internlm/internlm-chat-7b",
|
||||||
|
tokenizer_path='internlm/internlm-chat-7b',
|
||||||
|
tokenizer_kwargs=dict(
|
||||||
|
padding_side='left',
|
||||||
|
truncation_side='left',
|
||||||
|
use_fast=False,
|
||||||
|
trust_remote_code=True,
|
||||||
|
),
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
meta_template=_meta_template,
|
||||||
|
model_kwargs=dict(
|
||||||
|
trust_remote_code=True,
|
||||||
|
device_map='auto',
|
||||||
|
),
|
||||||
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
_winogrande_all = [d['abbr'] for d in winogrande_datasets]
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
summary_groups=[
|
||||||
|
{'name': 'winogrande', 'subsets': _winogrande_all},
|
||||||
|
{'name': 'winogrande_std', 'subsets': _winogrande_all, 'std': True},
|
||||||
|
]
|
||||||
|
)
|
@ -1,6 +1,7 @@
|
|||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
import getpass
|
import getpass
|
||||||
|
import math
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -127,6 +128,12 @@ class DefaultSummarizer:
|
|||||||
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
|
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
|
||||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||||
if len(results) == len(sg['subsets']):
|
if len(results) == len(sg['subsets']):
|
||||||
|
if 'std' in sg and sg['std'] == True:
|
||||||
|
avg = sum(results[k] for k in results) / len(results)
|
||||||
|
variance = sum((results[k] - avg)**2 for k in results) / len(results)
|
||||||
|
metric = 'standard_deviation'
|
||||||
|
results[metric] = math.sqrt(variance)
|
||||||
|
else:
|
||||||
if 'weights' in sg:
|
if 'weights' in sg:
|
||||||
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
|
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
|
||||||
denominator = sum(sg['weights'].values())
|
denominator = sum(sg['weights'].values())
|
||||||
@ -136,12 +143,13 @@ class DefaultSummarizer:
|
|||||||
denominator = len(results)
|
denominator = len(results)
|
||||||
metric = 'naive_average'
|
metric = 'naive_average'
|
||||||
results[metric] = numerator / denominator
|
results[metric] = numerator / denominator
|
||||||
|
|
||||||
eval_modes = list(set(eval_modes))
|
eval_modes = list(set(eval_modes))
|
||||||
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
|
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
|
||||||
|
|
||||||
# add to global results
|
# add to global results
|
||||||
raw_results[model_abbr][sg['name']] = results
|
raw_results[model_abbr][sg['name']] = results
|
||||||
parsed_results[model_abbr][sg['name']] = [numerator / denominator]
|
parsed_results[model_abbr][sg['name']] = [results[metric]]
|
||||||
|
|
||||||
dataset_metrics[sg['name']] = [metric]
|
dataset_metrics[sg['name']] = [metric]
|
||||||
dataset_eval_mode[sg['name']] = eval_mode
|
dataset_eval_mode[sg['name']] = eval_mode
|
||||||
elif len(results) == 0:
|
elif len(results) == 0:
|
||||||
|
Loading…
Reference in New Issue
Block a user