mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add multi-prompt generation demo (#568)
* [Feature] Add multi-prompt generation demo * [Fix] change form in winogrande_gen_XXX.py * [Fix] make multi prompt demo more directly * [Fix] fix bug * [Fix] minor fix --------- Co-authored-by: yingfhu <yingfhu@gmail.com>
This commit is contained in:
parent
91fba2c2e9
commit
5e75e29711
49
configs/datasets/winogrande/winogrande_gen_a027b6.py
Normal file
49
configs/datasets/winogrande/winogrande_gen_a027b6.py
Normal file
@ -0,0 +1,49 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import winograndeDataset_V2
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
|
||||
winogrande_reader_cfg = dict(
|
||||
input_columns=["opt1", "opt2"],
|
||||
output_column="answer",
|
||||
)
|
||||
|
||||
winogrande_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
||||
)
|
||||
|
||||
_winogrande_prompt = dict(
|
||||
prompt_1="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||
prompt_2="Which is a good sentence out of the following:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||
prompt_3="Can you identify a good sentence from the following:\nA. {opt1}\nB. {opt2}\nAnswer:",
|
||||
)
|
||||
|
||||
winogrande_datasets = []
|
||||
for _choice in _winogrande_prompt:
|
||||
winogrande_datasets.append(
|
||||
dict(
|
||||
abbr='winogrande_'+_choice,
|
||||
type=winograndeDataset_V2,
|
||||
path="./data/winogrande",
|
||||
reader_cfg=winogrande_reader_cfg,
|
||||
infer_cfg=dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt=_winogrande_prompt[_choice]
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
),
|
||||
eval_cfg=winogrande_eval_cfg),
|
||||
)
|
||||
|
||||
del _choice
|
48
configs/eval_multi_prompt_demo.py
Normal file
48
configs/eval_multi_prompt_demo.py
Normal file
@ -0,0 +1,48 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
|
||||
with read_base():
|
||||
from .datasets.winogrande.winogrande_gen_a027b6 import winogrande_datasets
|
||||
|
||||
datasets = [*winogrande_datasets]
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models=[
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='internlm-chat-7b-hf',
|
||||
path="internlm/internlm-chat-7b",
|
||||
tokenizer_path='internlm/internlm-chat-7b',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
use_fast=False,
|
||||
trust_remote_code=True,
|
||||
),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
model_kwargs=dict(
|
||||
trust_remote_code=True,
|
||||
device_map='auto',
|
||||
),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
_winogrande_all = [d['abbr'] for d in winogrande_datasets]
|
||||
|
||||
summarizer = dict(
|
||||
summary_groups=[
|
||||
{'name': 'winogrande', 'subsets': _winogrande_all},
|
||||
{'name': 'winogrande_std', 'subsets': _winogrande_all, 'std': True},
|
||||
]
|
||||
)
|
@ -1,6 +1,7 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import getpass
|
||||
import math
|
||||
import os.path as osp
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
@ -127,21 +128,28 @@ class DefaultSummarizer:
|
||||
results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0]
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
if len(results) == len(sg['subsets']):
|
||||
if 'weights' in sg:
|
||||
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
|
||||
denominator = sum(sg['weights'].values())
|
||||
metric = 'weighted_average'
|
||||
if 'std' in sg and sg['std'] == True:
|
||||
avg = sum(results[k] for k in results) / len(results)
|
||||
variance = sum((results[k] - avg)**2 for k in results) / len(results)
|
||||
metric = 'standard_deviation'
|
||||
results[metric] = math.sqrt(variance)
|
||||
else:
|
||||
numerator = sum(results[k] for k in results)
|
||||
denominator = len(results)
|
||||
metric = 'naive_average'
|
||||
results[metric] = numerator / denominator
|
||||
if 'weights' in sg:
|
||||
numerator = sum(results[k] * sg['weights'][k] for k in sg['weights'])
|
||||
denominator = sum(sg['weights'].values())
|
||||
metric = 'weighted_average'
|
||||
else:
|
||||
numerator = sum(results[k] for k in results)
|
||||
denominator = len(results)
|
||||
metric = 'naive_average'
|
||||
results[metric] = numerator / denominator
|
||||
|
||||
eval_modes = list(set(eval_modes))
|
||||
eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed'
|
||||
|
||||
# add to global results
|
||||
raw_results[model_abbr][sg['name']] = results
|
||||
parsed_results[model_abbr][sg['name']] = [numerator / denominator]
|
||||
parsed_results[model_abbr][sg['name']] = [results[metric]]
|
||||
|
||||
dataset_metrics[sg['name']] = [metric]
|
||||
dataset_eval_mode[sg['name']] = eval_mode
|
||||
elif len(results) == 0:
|
||||
|
Loading…
Reference in New Issue
Block a user