diff --git a/configs/datasets/winogrande/winogrande_gen_a027b6.py b/configs/datasets/winogrande/winogrande_gen_a027b6.py new file mode 100644 index 00000000..12561416 --- /dev/null +++ b/configs/datasets/winogrande/winogrande_gen_a027b6.py @@ -0,0 +1,49 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import winograndeDataset_V2 +from opencompass.utils.text_postprocessors import first_option_postprocess + +winogrande_reader_cfg = dict( + input_columns=["opt1", "opt2"], + output_column="answer", +) + +winogrande_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=first_option_postprocess, options='AB'), +) + +_winogrande_prompt = dict( + prompt_1="Which of the following is a good sentence:\nA. {opt1}\nB. {opt2}\nAnswer:", + prompt_2="Which is a good sentence out of the following:\nA. {opt1}\nB. {opt2}\nAnswer:", + prompt_3="Can you identify a good sentence from the following:\nA. {opt1}\nB. {opt2}\nAnswer:", +) + +winogrande_datasets = [] +for _choice in _winogrande_prompt: + winogrande_datasets.append( + dict( + abbr='winogrande_'+_choice, + type=winograndeDataset_V2, + path="./data/winogrande", + reader_cfg=winogrande_reader_cfg, + infer_cfg=dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt=_winogrande_prompt[_choice] + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), + ), + eval_cfg=winogrande_eval_cfg), + ) + +del _choice \ No newline at end of file diff --git a/configs/eval_multi_prompt_demo.py b/configs/eval_multi_prompt_demo.py new file mode 100644 index 00000000..748b23c1 --- /dev/null +++ b/configs/eval_multi_prompt_demo.py @@ -0,0 +1,48 @@ +from mmengine.config import read_base +from opencompass.models import HuggingFaceCausalLM + + +with read_base(): + from .datasets.winogrande.winogrande_gen_a027b6 import winogrande_datasets + +datasets = [*winogrande_datasets] + +_meta_template = dict( + round=[ + dict(role='HUMAN', begin='<|User|>:', end='\n'), + dict(role='BOT', begin='<|Bot|>:', end='\n', generate=True), + ], +) + +models=[ + dict( + type=HuggingFaceCausalLM, + abbr='internlm-chat-7b-hf', + path="internlm/internlm-chat-7b", + tokenizer_path='internlm/internlm-chat-7b', + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + use_fast=False, + trust_remote_code=True, + ), + max_out_len=100, + max_seq_len=2048, + batch_size=8, + meta_template=_meta_template, + model_kwargs=dict( + trust_remote_code=True, + device_map='auto', + ), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +_winogrande_all = [d['abbr'] for d in winogrande_datasets] + +summarizer = dict( + summary_groups=[ + {'name': 'winogrande', 'subsets': _winogrande_all}, + {'name': 'winogrande_std', 'subsets': _winogrande_all, 'std': True}, + ] +) \ No newline at end of file diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py index 77b6cdef..aa1acffe 100644 --- a/opencompass/summarizers/default.py +++ b/opencompass/summarizers/default.py @@ -1,6 +1,7 @@ # flake8: noqa # yapf: disable import getpass +import math import os.path as osp from datetime import datetime from typing import List, Optional @@ -127,21 +128,28 @@ class DefaultSummarizer: results[dataset_abbr] = parsed_results[model_abbr][dataset_abbr][0] eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown')) if len(results) == len(sg['subsets']): - if 'weights' in sg: - numerator = sum(results[k] * sg['weights'][k] for k in sg['weights']) - denominator = sum(sg['weights'].values()) - metric = 'weighted_average' + if 'std' in sg and sg['std'] == True: + avg = sum(results[k] for k in results) / len(results) + variance = sum((results[k] - avg)**2 for k in results) / len(results) + metric = 'standard_deviation' + results[metric] = math.sqrt(variance) else: - numerator = sum(results[k] for k in results) - denominator = len(results) - metric = 'naive_average' - results[metric] = numerator / denominator + if 'weights' in sg: + numerator = sum(results[k] * sg['weights'][k] for k in sg['weights']) + denominator = sum(sg['weights'].values()) + metric = 'weighted_average' + else: + numerator = sum(results[k] for k in results) + denominator = len(results) + metric = 'naive_average' + results[metric] = numerator / denominator + eval_modes = list(set(eval_modes)) eval_mode = eval_modes[0] if len(eval_modes) == 1 else 'mixed' - # add to global results raw_results[model_abbr][sg['name']] = results - parsed_results[model_abbr][sg['name']] = [numerator / denominator] + parsed_results[model_abbr][sg['name']] = [results[metric]] + dataset_metrics[sg['name']] = [metric] dataset_eval_mode[sg['name']] = eval_mode elif len(results) == 0: