From 054e9fa7e53d6aa3ca687bb2367513766ff5eb02 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:20:41 +0800 Subject: [PATCH] [Feature] add one script for subjective (#993) * add one script for subjective * add one script for subjective * add one script for subjective * add one script for subjective --------- Co-authored-by: thebestannie <1290646445@qq.com> --- .../subjective/eval_subjective_alignbench.py | 21 +++++ .../subjective/eval_subjective_alpacaeval.py | 20 +++++ .../eval_subjective_compassarena.py | 28 ++++++ configs/subjective/eval_subjective_mtbench.py | 25 ++++++ configs/subjective/model_cfg.py | 85 +++++++++++++++++++ 5 files changed, 179 insertions(+) create mode 100644 configs/subjective/eval_subjective_alignbench.py create mode 100644 configs/subjective/eval_subjective_alpacaeval.py create mode 100644 configs/subjective/eval_subjective_compassarena.py create mode 100644 configs/subjective/eval_subjective_mtbench.py create mode 100644 configs/subjective/model_cfg.py diff --git a/configs/subjective/eval_subjective_alignbench.py b/configs/subjective/eval_subjective_alignbench.py new file mode 100644 index 00000000..32a17d29 --- /dev/null +++ b/configs/subjective/eval_subjective_alignbench.py @@ -0,0 +1,21 @@ +from mmengine.config import read_base + +with read_base(): + from ..datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets + from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.summarizers import AlignmentBenchSummarizer + +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +datasets = [*subjective_datasets] +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, mode='singlescore', models=models + ), + runner=runner, +) + +summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general') +work_dir = 'outputs/alignment_bench/' diff --git a/configs/subjective/eval_subjective_alpacaeval.py b/configs/subjective/eval_subjective_alpacaeval.py new file mode 100644 index 00000000..6d9a1b88 --- /dev/null +++ b/configs/subjective/eval_subjective_alpacaeval.py @@ -0,0 +1,20 @@ +from mmengine.config import read_base + +with read_base(): + from ..datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1 + from ..datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2 + from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.summarizers import AlpacaSummarizer +datasets = [*alpacav2] +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models + ), +runner=runner, +given_pred=given_pred +) +work_dir = 'outputs/alpaca/' + +summarizer = dict(type=AlpacaSummarizer, judge_type='v2') diff --git a/configs/subjective/eval_subjective_compassarena.py b/configs/subjective/eval_subjective_compassarena.py new file mode 100644 index 00000000..82028ba3 --- /dev/null +++ b/configs/subjective/eval_subjective_compassarena.py @@ -0,0 +1,28 @@ +from os import getenv as gv +from opencompass.models import HuggingFaceCausalLM +from mmengine.config import read_base + +with read_base(): + from ..datasets.subjective.compassarena.compassarena_compare import subjective_datasets + from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.summarizers import CompassArenaSummarizer +datasets = [*subjective_datasets] + +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + strategy='split', + max_task_size=10000, + mode='m2n', + base_models=[gpt4], + compare_models=models, + ), +runner=runner, +given_pred=given_pred +) + +work_dir = 'outputs/compass_arena/' + +summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add') diff --git a/configs/subjective/eval_subjective_mtbench.py b/configs/subjective/eval_subjective_mtbench.py new file mode 100644 index 00000000..fe562575 --- /dev/null +++ b/configs/subjective/eval_subjective_mtbench.py @@ -0,0 +1,25 @@ +from mmengine.config import read_base + +with read_base(): + from ..datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets + # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets + from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.summarizers import MTBenchSummarizer + +datasets = [*subjective_datasets] + +for model in models: + if 'generation_kwargs' in model: + if 'do_sample' in model['generation_kwargs']: + del model['generation_kwargs']['do_sample'] + +eval = dict( + partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models), + runner=runner +) + +summarizer = dict(type=MTBenchSummarizer, judge_type='single') + +work_dir = 'outputs/mtbench/' diff --git a/configs/subjective/model_cfg.py b/configs/subjective/model_cfg.py new file mode 100644 index 00000000..522d29fc --- /dev/null +++ b/configs/subjective/model_cfg.py @@ -0,0 +1,85 @@ +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 +from opencompass.models.openai_api import OpenAIAllesAPIN +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + + +judge_model = dict( + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=1, + retry=30, + temperature = 0 +) + +infer = dict( + partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='auto', + max_num_workers=256, + task=dict(type=OpenICLInferTask), + ), +) +runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)) + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAIAllesAPIN, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) +given_pred = [{'abbr':'gpt4-turbo', 'path':'your path'}] +