From 32b5948f4e9dbfc65317bd85775306ba4d6c011b Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:55:58 +0800 Subject: [PATCH] [Fix] add do sample demo for subjective dataset (#873) * add do sample demo for subjective dataset * fix strings * format --------- Co-authored-by: Leymore --- configs/eval_subjective_alignbench.py | 95 +++++++------- configs/eval_subjective_alpacaeval.py | 106 ++++++++++------ configs/eval_subjective_compassarena.py | 125 +++++++++++-------- configs/eval_subjective_corev2.py | 125 +++++++++++-------- configs/eval_subjective_creationbench.py | 99 ++++++++------- configs/eval_subjective_judge_pandalm.py | 106 ++++++++-------- configs/eval_subjective_mtbench.py | 150 +++++++++++------------ 7 files changed, 446 insertions(+), 360 deletions(-) diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py index 6a548f43..8f60016b 100644 --- a/configs/eval_subjective_alignbench.py +++ b/configs/eval_subjective_alignbench.py @@ -1,16 +1,7 @@ from mmengine.config import read_base -with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b - from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj - from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm - from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm - from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets -datasets = [*subjective_datasets] +with read_base(): + from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.models.openai_api import OpenAIAllesAPIN @@ -23,10 +14,42 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import AlignmentBenchSummarizer +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) -# -------------Inferen Stage ---------------------------------------- +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] -models = [*hf_chatglm3_6b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] +datasets = [*subjective_datasets] infer = dict( partitioner=dict(type=NaivePartitioner), @@ -35,51 +58,39 @@ infer = dict( partition='llmeval', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) # -------------Evalation Stage ---------------------------------------- - ## ------------- JudgeLLM Configuration -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, path='gpt-4-1106-preview', - key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', - meta_template=api_meta_template, - query_per_second=16, - max_out_len=2048, - max_seq_len=2048, - batch_size=8, - temperature = 0 + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, + path='gpt-4-1106-preview', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='xxxx', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8, + temperature=0, ) ## ------------- Evaluation Configuration eval = dict( partitioner=dict( - type=SubjectiveNaivePartitioner, - mode='singlescore', - models = models + type=SubjectiveNaivePartitioner, mode='singlescore', models=models ), runner=dict( type=LocalRunner, max_num_workers=2, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + ), ) -summarizer = dict( - type=AlignmentBenchSummarizer, judge_type = 'general' -) +summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general') work_dir = 'outputs/alignment_bench/' diff --git a/configs/eval_subjective_alpacaeval.py b/configs/eval_subjective_alpacaeval.py index 42ac5c83..098547b9 100644 --- a/configs/eval_subjective_alpacaeval.py +++ b/configs/eval_subjective_alpacaeval.py @@ -1,16 +1,9 @@ from mmengine.config import read_base + with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b - from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1 from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2 -datasets = [*alpacav2] - from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner @@ -22,18 +15,59 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import AlpacaSummarizer -models = [*hf_qwen_7b_chat, *hf_chatglm3_6b] - api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True) - ], - reserved_roles=[ - dict(role='SYSTEM', api_role='SYSTEM'), + dict(role='BOT', api_role='BOT', generate=True), ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], ) +# -------------Inference Stage ---------------------------------------- + +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +datasets = [*alpacav2] + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions + infer = dict( partitioner=dict(type=NaivePartitioner), runner=dict( @@ -41,42 +75,40 @@ infer = dict( partition='llmeval', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAI, path='gpt-4-1106-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=1024, - max_seq_len=4096, - batch_size=2, - retry=20, - temperature = 0 + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=2, + retry=20, + temperature=0, ) +## ------------- Evaluation Configuration eval = dict( partitioner=dict( - type=SubjectiveSizePartitioner, - max_task_size=1000, - mode='m2n', - base_models = [*hf_chatglm3_6b], - compare_models = [*hf_qwen_7b_chat] + type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models ), runner=dict( type=SlurmSequentialRunner, partition='llmeval', quotatype='auto', max_num_workers=256, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + ), ) work_dir = 'outputs/alpaca/' -summarizer = dict( - type=AlpacaSummarizer, judge_type='v2' -) \ No newline at end of file +summarizer = dict(type=AlpacaSummarizer, judge_type='v2') diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py index 3ac0b86c..58336a5c 100644 --- a/configs/eval_subjective_compassarena.py +++ b/configs/eval_subjective_compassarena.py @@ -1,8 +1,8 @@ from os import getenv as gv from opencompass.models import HuggingFaceCausalLM from mmengine.config import read_base + with read_base(): - from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI @@ -16,56 +16,85 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import CompassArenaSummarizer +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ], + reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')], +) + +# -------------Inference Stage ---------------------------------------- + +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +datasets = [*subjective_datasets] + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions + infer = dict( - #partitioner=dict(type=NaivePartitioner), partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000), runner=dict( type=SlurmSequentialRunner, partition='llm_dev2', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - -gpt4 = dict( - abbr='gpt4-turbo', - type=OpenAI, path='gpt-4-1106-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=2048, - max_seq_len=4096, - batch_size=4, - retry=20, - temperature = 1 -) -models = [*chatglm3_6b_32k_model] -datasets = [*subjective_datasets] - - - -work_dir = 'outputs/compass_arena_debug/' - -# -------------Inferen Stage ---------------------------------------- +# -------------Evalation Stage ---------------------------------------- +## ------------- JudgeLLM Configuration judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAI, path='gpt-4-1106-preview', - key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - meta_template=api_meta_template, - query_per_second=1, - max_out_len=1024, - max_seq_len=4096, - batch_size=2, - retry=20, - temperature = 0 + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=2, + retry=20, + temperature=0, ) ## ------------- Evaluation Configuration @@ -75,22 +104,18 @@ eval = dict( strategy='split', max_task_size=10000, mode='m2n', - base_models = [gpt4], - compare_models = [*chatglm3_6b_32k_model] + base_models=[gpt4], + compare_models=models, ), runner=dict( type=SlurmSequentialRunner, partition='llm_dev2', quotatype='auto', max_num_workers=32, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + ), ) +work_dir = 'outputs/compass_arena_debug/' -summarizer = dict( - type=CompassArenaSummarizer, - summary_type='half_add' -) \ No newline at end of file +summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add') diff --git a/configs/eval_subjective_corev2.py b/configs/eval_subjective_corev2.py index b16e017b..2ca07b43 100644 --- a/configs/eval_subjective_corev2.py +++ b/configs/eval_subjective_corev2.py @@ -1,16 +1,9 @@ from mmengine.config import read_base + with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_7b - from .models.hf_internlm.hf_internlm2_chat_20b import models as internlm2_20b from .datasets.subjective.subjective_cmp.subjective_corev2 import subjective_datasets -datasets = [*subjective_datasets] - -from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner @@ -19,18 +12,62 @@ from opencompass.runners import SlurmSequentialRunner from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import Corev2Summarizer -models = [*internlm2_7b, *internlm2_20b] api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True) + dict(role='BOT', api_role='BOT', generate=True), ], reserved_roles=[ dict(role='SYSTEM', api_role='SYSTEM'), ], ) +# -------------Inference Stage ---------------------------------------- + +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +datasets = [*subjective_datasets] + +gpt4 = dict( + abbr='gpt4-turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=4, + retry=20, + temperature=1, +) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions + infer = dict( partitioner=dict(type=SizePartitioner, max_task_size=500), runner=dict( @@ -38,61 +75,41 @@ infer = dict( partition='llm_dev2', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) +# -------------Evalation Stage ---------------------------------------- -_meta_template = dict( - round=[ - dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), - dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True), - ], +## ------------- JudgeLLM Configuration +judge_model = dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=2, + retry=20, + temperature=0, ) - -judge_model = dict( - type=HuggingFaceCausalLM, - abbr='qwen-7b-chat-hf', - path="Qwen/Qwen-7B-Chat", - tokenizer_path='Qwen/Qwen-7B-Chat', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False,), - pad_token_id=151643, - max_out_len=2048, - max_seq_len=2048, - batch_size=8, - meta_template=_meta_template, - run_cfg=dict(num_gpus=1, num_procs=1), - ) - +## ------------- Evaluation Configuration eval = dict( partitioner=dict( - type=SubjectiveSizePartitioner, - mode='m2n', - max_task_size=500, - base_models = [*internlm2_7b], - compare_models = [*internlm2_20b] + type=SubjectiveSizePartitioner, mode='m2n', max_task_size=500, base_models=[gpt4], compare_models=models ), runner=dict( type=SlurmSequentialRunner, partition='llm_dev2', quotatype='auto', max_num_workers=256, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + ), ) -work_dir = 'outputs/corev2/' -summarizer = dict( - type=Corev2Summarizer, - match_method='smart', -) \ No newline at end of file +summarizer = dict(type=Corev2Summarizer, match_method='smart') + +work_dir = 'outputs/corev2/' diff --git a/configs/eval_subjective_creationbench.py b/configs/eval_subjective_creationbench.py index a871cfd8..52bf7d4b 100644 --- a/configs/eval_subjective_creationbench.py +++ b/configs/eval_subjective_creationbench.py @@ -1,16 +1,7 @@ from mmengine.config import read_base -with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b - from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj - from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm - from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm - from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets -datasets = [*subjective_datasets] +with read_base(): + from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.models.openai_api import OpenAIAllesAPIN @@ -23,10 +14,42 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import CreationBenchSummarizer +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) -# -------------Inferen Stage ---------------------------------------- +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] -models = [*hf_chatglm3_6b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] +datasets = [*subjective_datasets] infer = dict( partitioner=dict(type=NaivePartitioner), @@ -35,51 +58,33 @@ infer = dict( partition='llmeval', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) # -------------Evalation Stage ---------------------------------------- - ## ------------- JudgeLLM Configuration -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, path='gpt-4-1106-preview', - key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', - meta_template=api_meta_template, - query_per_second=16, - max_out_len=2048, - max_seq_len=2048, - batch_size=8, - temperature = 0 + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, + path='gpt-4-1106-preview', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='xxxx', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=2048, + max_seq_len=2048, + batch_size=8, + temperature=0, ) ## ------------- Evaluation Configuration eval = dict( - partitioner=dict( - type=SubjectiveNaivePartitioner, - mode='singlescore', - models = models - ), - runner=dict( - type=LocalRunner, - max_num_workers=2, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), ) -summarizer = dict( - type=CreationBenchSummarizer, judge_type = 'general' -) +summarizer = dict(type=CreationBenchSummarizer, judge_type='general') work_dir = 'outputs/creationbench/' diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py index ff7b69de..2fbe7e91 100644 --- a/configs/eval_subjective_judge_pandalm.py +++ b/configs/eval_subjective_judge_pandalm.py @@ -1,13 +1,7 @@ from mmengine.config import read_base -with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b - from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets -datasets = [*subjective_datasets] +with read_base(): + from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3 from opencompass.partitioners import NaivePartitioner @@ -18,10 +12,42 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import AlignmentBenchSummarizer +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) -# -------------Inferen Stage ---------------------------------------- +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=api_meta_template, + max_out_len=2048, + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] -models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat] +datasets = [*subjective_datasets] infer = dict( partitioner=dict(type=NaivePartitioner), @@ -30,55 +56,37 @@ infer = dict( partition='llmeval', quotatype='auto', max_num_workers=256, - task=dict(type=OpenICLInferTask)), + task=dict(type=OpenICLInferTask), + ), ) - # -------------Evalation Stage ---------------------------------------- - ## ------------- JudgeLLM Configuration -api_meta_template = dict( - round=[ - dict(role='HUMAN', api_role='HUMAN'), - dict(role='BOT', api_role='BOT', generate=True), - ] -) - judge_model = dict( - type=HuggingFaceCausalLM, - abbr='pandalm-7b-v1-hf', - path="WeOpenML/PandaLM-7B-v1", - tokenizer_path='WeOpenML/PandaLM-7B-v1', - tokenizer_kwargs=dict(padding_side='left', - truncation_side='left', - trust_remote_code=True, - use_fast=False,), - max_out_len=512, - max_seq_len=2048, - batch_size=8, - model_kwargs=dict(device_map='auto', trust_remote_code=True), - run_cfg=dict(num_gpus=1, num_procs=1), - ) + type=HuggingFaceCausalLM, + abbr='pandalm-7b-v1-hf', + path='WeOpenML/PandaLM-7B-v1', + tokenizer_path='WeOpenML/PandaLM-7B-v1', + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + max_out_len=512, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), +) ## ------------- Evaluation Configuration eval = dict( - partitioner=dict( - type=SubjectiveNaivePartitioner, - mode='singlescore', - models = [*hf_baichuan2_7b] - ), - runner=dict( - type=LocalRunner, - max_num_workers=2, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), ) -summarizer = dict( - type=AlignmentBenchSummarizer, -) +summarizer = dict(type=AlignmentBenchSummarizer) work_dir = 'outputs/pandalm' diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py index 575d3974..6ccb5e74 100644 --- a/configs/eval_subjective_mtbench.py +++ b/configs/eval_subjective_mtbench.py @@ -1,17 +1,9 @@ from mmengine.config import read_base -with read_base(): - from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat - from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat - from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b - from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b - from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b - from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj - from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm - from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm - from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets - #from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets -datasets = [*subjective_datasets] +with read_base(): + from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets + + # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.models.openai_api import OpenAIAllesAPIN @@ -24,24 +16,6 @@ from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import MTBenchSummarizer - -# -------------Inferen Stage ---------------------------------------- - -models = [*hf_chatglm3_6b, *hf_qwen_7b_chat] -infer = dict( - partitioner=dict(type=SizePartitioner, max_task_size=100), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, - task=dict(type=OpenICLInferTask)), -) - -# -------------Evalation Stage ---------------------------------------- - - -## ------------- JudgeLLM Configuration api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), @@ -49,68 +23,82 @@ api_meta_template = dict( ] ) -judge_model = dict( - abbr='GPT4-Turbo', - type=OpenAIAllesAPIN, path='gpt-4-1106-preview', - key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well - url='xxxx', +# -------------Inference Stage ---------------------------------------- +# For subjective evaluation, we often set do sample for models +models = [ + dict( + type=HuggingFaceChatGLM3, + abbr='chatglm3-6b-hf', + path='THUDM/chatglm3-6b', + tokenizer_path='THUDM/chatglm3-6b', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + ), + generation_kwargs=dict( + do_sample=True, + ), meta_template=api_meta_template, - query_per_second=16, max_out_len=2048, - max_seq_len=2048, - batch_size=8, - temperature = 0 + max_seq_len=4096, + batch_size=1, + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] + +datasets = [*subjective_datasets] + +infer = dict( + partitioner=dict(type=SizePartitioner, max_task_size=100), + runner=dict( + type=SlurmSequentialRunner, + partition='llmeval', + quotatype='auto', + max_num_workers=256, + task=dict(type=OpenICLInferTask), + ), +) + +# -------------Evalation Stage ---------------------------------------- + +## ------------- JudgeLLM Configuration +judge_model = dict( + abbr='GPT4-Turbo', + type=OpenAIAllesAPIN, + path='gpt-4-0613', + key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + url='xxxx', + meta_template=api_meta_template, + query_per_second=16, + max_out_len=1024, + max_seq_len=2048, + batch_size=8, + temperature=0, ) ## ------------- Evaluation Configuration -''' -## pair evaluation -eval = dict( - partitioner=dict( - type=SubjectiveSizePartitioner, - max_task_size=100, - mode='m2n', - base_models = [*hf_chatglm3_6b, ], - compare_models = models - ), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=32, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), -) +# ## pair evaluation +# eval = dict( +# partitioner=dict( +# type=SubjectiveSizePartitioner, max_task_size=100, mode='m2n', base_models=[gpt4], compare_models=models +# ), +# runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), +# ) -summarizer = dict( - type=MTBenchSummarizer, judge_type='pair' -) +# summarizer = dict(type=MTBenchSummarizer, judge_type='pair') -''' ## single evaluation eval = dict( - partitioner=dict( - type=SubjectiveSizePartitioner, - max_task_size=100, - mode='singlescore', - models = models - ), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=32, - task=dict( - type=SubjectiveEvalTask, - judge_cfg=judge_model - )), + partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models), + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), ) -summarizer = dict( - type=MTBenchSummarizer, judge_type='single' -) +summarizer = dict(type=MTBenchSummarizer, judge_type='single') work_dir = 'outputs/mtbench/'