from mmengine.config import read_base with read_base(): from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1 from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2 datasets = [*alpacav2] from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3 from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.partitioners.sub_size import SubjectiveSizePartitioner from opencompass.runners import LocalRunner from opencompass.runners import SlurmSequentialRunner from opencompass.tasks import OpenICLInferTask from opencompass.tasks.subjective_eval import SubjectiveEvalTask from opencompass.summarizers import AlpacaSummarizer models = [*hf_qwen_7b_chat, *hf_chatglm3_6b] api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True) ], reserved_roles=[ dict(role='SYSTEM', api_role='SYSTEM'), ], ) infer = dict( partitioner=dict(type=NaivePartitioner), runner=dict( type=SlurmSequentialRunner, partition='llmeval', quotatype='auto', max_num_workers=256, task=dict(type=OpenICLInferTask)), ) judge_model = dict( abbr='GPT4-Turbo', type=OpenAI, path='gpt-4-1106-preview', key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well meta_template=api_meta_template, query_per_second=1, max_out_len=1024, max_seq_len=4096, batch_size=2, retry=20, temperature = 0 ) eval = dict( partitioner=dict( type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models = [*hf_chatglm3_6b], compare_models = [*hf_qwen_7b_chat] ), runner=dict( type=SlurmSequentialRunner, partition='llmeval', quotatype='auto', max_num_workers=256, task=dict( type=SubjectiveEvalTask, judge_cfg=judge_model )), ) work_dir = 'outputs/alpaca/' summarizer = dict( type=AlpacaSummarizer, judge_type='v2' )