from mmengine.config import read_base with read_base(): from opencompass.configs.datasets.subjective.judgerbench.judgerbench import judgerbench_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI, TurboMindModelwithChatTemplate from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.runners import LocalRunner from opencompass.runners import SlurmSequentialRunner from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask api_meta_template = dict( round=[ dict(role='HUMAN', api_role='HUMAN'), dict(role='BOT', api_role='BOT', generate=True), ] ) # -------------Inference Stage ---------------------------------------- # For subjective evaluation, we often set do sample for models models = [ dict( type=TurboMindModelwithChatTemplate, abbr='Qwen2-7B', path='Qwen/Qwen2-7B-Instruct', engine_config=dict(session_len=16384, max_batch_size=16, tp=1), gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=2048), max_seq_len=16384, max_out_len=2048, batch_size=16, run_cfg=dict(num_gpus=1), ) ] datasets = judgerbench_datasets infer = dict( partitioner=dict(type=NaivePartitioner), runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)), ) # -------------Evalation Stage ---------------------------------------- ## ------------- Evaluation Configuration eval = dict( partitioner=dict(type=NaivePartitioner, n=10,), runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLEvalTask)), ) work_dir = 'outputs/judgerbench/'