diff --git a/configs/eval_subjective.py b/configs/eval_subjective.py index ca6ca887..523374d7 100644 --- a/configs/eval_subjective.py +++ b/configs/eval_subjective.py @@ -9,7 +9,6 @@ with read_base(): from .datasets.subjective.fofo.fofo_judge import fofo_datasets from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets - from .models.chatglm.hf_chatglm3_6b import models from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner @@ -30,9 +29,6 @@ api_meta_template = dict( # -------------Inference Stage ---------------------------------------- # For subjective evaluation, we often set do sample for models -for model in models: - model['generation_kwargs'] = dict(do_sample=True) - models = [ dict( type=HuggingFaceChatGLM3, @@ -49,51 +45,7 @@ models = [ trust_remote_code=True, ), generation_kwargs=dict( - do_sample=True, - ), - meta_template=api_meta_template, - max_out_len=2048, - max_seq_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), - ),dict( - type=HuggingFaceChatGLM3, - abbr='chatglm3-6b-hf2', - path='THUDM/chatglm3-6b', - tokenizer_path='THUDM/chatglm3-6b', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True, - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), - generation_kwargs=dict( - do_sample=True, - ), - meta_template=api_meta_template, - max_out_len=2048, - max_seq_len=4096, - batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), - ),dict( - type=HuggingFaceChatGLM3, - abbr='chatglm3-6b-hf3', - path='THUDM/chatglm3-6b', - tokenizer_path='THUDM/chatglm3-6b', - model_kwargs=dict( - device_map='auto', - trust_remote_code=True, - ), - tokenizer_kwargs=dict( - padding_side='left', - truncation_side='left', - trust_remote_code=True, - ), - generation_kwargs=dict( - do_sample=True, + do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference! ), meta_template=api_meta_template, max_out_len=2048, @@ -103,7 +55,7 @@ models = [ ) ] -datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] +datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want infer = dict( partitioner=dict(type=NaivePartitioner), @@ -124,7 +76,6 @@ judge_models = [dict( batch_size=8, temperature=0, )] -judge_models = [models[0]] ## ------------- Evaluation Configuration eval = dict(