2024-07-05 22:11:37 +08:00
from mmengine . config import read_base
with read_base ( ) :
from . datasets . subjective . alignbench . alignbench_judgeby_critiquellm import alignbench_datasets
from . datasets . subjective . alpaca_eval . alpacav2_judgeby_gpt4 import alpacav2_datasets
from . datasets . subjective . compassarena . compassarena_compare import compassarena_datasets
from . datasets . subjective . arena_hard . arena_hard_compare import arenahard_datasets
from . datasets . subjective . compassbench . compassbench_compare import compassbench_datasets
from . datasets . subjective . fofo . fofo_judge import fofo_datasets
from . datasets . subjective . multiround . mtbench_single_judge_diff_temp import mtbench_datasets
from . datasets . subjective . multiround . mtbench101_judge import mtbench101_datasets
from opencompass . models import HuggingFaceCausalLM , HuggingFace , HuggingFaceChatGLM3 , OpenAI
from opencompass . partitioners import NaivePartitioner , SizePartitioner
from opencompass . partitioners . sub_naive import SubjectiveNaivePartitioner
from opencompass . partitioners . sub_size import SubjectiveSizePartitioner
from opencompass . partitioners . sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass . runners import LocalRunner
from opencompass . runners import SlurmSequentialRunner
from opencompass . tasks import OpenICLInferTask
from opencompass . tasks . subjective_eval import SubjectiveEvalTask
from opencompass . summarizers import SubjectiveSummarizer
api_meta_template = dict (
round = [
dict ( role = ' HUMAN ' , api_role = ' HUMAN ' ) ,
dict ( role = ' BOT ' , api_role = ' BOT ' , generate = True ) ,
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict (
type = HuggingFaceChatGLM3 ,
abbr = ' chatglm3-6b-hf ' ,
path = ' THUDM/chatglm3-6b ' ,
tokenizer_path = ' THUDM/chatglm3-6b ' ,
model_kwargs = dict (
device_map = ' auto ' ,
trust_remote_code = True ,
) ,
tokenizer_kwargs = dict (
padding_side = ' left ' ,
truncation_side = ' left ' ,
trust_remote_code = True ,
) ,
generation_kwargs = dict (
2024-07-24 15:48:28 +08:00
do_sample = True , #For subjective evaluation, we suggest you do set do_sample when running model inference!
2024-07-05 22:11:37 +08:00
) ,
meta_template = api_meta_template ,
max_out_len = 2048 ,
max_seq_len = 4096 ,
batch_size = 8 ,
run_cfg = dict ( num_gpus = 1 , num_procs = 1 ) ,
)
]
2024-07-24 15:48:28 +08:00
datasets = [ * alignbench_datasets , * alpacav2_datasets , * arenahard_datasets , * compassarena_datasets , * compassbench_datasets , * fofo_datasets , * mtbench_datasets , * mtbench101_datasets ] # add datasets you want
2024-07-05 22:11:37 +08:00
infer = dict (
partitioner = dict ( type = NaivePartitioner ) ,
runner = dict ( type = LocalRunner , max_num_workers = 16 , task = dict ( type = OpenICLInferTask ) ) ,
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [ dict (
abbr = ' GPT4-Turbo ' ,
type = OpenAI ,
path = ' gpt-4-1106-preview ' ,
key = ' xxxx ' , # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template = api_meta_template ,
query_per_second = 16 ,
max_out_len = 2048 ,
max_seq_len = 2048 ,
batch_size = 8 ,
temperature = 0 ,
) ]
## ------------- Evaluation Configuration
eval = dict (
partitioner = dict ( type = SubjectiveNaivePartitioner , models = models , judge_models = judge_models , ) ,
runner = dict ( type = LocalRunner , max_num_workers = 16 , task = dict ( type = SubjectiveEvalTask ) ) ,
)
summarizer = dict ( type = SubjectiveSummarizer , function = ' subjective ' )
work_dir = ' outputs/subjective/ '