[Update] update Subeval demo config (#1358)

* fix pip version * fix pip version * update demo config
2025-05-30 16:03:24 +08:00 · 2024-07-24 15:48:28 +08:00 · 2024-07-24 15:48:28 +08:00 · 8fe75e9937
commit 8fe75e9937
parent 86b6d18731
1 changed files with 2 additions and 51 deletions
--- a/configs/eval_subjective.py
+++ b/configs/eval_subjective.py
@ -9,7 +9,6 @@ with read_base():
    from .datasets.subjective.fofo.fofo_judge import fofo_datasets
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
    from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
    from .models.chatglm.hf_chatglm3_6b import models
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@ -30,9 +29,6 @@ api_meta_template = dict(
 # -------------Inference Stage ----------------------------------------
 # For subjective evaluation, we often set do sample for models
 for model in models:
    model['generation_kwargs'] = dict(do_sample=True)
 models = [
    dict(
        type=HuggingFaceChatGLM3,
@ -49,51 +45,7 @@ models = [
            trust_remote_code=True,
        ),
        generation_kwargs=dict(
-            do_sample=True,
+            do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
        ),
        meta_template=api_meta_template,
        max_out_len=2048,
        max_seq_len=4096,
        batch_size=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),dict(
        type=HuggingFaceChatGLM3,
        abbr='chatglm3-6b-hf2',
        path='THUDM/chatglm3-6b',
        tokenizer_path='THUDM/chatglm3-6b',
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True,
        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        generation_kwargs=dict(
            do_sample=True,
        ),
        meta_template=api_meta_template,
        max_out_len=2048,
        max_seq_len=4096,
        batch_size=8,
        run_cfg=dict(num_gpus=1, num_procs=1),
    ),dict(
        type=HuggingFaceChatGLM3,
        abbr='chatglm3-6b-hf3',
        path='THUDM/chatglm3-6b',
        tokenizer_path='THUDM/chatglm3-6b',
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True,
        ),
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
        ),
        generation_kwargs=dict(
            do_sample=True,
        ),
        meta_template=api_meta_template,
        max_out_len=2048,
@ -103,7 +55,7 @@ models = [
    )
 ]
-datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets]
+datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want
 infer = dict(
    partitioner=dict(type=NaivePartitioner),
@ -124,7 +76,6 @@ judge_models = [dict(
    batch_size=8,
    temperature=0,
 )]
 judge_models = [models[0]]
 ## ------------- Evaluation Configuration
 eval = dict(