[Update] update Subeval demo config (#1358)

* fix pip version * fix pip version * update demo config
2025-05-30 16:03:24 +08:00 · 2024-07-24 15:48:28 +08:00 · 2024-07-24 15:48:28 +08:00 · 8fe75e9937
commit 8fe75e9937
parent 86b6d18731
1 changed files with 2 additions and 51 deletions
--- a/configs/eval_subjective.py
+++ b/configs/eval_subjective.py
@ -9,7 +9,6 @@ with read_base():
    from .datasets.subjective.fofo.fofo_judge import fofo_datasets
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
    from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
-    from .models.chatglm.hf_chatglm3_6b import models
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@ -30,9 +29,6 @@ api_meta_template = dict(

 # -------------Inference Stage ----------------------------------------
 # For subjective evaluation, we often set do sample for models
-for model in models:
-    model['generation_kwargs'] = dict(do_sample=True)
-
 models = [
    dict(
        type=HuggingFaceChatGLM3,
@ -49,51 +45,7 @@ models = [
            trust_remote_code=True,
        ),
        generation_kwargs=dict(
-            do_sample=True,
-        ),
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    ),dict(
-        type=HuggingFaceChatGLM3,
-        abbr='chatglm3-6b-hf2',
-        path='THUDM/chatglm3-6b',
-        tokenizer_path='THUDM/chatglm3-6b',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True,
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        generation_kwargs=dict(
-            do_sample=True,
-        ),
-        meta_template=api_meta_template,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    ),dict(
-        type=HuggingFaceChatGLM3,
-        abbr='chatglm3-6b-hf3',
-        path='THUDM/chatglm3-6b',
-        tokenizer_path='THUDM/chatglm3-6b',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True,
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-        ),
-        generation_kwargs=dict(
-            do_sample=True,
+            do_sample=True, #For subjective evaluation, we suggest you do set do_sample when running model inference!
        ),
        meta_template=api_meta_template,
        max_out_len=2048,
@ -103,7 +55,7 @@ models = [
    )
 ]

-datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets]
+datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want

 infer = dict(
    partitioner=dict(type=NaivePartitioner),
@ -124,7 +76,6 @@ judge_models = [dict(
    batch_size=8,
    temperature=0,
 )]
-judge_models = [models[0]]

 ## ------------- Evaluation Configuration
 eval = dict(