From 32b5948f4e9dbfc65317bd85775306ba4d6c011b Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Mon, 5 Feb 2024 15:55:58 +0800
Subject: [PATCH] [Fix] add do sample demo for subjective dataset (#873)

* add do sample demo for subjective dataset

* fix strings

* format

---------

Co-authored-by: Leymore <zfz-960727@163.com>
---
 configs/eval_subjective_alignbench.py    |  95 +++++++-------
 configs/eval_subjective_alpacaeval.py    | 106 ++++++++++------
 configs/eval_subjective_compassarena.py  | 125 +++++++++++--------
 configs/eval_subjective_corev2.py        | 125 +++++++++++--------
 configs/eval_subjective_creationbench.py |  99 ++++++++-------
 configs/eval_subjective_judge_pandalm.py | 106 ++++++++--------
 configs/eval_subjective_mtbench.py       | 150 +++++++++++------------
 7 files changed, 446 insertions(+), 360 deletions(-)

diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py
index 6a548f43..8f60016b 100644
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@@ -1,16 +1,7 @@
 from mmengine.config import read_base
-with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
-    from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
-    from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
-    from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
-    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
 
-datasets = [*subjective_datasets]
+with read_base():
+    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAIAllesAPIN
@@ -23,10 +14,42 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import AlignmentBenchSummarizer
 
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
 
-# -------------Inferen Stage ----------------------------------------
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
 
-models = [*hf_chatglm3_6b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
+datasets = [*subjective_datasets]
 
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
@@ -35,51 +58,39 @@ infer = dict(
         partition='llmeval',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
 # -------------Evalation Stage ----------------------------------------
 
-
 ## ------------- JudgeLLM Configuration
-api_meta_template = dict(
-    round=[
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
 judge_model = dict(
-        abbr='GPT4-Turbo',
-        type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
-        key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        url='xxxx',
-        meta_template=api_meta_template,
-        query_per_second=16,
-        max_out_len=2048,
-        max_seq_len=2048,
-        batch_size=8,
-        temperature = 0
+    abbr='GPT4-Turbo',
+    type=OpenAIAllesAPIN,
+    path='gpt-4-1106-preview',
+    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    url='xxxx',
+    meta_template=api_meta_template,
+    query_per_second=16,
+    max_out_len=2048,
+    max_seq_len=2048,
+    batch_size=8,
+    temperature=0,
 )
 
 ## ------------- Evaluation Configuration
 eval = dict(
     partitioner=dict(
-        type=SubjectiveNaivePartitioner,
-        mode='singlescore',
-        models = models
+        type=SubjectiveNaivePartitioner, mode='singlescore', models=models
     ),
     runner=dict(
         type=LocalRunner,
         max_num_workers=2,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
+        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
+    ),
 )
 
-summarizer = dict(
-    type=AlignmentBenchSummarizer, judge_type = 'general'
-)
+summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
 
 work_dir = 'outputs/alignment_bench/'
diff --git a/configs/eval_subjective_alpacaeval.py b/configs/eval_subjective_alpacaeval.py
index 42ac5c83..098547b9 100644
--- a/configs/eval_subjective_alpacaeval.py
+++ b/configs/eval_subjective_alpacaeval.py
@@ -1,16 +1,9 @@
 from mmengine.config import read_base
+
 with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b
-    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
     from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
     from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
 
-datasets = [*alpacav2]
-
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
@@ -22,18 +15,59 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import AlpacaSummarizer
 
-models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
-
 api_meta_template = dict(
     round=[
         dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True)
-    ],
-    reserved_roles=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
+        dict(role='BOT', api_role='BOT', generate=True),
     ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
 )
 
+# -------------Inference Stage ----------------------------------------
+
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = [*alpacav2]
+
+gpt4 = dict(
+    abbr='gpt4-turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=2048,
+    max_seq_len=4096,
+    batch_size=4,
+    retry=20,
+    temperature=1,
+)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
+
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
     runner=dict(
@@ -41,42 +75,40 @@ infer = dict(
         partition='llmeval',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
 judge_model = dict(
-        abbr='GPT4-Turbo',
-        type=OpenAI, path='gpt-4-1106-preview',
-        key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        meta_template=api_meta_template,
-        query_per_second=1,
-        max_out_len=1024,
-        max_seq_len=4096,
-        batch_size=2,
-        retry=20,
-        temperature = 0
+    abbr='GPT4-Turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=1024,
+    max_seq_len=4096,
+    batch_size=2,
+    retry=20,
+    temperature=0,
 )
 
+## ------------- Evaluation Configuration
 eval = dict(
     partitioner=dict(
-        type=SubjectiveSizePartitioner,
-        max_task_size=1000,
-        mode='m2n',
-        base_models = [*hf_chatglm3_6b],
-        compare_models = [*hf_qwen_7b_chat]
+        type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models
     ),
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llmeval',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(
-            type=SubjectiveEvalTask,
-        judge_cfg=judge_model
-        )),
+        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
+    ),
 )
 work_dir = 'outputs/alpaca/'
 
-summarizer = dict(
-    type=AlpacaSummarizer, judge_type='v2'
-)
\ No newline at end of file
+summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py
index 3ac0b86c..58336a5c 100644
--- a/configs/eval_subjective_compassarena.py
+++ b/configs/eval_subjective_compassarena.py
@@ -1,8 +1,8 @@
 from os import getenv as gv
 from opencompass.models import HuggingFaceCausalLM
 from mmengine.config import read_base
+
 with read_base():
-    from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
     from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
@@ -16,56 +16,85 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import CompassArenaSummarizer
 
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+# -------------Inference Stage ----------------------------------------
+
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = [*subjective_datasets]
+
+gpt4 = dict(
+    abbr='gpt4-turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=2048,
+    max_seq_len=4096,
+    batch_size=4,
+    retry=20,
+    temperature=1,
+)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
+
 infer = dict(
-    #partitioner=dict(type=NaivePartitioner),
     partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llm_dev2',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
-api_meta_template = dict(
-    round=[
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
-gpt4 = dict(
-        abbr='gpt4-turbo',
-        type=OpenAI, path='gpt-4-1106-preview',
-        key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        meta_template=api_meta_template,
-        query_per_second=1,
-        max_out_len=2048,
-        max_seq_len=4096,
-        batch_size=4,
-        retry=20,
-        temperature = 1
-)
-models = [*chatglm3_6b_32k_model]
-datasets = [*subjective_datasets]
-
-
-
-work_dir = 'outputs/compass_arena_debug/'
-
-# -------------Inferen Stage ----------------------------------------
+# -------------Evalation Stage ----------------------------------------
 
+## ------------- JudgeLLM Configuration
 judge_model = dict(
-        abbr='GPT4-Turbo',
-        type=OpenAI, path='gpt-4-1106-preview',
-        key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        meta_template=api_meta_template,
-        query_per_second=1,
-        max_out_len=1024,
-        max_seq_len=4096,
-        batch_size=2,
-        retry=20,
-        temperature = 0
+    abbr='GPT4-Turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=1024,
+    max_seq_len=4096,
+    batch_size=2,
+    retry=20,
+    temperature=0,
 )
 
 ## ------------- Evaluation Configuration
@@ -75,22 +104,18 @@ eval = dict(
         strategy='split',
         max_task_size=10000,
         mode='m2n',
-        base_models = [gpt4],
-        compare_models = [*chatglm3_6b_32k_model]
+        base_models=[gpt4],
+        compare_models=models,
     ),
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llm_dev2',
         quotatype='auto',
         max_num_workers=32,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
+        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
+    ),
 )
 
+work_dir = 'outputs/compass_arena_debug/'
 
-summarizer = dict(
-    type=CompassArenaSummarizer,
-    summary_type='half_add'
-)
\ No newline at end of file
+summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add')
diff --git a/configs/eval_subjective_corev2.py b/configs/eval_subjective_corev2.py
index b16e017b..2ca07b43 100644
--- a/configs/eval_subjective_corev2.py
+++ b/configs/eval_subjective_corev2.py
@@ -1,16 +1,9 @@
 from mmengine.config import read_base
+
 with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm2_chat_7b import models as internlm2_7b
-    from .models.hf_internlm.hf_internlm2_chat_20b import models as internlm2_20b
     from .datasets.subjective.subjective_cmp.subjective_corev2 import subjective_datasets
 
-datasets = [*subjective_datasets]
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@@ -19,18 +12,62 @@ from opencompass.runners import SlurmSequentialRunner
 from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import Corev2Summarizer
-models = [*internlm2_7b, *internlm2_20b]
 
 api_meta_template = dict(
     round=[
         dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True)
+        dict(role='BOT', api_role='BOT', generate=True),
     ],
     reserved_roles=[
         dict(role='SYSTEM', api_role='SYSTEM'),
     ],
 )
 
+# -------------Inference Stage ----------------------------------------
+
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = [*subjective_datasets]
+
+gpt4 = dict(
+    abbr='gpt4-turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=2048,
+    max_seq_len=4096,
+    batch_size=4,
+    retry=20,
+    temperature=1,
+)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
+
 infer = dict(
     partitioner=dict(type=SizePartitioner, max_task_size=500),
     runner=dict(
@@ -38,61 +75,41 @@ infer = dict(
         partition='llm_dev2',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
+# -------------Evalation Stage ----------------------------------------
 
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
-    ],
+## ------------- JudgeLLM Configuration
+judge_model = dict(
+    abbr='GPT4-Turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=1024,
+    max_seq_len=4096,
+    batch_size=2,
+    retry=20,
+    temperature=0,
 )
 
-
-judge_model =    dict(
-        type=HuggingFaceCausalLM,
-        abbr='qwen-7b-chat-hf',
-        path="Qwen/Qwen-7B-Chat",
-        tokenizer_path='Qwen/Qwen-7B-Chat',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,),
-        pad_token_id=151643,
-        max_out_len=2048,
-        max_seq_len=2048,
-        batch_size=8,
-        meta_template=_meta_template,
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-
+## ------------- Evaluation Configuration
 eval = dict(
     partitioner=dict(
-        type=SubjectiveSizePartitioner,
-        mode='m2n',
-        max_task_size=500,
-        base_models = [*internlm2_7b],
-        compare_models = [*internlm2_20b]
+        type=SubjectiveSizePartitioner, mode='m2n', max_task_size=500, base_models=[gpt4], compare_models=models
     ),
     runner=dict(
         type=SlurmSequentialRunner,
         partition='llm_dev2',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(
-            type=SubjectiveEvalTask,
-        judge_cfg=judge_model
-        )),
+        task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model),
+    ),
 )
-work_dir = 'outputs/corev2/'
 
-summarizer = dict(
-    type=Corev2Summarizer,
-    match_method='smart',
-)
\ No newline at end of file
+summarizer = dict(type=Corev2Summarizer, match_method='smart')
+
+work_dir = 'outputs/corev2/'
diff --git a/configs/eval_subjective_creationbench.py b/configs/eval_subjective_creationbench.py
index a871cfd8..52bf7d4b 100644
--- a/configs/eval_subjective_creationbench.py
+++ b/configs/eval_subjective_creationbench.py
@@ -1,16 +1,7 @@
 from mmengine.config import read_base
-with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
-    from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
-    from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
-    from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
-    from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
 
-datasets = [*subjective_datasets]
+with read_base():
+    from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAIAllesAPIN
@@ -23,10 +14,42 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import CreationBenchSummarizer
 
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
 
-# -------------Inferen Stage ----------------------------------------
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
 
-models = [*hf_chatglm3_6b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
+datasets = [*subjective_datasets]
 
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
@@ -35,51 +58,33 @@ infer = dict(
         partition='llmeval',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
 # -------------Evalation Stage ----------------------------------------
 
-
 ## ------------- JudgeLLM Configuration
-api_meta_template = dict(
-    round=[
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
 judge_model = dict(
-        abbr='GPT4-Turbo',
-        type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
-        key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        url='xxxx',
-        meta_template=api_meta_template,
-        query_per_second=16,
-        max_out_len=2048,
-        max_seq_len=2048,
-        batch_size=8,
-        temperature = 0
+    abbr='GPT4-Turbo',
+    type=OpenAIAllesAPIN,
+    path='gpt-4-1106-preview',
+    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    url='xxxx',
+    meta_template=api_meta_template,
+    query_per_second=16,
+    max_out_len=2048,
+    max_seq_len=2048,
+    batch_size=8,
+    temperature=0,
 )
 
 ## ------------- Evaluation Configuration
 eval = dict(
-    partitioner=dict(
-        type=SubjectiveNaivePartitioner,
-        mode='singlescore',
-        models = models
-    ),
-    runner=dict(
-        type=LocalRunner,
-        max_num_workers=2,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
+    partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models),
+    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )
 
-summarizer = dict(
-    type=CreationBenchSummarizer, judge_type = 'general'
-)
+summarizer = dict(type=CreationBenchSummarizer, judge_type='general')
 
 work_dir = 'outputs/creationbench/'
diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py
index ff7b69de..2fbe7e91 100644
--- a/configs/eval_subjective_judge_pandalm.py
+++ b/configs/eval_subjective_judge_pandalm.py
@@ -1,13 +1,7 @@
 from mmengine.config import read_base
-with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
-    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
 
-datasets = [*subjective_datasets]
+with read_base():
+    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
 from opencompass.partitioners import NaivePartitioner
@@ -18,10 +12,42 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import AlignmentBenchSummarizer
 
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ]
+)
 
-# -------------Inferen Stage ----------------------------------------
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+        meta_template=api_meta_template,
+        max_out_len=2048,
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
 
-models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
+datasets = [*subjective_datasets]
 
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
@@ -30,55 +56,37 @@ infer = dict(
         partition='llmeval',
         quotatype='auto',
         max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
+        task=dict(type=OpenICLInferTask),
+    ),
 )
 
-
 # -------------Evalation Stage ----------------------------------------
 
-
 ## ------------- JudgeLLM Configuration
-api_meta_template = dict(
-    round=[
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
 judge_model = dict(
-        type=HuggingFaceCausalLM,
-        abbr='pandalm-7b-v1-hf',
-        path="WeOpenML/PandaLM-7B-v1",
-        tokenizer_path='WeOpenML/PandaLM-7B-v1',
-        tokenizer_kwargs=dict(padding_side='left',
-                              truncation_side='left',
-                              trust_remote_code=True,
-                              use_fast=False,),
-        max_out_len=512,
-        max_seq_len=2048,
-        batch_size=8,
-        model_kwargs=dict(device_map='auto', trust_remote_code=True),
-        run_cfg=dict(num_gpus=1, num_procs=1),
-    )
+    type=HuggingFaceCausalLM,
+    abbr='pandalm-7b-v1-hf',
+    path='WeOpenML/PandaLM-7B-v1',
+    tokenizer_path='WeOpenML/PandaLM-7B-v1',
+    tokenizer_kwargs=dict(
+        padding_side='left',
+        truncation_side='left',
+        trust_remote_code=True,
+        use_fast=False,
+    ),
+    max_out_len=512,
+    max_seq_len=2048,
+    batch_size=8,
+    model_kwargs=dict(device_map='auto', trust_remote_code=True),
+    run_cfg=dict(num_gpus=1, num_procs=1),
+)
 
 ## ------------- Evaluation Configuration
 eval = dict(
-    partitioner=dict(
-        type=SubjectiveNaivePartitioner,
-        mode='singlescore',
-        models = [*hf_baichuan2_7b]
-    ),
-    runner=dict(
-        type=LocalRunner,
-        max_num_workers=2,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
+    partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models),
+    runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )
 
-summarizer = dict(
-    type=AlignmentBenchSummarizer,
-)
+summarizer = dict(type=AlignmentBenchSummarizer)
 
 work_dir = 'outputs/pandalm'
diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py
index 575d3974..6ccb5e74 100644
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@@ -1,17 +1,9 @@
 from mmengine.config import read_base
-with read_base():
-    from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
-    from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
-    from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
-    from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
-    from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
-    from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
-    from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
-    from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
-    from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
-    #from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
 
-datasets = [*subjective_datasets]
+with read_base():
+    from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
+
+    # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.models.openai_api import OpenAIAllesAPIN
@@ -24,24 +16,6 @@ from opencompass.tasks import OpenICLInferTask
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 from opencompass.summarizers import MTBenchSummarizer
 
-
-# -------------Inferen Stage ----------------------------------------
-
-models = [*hf_chatglm3_6b, *hf_qwen_7b_chat]
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=100),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=256,
-        task=dict(type=OpenICLInferTask)),
-)
-
-# -------------Evalation Stage ----------------------------------------
-
-
-## ------------- JudgeLLM Configuration
 api_meta_template = dict(
     round=[
         dict(role='HUMAN', api_role='HUMAN'),
@@ -49,68 +23,82 @@ api_meta_template = dict(
     ]
 )
 
-judge_model = dict(
-        abbr='GPT4-Turbo',
-        type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
-        key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-        url='xxxx',
+# -------------Inference Stage ----------------------------------------
+# For subjective evaluation, we often set do sample for models
+models = [
+    dict(
+        type=HuggingFaceChatGLM3,
+        abbr='chatglm3-6b-hf',
+        path='THUDM/chatglm3-6b',
+        tokenizer_path='THUDM/chatglm3-6b',
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+        ),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
         meta_template=api_meta_template,
-        query_per_second=16,
         max_out_len=2048,
-        max_seq_len=2048,
-        batch_size=8,
-        temperature = 0
+        max_seq_len=4096,
+        batch_size=1,
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
+
+datasets = [*subjective_datasets]
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=100),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='auto',
+        max_num_workers=256,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_model = dict(
+    abbr='GPT4-Turbo',
+    type=OpenAIAllesAPIN,
+    path='gpt-4-0613',
+    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    url='xxxx',
+    meta_template=api_meta_template,
+    query_per_second=16,
+    max_out_len=1024,
+    max_seq_len=2048,
+    batch_size=8,
+    temperature=0,
 )
 
 ## ------------- Evaluation Configuration
-'''
-## pair evaluation
-eval = dict(
-    partitioner=dict(
-        type=SubjectiveSizePartitioner, 
-        max_task_size=100,
-        mode='m2n',
-        base_models = [*hf_chatglm3_6b, ],
-        compare_models = models
-    ),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=32,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
-)
+# ## pair evaluation
+# eval = dict(
+#     partitioner=dict(
+#         type=SubjectiveSizePartitioner, max_task_size=100, mode='m2n', base_models=[gpt4], compare_models=models
+#     ),
+#     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
+# )
 
-summarizer = dict(
-    type=MTBenchSummarizer, judge_type='pair'
-)
+# summarizer = dict(type=MTBenchSummarizer, judge_type='pair')
 
-'''
 
 ## single evaluation
 eval = dict(
-    partitioner=dict(
-        type=SubjectiveSizePartitioner, 
-        max_task_size=100,
-        mode='singlescore',
-        models = models
-    ),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        partition='llmeval',
-        quotatype='auto',
-        max_num_workers=32,
-        task=dict(
-            type=SubjectiveEvalTask,
-            judge_cfg=judge_model
-        )),
+    partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models),
+    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
 )
 
-summarizer = dict(
-    type=MTBenchSummarizer, judge_type='single'
-)
+summarizer = dict(type=MTBenchSummarizer, judge_type='single')
 
 work_dir = 'outputs/mtbench/'