[Sync] bump version (#1204)

2025-05-30 16:03:24 +08:00 · 2024-05-28 23:09:59 +08:00 · 2024-05-28 23:09:59 +08:00 · 2954913d9b
commit 2954913d9b
parent ba620c4afe
92 changed files with 3228 additions and 795 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -35,6 +35,7 @@ repos:
        exclude: |
            (?x)^(
                .*\.jsonl|
+                .*\.md.template|
                configs/
            )
  - repo: https://github.com/pre-commit/pre-commit-hooks
--- a/configs/datasets/CHARM/charm_rea_gen.py
+++ b/configs/datasets/CHARM/charm_rea_gen.py
@ -1,4 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    from .charm_rea_gen_f8fca2 import charm_rea_datasets  # noqa: F401, F403
--- a/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
+++ b/configs/datasets/CHARM/charm_reason_cot_only_gen_f7b7d3.py
@ -0,0 +1,50 @@
+import os
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
+
+with read_base():
+    from .charm_reason_settings import charm_tasks, settings
+
+settings = [s for s in settings if s[0] in ['ZH-CoT', 'EN-CoT']]
+
+charm_reason_datasets = []
+
+for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
+    for _task in charm_tasks:
+        _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
+        with open(_fewshot_example_file, 'r') as f:
+            _hint = f.read()
+
+        charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+        charm_reason_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
+        )
+
+        charm_reason_eval_cfg = dict(
+            evaluator=dict(type=CharmReasonEvaluator),
+            pred_role='BOT',
+            pred_postprocessor=dict(type=charm_reason_postprocess),
+            dataset_postprocessor=dict(type=charm_reason_postprocess),
+        )
+
+        charm_reason_datasets.append(
+            dict(
+                type=CharmDataset,
+                path=dataset_path,
+                name=_task,
+                abbr='charm-reason-' + _task + '_' + _cot,
+                reader_cfg=charm_reason_reader_cfg,
+                infer_cfg=charm_reason_infer_cfg.copy(),
+                eval_cfg=charm_reason_eval_cfg.copy(),
+            )
+        )
--- a/configs/datasets/CHARM/charm_reason_gen.py
+++ b/configs/datasets/CHARM/charm_reason_gen.py
@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .charm_reason_gen_f8fca2 import charm_reason_datasets  # noqa: F401, F403
--- a/configs/datasets/CHARM/charm_reason_gen_f8fca2.py
+++ b/configs/datasets/CHARM/charm_reason_gen_f8fca2.py
@ -0,0 +1,49 @@
+import os
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CharmDataset, charm_reason_postprocess, CharmReasonEvaluator
+
+with read_base():
+    from .charm_reason_settings import charm_tasks, settings
+
+
+charm_reason_datasets = []
+
+for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
+    for _task in charm_tasks:
+        _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
+        with open(_fewshot_example_file, 'r') as f:
+            _hint = f.read()
+
+        charm_reason_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+        charm_reason_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_out_len=512),
+        )
+
+        charm_reason_eval_cfg = dict(
+            evaluator=dict(type=CharmReasonEvaluator),
+            pred_role='BOT',
+            pred_postprocessor=dict(type=charm_reason_postprocess),
+            dataset_postprocessor=dict(type=charm_reason_postprocess),
+        )
+
+        charm_reason_datasets.append(
+            dict(
+                type=CharmDataset,
+                path=dataset_path,
+                name=_task,
+                abbr='charm-reason-' + _task + '_' + _cot,
+                reader_cfg=charm_reason_reader_cfg,
+                infer_cfg=charm_reason_infer_cfg.copy(),
+                eval_cfg=charm_reason_eval_cfg.copy(),
+            )
+        )
--- a/configs/datasets/CHARM/charm_reason_settings.py
+++ b/configs/datasets/CHARM/charm_reason_settings.py
@ -1,8 +1,4 @@
 import os
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import CharmDataset, charm_rea_postprocess, CharmReaEvaluator

 charm_tasks = [
    'Chinese_Anachronisms_Judgment',
@ -21,16 +17,16 @@ charm_tasks = [
    'Global_Time_Understanding',
 ]

+XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
+Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
+Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ：{{input}}\nA：'
+
 data_dir = 'data/CHARM'
 dataset_path_ZH = f'{data_dir}/reasoning'
 dataset_path_TransEn = f'{data_dir}/reasoning_Translate-EN'
 fewshot_example_path_ZH = os.path.join(os.path.dirname(__file__), 'few-shot-examples')
 fewshot_example_path_TransEn = os.path.join(os.path.dirname(__file__), 'few-shot-examples_Translate-EN')

-XLT_template = 'Follow the given examples and answer the question.\n{_hint}\n\n I want you to act as an commonsense reasoning expert for Chinese. \n Request: {{input}}\n'
-Translate_EN_template = 'Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: '
-Other_template = '请按照给定的例子回答问题。\n{_hint}\n\nQ：{{input}}\nA：'
-
 settings = [
    ('Direct', '', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
    ('ZH-CoT', '让我们一步一步来思考。', dataset_path_ZH, fewshot_example_path_ZH, Other_template),
@ -38,41 +34,3 @@ settings = [
    ('XLT', """You should retell the request in English.\nYou should do the answer step by step to choose the right answer.\nYou should step-by-step answer the request.\nYou should tell me the answer in this format 'So the answer is'.""", dataset_path_ZH, fewshot_example_path_ZH, XLT_template),
    ('Translate-EN', "Let's think step by step.", dataset_path_TransEn, fewshot_example_path_TransEn, Translate_EN_template),
 ]
-
-charm_rea_datasets = []
-
-for _cot, _cot_prefix, dataset_path, fewshot_example_path, prompt_template in settings:
-    for _task in charm_tasks:
-        _fewshot_example_file = os.path.join(fewshot_example_path, f'{_task}_{_cot}.txt')
-        with open(_fewshot_example_file, 'r') as f:
-            _hint = f.read()
-
-        charm_rea_reader_cfg = dict(input_columns=['input'], output_column='target')
-
-        charm_rea_infer_cfg = dict(
-            prompt_template=dict(
-                type=PromptTemplate,
-                template=dict(round=[dict(role='HUMAN', prompt=prompt_template.format(_hint=_hint) + _cot_prefix)]),
-            ),
-            retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=512),
-        )
-
-        charm_rea_eval_cfg = dict(
-            evaluator=dict(type=CharmReaEvaluator),
-            pred_role='BOT',
-            pred_postprocessor=dict(type=charm_rea_postprocess),
-            dataset_postprocessor=dict(type=charm_rea_postprocess),
-        )
-
-        charm_rea_datasets.append(
-            dict(
-                type=CharmDataset,
-                path=dataset_path,
-                name=_task,
-                abbr='charm-rea-' + _task + '_' + _cot,
-                reader_cfg=charm_rea_reader_cfg,
-                infer_cfg=charm_rea_infer_cfg.copy(),
-                eval_cfg=charm_rea_eval_cfg.copy(),
-            )
-        )
--- a/configs/datasets/GaokaoBench/README.md
+++ b/configs/datasets/GaokaoBench/README.md
@ -0,0 +1,186 @@
+# GaokaoBench
+
+## Base Models
+
+|          model           |   GaokaoBench |
+|:------------------------:|--------------:|
+|    llama-7b-turbomind    |         14.55 |
+|   llama-13b-turbomind    |         16.20 |
+|   llama-30b-turbomind    |         16.14 |
+|   llama-65b-turbomind    |         13.31 |
+|   llama-2-7b-turbomind   |         15.02 |
+|  llama-2-13b-turbomind   |         14.86 |
+|  llama-2-70b-turbomind   |         16.36 |
+|   llama-3-8b-turbomind   |         20.88 |
+|  llama-3-70b-turbomind   |         19.98 |
+| internlm2-1.8b-turbomind |         23.78 |
+|  internlm2-7b-turbomind  |         41.41 |
+| internlm2-20b-turbomind  |         58.99 |
+|   qwen-1.8b-turbomind    |         22.11 |
+|    qwen-7b-turbomind     |         35.32 |
+|    qwen-14b-turbomind    |         54.07 |
+|    qwen-72b-turbomind    |         77.56 |
+|     qwen1.5-0.5b-hf      |         30.67 |
+|     qwen1.5-1.8b-hf      |         35.66 |
+|      qwen1.5-4b-hf       |         54.31 |
+|      qwen1.5-7b-hf       |         65.99 |
+|      qwen1.5-14b-hf      |         66.60 |
+|      qwen1.5-32b-hf      |         79.01 |
+|      qwen1.5-72b-hf      |         80.26 |
+|   qwen1.5-moe-a2-7b-hf   |         52.79 |
+|    mistral-7b-v0.1-hf    |         14.35 |
+|    mistral-7b-v0.2-hf    |         11.10 |
+|   mixtral-8x7b-v0.1-hf   |          8.40 |
+|  mixtral-8x22b-v0.1-hf   |         16.23 |
+|         yi-6b-hf         |         31.70 |
+|        yi-34b-hf         |         30.51 |
+|   deepseek-7b-base-hf    |         17.02 |
+|   deepseek-67b-base-hf   |         10.14 |
+
+### Details
+
+|          model           |   2010-2022_Math_II_MCQs |   2010-2022_Math_I_MCQs |   2010-2022_History_MCQs |   2010-2022_Biology_MCQs |   2010-2022_Political_Science_MCQs |   2010-2022_Physics_MCQs |   2010-2022_Chemistry_MCQs |
+|:------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+|    llama-7b-turbomind    |                    14.22 |                   13.55 |                    12.54 |                    18.67 |                              19.06 |                     2.34 |                      17.74 |
+|   llama-13b-turbomind    |                    18.81 |                   15.89 |                    21.25 |                    22.67 |                              15.62 |                     1.56 |                      25.81 |
+|   llama-30b-turbomind    |                    20.64 |                   19.16 |                    27.18 |                    16.67 |                              16.56 |                     2.34 |                      12.10 |
+|   llama-65b-turbomind    |                    21.10 |                   15.89 |                    11.50 |                    20.00 |                               5.94 |                     1.56 |                      21.77 |
+|   llama-2-7b-turbomind   |                    16.97 |                   16.36 |                    20.91 |                    22.00 |                              18.75 |                     2.34 |                      11.29 |
+|  llama-2-13b-turbomind   |                    14.68 |                   11.68 |                    26.13 |                    16.00 |                              17.81 |                     2.34 |                      20.97 |
+|  llama-2-70b-turbomind   |                    18.81 |                   12.15 |                    26.13 |                    16.00 |                              20.31 |                     4.69 |                      16.13 |
+|   llama-3-8b-turbomind   |                     4.13 |                    7.94 |                    37.63 |                    24.67 |                              26.25 |                     5.47 |                      21.77 |
+|  llama-3-70b-turbomind   |                     4.59 |                    3.12 |                    20.83 |                    10.94 |                              18.00 |                     6.25 |                      15.62 |
+| internlm2-1.8b-turbomind |                    20.64 |                   22.90 |                    39.72 |                    30.00 |                              25.94 |                    10.94 |                      31.45 |
+|  internlm2-7b-turbomind  |                    33.94 |                   35.51 |                    38.33 |                    59.33 |                              61.56 |                     2.34 |                      11.29 |
+| internlm2-20b-turbomind  |                    59.17 |                   51.40 |                    65.16 |                    74.00 |                              82.19 |                    28.91 |                      54.03 |
+|   qwen-1.8b-turbomind    |                    29.36 |                   30.84 |                    19.51 |                    26.00 |                              22.19 |                     5.47 |                      27.42 |
+|    qwen-7b-turbomind     |                    22.48 |                   28.04 |                    45.64 |                    43.33 |                              62.19 |                     3.91 |                      33.87 |
+|    qwen-14b-turbomind    |                    54.13 |                   56.25 |                    82.93 |                    72.00 |                              85.00 |                     4.69 |                      65.62 |
+|    qwen-72b-turbomind    |                    73.12 |                   64.49 |                    91.67 |                    90.62 |                              58.75 |                    44.53 |                      79.03 |
+|     qwen1.5-0.5b-hf      |                    26.61 |                   32.71 |                    32.40 |                    34.67 |                              53.44 |                    10.94 |                      28.23 |
+|     qwen1.5-1.8b-hf      |                    36.24 |                   33.18 |                    56.45 |                    36.00 |                              49.38 |                     6.25 |                      33.06 |
+|      qwen1.5-4b-hf       |                    45.41 |                   37.85 |                    68.29 |                    62.00 |                              87.81 |                     5.47 |                      47.58 |
+|      qwen1.5-7b-hf       |                    56.42 |                   53.74 |                    85.02 |                    69.33 |                              86.88 |                    28.12 |                      70.16 |
+|      qwen1.5-14b-hf      |                    69.27 |                   63.08 |                    54.01 |                    79.33 |                              76.56 |                    40.62 |                      79.84 |
+|      qwen1.5-32b-hf      |                    71.10 |                   61.68 |                    92.68 |                    93.33 |                              95.94 |                    45.31 |                      83.06 |
+|      qwen1.5-72b-hf      |                    71.15 |                   68.22 |                    94.44 |                    96.67 |                              95.00 |                    38.28 |                      75.00 |
+|   qwen1.5-moe-a2-7b-hf   |                    35.32 |                   29.44 |                    68.64 |                    44.67 |                              75.00 |                    17.97 |                      59.68 |
+|    mistral-7b-v0.1-hf    |                    13.76 |                   12.15 |                     9.76 |                     8.00 |                               5.94 |                     0.00 |                      17.74 |
+|    mistral-7b-v0.2-hf    |                     6.88 |                    5.61 |                    10.45 |                    12.00 |                               4.06 |                     0.78 |                      14.52 |
+|   mixtral-8x7b-v0.1-hf   |                     3.67 |                    1.87 |                     0.35 |                     0.00 |                               0.00 |                     0.78 |                       0.81 |
+|  mixtral-8x22b-v0.1-hf   |                    16.51 |                   15.89 |                     1.39 |                     3.33 |                               9.69 |                     0.00 |                      13.71 |
+|         yi-6b-hf         |                     6.25 |                    3.12 |                    40.74 |                    43.75 |                              35.94 |                     8.59 |                      31.25 |
+|        yi-34b-hf         |                    12.50 |                    4.17 |                    31.11 |                     5.00 |                              20.62 |                     2.34 |                       0.89 |
+|   deepseek-7b-base-hf    |                    14.22 |                   13.08 |                    25.78 |                    20.67 |                              20.31 |                     5.47 |                      18.55 |
+|   deepseek-67b-base-hf   |                     3.67 |                    4.21 |                     8.36 |                     7.33 |                               4.69 |                     1.56 |                       4.84 |
+
+|          model           |   2010-2013_English_MCQs |   2010-2022_Chinese_Modern_Lit |   2010-2022_English_Fill_in_Blanks |   2012-2022_English_Cloze_Test |   2010-2022_Geography_MCQs |   2010-2022_English_Reading_Comp |   2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+|    llama-7b-turbomind    |                    19.05 |                           0.00 |                              15.00 |                          16.15 |                      22.11 |                            10.43 |                                   15.00 |
+|   llama-13b-turbomind    |                    22.86 |                           0.00 |                               8.50 |                           8.46 |                      24.21 |                             9.36 |                                   20.00 |
+|   llama-30b-turbomind    |                    28.57 |                           0.00 |                               6.33 |                          13.85 |                      23.16 |                            12.98 |                                   12.50 |
+|   llama-65b-turbomind    |                    21.90 |                           0.00 |                               8.00 |                          13.85 |                      16.84 |                            12.34 |                                   10.00 |
+|   llama-2-7b-turbomind   |                    20.95 |                           0.00 |                               6.17 |                          12.31 |                      22.11 |                            11.28 |                                   11.25 |
+|  llama-2-13b-turbomind   |                    16.19 |                           0.00 |                               9.83 |                          13.08 |                      22.11 |                             7.66 |                                   10.00 |
+|  llama-2-70b-turbomind   |                    31.43 |                           0.00 |                               4.17 |                          13.08 |                      25.26 |                            20.43 |                                    7.50 |
+|   llama-3-8b-turbomind   |                     1.90 |                           1.15 |                              42.00 |                           7.69 |                      29.47 |                            17.66 |                                   17.50 |
+|  llama-3-70b-turbomind   |                    18.75 |                           3.45 |                              53.67 |                          76.15 |                      18.60 |                            36.76 |                                    8.75 |
+| internlm2-1.8b-turbomind |                    33.33 |                           3.45 |                              15.67 |                          13.85 |                      32.63 |                            10.43 |                                   25.00 |
+|  internlm2-7b-turbomind  |                    61.90 |                          20.69 |                              57.33 |                          20.77 |                      61.05 |                            40.21 |                                   47.50 |
+| internlm2-20b-turbomind  |                    72.38 |                          37.93 |                              62.33 |                          19.23 |                      74.74 |                            38.51 |                                   48.75 |
+|   qwen-1.8b-turbomind    |                    47.62 |                           9.20 |                              13.50 |                          12.31 |                      25.26 |                            16.38 |                                   21.25 |
+|    qwen-7b-turbomind     |                    42.86 |                          12.64 |                              35.83 |                          26.15 |                      51.58 |                            17.87 |                                   30.00 |
+|    qwen-14b-turbomind    |                    89.58 |                           3.45 |                               5.00 |                          23.85 |                      93.02 |                            21.10 |                                   40.62 |
+|    qwen-72b-turbomind    |                    71.43 |                          81.25 |                              88.17 |                          96.25 |                      95.79 |                            79.57 |                                   90.00 |
+|     qwen1.5-0.5b-hf      |                    40.95 |                          22.99 |                              21.67 |                          21.54 |                      38.95 |                            17.02 |                                   22.50 |
+|     qwen1.5-1.8b-hf      |                    85.71 |                          29.89 |                              22.17 |                          30.00 |                      34.74 |                            20.43 |                                   27.50 |
+|      qwen1.5-4b-hf       |                    88.57 |                          35.63 |                              41.00 |                          67.69 |                      64.21 |                            41.28 |                                   68.75 |
+|      qwen1.5-7b-hf       |                    93.33 |                          14.94 |                              59.33 |                          70.00 |                      61.05 |                            67.87 |                                   61.25 |
+|      qwen1.5-14b-hf      |                    94.29 |                          16.09 |                              59.67 |                          76.92 |                      90.53 |                            59.57 |                                   77.50 |
+|      qwen1.5-32b-hf      |                    94.29 |                          43.68 |                              82.83 |                          38.46 |                      97.89 |                            75.96 |                                   67.50 |
+|      qwen1.5-72b-hf      |                    99.05 |                          28.74 |                              85.62 |                          77.69 |                      94.74 |                            72.77 |                                   87.50 |
+|   qwen1.5-moe-a2-7b-hf   |                    65.71 |                          36.78 |                              51.67 |                          75.38 |                      72.63 |                            61.28 |                                   33.75 |
+|    mistral-7b-v0.1-hf    |                    17.14 |                           8.05 |                              28.33 |                           6.92 |                      24.21 |                            30.43 |                                   12.50 |
+|    mistral-7b-v0.2-hf    |                     7.62 |                           9.20 |                              23.17 |                           6.15 |                      25.26 |                            19.15 |                                    7.50 |
+|   mixtral-8x7b-v0.1-hf   |                     0.00 |                           4.60 |                              33.83 |                          10.77 |                      37.89 |                            25.96 |                                    3.75 |
+|  mixtral-8x22b-v0.1-hf   |                     7.62 |                           4.17 |                              51.33 |                          14.62 |                      53.68 |                            21.91 |                                   10.00 |
+|         yi-6b-hf         |                    17.14 |                          52.87 |                              50.83 |                          36.25 |                      36.84 |                            48.09 |                                   36.25 |
+|        yi-34b-hf         |                     0.00 |                          59.77 |                              76.67 |                          86.92 |                      67.44 |                            61.06 |                                   81.25 |
+|   deepseek-7b-base-hf    |                    20.95 |                           2.30 |                              17.83 |                          12.31 |                      25.26 |                            12.55 |                                    8.75 |
+|   deepseek-67b-base-hf   |                     1.90 |                           9.20 |                              27.33 |                          30.00 |                      40.00 |                            13.19 |                                    3.75 |
+
+## Chat Models
+
+|             model             |   GaokaoBench |
+|:-----------------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |         21.51 |
+|     qwen1.5-1.8b-chat-hf      |         46.19 |
+|      qwen1.5-4b-chat-hf       |         59.11 |
+|      qwen1.5-7b-chat-hf       |         70.55 |
+|      qwen1.5-14b-chat-hf      |         80.39 |
+|      qwen1.5-32b-chat-hf      |         86.15 |
+|      qwen1.5-72b-chat-hf      |         88.58 |
+|     qwen1.5-110b-chat-hf      |         89.59 |
+|    internlm2-chat-1.8b-hf     |         29.73 |
+|  internlm2-chat-1.8b-sft-hf   |         28.79 |
+|     internlm2-chat-7b-hf      |         54.54 |
+|   internlm2-chat-7b-sft-hf    |         55.39 |
+|     internlm2-chat-20b-hf     |         57.95 |
+|   internlm2-chat-20b-sft-hf   |         57.62 |
+|    llama-3-8b-instruct-hf     |         45.48 |
+|    llama-3-70b-instruct-hf    |         65.91 |
+| llama-3-8b-instruct-lmdeploy  |         44.48 |
+| llama-3-70b-instruct-lmdeploy |         67.06 |
+|  mistral-7b-instruct-v0.1-hf  |         26.21 |
+|  mistral-7b-instruct-v0.2-hf  |         32.17 |
+| mixtral-8x7b-instruct-v0.1-hf |         42.46 |
+
+### Details
+
+|             model             |   2010-2022_Math_II_MCQs |   2010-2022_Math_I_MCQs |   2010-2022_History_MCQs |   2010-2022_Biology_MCQs |   2010-2022_Political_Science_MCQs |   2010-2022_Physics_MCQs |   2010-2022_Chemistry_MCQs |
+|:-----------------------------:|-------------------------:|------------------------:|-------------------------:|-------------------------:|-----------------------------------:|-------------------------:|---------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                    25.23 |                   25.70 |                    39.02 |                    24.67 |                              25.00 |                     0.78 |                      25.00 |
+|     qwen1.5-1.8b-chat-hf      |                    30.28 |                   26.64 |                    61.32 |                    55.33 |                              77.81 |                    11.72 |                      40.32 |
+|      qwen1.5-4b-chat-hf       |                    38.53 |                   35.05 |                    70.73 |                    70.00 |                              83.44 |                    25.00 |                      41.13 |
+|      qwen1.5-7b-chat-hf       |                    49.54 |                   39.72 |                    81.88 |                    82.67 |                              90.62 |                    46.88 |                      61.29 |
+|      qwen1.5-14b-chat-hf      |                    64.68 |                   54.21 |                    87.80 |                    90.67 |                              94.69 |                    44.53 |                      69.35 |
+|      qwen1.5-32b-chat-hf      |                    70.92 |                   66.14 |                    98.02 |                    97.74 |                              96.07 |                    57.81 |                      72.92 |
+|      qwen1.5-72b-chat-hf      |                    76.61 |                   68.22 |                    95.47 |                    96.00 |                              97.19 |                    64.06 |                      86.29 |
+|     qwen1.5-110b-chat-hf      |                    80.36 |                   66.67 |                   100.00 |                   100.00 |                              96.25 |                    65.62 |                      75.00 |
+|    internlm2-chat-1.8b-hf     |                    28.44 |                   28.50 |                    46.69 |                    39.33 |                              44.38 |                    10.16 |                      26.61 |
+|  internlm2-chat-1.8b-sft-hf   |                    23.85 |                   20.09 |                    55.75 |                    40.67 |                              53.12 |                    14.84 |                      30.65 |
+|     internlm2-chat-7b-hf      |                    45.87 |                   42.52 |                    77.70 |                    75.33 |                              76.56 |                    16.41 |                      38.71 |
+|   internlm2-chat-7b-sft-hf    |                    49.08 |                   39.72 |                    80.84 |                    68.67 |                              81.25 |                    29.69 |                      42.74 |
+|     internlm2-chat-20b-hf     |                    53.21 |                   46.73 |                    80.49 |                    74.00 |                              85.00 |                    31.25 |                      37.10 |
+|   internlm2-chat-20b-sft-hf   |                    51.83 |                   47.20 |                    86.06 |                    78.00 |                              88.12 |                    35.16 |                      45.16 |
+|    llama-3-8b-instruct-hf     |                    37.16 |                   31.31 |                    60.98 |                    48.67 |                              51.25 |                    11.72 |                      39.52 |
+|    llama-3-70b-instruct-hf    |                    58.26 |                   52.34 |                    63.76 |                    75.33 |                              75.31 |                    36.72 |                      53.23 |
+| llama-3-8b-instruct-lmdeploy  |                    37.61 |                   35.51 |                    55.05 |                    53.33 |                              52.19 |                     7.81 |                      34.68 |
+| llama-3-70b-instruct-lmdeploy |                    75.00 |                   55.56 |                    61.11 |                    73.68 |                              70.00 |                    40.62 |                      43.75 |
+|  mistral-7b-instruct-v0.1-hf  |                    23.39 |                   21.03 |                    35.19 |                    18.00 |                              26.56 |                     5.47 |                      30.65 |
+|  mistral-7b-instruct-v0.2-hf  |                    31.19 |                   19.63 |                    38.33 |                    40.00 |                              35.94 |                    20.31 |                      34.68 |
+| mixtral-8x7b-instruct-v0.1-hf |                    41.28 |                   37.85 |                    52.26 |                    47.33 |                              50.00 |                    25.78 |                      43.55 |
+
+|             model             |   2010-2013_English_MCQs |   2010-2022_Chinese_Modern_Lit |   2010-2022_English_Fill_in_Blanks |   2012-2022_English_Cloze_Test |   2010-2022_Geography_MCQs |   2010-2022_English_Reading_Comp |   2010-2022_Chinese_Lang_and_Usage_MCQs |
+|:-----------------------------:|-------------------------:|-------------------------------:|-----------------------------------:|-------------------------------:|---------------------------:|---------------------------------:|----------------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                    32.38 |                          10.34 |                               0.00 |                           2.31 |                      27.37 |                            15.11 |                                   18.75 |
+|     qwen1.5-1.8b-chat-hf      |                    69.52 |                          42.53 |                              56.33 |                           2.31 |                      61.05 |                            32.98 |                                   35.00 |
+|      qwen1.5-4b-chat-hf       |                    70.48 |                          58.62 |                              82.33 |                          16.15 |                      68.42 |                            68.51 |                                   47.50 |
+|      qwen1.5-7b-chat-hf       |                    83.81 |                          71.26 |                              85.17 |                          57.69 |                      81.05 |                            78.94 |                                   66.25 |
+|      qwen1.5-14b-chat-hf      |                    93.33 |                          78.16 |                              97.17 |                          71.54 |                      91.58 |                            94.26 |                                   81.25 |
+|      qwen1.5-32b-chat-hf      |                   100.00 |                          81.61 |                              95.83 |                          90.00 |                      97.89 |                            92.43 |                                   92.86 |
+|      qwen1.5-72b-chat-hf      |                    98.10 |                          83.91 |                              98.00 |                          90.77 |                      94.74 |                            96.38 |                                   96.25 |
+|     qwen1.5-110b-chat-hf      |                   100.00 |                          91.95 |                              98.50 |                          97.69 |                      95.35 |                            98.44 |                                  100.00 |
+|    internlm2-chat-1.8b-hf     |                    38.10 |                           6.90 |                               0.67 |                           1.54 |                      56.84 |                            23.19 |                                   30.00 |
+|  internlm2-chat-1.8b-sft-hf   |                    50.48 |                           0.00 |                               0.00 |                           0.00 |                      27.37 |                            11.91 |                                   32.50 |
+|     internlm2-chat-7b-hf      |                    60.95 |                          67.82 |                               7.00 |                           7.69 |                      70.53 |                            79.79 |                                   38.75 |
+|   internlm2-chat-7b-sft-hf    |                    60.00 |                          71.26 |                               6.50 |                           0.77 |                      68.42 |                            77.02 |                                   42.50 |
+|     internlm2-chat-20b-hf     |                    60.95 |                          43.68 |                              34.83 |                           4.62 |                      71.58 |                            62.55 |                                   43.75 |
+|   internlm2-chat-20b-sft-hf   |                    75.24 |                          47.13 |                               1.00 |                           2.31 |                      80.00 |                            65.96 |                                   37.50 |
+|    llama-3-8b-instruct-hf     |                    50.48 |                          36.78 |                              30.83 |                          21.54 |                      57.89 |                            81.70 |                                   28.75 |
+|    llama-3-70b-instruct-hf    |                    73.33 |                          59.77 |                              82.83 |                          24.62 |                      73.68 |                            91.28 |                                   45.00 |
+| llama-3-8b-instruct-lmdeploy  |                    52.38 |                          42.53 |                              21.33 |                          18.46 |                      58.95 |                            81.28 |                                   26.25 |
+| llama-3-70b-instruct-lmdeploy |                    87.50 |                          62.07 |                              84.38 |                          26.92 |                      72.63 |                            91.20 |                                   56.25 |
+|  mistral-7b-instruct-v0.1-hf  |                    38.10 |                          18.39 |                              30.50 |                           6.15 |                      31.58 |                            38.72 |                                   18.75 |
+|  mistral-7b-instruct-v0.2-hf  |                    41.90 |                          31.03 |                              28.00 |                          20.77 |                      29.47 |                            42.13 |                                   15.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                    49.52 |                          39.08 |                              41.33 |                           9.23 |                      44.21 |                            43.19 |                                   21.25 |
--- a/configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py
+++ b/configs/datasets/MathBench/deprecated_mathbench_agent_gen_48ec47.py
--- a/configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py
+++ b/configs/datasets/MathBench/deprecated_mathbench_arith_gen_ccd638.py
--- a/configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py
+++ b/configs/datasets/MathBench/deprecated_mathbench_cot_gen_66f329.py
--- a/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py
+++ b/configs/datasets/MathBench/deprecated_mathbench_gen_7b734b.py
--- a/configs/datasets/MathBench/mathbench_2024_gen_19e486.py
+++ b/configs/datasets/MathBench/mathbench_2024_gen_19e486.py
@ -1,114 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
-    'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：\n',
-    'single_choice_cn': '以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：',
-    'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
-    'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
-}
-
-cloze_prompts = {
-    'cloze_cn': [
-                dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？'),
-                dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
-                dict(role='HUMAN', prompt='Q: 如果停车场有3辆车，又有2辆车进来，停车场里有多少辆车？'),
-                dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
-                dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力，她的妹妹有42块。如果他们吃了35块，他们总共剩下多少块？'),
-                dict(role='BOT', prompt='A: 黎恩有32块巧克力，Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
-                dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖？'),
-                dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个，所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
-                dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？'),
-                dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
-                dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？'),
-                dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29\n'),
-                dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？'),
-                dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
-                dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？'),
-                dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
-                dict(role='HUMAN', prompt='Q: {question}'),
-                dict(role='BOT', prompt='A: {answer}'),
-                ],
-    'cloze_en': [
-                dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
-                dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
-                dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
-                dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
-                dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
-                dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
-                dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
-                dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
-                dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
-                dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
-                dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
-                dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
-                dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
-                dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
-                dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
-                dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
-                dict(role='HUMAN', prompt='Q: {question}'),
-                dict(role='BOT', prompt='A: {answer}\n'),
-]}
-
-mathbench_sets = {
-    # Practice Part
-    'college': ['single_choice_cn', 'single_choice_en'],
-    'high': ['single_choice_cn', 'single_choice_en'],
-    'middle': ['single_choice_cn', 'single_choice_en'],
-    'primary': ['cloze_cn', 'cloze_en'],
-    'arithmetic': ['cloze_en'],
-    # Theory part
-    'college_knowledge': ['single_choice_cn','single_choice_en'],
-    'high_knowledge': ['single_choice_cn','single_choice_en'],
-    'middle_knowledge': ['single_choice_cn','single_choice_en'],
-    'primary_knowledge': ['single_choice_cn','single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = True
-
-# Use circular evaluation or not
-with_circular_eval = True
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
-    for _name in mathbench_sets[_split]:
-        mathbench_infer_cfg = dict(
-            prompt_template=dict(
-                type=PromptTemplate,
-                template=dict(
-                    round=[
-                        dict(
-                            role='HUMAN',
-                            prompt=single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name],
-                        ),
-                        dict(role='BOT', prompt='{answer}')] if 'choice' in _name else cloze_prompts[_name],
-                    ),
-            ),
-            retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
-        )
-
-        mathbench_eval_cfg = dict(
-            evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
-            pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
-
-        mathbench_datasets.append(
-            dict(
-                abbr='mathbench-' + _split + '-' + _name,
-                type=MathBenchDataset,
-                path=f'./data/mathbench_v1/{_split}',
-                name=_name,
-                with_circular=with_circular_eval,
-                reader_cfg=dict(
-                    input_columns=['question'],
-                    output_column='answer'
-                    ),
-                infer_cfg=mathbench_infer_cfg,
-                eval_cfg=mathbench_eval_cfg,
-            ))
--- a/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
+++ b/configs/datasets/MathBench/mathbench_2024_gen_1dc21d.py
@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=2048),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        mathbench_datasets.append(
+            dict(
+                abbr='mathbench-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/mathbench_v1/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_1dc21d.py
@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = False
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=2048),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        mathbench_datasets.append(
+            dict(
+                abbr='mathbench-wocircular-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/mathbench_v1/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_gen_649349.py
@ -1,124 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
-    'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：\n',
-    'single_choice_cn': '以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：',
-    'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
-    'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:',
-}
-
-cloze_prompts = {
-    'cloze_cn': [
-        dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？'),
-        dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
-        dict(role='HUMAN', prompt='Q: 如果停车场有3辆车，又有2辆车进来，停车场里有多少辆车？'),
-        dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
-        dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力，她的妹妹有42块。如果他们吃了35块，他们总共剩下多少块？'),
-        dict(role='BOT', prompt='A: 黎恩有32块巧克力，Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
-        dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖？'),
-        dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个，所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
-        dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？'),
-        dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
-        dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？'),
-        dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29\n'),
-        dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？'),
-        dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
-        dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？'),
-        dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
-        dict(role='HUMAN', prompt='Q: {question}'),
-        dict(role='BOT', prompt='A: {answer}'),
-    ],
-    'cloze_en': [
-        dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
-        dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
-        dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
-        dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
-        dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
-        dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
-        dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
-        dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
-        dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
-        dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
-        dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
-        dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
-        dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
-        dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
-        dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
-        dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
-        dict(role='HUMAN', prompt='Q: {question}'),
-        dict(role='BOT', prompt='A: {answer}\n'),
-    ],
-}
-
-mathbench_sets = {
-    # Practice Part
-    'college': ['single_choice_cn', 'single_choice_en'],
-    'high': ['single_choice_cn', 'single_choice_en'],
-    'middle': ['single_choice_cn', 'single_choice_en'],
-    'primary': ['cloze_cn', 'cloze_en'],
-    'calculate': ['cloze_en'],
-    # Theory part
-    'college_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'high_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = True
-
-# Use circular evaluation or not
-with_circular_eval = False
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
-    for _name in mathbench_sets[_split]:
-        mathbench_reader_cfg = dict(
-            input_columns=['question'],
-            output_column='answer',
-        )
-
-        if 'single_choice' in _name:
-            template_round = [
-                dict(role='HUMAN', prompt=(single_choice_prompts[_name + '_with_reasoning'] if with_reasoning else single_choice_prompts[_name])),
-                dict(role='BOT', prompt='{answer}')
-            ]
-        else:
-            template_round = cloze_prompts[_name]
-        mathbench_infer_cfg = dict(
-            prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
-            retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=512),
-        )
-
-        if 'single_choice' in _name:
-            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
-        else:
-            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
-        if 'single_choice' in _name and with_circular_eval:
-            evaluator = dict(type=CircularEvaluator)
-        else:
-            evaluator = dict(type=AccEvaluator)
-        mathbench_eval_cfg = dict(
-            evaluator=evaluator,
-            pred_postprocessor=pred_postprocessor,
-        )
-
-        mathbench_datasets.append(
-            dict(
-                abbr='mathbench-' + _split + '-' + _name,
-                type=MathBenchDataset,
-                path=f'./data/mathbench_v1_ori/{_split}',
-                name=_name,
-                with_circular=with_circular_eval,
-                reader_cfg=mathbench_reader_cfg,
-                infer_cfg=mathbench_infer_cfg,
-                eval_cfg=mathbench_eval_cfg,
-            )
-        )
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_8eb12b.py
@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, mathbench_postprocess
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 4
+# Generate reasoning path or not, only for single choice
+with_reasoning = False
+# Use circular evaluation or not
+with_circular_eval = False
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = True
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round[-1]['prompt'] = one_template_round[-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer, max_out_len=2048),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        mathbench_datasets.append(
+            dict(
+                abbr='mathbench-wocircular-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/mathbench_v1/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
--- a/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py
+++ b/configs/datasets/MathBench/mathbench_2024_wocircular_mixed_dc0207.py
@ -1,114 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
-from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
-from opencompass.datasets import MathBenchDataset, mathbench_postprocess
-from opencompass.utils.text_postprocessors import first_option_postprocess
-
-single_choice_prompts = {
-    'single_choice_cn_with_reasoning': '以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：\n',
-    'single_choice_cn': '以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n答案选项：\n',
-    'single_choice_en_with_reasoning': "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet's think step by step:\n",
-    'single_choice_en': 'Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:\n',
-}
-
-cloze_prompts = {
-    'cloze_cn': [
-        'Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后，将有21棵树。林务工人员今天种植了多少棵树？\nA: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以，他们必须种植了21 - 15 = 6棵树。答案是 6',
-        'Q: 如果停车场有3辆车，又有2辆车进来，停车场里有多少辆车？\nA: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5',
-        'Q: 黎恩有32块巧克力，她的妹妹有42块。如果他们吃了35块，他们总共剩下多少块？\nA: 黎恩有32块巧克力，Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39',
-        'Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖？\nA: 杰森有20个棒棒糖。因为他现在只剩下12个，所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8',
-        'Q: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？\nA: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9',
-        'Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？\nA: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29',
-        'Q: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？\nA: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33',
-        'Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？\nA: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8',
-        'Q: {question}\nA: {answer}',
-    ],
-    'cloze_en': [
-        'Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.',
-        'Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.',
-        "Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\nA: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
-        'Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\nA: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.',
-        'Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\nA: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.',
-        'Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\nA: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.',
-        'Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\nA: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.',
-        'Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\nA: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.',
-        'Q: {question}\nA: {answer}',
-    ],
-}
-
-mathbench_sets = {
-    # Practice Part
-    'college': ['single_choice_cn', 'single_choice_en'],
-    'high': ['single_choice_cn', 'single_choice_en'],
-    'middle': ['single_choice_cn', 'single_choice_en'],
-    'primary': ['cloze_cn', 'cloze_en'],
-    'calculate': ['cloze_en'],
-    # Theory part
-    'college_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'high_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
-    'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
-}
-
-# Generate reasoning path or not, only for single choice
-with_reasoning = False
-
-# Use circular evaluation or not
-with_circular_eval = False
-
-mathbench_datasets = []
-
-for _split in list(mathbench_sets.keys()):
-    for _name in mathbench_sets[_split]:
-        mathbench_reader_cfg = dict(
-            input_columns=['question'],
-            output_column='answer',
-        )
-
-        if 'single_choice' in _name:
-            if with_reasoning:
-                mathbench_infer_cfg = dict(
-                    prompt_template=dict(type=PromptTemplate, template=single_choice_prompts[_name + '_with_reasoning']),
-                    retriever=dict(type=ZeroRetriever),
-                    inferencer=dict(type=GenInferencer, max_out_len=512),
-                )
-            else:
-                mathbench_infer_cfg = dict(
-                    prompt_template=dict(type=PromptTemplate, template={answer: f'{single_choice_prompts[_name]}{answer}' for answer in ['A', 'B', 'C', 'D']}),
-                    retriever=dict(type=ZeroRetriever),
-                    inferencer=dict(type=PPLInferencer),
-                )
-        else:
-            mathbench_infer_cfg = dict(
-                prompt_template=dict(type=PromptTemplate, template='\n'.join(cloze_prompts[_name])),
-                retriever=dict(type=ZeroRetriever),
-                inferencer=dict(type=GenInferencer, max_out_len=512),
-            )
-
-
-        if 'single_choice' in _name:
-            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
-        else:
-            pred_postprocessor = dict(type=mathbench_postprocess, name=_name)
-        if 'single_choice' in _name and with_circular_eval:
-            evaluator = dict(type=CircularEvaluator)
-        else:
-            evaluator = dict(type=AccEvaluator)
-        mathbench_eval_cfg = dict(
-            evaluator=evaluator,
-            pred_postprocessor=pred_postprocessor,
-        )
-
-        mathbench_datasets.append(
-            dict(
-                abbr='mathbench-' + _split + '-' + _name,
-                type=MathBenchDataset,
-                path=f'./data/mathbench_v1_ori/{_split}',
-                name=_name,
-                with_circular=with_circular_eval,
-                reader_cfg=mathbench_reader_cfg,
-                infer_cfg=mathbench_infer_cfg,
-                eval_cfg=mathbench_eval_cfg,
-            )
-        )
--- a/configs/datasets/MathBench/mathbench_gen.py
+++ b/configs/datasets/MathBench/mathbench_gen.py
@ -1,4 +1,4 @@
 from mmengine.config import read_base

 with read_base():
-    from .mathbench_2024_gen_19e486 import mathbench_datasets  # noqa: F401, F403
+    from .mathbench_2024_gen_1dc21d import mathbench_datasets  # noqa: F401, F403
--- a/configs/datasets/MathBench/mathbench_prompt.py
+++ b/configs/datasets/MathBench/mathbench_prompt.py
@ -0,0 +1,103 @@
+zero_shot_prompts = {
+    'single_choice_cn_with_reasoning': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你一步一步推理，并在最后用“所以答案为选项X”给出答案，其中“X”为选项A，B，C，D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考：'),
+    ],
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 以下是一道关于数学的单项选择题，请你直接回答正确答案的选项序号。\n下面是你要回答的题目：\n{question}\n所以答案是：'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with "Therefore, the correct answer is option X", Where "X" is the correct option you think from A，B，C，D. Here is the question you need to answer:\n{question}\nLet\'s think step by step:'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nThe answer is:'),
+    ],
+}
+
+few_shot_prompts = {
+    'single_choice_cn': [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____?\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: A'),
+        dict(role='HUMAN', prompt='问题: 圆锥的底面半径为2，高为4.一个圆柱的下底面在圆锥的底面上，上底面的圆周在圆锥的侧面上，当圆柱侧面积为$4 \\pi$时，该圆柱的体积为____\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='回答: B'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_cn_with_reasoning' : [
+        dict(role='HUMAN', prompt='问题: 已知i是虚数单位，z为复数，$2+\\frac{1}{i}=z(3+i)$，则在复平面内z对应的点位于____\nA. 第一象限\nB. 第二象限\nC. 第三象限\nD. 第四象限'),
+        dict(role='BOT', prompt='回答: 因为首先，我们将等式两边同时乘以$(3 + i)$的共轭复数$(3 - i)$，以便消去分母中的虚数部分：$z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$，这样做的目的是利用复数乘法的性质，从而消去分母中的虚数部分。我们进行乘法运算：$z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$，$z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$，由于$i^2 = -1$，我们可以进一步简化：$z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{{1}{2} - \\frac{1}{2}i$，在复平面上，这个点位于第四象限，因为它的实部是正的（$\\frac{1}{2}$），而虚部是负的（$-\\frac{1}{2}$）\n答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 将函数$y=\\tan(\\omega x-1)(\\omega>0)$的图像向左平移2个单位长度后,与函数$y=\\tan(\\omega x+3)$的图象重合,则的最小值等于____\nA. $2-\\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='回答: 将函数$y=\\tan(\\omega x-1)$（其中$\\omega>0$）的图像向左平移2个单位长度后，得到的新函数为$y=\\tan(\\omega(x+2)-1)$。为了使这个新函数的图像与$y=\\tan(\\omega x+3)$的图像重合，我们需要满足以下条件：\n$\\tan(\\omega(x+2)-1) = \\tan(\\omega x+3)$\n由于正切函数的周期性，我们知道$\\tan(\\theta) = \\tan(\\theta + k\\pi)$，其中$k$是任意整数。因此，我们可以写出：\n$\\omega(x+2)-1 = \\omega x+3 + k\\pi$\n解这个方程，我们得到：$\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$\n$2\\omega - 1 = 3 + k\\pi$\n$\\omega = 2 + \\frac{k\\pi}{2}$\n由于$\\omega>0$，我们需要找到满足条件的最小正整数$k$。当$k=0$时，我们得到$\\omega=2$，这是满足条件的最小正值。因此，最小值等于2。答案选项：D'),
+        dict(role='HUMAN', prompt='问题: 设$（1+2i）a+b=2i$，其中a,b为实数，则（  ）\nA. $a=1,b=-1$\nB. $a=1,b=1$\nC. $a=-1,b=1$\nD. $a=-1,b=-1$'),
+        dict(role='BOT', prompt='回答: 我们可以将这个方程写成：$a + 2ai + b = 2i$，然后将实部和虚部分开：\n$a + b = 0$\n$2a = 2$\n由于$a + b = 0$，我们可以得到$a = -b$。将这个值代入$2a = 2$，我们得到$2(-b) = 2$，因此$b = -1$。因此，$a = 1$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: 一直角三角形的两边长分别为3和4．则第三边的长为____\nA. 5\nB. $\\sqrt 7$\nC. $\\sqrt 5$\nD. 5或者$\\sqrt 7$'),
+        dict(role='BOT', prompt='回答: 我们可以使用勾股定理来解决这个问题。勾股定理告诉我们，如果一个三角形的两条边长分别为$a$和$b$，那么第三条边的长度为$c = \\sqrt{a^2 + b^2}$。在这个问题中，$a = 3$，$b = 4$，因此$c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$。答案选项：A'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'single_choice_en': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: B'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'single_choice_en_with_reasoning': [
+        dict(role='HUMAN', prompt='Question: Given that $i$ is the imaginary unit and $z$ is a complex number, $2 + \\frac{1}{i} = z(3 + i)$, the point corresponding to $z$ in the complex plane is located in the ______\nA. First quadrant\nB. Second quadrant\nC. Third quadrant\nD. Fourth quadrant'),
+        dict(role='BOT', prompt='Response: First, we multiply both sides of the equation by the conjugate of $(3 + i)$: $z = \\frac{2 - i}{3 + i} \\cdot \\frac{3 - i}{3 - i}$. We perform the multiplication: $z = \\frac{(2 - i)(3 - i)}{(3 + i)(3 - i)}$, $z = \\frac{6 - 2i - 3i + i^2}{9 - 3i + 3i - i^2}$. Since $i^2 = -1$, we can simplify this further: $z = \\frac{6 - 5i - 1}{9 + 1}$ $z = \\frac{1}{2} - \\frac{1}{2}i$. In the complex plane, this point is located in the fourth quadrant, because its real part is positive ($\\frac{1}{2}$) and its imaginary part is negative ($-\\frac{1}{2}$)\nAnswer option: D'),
+        dict(role='HUMAN', prompt='Question: The graph of the function $y = \\tan(\\omega x - 1)$ is shifted 2 units to the left and coincides with the graph of the function $y = \\tan(\\omega x + 3)$. The minimum value of $\\omega$ is ______\nA. $2 - \\frac{\\pi}{2}$\nB. 1\nC. $\\pi - 2$\nD. 2'),
+        dict(role='BOT', prompt='Response: In order for the graph of this new function to coincide with the graph of $y = \\tan(\\omega x + 3)$, we need to satisfy the following condition: $\\tan(\\omega(x + 2) - 1) = \\tan(\\omega x + 3)$. Therefore, we can write: $\\omega(x + 2) - 1 = \\omega x + 3 + k\\pi$. Solving this equation, we get: $\\omega x + 2\\omega - 1 = \\omega x + 3 + k\\pi$. $2\\omega - 1 = 3 + k\\pi$. $\\omega = 2 + \\frac{k\\pi}{2}$. Since $\\omega > 0$, we need to find the smallest positive integer $k$ that satisfies the condition. When $k = 0$, we get $\\omega = 2$, which is the smallest positive value that satisfies the condition. Therefore, the minimum value is 2. Answer option: D'),
+        dict(role='HUMAN', prompt='Question: If $(1 + 2i)a + b = 2i$, where $a$ and $b$ are real numbers, then ______\nA. $a = 1, b = -1$\nB. $a = 1, b = 1$\nC. $a = -1, b = 1$\nD. $a = -1, b = -1$'),
+        dict(role='BOT', prompt='Response: We can write this equation as: $a + 2ai + b = 2i$, and then separate the real and imaginary parts: $a + b = 0$. $2a = 2$. Since $a + b = 0$, we can get $a = -b$. Substituting this value into $2a = 2$, we get $2(-b) = 2$, so $b = -1$. Therefore, $a = 1$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: The radius of a cone is 2 and its height is 4. A cylinder has its lower base on the base of the cone and its upper base on the lateral surface of the cone. When the lateral surface area of the cylinder is $4\\pi$, the volume of the cylinder is ______\nA. $\\pi$\nB. $2\\pi$\nC. $3\\pi$\nD. $4\\pi$'),
+        dict(role='BOT', prompt='Response: We can use the Pythagorean theorem to solve this problem. The Pythagorean theorem tells us that if the two sides of a triangle are $a$ and $b$, then the length of the third side is $c = \\sqrt{a^2 + b^2}$. In this problem, $a = 3$ and $b = 4$, so $c = \\sqrt{3^2 + 4^2} = \\sqrt{9 + 16} = \\sqrt{25} = 5$. Answer option: A'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+    'cloze_cn': [
+        dict(role='HUMAN', prompt='问题: 莎莎有五个玩具。在圣诞节，他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具？'),
+        dict(role='BOT', prompt='回答: 她有5个玩具。他从妈妈那里得到了2个，所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个，所以总共他有7 + 2 = 9个玩具。答案是 9'),
+        dict(role='HUMAN', prompt='问题: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑？'),
+        dict(role='BOT', prompt='回答: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑，所以现在有9 + 20 = 29台电脑。答案是 29'),
+        dict(role='HUMAN', prompt='问题: 迈克尔有58个高尔夫球。星期二，他丢失了23个高尔夫球。星期三，他又丢失了2个。星期三结束时他还剩下多少个高尔夫球？'),
+        dict(role='BOT', prompt='回答: 迈克尔一开始有58个球。星期二他丢失了23个，所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个，所以现在他还剩下35 - 2 = 33个球。答案是 33'),
+        dict(role='HUMAN', prompt='问题: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱？'),
+        dict(role='BOT', prompt='回答: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元，所以现在她还剩下23 - 15 = 8美元。答案是 8'),
+        dict(role='HUMAN', prompt='问题: {question}'),
+        dict(role='BOT', prompt='回答: {answer}'),
+    ],
+    'cloze_en': [
+        dict(role='HUMAN', prompt='Question: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
+        dict(role='BOT', prompt='Response: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.'),
+        dict(role='HUMAN', prompt='Question: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
+        dict(role='BOT', prompt='Response: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.'),
+        dict(role='HUMAN', prompt='Question: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
+        dict(role='BOT', prompt='Response: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.'),
+        dict(role='HUMAN', prompt='Question: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
+        dict(role='BOT', prompt='Response: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.'),
+        dict(role='HUMAN', prompt='Question: {question}'),
+        dict(role='BOT', prompt='Response: {answer}'),
+    ],
+}
+
+mathbench_sets = {
+    # Practice Part
+    'college': ['single_choice_cn', 'single_choice_en'],
+    'high': ['single_choice_cn', 'single_choice_en'],
+    'middle': ['single_choice_cn', 'single_choice_en'],
+    'primary': ['cloze_cn', 'cloze_en'],
+    'arithmetic': ['cloze_en'],
+    # Theory part
+    'college_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'high_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'middle_knowledge': ['single_choice_cn', 'single_choice_en'],
+    'primary_knowledge': ['single_choice_cn', 'single_choice_en'],
+}
--- a/configs/datasets/TheoremQA/README.md
+++ b/configs/datasets/TheoremQA/README.md
@ -0,0 +1,64 @@
+# TheoremQA
+
+## Base Models
+
+|          model           |   TheoremQA |
+|:------------------------:|------------:|
+|    llama-7b-turbomind    |       10.25 |
+|   llama-13b-turbomind    |       11.25 |
+|   llama-30b-turbomind    |       14.25 |
+|   llama-65b-turbomind    |       15.62 |
+|   llama-2-7b-turbomind   |       12.62 |
+|  llama-2-13b-turbomind   |       11.88 |
+|  llama-2-70b-turbomind   |       15.62 |
+|   llama-3-8b-turbomind   |       20.25 |
+|  llama-3-70b-turbomind   |       33.62 |
+| internlm2-1.8b-turbomind |       10.50 |
+|  internlm2-7b-turbomind  |       21.88 |
+| internlm2-20b-turbomind  |       26.00 |
+|   qwen-1.8b-turbomind    |        9.38 |
+|    qwen-7b-turbomind     |       15.00 |
+|    qwen-14b-turbomind    |       21.62 |
+|    qwen-72b-turbomind    |       27.12 |
+|     qwen1.5-0.5b-hf      |        5.88 |
+|     qwen1.5-1.8b-hf      |       12.00 |
+|      qwen1.5-4b-hf       |       13.75 |
+|      qwen1.5-7b-hf       |        4.25 |
+|      qwen1.5-14b-hf      |       12.62 |
+|      qwen1.5-32b-hf      |       26.62 |
+|      qwen1.5-72b-hf      |       26.62 |
+|   qwen1.5-moe-a2-7b-hf   |        7.50 |
+|    mistral-7b-v0.1-hf    |       17.00 |
+|    mistral-7b-v0.2-hf    |       16.25 |
+|   mixtral-8x7b-v0.1-hf   |       24.12 |
+|  mixtral-8x22b-v0.1-hf   |       36.75 |
+|         yi-6b-hf         |       13.88 |
+|        yi-34b-hf         |       24.75 |
+|   deepseek-7b-base-hf    |       12.38 |
+|   deepseek-67b-base-hf   |       21.25 |
+
+## Chat Models
+
+|             model             |   TheoremQA |
+|:-----------------------------:|------------:|
+|     qwen1.5-0.5b-chat-hf      |        9.00 |
+|     qwen1.5-1.8b-chat-hf      |        9.25 |
+|      qwen1.5-4b-chat-hf       |       13.88 |
+|      qwen1.5-7b-chat-hf       |       12.25 |
+|      qwen1.5-14b-chat-hf      |       13.63 |
+|      qwen1.5-32b-chat-hf      |       19.25 |
+|      qwen1.5-72b-chat-hf      |       22.75 |
+|     qwen1.5-110b-chat-hf      |       17.50 |
+|    internlm2-chat-1.8b-hf     |       13.63 |
+|  internlm2-chat-1.8b-sft-hf   |       12.88 |
+|     internlm2-chat-7b-hf      |       18.50 |
+|   internlm2-chat-7b-sft-hf    |       18.75 |
+|     internlm2-chat-20b-hf     |       23.00 |
+|   internlm2-chat-20b-sft-hf   |       25.12 |
+|    llama-3-8b-instruct-hf     |       19.38 |
+|    llama-3-70b-instruct-hf    |       36.25 |
+| llama-3-8b-instruct-lmdeploy  |       19.62 |
+| llama-3-70b-instruct-lmdeploy |       34.50 |
+|  mistral-7b-instruct-v0.1-hf  |       12.62 |
+|  mistral-7b-instruct-v0.2-hf  |       11.38 |
+| mixtral-8x7b-instruct-v0.1-hf |       26.00 |
--- a/configs/datasets/bbh/README.md
+++ b/configs/datasets/bbh/README.md
@ -0,0 +1,245 @@
+# BBH
+
+## Base Models
+
+|          model           |   bbh |
+|:------------------------:|------:|
+|    llama-7b-turbomind    | 33.34 |
+|   llama-13b-turbomind    | 37.99 |
+|   llama-30b-turbomind    | 49.86 |
+|   llama-65b-turbomind    | 58.26 |
+|   llama-2-7b-turbomind   | 38.27 |
+|  llama-2-13b-turbomind   | 45.68 |
+|  llama-2-70b-turbomind   | 64.78 |
+|   llama-3-8b-turbomind   | 59.69 |
+|  llama-3-70b-turbomind   | 79.16 |
+| internlm2-1.8b-turbomind | 36.03 |
+|  internlm2-7b-turbomind  | 63.56 |
+| internlm2-20b-turbomind  | 71.29 |
+|   qwen-1.8b-turbomind    | 22.53 |
+|    qwen-7b-turbomind     | 45.89 |
+|    qwen-14b-turbomind    | 56.75 |
+|    qwen-72b-turbomind    | 63.35 |
+|     qwen1.5-0.5b-hf      | 20.54 |
+|     qwen1.5-1.8b-hf      | 27.01 |
+|      qwen1.5-4b-hf       | 34.81 |
+|      qwen1.5-7b-hf       | 39.87 |
+|      qwen1.5-14b-hf      | 50.38 |
+|      qwen1.5-32b-hf      | 67.47 |
+|      qwen1.5-72b-hf      | 58.81 |
+|   qwen1.5-moe-a2-7b-hf   | 39.46 |
+|    mistral-7b-v0.1-hf    | 56.71 |
+|    mistral-7b-v0.2-hf    | 57.32 |
+|   mixtral-8x7b-v0.1-hf   | 68.46 |
+|  mixtral-8x22b-v0.1-hf   | 79.48 |
+|         yi-6b-hf         | 44.82 |
+|        yi-34b-hf         | 66.37 |
+|   deepseek-7b-base-hf    | 42.88 |
+|   deepseek-67b-base-hf   | 71.86 |
+
+### Details
+
+|          model           |   temporal_sequences |   disambiguation_qa |   date_understanding |   tracking_shuffled_objects_three_objects |   penguins_in_a_table |   geometric_shapes |   snarks |   ruin_names |   tracking_shuffled_objects_seven_objects |
+|:------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+|    llama-7b-turbomind    |                23.60 |               46.00 |                44.80 |                                     36.40 |                 30.14 |               0.00 |    46.07 |        21.60 |                                     15.20 |
+|   llama-13b-turbomind    |                16.80 |               50.00 |                56.80 |                                     36.40 |                 43.15 |               0.00 |    60.67 |        29.20 |                                     15.20 |
+|   llama-30b-turbomind    |                33.60 |               60.00 |                76.40 |                                     29.20 |                 57.53 |               0.00 |    59.55 |        62.40 |                                     17.20 |
+|   llama-65b-turbomind    |                84.00 |               76.00 |                84.40 |                                     50.00 |                 65.75 |               0.00 |    62.92 |        69.60 |                                     31.60 |
+|   llama-2-7b-turbomind   |                12.00 |               46.80 |                60.00 |                                     34.00 |                 32.19 |               0.00 |    49.44 |        32.80 |                                     18.40 |
+|  llama-2-13b-turbomind   |                24.00 |               40.80 |                73.20 |                                     36.00 |                 45.89 |               0.00 |    55.06 |        37.60 |                                     22.40 |
+|  llama-2-70b-turbomind   |                75.60 |               66.80 |                88.80 |                                     73.60 |                 69.86 |               0.00 |    73.60 |        60.80 |                                     57.60 |
+|   llama-3-8b-turbomind   |                65.60 |               42.00 |                78.80 |                                     56.80 |                 69.86 |               0.00 |    56.18 |        66.00 |                                     30.80 |
+|  llama-3-70b-turbomind   |               100.00 |               82.80 |                91.60 |                                    100.00 |                 86.30 |               0.00 |    81.46 |        77.20 |                                     94.40 |
+| internlm2-1.8b-turbomind |                31.20 |               44.00 |                60.00 |                                     36.00 |                 35.62 |               0.00 |    44.94 |        27.20 |                                     12.80 |
+|  internlm2-7b-turbomind  |                94.80 |               75.60 |                86.40 |                                     53.60 |                 69.18 |               0.00 |    59.55 |        68.00 |                                     46.00 |
+| internlm2-20b-turbomind  |                98.40 |               83.60 |                84.00 |                                     72.00 |                 71.92 |               0.00 |    81.46 |        78.40 |                                     74.40 |
+|   qwen-1.8b-turbomind    |                26.40 |               39.60 |                33.20 |                                     28.40 |                 28.08 |               0.00 |    44.94 |        21.60 |                                     12.40 |
+|    qwen-7b-turbomind     |                38.80 |               42.80 |                64.40 |                                     30.80 |                 45.89 |               0.00 |    55.62 |        44.00 |                                     14.40 |
+|    qwen-14b-turbomind    |                57.60 |               59.20 |                67.20 |                                     46.40 |                 67.12 |               0.00 |    51.12 |        63.60 |                                     30.40 |
+|    qwen-72b-turbomind    |                72.00 |               66.80 |                77.60 |                                     81.20 |                 84.93 |               0.00 |    78.09 |        67.20 |                                     63.60 |
+|     qwen1.5-0.5b-hf      |                15.20 |               37.20 |                20.40 |                                     30.40 |                 18.49 |               8.40 |    44.94 |        11.20 |                                     14.00 |
+|     qwen1.5-1.8b-hf      |                27.60 |               40.80 |                36.00 |                                     24.40 |                 32.19 |               0.00 |    50.56 |        20.80 |                                     11.20 |
+|      qwen1.5-4b-hf       |                10.40 |               44.40 |                47.20 |                                     36.80 |                 44.52 |              24.80 |    46.63 |        20.80 |                                     14.80 |
+|      qwen1.5-7b-hf       |                37.20 |               42.40 |                52.00 |                                     52.40 |                 56.85 |               6.80 |    48.31 |        23.60 |                                     18.40 |
+|      qwen1.5-14b-hf      |                38.80 |               62.80 |                73.60 |                                     24.80 |                 69.86 |              26.80 |    66.29 |        52.80 |                                      2.00 |
+|      qwen1.5-32b-hf      |                93.60 |               77.20 |                68.40 |                                     70.00 |                 82.88 |              36.80 |    47.75 |        70.40 |                                     71.20 |
+|      qwen1.5-72b-hf      |                75.60 |               66.00 |                78.80 |                                     72.80 |                 80.82 |               0.00 |    75.84 |        64.80 |                                     44.40 |
+|   qwen1.5-moe-a2-7b-hf   |                23.20 |               59.60 |                43.20 |                                     27.60 |                 46.58 |              25.20 |    48.88 |        16.80 |                                     13.20 |
+|    mistral-7b-v0.1-hf    |                73.60 |               53.60 |                76.40 |                                     45.20 |                 56.85 |              28.00 |    64.04 |        66.00 |                                     21.60 |
+|    mistral-7b-v0.2-hf    |                76.80 |               42.00 |                73.20 |                                     47.20 |                 60.27 |              26.00 |    66.85 |        60.80 |                                     26.40 |
+|   mixtral-8x7b-v0.1-hf   |                89.60 |               70.80 |                84.80 |                                     81.20 |                 70.55 |              25.60 |    66.29 |        71.20 |                                     58.80 |
+|  mixtral-8x22b-v0.1-hf   |                98.80 |               77.60 |                92.00 |                                     98.80 |                 83.56 |              35.60 |    80.34 |        79.20 |                                     82.00 |
+|         yi-6b-hf         |                32.80 |               46.40 |                64.40 |                                     34.40 |                 47.26 |              28.80 |    60.11 |        45.60 |                                     14.00 |
+|        yi-34b-hf         |                86.00 |               76.00 |                84.80 |                                     54.80 |                 67.81 |              24.80 |    73.60 |        66.00 |                                     65.60 |
+|   deepseek-7b-base-hf    |                27.60 |               42.00 |                64.40 |                                     31.20 |                 40.41 |              33.60 |    52.25 |        46.00 |                                     13.20 |
+|   deepseek-67b-base-hf   |                95.60 |               75.60 |                86.40 |                                     86.40 |                 76.71 |              39.20 |    76.40 |        77.20 |                                     82.00 |
+
+|          model           |   tracking_shuffled_objects_five_objects |   logical_deduction_three_objects |   hyperbaton |   logical_deduction_five_objects |   logical_deduction_seven_objects |   movie_recommendation |   salient_translation_error_detection |   reasoning_about_colored_objects |   multistep_arithmetic_two |
+|:------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+|    llama-7b-turbomind    |                                    18.40 |                             42.80 |        58.00 |                            23.20 |                             13.20 |                  40.00 |                                 16.40 |                             30.40 |                       0.00 |
+|   llama-13b-turbomind    |                                    16.00 |                             48.80 |        53.60 |                            30.40 |                             16.40 |                  61.60 |                                 11.20 |                             44.80 |                       0.80 |
+|   llama-30b-turbomind    |                                    22.40 |                             66.40 |        73.20 |                            43.60 |                             31.60 |                  84.40 |                                 43.60 |                             57.60 |                       2.80 |
+|   llama-65b-turbomind    |                                    41.60 |                             79.20 |        74.40 |                            48.40 |                             39.20 |                  91.20 |                                 40.40 |                             67.20 |                      20.00 |
+|   llama-2-7b-turbomind   |                                    17.20 |                             54.80 |        51.60 |                            32.80 |                             23.60 |                  74.40 |                                 19.60 |                             45.60 |                       1.20 |
+|  llama-2-13b-turbomind   |                                    23.20 |                             63.60 |        52.40 |                            46.00 |                             42.00 |                  68.00 |                                 21.60 |                             62.00 |                       2.00 |
+|  llama-2-70b-turbomind   |                                    72.40 |                             86.40 |        84.40 |                            55.20 |                             43.20 |                  95.60 |                                 50.80 |                             76.80 |                      20.80 |
+|   llama-3-8b-turbomind   |                                    40.80 |                             76.40 |        93.20 |                            45.20 |                             36.80 |                  88.80 |                                 53.60 |                             72.80 |                      30.80 |
+|  llama-3-70b-turbomind   |                                    99.20 |                             94.00 |        98.00 |                            58.40 |                             42.80 |                  93.60 |                                 63.60 |                             88.40 |                      79.20 |
+| internlm2-1.8b-turbomind |                                    16.80 |                             47.60 |        63.60 |                            21.60 |                             12.00 |                  69.20 |                                 16.80 |                             45.20 |                       5.60 |
+|  internlm2-7b-turbomind  |                                    51.20 |                             78.80 |        90.40 |                            52.00 |                             41.20 |                  95.60 |                                 58.80 |                             74.40 |                      44.40 |
+| internlm2-20b-turbomind  |                                    81.20 |                             95.60 |        83.60 |                            62.40 |                             48.00 |                  94.80 |                                 57.60 |                             75.60 |                      72.80 |
+|   qwen-1.8b-turbomind    |                                    14.80 |                             35.60 |        51.20 |                            22.40 |                             15.20 |                  31.20 |                                 12.40 |                             22.00 |                       3.20 |
+|    qwen-7b-turbomind     |                                    20.80 |                             54.80 |        76.00 |                            37.60 |                             27.60 |                  74.80 |                                 41.20 |                             57.60 |                      23.60 |
+|    qwen-14b-turbomind    |                                    35.60 |                             81.20 |        78.40 |                            45.20 |                             40.80 |                  80.00 |                                 44.80 |                             70.40 |                      65.60 |
+|    qwen-72b-turbomind    |                                    66.40 |                             89.20 |        90.40 |                            60.00 |                             50.80 |                  81.60 |                                 56.40 |                             88.00 |                      70.40 |
+|     qwen1.5-0.5b-hf      |                                    20.00 |                             34.80 |        46.80 |                            18.80 |                             15.60 |                  24.40 |                                 15.20 |                             16.00 |                       1.20 |
+|     qwen1.5-1.8b-hf      |                                    18.00 |                             32.80 |        66.00 |                            18.80 |                             11.20 |                  24.80 |                                 13.60 |                             27.60 |                       4.80 |
+|      qwen1.5-4b-hf       |                                    18.40 |                             56.40 |        56.80 |                            30.00 |                             20.80 |                  40.80 |                                 46.80 |                             44.80 |                      41.20 |
+|      qwen1.5-7b-hf       |                                    32.40 |                             58.40 |        67.20 |                            36.00 |                             28.00 |                  62.80 |                                 49.20 |                             60.40 |                      48.00 |
+|      qwen1.5-14b-hf      |                                     7.20 |                             78.40 |        75.20 |                            41.20 |                             27.60 |                  74.40 |                                 46.00 |                             81.60 |                       8.00 |
+|      qwen1.5-32b-hf      |                                    71.60 |                             88.40 |        97.60 |                            58.80 |                             46.40 |                  68.00 |                                 51.60 |                             88.40 |                      66.80 |
+|      qwen1.5-72b-hf      |                                    61.20 |                             88.40 |        96.00 |                            60.40 |                             49.20 |                  86.40 |                                 34.80 |                             86.80 |                      53.60 |
+|   qwen1.5-moe-a2-7b-hf   |                                    22.80 |                             49.20 |        68.00 |                            28.40 |                             22.40 |                  58.40 |                                 40.80 |                             42.00 |                      33.60 |
+|    mistral-7b-v0.1-hf    |                                    30.40 |                             79.60 |        70.80 |                            54.40 |                             42.80 |                  77.60 |                                 47.20 |                             70.00 |                      30.40 |
+|    mistral-7b-v0.2-hf    |                                    32.80 |                             74.00 |        77.60 |                            48.00 |                             40.40 |                  84.00 |                                 49.20 |                             76.00 |                      35.20 |
+|   mixtral-8x7b-v0.1-hf   |                                    66.80 |                             86.00 |        94.80 |                            50.40 |                             40.40 |                  86.40 |                                 53.20 |                             82.80 |                      60.80 |
+|  mixtral-8x22b-v0.1-hf   |                                    87.60 |                             95.20 |        99.60 |                            70.00 |                             54.00 |                  95.20 |                                 58.40 |                             95.20 |                      82.00 |
+|         yi-6b-hf         |                                    17.20 |                             49.20 |        72.40 |                            34.40 |                             28.00 |                  76.80 |                                 32.40 |                             56.80 |                       9.20 |
+|        yi-34b-hf         |                                    67.20 |                             85.60 |        79.60 |                            49.20 |                             39.60 |                  86.80 |                                 56.00 |                             81.20 |                      33.20 |
+|   deepseek-7b-base-hf    |                                    17.60 |                             51.20 |        72.40 |                            28.80 |                             20.00 |                  78.40 |                                 28.80 |                             46.80 |                       1.60 |
+|   deepseek-67b-base-hf   |                                    82.40 |                             90.00 |        78.80 |                            60.40 |                             44.80 |                  88.80 |                                 56.80 |                             86.40 |                      38.00 |
+
+|          model           |   navigate |   dyck_languages |   word_sorting |   sports_understanding |   boolean_expressions |   object_counting |   formal_fallacies |   causal_judgement |   web_of_lies |
+|:------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+|    llama-7b-turbomind    |      45.20 |             1.60 |           8.40 |                  81.60 |                 66.00 |             47.20 |              46.00 |              40.64 |         57.20 |
+|   llama-13b-turbomind    |      59.20 |             0.80 |          14.40 |                  76.40 |                 69.20 |             46.40 |              47.20 |              53.48 |         66.80 |
+|   llama-30b-turbomind    |      64.80 |             2.40 |          17.20 |                  93.60 |                 78.40 |             71.20 |              43.20 |              55.61 |         98.40 |
+|   llama-65b-turbomind    |      72.40 |             6.80 |          21.60 |                  98.80 |                 81.60 |             70.00 |              40.80 |              55.61 |         99.60 |
+|   llama-2-7b-turbomind   |      54.40 |             1.20 |          10.80 |                  88.80 |                 68.40 |             49.20 |              48.40 |              52.41 |         53.20 |
+|  llama-2-13b-turbomind   |      74.40 |             2.80 |          18.80 |                  97.60 |                 74.40 |             52.80 |              46.40 |              54.55 |         96.00 |
+|  llama-2-70b-turbomind   |      82.40 |            13.60 |          30.40 |                  98.40 |                 81.60 |             83.20 |              43.60 |              63.64 |        100.00 |
+|   llama-3-8b-turbomind   |      90.00 |             9.20 |          38.80 |                  95.20 |                 87.60 |             84.80 |              51.20 |              50.27 |        100.00 |
+|  llama-3-70b-turbomind   |      96.80 |            48.40 |          48.80 |                  99.60 |                 92.40 |             99.60 |              62.40 |              58.29 |        100.00 |
+| internlm2-1.8b-turbomind |      64.40 |             0.40 |           3.20 |                  66.40 |                 54.00 |             50.00 |              49.20 |              48.13 |         46.80 |
+|  internlm2-7b-turbomind  |      78.80 |             2.40 |          35.20 |                  95.60 |                 85.60 |             75.60 |              48.00 |              63.10 |         92.00 |
+| internlm2-20b-turbomind  |      88.80 |            15.60 |          36.00 |                  96.80 |                 88.80 |             76.00 |              50.40 |              56.68 |        100.00 |
+|   qwen-1.8b-turbomind    |      50.00 |             0.00 |           0.80 |                  62.80 |                 29.20 |              2.40 |               6.00 |              12.83 |          1.60 |
+|    qwen-7b-turbomind     |      62.80 |             1.60 |          18.00 |                  81.60 |                 75.20 |             68.80 |              50.00 |              63.64 |         66.80 |
+|    qwen-14b-turbomind    |      75.60 |             1.20 |          26.80 |                  88.80 |                 80.40 |             74.40 |              50.00 |              53.48 |         96.80 |
+|    qwen-72b-turbomind    |      56.00 |            14.40 |          35.20 |                  87.60 |                 91.60 |             81.60 |               5.60 |              31.55 |         62.40 |
+|     qwen1.5-0.5b-hf      |      25.60 |             0.00 |           0.40 |                  41.60 |                 51.60 |             16.80 |               4.40 |               1.07 |         20.00 |
+|     qwen1.5-1.8b-hf      |      55.60 |             0.00 |           1.60 |                  63.60 |                 55.20 |             47.60 |               4.40 |              28.88 |         11.20 |
+|      qwen1.5-4b-hf       |      61.60 |             0.40 |           8.80 |                   0.80 |                 76.00 |             54.40 |               0.80 |              28.34 |         62.40 |
+|      qwen1.5-7b-hf       |      63.60 |             2.40 |          20.80 |                  72.40 |                 69.60 |             26.80 |               0.00 |              40.64 |          0.00 |
+|      qwen1.5-14b-hf      |      82.40 |             1.20 |          27.60 |                  78.40 |                 87.20 |             48.00 |              54.00 |              24.06 |        100.00 |
+|      qwen1.5-32b-hf      |      86.80 |             5.60 |          36.80 |                  90.00 |                 86.40 |             66.40 |              35.60 |              62.57 |         95.60 |
+|      qwen1.5-72b-hf      |      48.40 |            13.20 |          34.40 |                  87.60 |                  8.00 |             67.60 |              13.60 |              39.57 |         99.60 |
+|   qwen1.5-moe-a2-7b-hf   |      56.80 |             2.00 |           8.80 |                  79.60 |                 73.60 |             66.80 |               4.00 |              53.48 |         50.40 |
+|    mistral-7b-v0.1-hf    |      73.60 |             4.00 |          26.40 |                  97.20 |                 82.00 |             67.60 |              43.20 |              48.66 |        100.00 |
+|    mistral-7b-v0.2-hf    |      72.80 |             4.00 |          30.40 |                  97.20 |                 81.20 |             66.80 |              46.00 |              52.41 |        100.00 |
+|   mixtral-8x7b-v0.1-hf   |      85.60 |            18.80 |          33.60 |                  98.00 |                 90.80 |             85.20 |              49.60 |              55.61 |         90.80 |
+|  mixtral-8x22b-v0.1-hf   |      92.80 |            51.60 |          40.00 |                  98.40 |                 91.60 |             95.60 |              54.80 |              56.15 |        100.00 |
+|         yi-6b-hf         |      66.40 |             1.20 |          16.00 |                  92.80 |                 59.60 |             53.20 |              53.20 |              52.41 |         65.20 |
+|        yi-34b-hf         |      81.20 |            18.80 |          36.40 |                  97.60 |                 85.60 |             84.00 |              51.20 |              59.89 |         99.60 |
+|   deepseek-7b-base-hf    |      59.20 |             3.20 |           6.40 |                  92.00 |                 73.20 |             49.60 |              50.80 |              52.41 |         74.80 |
+|   deepseek-67b-base-hf   |      85.20 |            30.00 |          33.20 |                  99.60 |                 84.80 |             82.40 |              46.80 |              56.68 |         99.60 |
+
+## Chat Models
+
+|             model             |   bbh |
+|:-----------------------------:|------:|
+|     qwen1.5-0.5b-chat-hf      | 24.12 |
+|     qwen1.5-1.8b-chat-hf      | 26.82 |
+|      qwen1.5-4b-chat-hf       | 43.15 |
+|      qwen1.5-7b-chat-hf       | 38.12 |
+|      qwen1.5-14b-chat-hf      | 55.38 |
+|      qwen1.5-32b-chat-hf      | 69.28 |
+|      qwen1.5-72b-chat-hf      | 72.97 |
+|     qwen1.5-110b-chat-hf      | 71.04 |
+|    internlm2-chat-1.8b-hf     | 37.69 |
+|  internlm2-chat-1.8b-sft-hf   | 37.12 |
+|     internlm2-chat-7b-hf      | 57.83 |
+|   internlm2-chat-7b-sft-hf    | 57.19 |
+|     internlm2-chat-20b-hf     | 68.24 |
+|   internlm2-chat-20b-sft-hf   | 69.38 |
+|    llama-3-8b-instruct-hf     | 52.85 |
+|    llama-3-70b-instruct-hf    | 82.42 |
+| llama-3-8b-instruct-lmdeploy  | 53.54 |
+| llama-3-70b-instruct-lmdeploy | 82.58 |
+|  mistral-7b-instruct-v0.1-hf  | 32.88 |
+|  mistral-7b-instruct-v0.2-hf  | 48.84 |
+| mixtral-8x7b-instruct-v0.1-hf | 59.64 |
+
+### Details
+
+|             model             |   temporal_sequences |   disambiguation_qa |   date_understanding |   tracking_shuffled_objects_three_objects |   penguins_in_a_table |   geometric_shapes |   snarks |   ruin_names |   tracking_shuffled_objects_seven_objects |
+|:-----------------------------:|---------------------:|--------------------:|---------------------:|------------------------------------------:|----------------------:|-------------------:|---------:|-------------:|------------------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                25.60 |               42.00 |                20.00 |                                     31.20 |                 15.07 |              14.40 |    46.07 |        24.80 |                                     13.20 |
+|     qwen1.5-1.8b-chat-hf      |                28.80 |               36.00 |                30.40 |                                     35.20 |                 19.18 |               7.60 |    46.63 |        24.00 |                                      9.60 |
+|      qwen1.5-4b-chat-hf       |                 8.00 |               56.00 |                64.80 |                                     28.40 |                 48.63 |              19.60 |    60.67 |        34.00 |                                     14.40 |
+|      qwen1.5-7b-chat-hf       |                39.60 |               37.60 |                62.40 |                                     36.80 |                 60.96 |              30.80 |    54.49 |        38.00 |                                     20.00 |
+|      qwen1.5-14b-chat-hf      |                61.60 |               63.60 |                70.00 |                                     54.00 |                 74.66 |              33.60 |    67.42 |        61.20 |                                     35.60 |
+|      qwen1.5-32b-chat-hf      |                94.40 |               77.60 |                78.00 |                                     66.00 |                 93.84 |              46.00 |    82.58 |        73.60 |                                     61.60 |
+|      qwen1.5-72b-chat-hf      |                70.40 |               72.40 |                84.40 |                                     67.20 |                 89.73 |              52.00 |    79.21 |        86.40 |                                     68.80 |
+|     qwen1.5-110b-chat-hf      |                74.80 |               71.20 |                82.80 |                                     74.80 |                 89.04 |              48.00 |    90.45 |        87.60 |                                     73.60 |
+|    internlm2-chat-1.8b-hf     |                35.60 |               52.40 |                48.80 |                                     29.60 |                 39.73 |              24.40 |    51.69 |        27.20 |                                     13.20 |
+|  internlm2-chat-1.8b-sft-hf   |                37.20 |               53.60 |                44.00 |                                     30.00 |                 34.93 |              22.40 |    56.74 |        28.00 |                                     12.00 |
+|     internlm2-chat-7b-hf      |                72.00 |               66.40 |                73.60 |                                     65.20 |                 60.27 |              50.00 |    62.92 |        52.40 |                                     44.40 |
+|   internlm2-chat-7b-sft-hf    |                67.20 |               66.80 |                58.00 |                                     63.20 |                 48.63 |              45.60 |    64.04 |        59.60 |                                     42.80 |
+|     internlm2-chat-20b-hf     |                80.40 |               76.00 |                77.60 |                                     88.80 |                 78.08 |              36.40 |    71.91 |        71.60 |                                     77.20 |
+|   internlm2-chat-20b-sft-hf   |                80.00 |               70.80 |                78.00 |                                     87.60 |                 82.88 |              41.20 |    76.40 |        72.80 |                                     71.60 |
+|    llama-3-8b-instruct-hf     |                70.40 |               42.80 |                28.40 |                                     81.20 |                 13.01 |              49.20 |    44.94 |        73.20 |                                     42.40 |
+|    llama-3-70b-instruct-hf    |               100.00 |               84.00 |                91.60 |                                     95.60 |                 78.08 |              52.40 |    87.08 |        89.60 |                                     97.60 |
+| llama-3-8b-instruct-lmdeploy  |                73.20 |               45.60 |                34.00 |                                     79.60 |                 31.51 |              48.40 |    47.75 |        76.80 |                                     47.60 |
+| llama-3-70b-instruct-lmdeploy |               100.00 |               84.00 |                90.00 |                                     96.80 |                 83.56 |              56.00 |    87.08 |        89.20 |                                     97.20 |
+|  mistral-7b-instruct-v0.1-hf  |                32.00 |               22.40 |                52.40 |                                     35.20 |                 30.82 |              23.20 |    38.76 |        46.00 |                                     18.40 |
+|  mistral-7b-instruct-v0.2-hf  |                66.00 |               58.40 |                50.40 |                                     48.40 |                 48.63 |              37.20 |    65.73 |        40.40 |                                     29.20 |
+| mixtral-8x7b-instruct-v0.1-hf |                63.20 |               68.40 |                65.20 |                                     60.00 |                 78.08 |              40.40 |    74.16 |        64.00 |                                     46.00 |
+
+|             model             |   tracking_shuffled_objects_five_objects |   logical_deduction_three_objects |   hyperbaton |   logical_deduction_five_objects |   logical_deduction_seven_objects |   movie_recommendation |   salient_translation_error_detection |   reasoning_about_colored_objects |   multistep_arithmetic_two |
+|:-----------------------------:|-----------------------------------------:|----------------------------------:|-------------:|---------------------------------:|----------------------------------:|-----------------------:|--------------------------------------:|----------------------------------:|---------------------------:|
+|     qwen1.5-0.5b-chat-hf      |                                    20.40 |                             34.40 |        51.60 |                            21.20 |                             13.20 |                  26.00 |                                 20.80 |                             17.20 |                       1.20 |
+|     qwen1.5-1.8b-chat-hf      |                                    18.00 |                             34.80 |        48.40 |                            21.20 |                             16.40 |                  34.80 |                                 24.00 |                             28.80 |                       4.40 |
+|      qwen1.5-4b-chat-hf       |                                    19.20 |                             56.80 |        65.20 |                            36.40 |                             35.60 |                  51.60 |                                 40.40 |                             55.20 |                      29.20 |
+|      qwen1.5-7b-chat-hf       |                                    31.60 |                             58.80 |        53.20 |                            35.60 |                             27.20 |                  56.00 |                                 44.80 |                             62.00 |                      50.00 |
+|      qwen1.5-14b-chat-hf      |                                    43.20 |                             75.20 |        52.80 |                            52.40 |                             50.80 |                  76.40 |                                 48.80 |                             83.60 |                      65.20 |
+|      qwen1.5-32b-chat-hf      |                                    68.40 |                             84.00 |        81.20 |                            57.20 |                             46.00 |                  78.80 |                                 54.40 |                             86.00 |                      86.00 |
+|      qwen1.5-72b-chat-hf      |                                    76.80 |                             94.40 |        85.20 |                            62.80 |                             54.00 |                  78.40 |                                 63.60 |                             86.40 |                      82.80 |
+|     qwen1.5-110b-chat-hf      |                                    79.20 |                             91.60 |        88.80 |                            61.20 |                             50.00 |                  82.40 |                                 59.60 |                             88.80 |                      78.00 |
+|    internlm2-chat-1.8b-hf     |                                    20.00 |                             48.40 |        56.00 |                            24.40 |                             26.80 |                  65.20 |                                 18.00 |                             39.60 |                       7.60 |
+|  internlm2-chat-1.8b-sft-hf   |                                    18.40 |                             48.00 |        51.20 |                            20.40 |                             25.20 |                  63.20 |                                 22.00 |                             38.80 |                       6.00 |
+|     internlm2-chat-7b-hf      |                                    48.40 |                             75.20 |        84.80 |                            42.00 |                             36.80 |                  79.60 |                                 53.20 |                             65.60 |                      26.40 |
+|   internlm2-chat-7b-sft-hf    |                                    44.00 |                             72.40 |        85.60 |                            41.60 |                             37.20 |                  82.40 |                                 55.60 |                             52.80 |                      32.00 |
+|     internlm2-chat-20b-hf     |                                    88.00 |                             88.80 |        88.80 |                            52.80 |                             50.40 |                  85.20 |                                 56.80 |                             79.60 |                      40.00 |
+|   internlm2-chat-20b-sft-hf   |                                    83.20 |                             90.00 |        90.40 |                            55.60 |                             48.80 |                  84.40 |                                 57.60 |                             79.20 |                      38.40 |
+|    llama-3-8b-instruct-hf     |                                    49.60 |                             85.60 |        76.00 |                            54.00 |                             29.20 |                  57.60 |                                 46.00 |                             44.80 |                      52.00 |
+|    llama-3-70b-instruct-hf    |                                    99.20 |                             96.80 |        95.20 |                            77.20 |                             65.20 |                  80.00 |                                 69.60 |                             94.80 |                      84.00 |
+| llama-3-8b-instruct-lmdeploy  |                                    57.20 |                             78.00 |        75.60 |                            36.00 |                             13.20 |                  59.20 |                                 53.60 |                             54.80 |                      52.80 |
+| llama-3-70b-instruct-lmdeploy |                                    98.80 |                             96.40 |        96.80 |                            75.20 |                             68.80 |                  79.60 |                                 67.60 |                             94.00 |                      84.80 |
+|  mistral-7b-instruct-v0.1-hf  |                                    26.00 |                             46.00 |        60.00 |                            38.00 |                             24.00 |                  59.20 |                                  1.20 |                              6.00 |                      12.40 |
+|  mistral-7b-instruct-v0.2-hf  |                                    39.60 |                             63.60 |        64.00 |                            44.00 |                             33.20 |                  56.00 |                                 42.40 |                             68.40 |                      14.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                                    46.40 |                             71.60 |        88.80 |                            48.00 |                             36.80 |                  60.00 |                                 50.00 |                             81.20 |                      59.20 |
+
+|             model             |   navigate |   dyck_languages |   word_sorting |   sports_understanding |   boolean_expressions |   object_counting |   formal_fallacies |   causal_judgement |   web_of_lies |
+|:-----------------------------:|-----------:|-----------------:|---------------:|-----------------------:|----------------------:|------------------:|-------------------:|-------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |      45.60 |             0.00 |           1.20 |                  17.20 |                 50.40 |             16.40 |              11.60 |              42.78 |         27.60 |
+|     qwen1.5-1.8b-chat-hf      |      58.40 |             0.00 |           2.00 |                  34.00 |                 44.80 |             30.40 |              11.60 |              24.60 |         50.00 |
+|      qwen1.5-4b-chat-hf       |      64.00 |             3.20 |           6.80 |                  80.40 |                 77.60 |             48.80 |              41.20 |              55.61 |         63.20 |
+|      qwen1.5-7b-chat-hf       |      54.40 |             0.40 |           8.00 |                  55.60 |                 47.60 |             31.20 |               0.00 |               2.14 |         30.00 |
+|      qwen1.5-14b-chat-hf      |      74.40 |             6.40 |          26.40 |                  72.40 |                 76.40 |             61.60 |               0.80 |              25.67 |         81.20 |
+|      qwen1.5-32b-chat-hf      |      90.00 |            10.40 |          28.40 |                  82.40 |                 92.80 |             76.80 |              32.40 |              41.71 |        100.00 |
+|      qwen1.5-72b-chat-hf      |      81.20 |            18.40 |          37.60 |                  95.20 |                 92.80 |             76.00 |              50.40 |              63.64 |        100.00 |
+|     qwen1.5-110b-chat-hf      |      91.60 |            18.00 |          39.60 |                  82.80 |                 80.80 |             75.20 |              22.40 |              35.83 |        100.00 |
+|    internlm2-chat-1.8b-hf     |      63.20 |             0.00 |           6.00 |                  58.00 |                 56.80 |             48.80 |              54.80 |              52.94 |         48.40 |
+|  internlm2-chat-1.8b-sft-hf   |      63.20 |             0.00 |           5.60 |                  58.00 |                 56.80 |             50.00 |              52.40 |              56.68 |         47.60 |
+|     internlm2-chat-7b-hf      |      73.60 |             3.60 |          18.00 |                  55.20 |                 83.60 |             62.80 |              50.00 |              58.29 |         97.20 |
+|   internlm2-chat-7b-sft-hf    |      71.60 |             4.40 |          20.00 |                  82.00 |                 84.00 |             60.00 |              51.60 |              52.94 |         98.00 |
+|     internlm2-chat-20b-hf     |      82.40 |             8.00 |          36.00 |                  55.60 |                 84.40 |             78.00 |              50.40 |              59.36 |        100.00 |
+|   internlm2-chat-20b-sft-hf   |      81.60 |            10.40 |          36.40 |                  89.20 |                 82.40 |             80.40 |              48.40 |              55.61 |        100.00 |
+|    llama-3-8b-instruct-hf     |      82.80 |             8.80 |          37.20 |                  94.40 |                 78.80 |             89.60 |              45.20 |              24.06 |         25.60 |
+|    llama-3-70b-instruct-hf    |      95.20 |            18.80 |          49.20 |                  98.00 |                 94.00 |             90.00 |              73.20 |              68.98 |        100.00 |
+| llama-3-8b-instruct-lmdeploy  |      83.60 |            10.00 |          40.40 |                  96.00 |                 77.20 |             89.20 |              43.60 |              37.43 |          3.20 |
+| llama-3-70b-instruct-lmdeploy |      95.60 |            22.40 |          48.80 |                  96.80 |                 91.60 |             87.20 |              72.00 |              69.52 |        100.00 |
+|  mistral-7b-instruct-v0.1-hf  |      70.80 |             0.80 |           5.20 |                  68.80 |                 69.60 |             51.60 |               3.20 |              12.30 |         33.60 |
+|  mistral-7b-instruct-v0.2-hf  |      62.40 |             4.00 |          15.60 |                  81.20 |                 70.40 |             50.40 |              32.00 |              34.76 |         98.40 |
+| mixtral-8x7b-instruct-v0.1-hf |      76.40 |            12.80 |          23.20 |                  55.20 |                 85.60 |             83.60 |              40.00 |              43.32 |         88.80 |
--- a/configs/datasets/ceval/README.md
+++ b/configs/datasets/ceval/README.md
@ -0,0 +1,367 @@
+# C-Eval
+
+## Base Models
+
+|          model           |   ceval-test |   ceval-test-hard |   ceval-test-stem |   ceval-test-social-science |   ceval-test-humanities |   ceval-test-other |   ceval-dev |   ceval-dev-hard |   ceval-dev-stem |   ceval-dev-social-science |   ceval-dev-humanities |   ceval-dev-other |
+|:------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+|    llama-7b-turbomind    |        26.61 |             27.75 |             27.20 |                       26.31 |                   25.90 |              26.52 |       27.44 |            27.68 |            27.16 |                      29.49 |                  24.18 |             29.36 |
+|   llama-13b-turbomind    |        29.18 |             25.59 |             27.66 |                       33.86 |                   28.29 |              28.58 |       31.75 |            30.32 |            31.39 |                      35.22 |                  30.16 |             30.82 |
+|   llama-30b-turbomind    |        35.09 |             31.68 |             34.56 |                       39.89 |                   33.02 |              33.76 |       37.70 |            31.97 |            34.80 |                      42.72 |                  41.19 |             34.93 |
+|   llama-65b-turbomind    |        37.98 |             29.47 |             36.03 |                       45.03 |                   36.51 |              36.56 |       40.46 |            33.76 |            36.37 |                      46.47 |                  42.26 |             40.63 |
+|   llama-2-7b-turbomind   |        30.13 |             26.26 |             29.29 |                       33.02 |                   31.02 |              28.15 |       32.70 |            25.85 |            28.75 |                      39.75 |                  37.04 |             29.13 |
+|  llama-2-13b-turbomind   |        37.38 |             30.81 |             35.85 |                       43.98 |                   36.81 |              34.75 |       40.43 |            31.34 |            35.67 |                      45.75 |                  45.32 |             39.36 |
+|  llama-2-70b-turbomind   |        49.53 |             33.48 |             44.73 |                       60.19 |                   50.93 |              47.17 |       50.26 |            32.53 |            44.83 |                      59.44 |                  54.45 |             47.58 |
+|   llama-3-8b-turbomind   |        48.83 |             34.47 |             46.02 |                       56.48 |                   49.15 |              46.69 |       50.45 |            33.76 |            45.94 |                      58.08 |                  50.93 |             51.25 |
+|  llama-3-70b-turbomind   |        66.56 |             54.09 |             64.08 |                       76.43 |                   64.38 |              64.25 |       67.30 |            52.35 |            62.67 |                      77.89 |                  69.76 |             63.65 |
+| internlm2-1.8b-turbomind |        44.79 |             33.93 |             41.19 |                       54.26 |                   47.15 |              40.35 |       46.64 |            33.00 |            38.62 |                      57.28 |                  51.30 |             46.89 |
+|  internlm2-7b-turbomind  |        63.54 |             45.32 |             58.10 |                       76.40 |                   66.94 |              58.32 |       64.23 |            40.09 |            54.37 |                      76.88 |                  70.11 |             64.77 |
+| internlm2-20b-turbomind  |        67.28 |             50.15 |             62.33 |                       79.59 |                   70.55 |              61.82 |       66.73 |            42.50 |            59.25 |                      79.98 |                  73.43 |             61.56 |
+|   qwen-1.8b-turbomind    |        54.24 |             38.60 |             50.02 |                       68.18 |                   55.33 |              48.13 |       53.78 |            33.38 |            46.36 |                      68.40 |                  57.57 |             50.17 |
+|    qwen-7b-turbomind     |        62.06 |             42.73 |             56.21 |                       77.12 |                   65.28 |              55.76 |       63.23 |            36.99 |            54.74 |                      78.55 |                  68.94 |             59.02 |
+|    qwen-14b-turbomind    |        70.33 |             53.61 |             65.25 |                       83.19 |                   72.85 |              65.37 |       72.05 |            55.03 |            66.07 |                      85.59 |                  74.91 |             67.78 |
+|    qwen-72b-turbomind    |        83.25 |             66.78 |             78.44 |                       91.75 |                   83.86 |              83.63 |       83.60 |            63.68 |            78.05 |                      90.25 |                  87.13 |             84.13 |
+|     qwen1.5-0.5b-hf      |        48.36 |             35.55 |             44.72 |                       62.00 |                   48.51 |              42.41 |       50.43 |            37.00 |            46.28 |                      62.64 |                  48.11 |             49.18 |
+|     qwen1.5-1.8b-hf      |        58.67 |             40.98 |             53.91 |                       74.52 |                   58.51 |              53.06 |       59.38 |            43.02 |            53.45 |                      75.88 |                  60.06 |             54.47 |
+|      qwen1.5-4b-hf       |        66.55 |             48.50 |             61.45 |                       81.12 |                   67.90 |              61.22 |       66.46 |            43.12 |            56.76 |                      82.89 |                  67.61 |             68.03 |
+|      qwen1.5-7b-hf       |        72.49 |             52.90 |             66.77 |                       85.50 |                   74.37 |              69.19 |       73.57 |            49.16 |            66.32 |                      84.23 |                  77.30 |             73.34 |
+|      qwen1.5-14b-hf      |        76.93 |             60.50 |             72.08 |                       88.81 |                   77.95 |              73.94 |       77.86 |            54.81 |            71.55 |                      86.79 |                  82.86 |             76.23 |
+|      qwen1.5-32b-hf      |        82.50 |             66.67 |             77.97 |                       90.93 |                   83.66 |              81.88 |       82.79 |            71.06 |            80.01 |                      89.02 |                  83.36 |             81.62 |
+|      qwen1.5-72b-hf      |        83.03 |             65.09 |             77.90 |                       91.47 |                   83.85 |              83.86 |       83.72 |            64.09 |            77.26 |                      91.87 |                  87.64 |             84.14 |
+|   qwen1.5-moe-a2-7b-hf   |        76.67 |             51.37 |             68.89 |                       88.33 |                   77.15 |              79.73 |       77.90 |            51.25 |            67.27 |                      89.28 |                  83.16 |             81.60 |
+|    mistral-7b-v0.1-hf    |        43.76 |             33.85 |             42.23 |                       49.97 |                   41.10 |              43.54 |       47.54 |            33.97 |            44.74 |                      54.80 |                  51.52 |             42.06 |
+|    mistral-7b-v0.2-hf    |        42.81 |             32.84 |             41.00 |                       50.19 |                   39.45 |              42.77 |       46.44 |            31.67 |            42.89 |                      54.50 |                  48.75 |             43.23 |
+|   mixtral-8x7b-v0.1-hf   |        51.15 |             41.46 |             50.93 |                       59.19 |                   46.69 |              48.72 |       55.31 |            42.04 |            52.78 |                      62.00 |                  56.44 |             52.71 |
+|  mixtral-8x22b-v0.1-hf   |        58.13 |             48.31 |             58.01 |                       66.94 |                   53.60 |              54.86 |       60.50 |            45.67 |            57.44 |                      71.27 |                  61.31 |             55.47 |
+|         yi-6b-hf         |        70.78 |             43.72 |             60.54 |                       83.29 |                   75.39 |              73.40 |       73.13 |            46.87 |            63.14 |                      85.52 |                  78.70 |             74.45 |
+|        yi-34b-hf         |        80.93 |             58.51 |             73.48 |                       89.24 |                   83.65 |              84.18 |       81.62 |            56.95 |            71.64 |                      89.73 |                  87.49 |             86.53 |
+|   deepseek-7b-base-hf    |        43.68 |             28.90 |             37.03 |                       53.55 |                   50.14 |              40.34 |       45.07 |            31.94 |            38.81 |                      56.68 |                  47.10 |             43.85 |
+|   deepseek-67b-base-hf   |        66.66 |             44.25 |             57.89 |                       79.02 |                   72.36 |              65.66 |       66.65 |            38.62 |            56.65 |                      79.56 |                  73.72 |             66.01 |
+
+### Details on Test Split
+
+|          model           |   computer_network |   operating_system |   computer_architecture |   college_programming |   college_physics |   college_chemistry |   advanced_mathematics |   probability_and_statistics |   discrete_mathematics |   electrical_engineer |   metrology_engineer |   high_school_mathematics |
+|:------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+|    llama-7b-turbomind    |              29.82 |              25.70 |                   26.94 |                 30.99 |             32.95 |               23.66 |                  26.01 |                        22.89 |                  27.45 |                 30.09 |                26.48 |                     33.13 |
+|   llama-13b-turbomind    |              33.33 |              37.99 |                   31.09 |                 29.82 |             22.16 |               27.23 |                  31.79 |                        27.11 |                  24.84 |                 28.02 |                33.33 |                     30.72 |
+|   llama-30b-turbomind    |              40.94 |              48.60 |                   40.41 |                 34.21 |             32.95 |               35.71 |                  36.42 |                        32.53 |                  27.45 |                 31.56 |                36.07 |                     30.12 |
+|   llama-65b-turbomind    |              41.52 |              50.84 |                   44.04 |                 40.94 |             27.84 |               29.46 |                  28.32 |                        30.72 |                  29.41 |                 35.10 |                42.47 |                     30.12 |
+|   llama-2-7b-turbomind   |              33.92 |              37.99 |                   34.72 |                 30.99 |             26.70 |               21.88 |                  31.79 |                        25.30 |                  24.18 |                 31.56 |                39.73 |                     30.12 |
+|  llama-2-13b-turbomind   |              40.94 |              46.93 |                   37.82 |                 36.26 |             30.68 |               29.46 |                  35.84 |                        30.72 |                  24.84 |                 32.74 |                42.92 |                     34.94 |
+|  llama-2-70b-turbomind   |              55.56 |              58.66 |                   53.89 |                 47.95 |             34.09 |               33.48 |                  32.95 |                        27.11 |                  34.64 |                 37.76 |                57.99 |                     29.52 |
+|   llama-3-8b-turbomind   |              55.56 |              58.66 |                   55.96 |                 51.17 |             27.27 |               35.27 |                  36.42 |                        31.33 |                  34.64 |                 40.12 |                50.68 |                     30.72 |
+|  llama-3-70b-turbomind   |              69.59 |              75.98 |                   69.95 |                 71.64 |             49.43 |               58.04 |                  52.02 |                        53.01 |                  58.82 |                 45.72 |                68.95 |                     40.96 |
+| internlm2-1.8b-turbomind |              40.35 |              40.78 |                   39.38 |                 32.16 |             34.66 |               34.38 |                  31.21 |                        31.33 |                  35.95 |                 35.10 |                51.60 |                     27.71 |
+|  internlm2-7b-turbomind  |              56.14 |              57.54 |                   62.69 |                 49.42 |             43.75 |               48.21 |                  34.68 |                        32.53 |                  33.33 |                 41.00 |                60.27 |                     40.36 |
+| internlm2-20b-turbomind  |              62.57 |              65.36 |                   66.84 |                 58.77 |             43.18 |               51.79 |                  39.31 |                        40.36 |                  35.95 |                 42.77 |                66.67 |                     47.59 |
+|   qwen-1.8b-turbomind    |              46.20 |              41.90 |                   46.63 |                 36.84 |             40.34 |               36.61 |                  27.75 |                        28.92 |                  32.68 |                 36.58 |                57.08 |                     30.12 |
+|    qwen-7b-turbomind     |              52.63 |              54.75 |                   54.40 |                 46.20 |             35.80 |               44.20 |                  36.99 |                        27.71 |                  26.80 |                 38.35 |                57.99 |                     33.13 |
+|    qwen-14b-turbomind    |              58.48 |              64.80 |                   59.07 |                 54.68 |             45.45 |               57.59 |                  45.09 |                        33.73 |                  39.22 |                 49.26 |                67.58 |                     45.78 |
+|    qwen-72b-turbomind    |              83.04 |              73.74 |                   79.27 |                 76.61 |             75.00 |               64.29 |                  49.13 |                        44.58 |                  46.41 |                 66.37 |                85.84 |                     68.07 |
+|     qwen1.5-0.5b-hf      |              37.43 |              40.22 |                   41.45 |                 35.09 |             40.91 |               34.82 |                  30.06 |                        27.11 |                  26.80 |                 29.79 |                54.34 |                     31.93 |
+|     qwen1.5-1.8b-hf      |              47.37 |              50.84 |                   47.67 |                 38.30 |             43.18 |               35.27 |                  29.48 |                        30.12 |                  33.99 |                 39.53 |                58.90 |                     28.92 |
+|      qwen1.5-4b-hf       |              62.57 |              56.98 |                   56.99 |                 46.78 |             48.30 |               45.98 |                  40.46 |                        34.34 |                  31.37 |                 46.61 |                62.10 |                     43.37 |
+|      qwen1.5-7b-hf       |              66.08 |              62.57 |                   66.32 |                 55.56 |             54.55 |               47.77 |                  41.62 |                        31.93 |                  35.95 |                 49.85 |                74.43 |                     49.40 |
+|      qwen1.5-14b-hf      |              71.35 |              66.48 |                   68.39 |                 64.91 |             57.95 |               65.62 |                  41.62 |                        40.36 |                  47.71 |                 56.64 |                79.45 |                     56.63 |
+|      qwen1.5-32b-hf      |              84.80 |              73.18 |                   74.61 |                 70.18 |             71.59 |               61.61 |                  49.13 |                        45.78 |                  49.02 |                 61.95 |                87.67 |                     72.89 |
+|      qwen1.5-72b-hf      |              85.38 |              73.74 |                   78.24 |                 78.36 |             72.73 |               63.39 |                  43.35 |                        40.96 |                  49.02 |                 65.78 |                85.84 |                     66.27 |
+|   qwen1.5-moe-a2-7b-hf   |              77.78 |              73.74 |                   68.91 |                 64.91 |             66.48 |               49.11 |                  33.53 |                        36.75 |                  35.95 |                 61.06 |                91.32 |                     40.96 |
+|    mistral-7b-v0.1-hf    |              55.56 |              55.31 |                   56.99 |                 48.25 |             39.77 |               39.29 |                  33.53 |                        25.90 |                  31.37 |                 35.99 |                45.21 |                     27.11 |
+|    mistral-7b-v0.2-hf    |              56.14 |              53.63 |                   55.44 |                 47.66 |             36.36 |               34.38 |                  32.37 |                        25.30 |                  33.33 |                 31.86 |                45.21 |                     29.52 |
+|   mixtral-8x7b-v0.1-hf   |              62.57 |              64.80 |                   60.10 |                 60.53 |             38.64 |               42.41 |                  40.46 |                        37.35 |                  45.75 |                 35.99 |                60.27 |                     34.94 |
+|  mixtral-8x22b-v0.1-hf   |              65.50 |              74.86 |                   63.73 |                 65.79 |             46.59 |               52.68 |                  52.02 |                        45.78 |                  52.94 |                 42.77 |                62.56 |                     39.16 |
+|         yi-6b-hf         |              68.42 |              63.13 |                   69.43 |                 57.89 |             42.05 |               48.66 |                  31.79 |                        33.13 |                  28.76 |                 49.85 |                74.89 |                     37.35 |
+|        yi-34b-hf         |              83.63 |              80.45 |                   74.09 |                 68.42 |             62.50 |               60.27 |                  45.09 |                        38.55 |                  50.33 |                 65.19 |                88.58 |                     49.40 |
+|   deepseek-7b-base-hf    |              44.44 |              44.13 |                   44.56 |                 36.26 |             30.68 |               29.02 |                  32.37 |                        24.70 |                  26.14 |                 35.99 |                48.86 |                     28.31 |
+|   deepseek-67b-base-hf   |              63.16 |              70.39 |                   65.80 |                 59.36 |             42.61 |               45.54 |                  35.84 |                        38.55 |                  42.48 |                 44.54 |                68.95 |                     33.73 |
+
+|          model           |   high_school_physics |   high_school_chemistry |   high_school_biology |   middle_school_mathematics |   middle_school_biology |   middle_school_physics |   middle_school_chemistry |   veterinary_medicine |   college_economics |   business_administration |   marxism |   mao_zedong_thought |
+|:------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+|    llama-7b-turbomind    |                 29.14 |                   26.74 |                 24.57 |                       29.94 |                   22.92 |                   23.60 |                     20.00 |                 30.95 |               29.98 |                     24.58 |     25.70 |                25.11 |
+|   llama-13b-turbomind    |                 22.29 |                   18.60 |                 28.00 |                       26.55 |                   26.56 |                   25.28 |                     19.46 |                 29.05 |               28.77 |                     28.57 |     39.66 |                43.38 |
+|   llama-30b-turbomind    |                 25.14 |                   33.14 |                 36.00 |                       31.07 |                   39.06 |                   28.09 |                     33.51 |                 38.10 |               35.21 |                     35.88 |     48.04 |                33.33 |
+|   llama-65b-turbomind    |                 33.71 |                   26.16 |                 38.29 |                       33.90 |                   44.27 |                   36.52 |                     38.92 |                 38.10 |               37.42 |                     42.19 |     59.22 |                48.40 |
+|   llama-2-7b-turbomind   |                 26.86 |                   23.26 |                 26.86 |                       28.81 |                   28.12 |                   29.78 |                     22.70 |                 30.48 |               31.79 |                     30.56 |     33.52 |                36.07 |
+|  llama-2-13b-turbomind   |                 28.00 |                   31.98 |                 36.57 |                       36.72 |                   38.54 |                   36.52 |                     37.84 |                 46.67 |               37.02 |                     36.54 |     57.54 |                41.10 |
+|  llama-2-70b-turbomind   |                 40.00 |                   36.05 |                 48.00 |                       36.72 |                   66.67 |                   55.06 |                     55.68 |                 52.86 |               51.91 |                     48.50 |     68.16 |                60.73 |
+|   llama-3-8b-turbomind   |                 41.71 |                   38.37 |                 50.86 |                       36.16 |                   61.98 |                   63.48 |                     63.78 |                 56.19 |               41.65 |                     49.17 |     69.27 |                54.34 |
+|  llama-3-70b-turbomind   |                 63.43 |                   56.98 |                 69.14 |                       59.32 |                   84.90 |                   75.28 |                     78.92 |                 79.52 |               68.81 |                     59.80 |     86.59 |                79.91 |
+| internlm2-1.8b-turbomind |                 30.29 |                   45.93 |                 46.29 |                       33.33 |                   63.02 |                   60.11 |                     62.70 |                 47.62 |               35.61 |                     37.87 |     69.27 |                61.64 |
+|  internlm2-7b-turbomind  |                 64.57 |                   65.12 |                 76.00 |                       54.80 |                   91.15 |                   85.96 |                     90.27 |                 74.29 |               57.34 |                     50.50 |     86.59 |                83.56 |
+| internlm2-20b-turbomind  |                 68.57 |                   74.42 |                 78.86 |                       58.76 |                   91.67 |                   90.45 |                     90.27 |                 72.38 |               57.95 |                     55.81 |     88.83 |                88.58 |
+|   qwen-1.8b-turbomind    |                 55.43 |                   56.98 |                 61.14 |                       54.80 |                   85.42 |                   84.83 |                     85.41 |                 54.76 |               43.06 |                     44.19 |     83.80 |                79.91 |
+|    qwen-7b-turbomind     |                 68.00 |                   69.19 |                 82.86 |                       57.63 |                   93.75 |                   87.64 |                     92.43 |                 63.81 |               47.28 |                     57.48 |     86.59 |                82.65 |
+|    qwen-14b-turbomind    |                 78.86 |                   83.14 |                 92.57 |                       67.23 |                   96.88 |                   95.51 |                     96.76 |                 73.33 |               56.94 |                     64.45 |     91.62 |                86.76 |
+|    qwen-72b-turbomind    |                 93.14 |                   93.60 |                 95.43 |                       88.70 |                   98.44 |                   97.75 |                     99.46 |                 90.00 |               75.45 |                     80.73 |     96.09 |                99.54 |
+|     qwen1.5-0.5b-hf      |                 48.57 |                   44.19 |                 60.00 |                       40.68 |                   73.44 |                   69.66 |                     78.92 |                 49.05 |               34.41 |                     40.20 |     79.89 |                74.43 |
+|     qwen1.5-1.8b-hf      |                 58.86 |                   68.02 |                 76.00 |                       59.32 |                   91.15 |                   90.45 |                     87.03 |                 63.81 |               44.87 |                     48.50 |     86.03 |                90.41 |
+|      qwen1.5-4b-hf       |                 66.86 |                   77.33 |                 82.86 |                       68.93 |                   95.31 |                   92.70 |                     97.30 |                 71.90 |               51.31 |                     61.13 |     91.62 |                94.52 |
+|      qwen1.5-7b-hf       |                 79.43 |                   82.56 |                 91.43 |                       77.40 |                   96.88 |                   95.51 |                     96.22 |                 80.00 |               62.37 |                     69.77 |     93.30 |                97.26 |
+|      qwen1.5-14b-hf      |                 86.29 |                   87.79 |                 93.14 |                       83.05 |                   97.92 |                   95.51 |                     97.84 |                 82.86 |               63.78 |                     77.08 |     95.53 |                96.35 |
+|      qwen1.5-32b-hf      |                 88.00 |                   95.35 |                 94.86 |                       91.53 |                   97.92 |                   99.44 |                    100.00 |                 90.00 |               73.44 |                     78.74 |     94.97 |                98.63 |
+|      qwen1.5-72b-hf      |                 91.43 |                   93.60 |                 95.43 |                       88.70 |                   97.92 |                   98.31 |                     99.46 |                 90.00 |               74.25 |                     80.40 |     94.41 |                98.63 |
+|   qwen1.5-moe-a2-7b-hf   |                 70.86 |                   77.33 |                 82.86 |                       68.36 |                   97.92 |                   93.26 |                     97.30 |                 89.52 |               70.22 |                     74.75 |     96.09 |                98.17 |
+|    mistral-7b-v0.1-hf    |                 33.14 |                   40.70 |                 40.57 |                       40.11 |                   47.92 |                   49.44 |                     50.81 |                 47.62 |               44.87 |                     37.87 |     58.10 |                48.40 |
+|    mistral-7b-v0.2-hf    |                 34.86 |                   36.63 |                 45.71 |                       36.72 |                   46.35 |                   46.07 |                     48.65 |                 43.81 |               43.46 |                     39.53 |     57.54 |                48.86 |
+|   mixtral-8x7b-v0.1-hf   |                 49.71 |                   42.44 |                 53.71 |                       47.46 |                   62.50 |                   61.24 |                     60.00 |                 57.62 |               52.52 |                     44.52 |     68.72 |                57.99 |
+|  mixtral-8x22b-v0.1-hf   |                 54.29 |                   43.02 |                 58.29 |                       55.93 |                   76.04 |                   66.29 |                     75.68 |                 66.19 |               60.97 |                     51.83 |     74.30 |                70.78 |
+|         yi-6b-hf         |                 58.86 |                   69.19 |                 78.29 |                       43.50 |                   92.19 |                   89.33 |                     90.27 |                 83.81 |               59.56 |                     70.10 |     93.85 |                97.72 |
+|        yi-34b-hf         |                 80.00 |                   81.98 |                 93.14 |                       65.54 |                   97.40 |                   95.51 |                     96.76 |                 92.86 |               74.04 |                     76.08 |     94.97 |                97.26 |
+|   deepseek-7b-base-hf    |                 29.14 |                   30.81 |                 33.14 |                       24.29 |                   53.12 |                   45.51 |                     48.65 |                 50.48 |               38.23 |                     44.19 |     62.01 |                65.30 |
+|   deepseek-67b-base-hf   |                 60.00 |                   55.23 |                 64.00 |                       46.33 |                   84.90 |                   79.78 |                     83.24 |                 73.33 |               57.75 |                     63.79 |     89.94 |                88.58 |
+
+|          model           |   education_science |   teacher_qualification |   high_school_politics |   high_school_geography |   middle_school_politics |   middle_school_geography |   modern_chinese_history |   ideological_and_moral_cultivation |   logic |   law |   chinese_language_and_literature |   art_studies |
+|:------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+|    llama-7b-turbomind    |               22.96 |                   31.58 |                  25.57 |                   29.78 |                    22.80 |                     25.00 |                    21.70 |                               21.51 |   25.00 | 26.24 |                             22.49 |         25.84 |
+|   llama-13b-turbomind    |               29.26 |                   30.83 |                  33.52 |                   36.52 |                    34.72 |                     33.33 |                    24.06 |                               40.12 |   26.47 | 33.48 |                             30.14 |         29.87 |
+|   llama-30b-turbomind    |               37.41 |                   46.37 |                  32.95 |                   38.20 |                    50.78 |                     40.74 |                    28.77 |                               45.93 |   33.33 | 32.13 |                             39.23 |         22.82 |
+|   llama-65b-turbomind    |               39.63 |                   51.13 |                  31.82 |                   39.89 |                    58.03 |                     42.59 |                    34.91 |                               55.23 |   39.71 | 30.32 |                             37.80 |         32.89 |
+|   llama-2-7b-turbomind   |               27.78 |                   34.34 |                  31.82 |                   34.83 |                    35.23 |                     34.26 |                    28.77 |                               38.95 |   32.35 | 33.94 |                             27.27 |         30.87 |
+|  llama-2-13b-turbomind   |               41.48 |                   47.37 |                  37.50 |                   37.64 |                    50.78 |                     52.78 |                    43.40 |                               48.84 |   32.35 | 38.46 |                             36.36 |         30.20 |
+|  llama-2-70b-turbomind   |               57.78 |                   69.17 |                  50.57 |                   58.43 |                    69.95 |                     66.67 |                    50.94 |                               72.09 |   50.98 | 42.53 |                             44.98 |         52.01 |
+|   llama-3-8b-turbomind   |               56.30 |                   65.41 |                  47.16 |                   56.18 |                    64.25 |                     61.11 |                    55.66 |                               67.44 |   41.67 | 40.27 |                             45.45 |         50.34 |
+|  llama-3-70b-turbomind   |               72.22 |                   85.46 |                  75.00 |                   74.72 |                    84.97 |                     76.85 |                    75.00 |                               76.16 |   59.31 | 52.94 |                             62.68 |         68.46 |
+| internlm2-1.8b-turbomind |               47.41 |                   61.40 |                  55.11 |                   47.75 |                    61.66 |                     64.81 |                    61.79 |                               63.95 |   32.35 | 32.58 |                             48.33 |         36.58 |
+|  internlm2-7b-turbomind  |               66.67 |                   85.96 |                  78.98 |                   74.72 |                    91.71 |                     87.96 |                    80.66 |                               80.23 |   42.16 | 50.23 |                             64.11 |         70.13 |
+| internlm2-20b-turbomind  |               69.26 |                   89.22 |                  83.52 |                   80.34 |                    90.67 |                     91.67 |                    83.02 |                               85.47 |   49.02 | 54.30 |                             72.25 |         73.15 |
+|   qwen-1.8b-turbomind    |               51.11 |                   70.68 |                  71.02 |                   62.36 |                    88.60 |                     87.04 |                    69.81 |                               73.26 |   29.90 | 46.15 |                             50.24 |         47.32 |
+|    qwen-7b-turbomind     |               57.41 |                   83.71 |                  88.64 |                   79.78 |                    93.26 |                     94.44 |                    75.47 |                               79.07 |   42.16 | 47.96 |                             59.33 |         65.10 |
+|    qwen-14b-turbomind    |               72.96 |                   89.97 |                  93.75 |                   83.71 |                    96.37 |                     95.37 |                    86.32 |                               87.21 |   50.00 | 60.63 |                             66.99 |         72.48 |
+|    qwen-72b-turbomind    |               85.56 |                   96.24 |                  95.45 |                   93.26 |                    97.93 |                     97.22 |                    92.45 |                               91.86 |   67.65 | 76.92 |                             75.12 |         83.89 |
+|     qwen1.5-0.5b-hf      |               43.33 |                   63.16 |                  65.91 |                   56.18 |                    82.90 |                     79.63 |                    68.87 |                               70.35 |   28.43 | 37.56 |                             39.23 |         32.21 |
+|     qwen1.5-1.8b-hf      |               57.41 |                   76.44 |                  81.25 |                   75.84 |                    92.75 |                     91.67 |                    79.72 |                               81.98 |   34.31 | 47.96 |                             47.85 |         43.62 |
+|      qwen1.5-4b-hf       |               65.93 |                   87.47 |                  86.93 |                   82.58 |                    94.30 |                     95.37 |                    84.91 |                               84.30 |   40.20 | 62.90 |                             58.85 |         58.72 |
+|      qwen1.5-7b-hf       |               69.26 |                   91.98 |                  90.91 |                   89.89 |                    95.85 |                     94.44 |                    89.15 |                               87.21 |   48.04 | 67.87 |                             63.16 |         68.12 |
+|      qwen1.5-14b-hf      |               78.89 |                   94.99 |                  94.89 |                   91.57 |                    96.89 |                     98.15 |                    91.04 |                               88.37 |   57.84 | 69.68 |                             66.99 |         73.83 |
+|      qwen1.5-32b-hf      |               83.70 |                   95.99 |                  93.75 |                   94.38 |                    98.45 |                     97.22 |                    90.57 |                               91.28 |   70.10 | 76.92 |                             76.56 |         80.87 |
+|      qwen1.5-72b-hf      |               84.44 |                   96.49 |                  96.59 |                   93.82 |                    98.45 |                     97.22 |                    92.92 |                               91.28 |   66.67 | 76.92 |                             74.16 |         85.23 |
+|   qwen1.5-moe-a2-7b-hf   |               80.74 |                   95.49 |                  89.20 |                   89.33 |                    94.82 |                     94.44 |                    92.45 |                               91.28 |   52.45 | 75.57 |                             67.94 |         79.87 |
+|    mistral-7b-v0.1-hf    |               45.19 |                   59.15 |                  43.75 |                   49.44 |                    56.48 |                     56.48 |                    45.28 |                               58.14 |   37.75 | 38.91 |                             40.67 |         34.56 |
+|    mistral-7b-v0.2-hf    |               45.93 |                   58.65 |                  38.07 |                   48.31 |                    63.21 |                     58.33 |                    41.98 |                               54.07 |   35.78 | 40.27 |                             38.28 |         32.21 |
+|   mixtral-8x7b-v0.1-hf   |               57.04 |                   67.92 |                  53.41 |                   55.06 |                    69.95 |                     64.81 |                    47.64 |                               70.93 |   42.16 | 38.01 |                             46.41 |         36.58 |
+|  mixtral-8x22b-v0.1-hf   |               60.37 |                   72.68 |                  64.77 |                   65.17 |                    77.20 |                     71.30 |                    57.08 |                               75.00 |   49.51 | 43.44 |                             52.63 |         49.33 |
+|         yi-6b-hf         |               79.26 |                   92.48 |                  77.27 |                   76.40 |                    92.75 |                     93.52 |                    89.15 |                               90.12 |   60.78 | 74.66 |                             61.24 |         74.16 |
+|        yi-34b-hf         |               84.81 |                   96.24 |                  88.07 |                   88.20 |                    96.37 |                     96.30 |                    91.98 |                               91.28 |   75.00 | 78.73 |                             80.38 |         82.89 |
+|   deepseek-7b-base-hf    |               52.22 |                   70.18 |                  47.16 |                   51.12 |                    60.62 |                     44.44 |                    58.49 |                               66.86 |   31.86 | 37.56 |                             53.11 |         61.07 |
+|   deepseek-67b-base-hf   |               76.67 |                   89.22 |                  77.27 |                   78.65 |                    89.64 |                     78.70 |                    85.85 |                               84.30 |   50.00 | 64.25 |                             69.38 |         84.23 |
+
+|          model           |   professional_tour_guide |   legal_professional |   high_school_chinese |   high_school_history |   middle_school_history |   civil_servant |   sports_science |   plant_protection |   basic_medicine |   clinical_medicine |   urban_and_rural_planner |   accountant |
+|:------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+|    llama-7b-turbomind    |                     29.70 |                23.72 |                 27.53 |                 30.22 |                   30.92 |           27.04 |            22.78 |              28.64 |            28.00 |               25.00 |                     26.32 |        29.80 |
+|   llama-13b-turbomind    |                     25.94 |                20.93 |                 25.84 |                 29.67 |                   24.64 |           29.60 |            26.67 |              29.15 |            33.71 |               25.50 |                     28.47 |        28.44 |
+|   llama-30b-turbomind    |                     29.32 |                27.91 |                 30.34 |                 36.26 |                   37.20 |           36.13 |            36.11 |              38.69 |            34.29 |               29.50 |                     38.52 |        29.35 |
+|   llama-65b-turbomind    |                     28.95 |                30.70 |                 30.90 |                 44.51 |                   35.75 |           36.60 |            45.56 |              39.20 |            37.71 |               30.00 |                     39.47 |        37.02 |
+|   llama-2-7b-turbomind   |                     29.70 |                30.23 |                 24.72 |                 29.67 |                   34.78 |           30.07 |            31.11 |              31.16 |            30.29 |               25.50 |                     31.34 |        27.31 |
+|  llama-2-13b-turbomind   |                     30.83 |                32.56 |                 24.16 |                 42.31 |                   45.41 |           32.87 |            36.67 |              45.23 |            38.29 |               33.50 |                     35.17 |        34.31 |
+|  llama-2-70b-turbomind   |                     53.76 |                38.14 |                 30.34 |                 58.79 |                   65.70 |           43.82 |            51.11 |              58.29 |            49.71 |               42.00 |                     49.76 |        46.28 |
+|   llama-3-8b-turbomind   |                     52.63 |                42.33 |                 27.53 |                 51.65 |                   65.70 |           44.52 |            54.44 |              51.26 |            46.86 |               43.00 |                     46.41 |        45.15 |
+|  llama-3-70b-turbomind   |                     72.93 |                52.56 |                 32.58 |                 71.98 |                   83.57 |           56.88 |            69.44 |              78.89 |            76.00 |               67.50 |                     57.89 |        59.14 |
+| internlm2-1.8b-turbomind |                     51.50 |                38.14 |                 25.84 |                 56.04 |                   71.50 |           47.32 |            35.00 |              43.72 |            42.29 |               39.00 |                     41.15 |        36.57 |
+|  internlm2-7b-turbomind  |                     72.56 |                53.49 |                 52.25 |                 79.67 |                   90.82 |           62.00 |            62.78 |              64.32 |            66.86 |               59.50 |                     55.74 |        53.50 |
+| internlm2-20b-turbomind  |                     74.06 |                54.42 |                 56.18 |                 81.87 |                   92.27 |           61.77 |            68.33 |              69.85 |            68.00 |               63.50 |                     60.77 |        58.92 |
+|   qwen-1.8b-turbomind    |                     54.14 |                43.72 |                 39.89 |                 69.23 |                   85.02 |           49.88 |            45.56 |              48.74 |            48.57 |               51.50 |                     46.89 |        45.82 |
+|    qwen-7b-turbomind     |                     71.05 |                48.37 |                 53.93 |                 81.87 |                   93.72 |           59.67 |            54.44 |              62.31 |            58.29 |               57.50 |                     50.24 |        56.66 |
+|    qwen-14b-turbomind    |                     79.70 |                53.02 |                 63.48 |                 87.36 |                   94.20 |           71.33 |            63.33 |              71.36 |            73.14 |               68.00 |                     59.09 |        67.95 |
+|    qwen-72b-turbomind    |                     90.23 |                77.21 |                 79.21 |                 91.76 |                   96.14 |           77.86 |            86.11 |              85.43 |            91.43 |               90.50 |                     76.08 |        86.68 |
+|     qwen1.5-0.5b-hf      |                     44.36 |                36.74 |                 39.33 |                 58.24 |                   78.26 |           43.36 |            40.00 |              45.23 |            41.71 |               42.50 |                     43.54 |        43.12 |
+|     qwen1.5-1.8b-hf      |                     59.40 |                47.91 |                 37.08 |                 72.53 |                   91.30 |           53.61 |            53.33 |              51.26 |            49.71 |               58.00 |                     51.20 |        56.21 |
+|      qwen1.5-4b-hf       |                     65.04 |                58.60 |                 55.62 |                 83.52 |                   94.20 |           62.00 |            63.89 |              65.33 |            65.71 |               64.00 |                     55.26 |        61.40 |
+|      qwen1.5-7b-hf       |                     78.57 |                66.51 |                 66.85 |                 87.91 |                   94.69 |           68.07 |            65.00 |              64.82 |            77.14 |               77.50 |                     60.77 |        74.49 |
+|      qwen1.5-14b-hf      |                     83.08 |                72.09 |                 70.22 |                 90.11 |                   94.20 |           69.46 |            73.89 |              70.35 |            82.29 |               83.00 |                     65.31 |        78.33 |
+|      qwen1.5-32b-hf      |                     87.59 |                78.14 |                 79.78 |                 92.86 |                   95.65 |           78.32 |            80.56 |              79.90 |            90.29 |               89.00 |                     77.27 |        86.68 |
+|      qwen1.5-72b-hf      |                     91.35 |                76.74 |                 79.21 |                 91.76 |                   96.14 |           79.25 |            85.56 |              86.93 |            92.00 |               90.00 |                     75.84 |        86.91 |
+|   qwen1.5-moe-a2-7b-hf   |                     88.35 |                75.81 |                 51.12 |                 79.12 |                   94.69 |           67.37 |            80.56 |              73.37 |            87.43 |               84.00 |                     78.23 |        82.39 |
+|    mistral-7b-v0.1-hf    |                     40.23 |                39.07 |                 24.16 |                 41.21 |                   52.17 |           41.49 |            45.00 |              52.26 |            45.14 |               42.00 |                     42.58 |        44.02 |
+|    mistral-7b-v0.2-hf    |                     36.84 |                34.88 |                 23.03 |                 43.96 |                   52.66 |           40.79 |            50.00 |              50.75 |            45.14 |               40.50 |                     42.58 |        40.86 |
+|   mixtral-8x7b-v0.1-hf   |                     47.74 |                40.00 |                 28.09 |                 57.14 |                   58.94 |           44.29 |            58.33 |              53.77 |            48.57 |               46.00 |                     51.20 |        46.50 |
+|  mixtral-8x22b-v0.1-hf   |                     59.02 |                41.86 |                 29.78 |                 60.99 |                   71.01 |           50.82 |            57.78 |              67.34 |            62.29 |               52.00 |                     53.35 |        55.98 |
+|         yi-6b-hf         |                     85.34 |                67.91 |                 53.93 |                 80.22 |                   91.79 |           65.97 |            72.22 |              72.36 |            82.29 |               84.50 |                     69.86 |        71.56 |
+|        yi-34b-hf         |                     94.36 |                76.74 |                 65.73 |                 87.91 |                   95.17 |           79.25 |            85.56 |              90.95 |            90.86 |               92.00 |                     76.79 |        82.39 |
+|   deepseek-7b-base-hf    |                     65.79 |                29.30 |                 32.58 |                 47.80 |                   67.15 |           37.76 |            44.44 |              52.26 |            43.43 |               36.50 |                     41.15 |        37.02 |
+|   deepseek-67b-base-hf   |                     83.83 |                58.60 |                 45.51 |                 79.67 |                   90.34 |           62.47 |            70.56 |              70.85 |            81.14 |               71.50 |                     61.72 |        60.05 |
+
+|          model           |   fire_engineer |   environmental_impact_assessment_engineer |   tax_accountant |   physician |
+|:------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+|    llama-7b-turbomind    |           22.34 |                                      24.91 |            29.12 |       27.77 |
+|   llama-13b-turbomind    |           24.11 |                                      30.25 |            27.77 |       30.70 |
+|   llama-30b-turbomind    |           28.72 |                                      31.67 |            31.83 |       36.57 |
+|   llama-65b-turbomind    |           28.37 |                                      39.15 |            33.63 |       35.44 |
+|   llama-2-7b-turbomind   |           22.70 |                                      24.91 |            25.51 |       29.80 |
+|  llama-2-13b-turbomind   |           25.53 |                                      35.94 |            29.35 |       35.44 |
+|  llama-2-70b-turbomind   |           36.52 |                                      52.67 |            36.12 |       52.60 |
+|   llama-3-8b-turbomind   |           35.46 |                                      49.82 |            41.31 |       55.30 |
+|  llama-3-70b-turbomind   |           48.58 |                                      64.41 |            52.60 |       75.40 |
+| internlm2-1.8b-turbomind |           32.27 |                                      42.35 |            39.05 |       45.15 |
+|  internlm2-7b-turbomind  |           46.81 |                                      55.16 |            47.63 |       67.27 |
+| internlm2-20b-turbomind  |           45.04 |                                      62.63 |            51.47 |       69.75 |
+|   qwen-1.8b-turbomind    |           41.84 |                                      47.69 |            45.60 |       57.34 |
+|    qwen-7b-turbomind     |           41.84 |                                      54.80 |            48.08 |       69.53 |
+|    qwen-14b-turbomind    |           45.74 |                                      64.77 |            56.43 |       77.88 |
+|    qwen-72b-turbomind    |           80.50 |                                      74.73 |            81.04 |       89.62 |
+|     qwen1.5-0.5b-hf      |           39.36 |                                      41.28 |            38.37 |       48.08 |
+|     qwen1.5-1.8b-hf      |           45.74 |                                      49.47 |            51.69 |       63.43 |
+|      qwen1.5-4b-hf       |           50.35 |                                      51.60 |            58.69 |       75.17 |
+|      qwen1.5-7b-hf       |           58.51 |                                      65.84 |            67.04 |       81.94 |
+|      qwen1.5-14b-hf      |           63.83 |                                      67.26 |            72.23 |       87.36 |
+|      qwen1.5-32b-hf      |           74.47 |                                      73.31 |            80.14 |       90.74 |
+|      qwen1.5-72b-hf      |           79.79 |                                      75.09 |            81.04 |       90.07 |
+|   qwen1.5-moe-a2-7b-hf   |           74.82 |                                      77.58 |            79.68 |       91.65 |
+|    mistral-7b-v0.1-hf    |           32.27 |                                      45.91 |            37.70 |       50.56 |
+|    mistral-7b-v0.2-hf    |           32.62 |                                      44.13 |            36.79 |       46.28 |
+|   mixtral-8x7b-v0.1-hf   |           35.11 |                                      53.02 |            46.73 |       52.37 |
+|  mixtral-8x22b-v0.1-hf   |           38.65 |                                      56.23 |            49.21 |       59.82 |
+|         yi-6b-hf         |           67.38 |                                      68.68 |            69.53 |       83.07 |
+|        yi-34b-hf         |           77.66 |                                      83.27 |            77.43 |       89.84 |
+|   deepseek-7b-base-hf    |           30.50 |                                      38.79 |            35.67 |       46.28 |
+|   deepseek-67b-base-hf   |           46.81 |                                      65.12 |            54.40 |       77.65 |
+
+### Details on Dev Split
+
+## Chat Models
+
+|             model             |   ceval-test |   ceval-test-hard |   ceval-test-stem |   ceval-test-social-science |   ceval-test-humanities |   ceval-test-other |   ceval-dev |   ceval-dev-hard |   ceval-dev-stem |   ceval-dev-social-science |   ceval-dev-humanities |   ceval-dev-other |
+|:-----------------------------:|-------------:|------------------:|------------------:|----------------------------:|------------------------:|-------------------:|------------:|-----------------:|-----------------:|---------------------------:|-----------------------:|------------------:|
+|     qwen1.5-0.5b-chat-hf      |        36.88 |             28.83 |             34.49 |                       43.46 |                   37.35 |              34.76 |       38.58 |            33.90 |            33.63 |                      43.81 |                  41.79 |             39.59 |
+|     qwen1.5-1.8b-chat-hf      |        55.17 |             38.21 |             50.63 |                       70.26 |                   56.04 |              48.82 |       55.93 |            37.60 |            50.31 |                      67.59 |                  60.90 |             50.59 |
+|      qwen1.5-4b-chat-hf       |        61.54 |             44.79 |             56.86 |                       75.84 |                   62.13 |              56.46 |       62.76 |            38.32 |            55.39 |                      79.53 |                  65.67 |             58.00 |
+|      qwen1.5-7b-chat-hf       |        68.71 |             51.77 |             64.27 |                       81.23 |                   68.22 |              65.88 |       71.10 |            50.13 |            65.42 |                      83.99 |                  73.77 |             67.02 |
+|      qwen1.5-14b-chat-hf      |        74.80 |             56.54 |             69.46 |                       87.47 |                   76.46 |              71.32 |       76.35 |            52.08 |            69.68 |                      86.70 |                  80.56 |             74.87 |
+|      qwen1.5-32b-chat-hf      |        80.47 |             63.17 |             75.66 |                       89.58 |                   81.98 |              79.43 |       81.27 |            63.51 |            76.64 |                      89.39 |                  82.97 |             80.59 |
+|      qwen1.5-72b-chat-hf      |        81.53 |             63.62 |             75.86 |                       90.74 |                   83.18 |              81.84 |       82.88 |            62.44 |            77.54 |                      89.80 |                  86.11 |             83.07 |
+|     qwen1.5-110b-chat-hf      |        87.33 |             67.27 |             80.70 |                       93.58 |                   89.67 |              91.35 |       87.59 |            73.64 |            81.94 |                      91.47 |                  92.12 |             89.80 |
+|    internlm2-chat-1.8b-hf     |        47.04 |             34.81 |             43.28 |                       59.34 |                   48.24 |              41.50 |       48.51 |            36.75 |            42.23 |                      57.79 |                  54.83 |             45.15 |
+|  internlm2-chat-1.8b-sft-hf   |        47.19 |             35.34 |             43.49 |                       59.56 |                   48.30 |              41.58 |       48.75 |            35.83 |            42.04 |                      59.80 |                  54.84 |             44.83 |
+|     internlm2-chat-7b-hf      |        58.75 |             39.61 |             52.38 |                       71.46 |                   61.57 |              55.96 |       61.04 |            36.56 |            51.81 |                      74.01 |                  69.13 |             57.92 |
+|   internlm2-chat-7b-sft-hf    |        58.96 |             40.09 |             52.40 |                       71.49 |                   62.20 |              56.26 |       61.02 |            37.29 |            52.60 |                      74.01 |                  68.27 |             57.27 |
+|     internlm2-chat-20b-hf     |        63.12 |             42.65 |             56.21 |                       75.64 |                   67.15 |              60.27 |       63.45 |            34.96 |            52.84 |                      79.27 |                  71.50 |             60.32 |
+|   internlm2-chat-20b-sft-hf   |        63.16 |             42.70 |             56.19 |                       75.74 |                   67.20 |              60.37 |       63.54 |            34.96 |            52.57 |                      80.33 |                  71.42 |             60.34 |
+|    llama-3-8b-instruct-hf     |        50.90 |             34.54 |             46.73 |                       58.73 |                   49.24 |              53.04 |       52.55 |            36.37 |            48.47 |                      58.03 |                  53.26 |             54.26 |
+|    llama-3-70b-instruct-hf    |        67.38 |             54.02 |             65.16 |                       76.83 |                   62.29 |              67.92 |       67.92 |            54.50 |            66.85 |                      76.80 |                  65.98 |             63.72 |
+| llama-3-8b-instruct-lmdeploy  |        49.92 |             34.75 |             46.19 |                       58.49 |                   47.68 |              51.14 |       50.27 |            33.32 |            46.25 |                      56.93 |                  49.02 |             52.76 |
+| llama-3-70b-instruct-lmdeploy |        66.41 |             52.76 |             64.72 |                       75.31 |                   61.36 |              66.44 |       68.21 |            52.28 |            65.86 |                      75.06 |                  68.37 |             66.09 |
+|  mistral-7b-instruct-v0.1-hf  |        36.76 |             27.76 |             35.55 |                       42.41 |                   34.45 |              36.12 |       40.04 |            30.21 |            35.77 |                      45.15 |                  40.99 |             42.22 |
+|  mistral-7b-instruct-v0.2-hf  |        40.38 |             30.26 |             38.82 |                       47.66 |                   37.08 |              39.91 |       43.00 |            25.97 |            38.60 |                      47.44 |                  48.15 |             41.82 |
+| mixtral-8x7b-instruct-v0.1-hf |        49.61 |             37.78 |             47.86 |                       58.56 |                   46.40 |              47.85 |       51.68 |            37.41 |            49.14 |                      59.79 |                  52.97 |             47.65 |
+
+### Details on Test Split
+
+|             model             |   computer_network |   operating_system |   computer_architecture |   college_programming |   college_physics |   college_chemistry |   advanced_mathematics |   probability_and_statistics |   discrete_mathematics |   electrical_engineer |   metrology_engineer |   high_school_mathematics |
+|:-----------------------------:|-------------------:|-------------------:|------------------------:|----------------------:|------------------:|--------------------:|-----------------------:|-----------------------------:|-----------------------:|----------------------:|---------------------:|--------------------------:|
+|     qwen1.5-0.5b-chat-hf      |              35.67 |              36.87 |                   33.68 |                 33.92 |             35.23 |               28.12 |                  27.17 |                        26.51 |                  24.84 |                 28.91 |                40.18 |                     25.90 |
+|     qwen1.5-1.8b-chat-hf      |              46.78 |              47.49 |                   50.78 |                 39.18 |             41.48 |               31.25 |                  32.95 |                        27.71 |                  28.10 |                 34.81 |                55.71 |                     27.11 |
+|      qwen1.5-4b-chat-hf       |              54.39 |              54.75 |                   54.92 |                 44.74 |             46.02 |               43.30 |                  39.31 |                        31.33 |                  28.10 |                 45.13 |                58.90 |                     43.98 |
+|      qwen1.5-7b-chat-hf       |              60.82 |              60.34 |                   63.21 |                 55.85 |             48.86 |               45.09 |                  46.24 |                        36.14 |                  39.22 |                 47.49 |                70.32 |                     45.78 |
+|      qwen1.5-14b-chat-hf      |              69.59 |              62.57 |                   64.77 |                 64.91 |             55.68 |               57.14 |                  49.13 |                        32.53 |                  43.14 |                 55.16 |                76.71 |                     46.99 |
+|      qwen1.5-32b-chat-hf      |              81.87 |              74.30 |                   73.58 |                 71.35 |             63.07 |               60.71 |                  50.87 |                        46.99 |                  47.06 |                 59.29 |                83.11 |                     60.84 |
+|      qwen1.5-72b-chat-hf      |              77.78 |              75.42 |                   76.17 |                 73.39 |             63.64 |               62.50 |                  45.09 |                        45.78 |                  48.37 |                 59.00 |                81.74 |                     60.84 |
+|     qwen1.5-110b-chat-hf      |              83.63 |              86.03 |                   81.87 |                 77.49 |             76.70 |               67.86 |                  49.13 |                        47.59 |                  55.56 |                 79.94 |                95.89 |                     62.05 |
+|    internlm2-chat-1.8b-hf     |              42.11 |              43.58 |                   44.56 |                 35.38 |             32.95 |               34.82 |                  32.95 |                        28.92 |                  32.68 |                 34.22 |                53.42 |                     31.93 |
+|  internlm2-chat-1.8b-sft-hf   |              42.11 |              44.13 |                   43.01 |                 35.09 |             34.09 |               36.16 |                  32.95 |                        27.11 |                  33.33 |                 35.10 |                51.14 |                     33.13 |
+|     internlm2-chat-7b-hf      |              59.65 |              60.89 |                   58.03 |                 51.46 |             36.93 |               43.75 |                  36.99 |                        29.52 |                  36.60 |                 39.82 |                63.47 |                     38.55 |
+|   internlm2-chat-7b-sft-hf    |              59.06 |              61.45 |                   56.48 |                 52.63 |             39.77 |               41.52 |                  36.99 |                        27.71 |                  39.22 |                 40.12 |                62.10 |                     40.36 |
+|     internlm2-chat-20b-hf     |              61.99 |              70.39 |                   63.73 |                 54.97 |             33.52 |               47.77 |                  43.93 |                        40.96 |                  44.44 |                 44.25 |                61.64 |                     34.34 |
+|   internlm2-chat-20b-sft-hf   |              61.40 |              70.39 |                   63.21 |                 54.97 |             32.95 |               47.77 |                  42.20 |                        42.17 |                  43.14 |                 44.25 |                61.64 |                     32.53 |
+|    llama-3-8b-instruct-hf     |              57.31 |              58.10 |                   57.51 |                 51.17 |             28.41 |               35.27 |                  39.31 |                        32.53 |                  35.29 |                 38.05 |                55.25 |                     27.11 |
+|    llama-3-70b-instruct-hf    |              71.93 |              74.86 |                   70.98 |                 67.54 |             50.57 |               57.14 |                  52.60 |                        53.01 |                  56.21 |                 47.79 |                68.95 |                     43.98 |
+| llama-3-8b-instruct-lmdeploy  |              55.56 |              57.54 |                   55.44 |                 48.25 |             30.11 |               33.04 |                  35.84 |                        31.33 |                  33.33 |                 38.94 |                53.88 |                     31.93 |
+| llama-3-70b-instruct-lmdeploy |              70.76 |              77.09 |                   69.95 |                 67.84 |             49.43 |               54.02 |                  50.87 |                        54.22 |                  56.21 |                 47.20 |                69.86 |                     42.17 |
+|  mistral-7b-instruct-v0.1-hf  |              49.12 |              47.49 |                   43.52 |                 39.18 |             32.39 |               28.57 |                  29.48 |                        24.10 |                  28.10 |                 37.46 |                44.29 |                     23.49 |
+|  mistral-7b-instruct-v0.2-hf  |              47.95 |              53.07 |                   52.85 |                 42.69 |             28.41 |               26.79 |                  40.46 |                        30.12 |                  29.41 |                 33.33 |                42.92 |                     24.10 |
+| mixtral-8x7b-instruct-v0.1-hf |              58.48 |              62.57 |                   58.03 |                 56.43 |             38.64 |               36.16 |                  39.31 |                        34.94 |                  37.91 |                 34.81 |                55.71 |                     28.31 |
+
+|             model             |   high_school_physics |   high_school_chemistry |   high_school_biology |   middle_school_mathematics |   middle_school_biology |   middle_school_physics |   middle_school_chemistry |   veterinary_medicine |   college_economics |   business_administration |   marxism |   mao_zedong_thought |
+|:-----------------------------:|----------------------:|------------------------:|----------------------:|----------------------------:|------------------------:|------------------------:|--------------------------:|----------------------:|--------------------:|--------------------------:|----------:|---------------------:|
+|     qwen1.5-0.5b-chat-hf      |                 30.86 |                   31.98 |                 44.00 |                       27.68 |                   47.40 |                   40.45 |                     55.14 |                 35.24 |               32.80 |                     30.56 |     58.66 |                57.53 |
+|     qwen1.5-1.8b-chat-hf      |                 54.86 |                   62.21 |                 69.14 |                       53.67 |                   82.81 |                   83.15 |                     85.41 |                 58.10 |               44.06 |                     49.83 |     82.12 |                82.65 |
+|      qwen1.5-4b-chat-hf       |                 58.86 |                   67.44 |                 80.00 |                       55.93 |                   89.58 |                   88.20 |                     88.11 |                 64.29 |               47.08 |                     57.48 |     86.59 |                84.93 |
+|      qwen1.5-7b-chat-hf       |                 72.00 |                   80.81 |                 84.00 |                       70.06 |                   95.31 |                   94.94 |                     95.14 |                 73.81 |               56.94 |                     66.11 |     91.62 |                89.04 |
+|      qwen1.5-14b-chat-hf      |                 84.00 |                   83.72 |                 90.29 |                       80.23 |                   97.92 |                   94.94 |                     98.38 |                 81.43 |               63.18 |                     74.75 |     93.30 |                96.80 |
+|      qwen1.5-32b-chat-hf      |                 85.71 |                   90.12 |                 93.71 |                       85.31 |                   97.92 |                   98.31 |                    100.00 |                 89.05 |               69.82 |                     75.75 |     93.85 |                97.72 |
+|      qwen1.5-72b-chat-hf      |                 88.57 |                   94.19 |                 94.86 |                       85.31 |                   97.92 |                   97.75 |                     98.38 |                 90.48 |               71.63 |                     79.73 |     93.85 |                97.72 |
+|     qwen1.5-110b-chat-hf      |                 86.86 |                   92.44 |                 94.29 |                       85.31 |                   98.44 |                   98.88 |                     98.92 |                 95.24 |               78.87 |                     86.38 |     95.53 |                99.54 |
+|    internlm2-chat-1.8b-hf     |                 35.43 |                   48.84 |                 52.00 |                       35.03 |                   70.31 |                   67.98 |                     67.03 |                 41.43 |               37.83 |                     36.88 |     70.95 |                60.73 |
+|  internlm2-chat-1.8b-sft-hf   |                 37.71 |                   48.26 |                 53.14 |                       34.46 |                   71.35 |                   67.98 |                     67.57 |                 41.90 |               38.63 |                     37.54 |     72.63 |                60.27 |
+|     internlm2-chat-7b-hf      |                 46.29 |                   48.26 |                 60.57 |                       46.89 |                   78.65 |                   71.91 |                     71.35 |                 68.10 |               50.30 |                     50.83 |     77.09 |                76.26 |
+|   internlm2-chat-7b-sft-hf    |                 46.86 |                   48.26 |                 61.14 |                       45.76 |                   77.60 |                   71.91 |                     71.35 |                 67.62 |               50.10 |                     50.50 |     77.09 |                75.80 |
+|     internlm2-chat-20b-hf     |                 49.71 |                   46.51 |                 63.43 |                       55.37 |                   80.73 |                   74.72 |                     79.46 |                 72.38 |               55.73 |                     59.80 |     85.47 |                76.26 |
+|   internlm2-chat-20b-sft-hf   |                 53.71 |                   47.09 |                 64.00 |                       55.37 |                   80.73 |                   73.60 |                     78.92 |                 73.81 |               55.53 |                     60.13 |     85.47 |                75.80 |
+|    llama-3-8b-instruct-hf     |                 38.86 |                   39.53 |                 50.29 |                       40.11 |                   65.10 |                   60.11 |                     63.78 |                 61.43 |               47.89 |                     45.85 |     69.27 |                56.16 |
+|    llama-3-70b-instruct-hf    |                 63.43 |                   55.23 |                 69.71 |                       68.36 |                   85.42 |                   80.90 |                     78.38 |                 86.19 |               69.01 |                     65.12 |     83.24 |                82.65 |
+| llama-3-8b-instruct-lmdeploy  |                 41.71 |                   40.70 |                 52.00 |                       41.24 |                   61.46 |                   58.43 |                     65.41 |                 57.62 |               45.27 |                     46.18 |     69.27 |                55.71 |
+| llama-3-70b-instruct-lmdeploy |                 61.71 |                   53.49 |                 70.86 |                       64.97 |                   88.02 |                   83.71 |                     77.30 |                 84.76 |               68.21 |                     60.80 |     80.45 |                79.91 |
+|  mistral-7b-instruct-v0.1-hf  |                 27.43 |                   28.49 |                 36.00 |                       28.25 |                   40.10 |                   42.70 |                     43.78 |                 37.14 |               32.80 |                     37.87 |     41.90 |                48.86 |
+|  mistral-7b-instruct-v0.2-hf  |                 33.14 |                   29.65 |                 44.00 |                       31.07 |                   47.92 |                   44.94 |                     49.19 |                 44.29 |               37.02 |                     40.86 |     53.63 |                48.40 |
+| mixtral-8x7b-instruct-v0.1-hf |                 46.29 |                   40.70 |                 54.86 |                       42.37 |                   58.85 |                   60.67 |                     57.84 |                 54.29 |               50.10 |                     46.51 |     69.27 |                52.51 |
+
+|             model             |   education_science |   teacher_qualification |   high_school_politics |   high_school_geography |   middle_school_politics |   middle_school_geography |   modern_chinese_history |   ideological_and_moral_cultivation |   logic |   law |   chinese_language_and_literature |   art_studies |
+|:-----------------------------:|--------------------:|------------------------:|-----------------------:|------------------------:|-------------------------:|--------------------------:|-------------------------:|------------------------------------:|--------:|------:|----------------------------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |               33.33 |                   46.12 |                  37.50 |                   37.08 |                    57.51 |                     43.52 |                    42.45 |                               51.74 |   32.84 | 31.22 |                             37.32 |         24.50 |
+|     qwen1.5-1.8b-chat-hf      |               54.07 |                   72.43 |                  74.43 |                   66.85 |                    89.12 |                     87.04 |                    77.36 |                               76.16 |   38.24 | 44.34 |                             46.89 |         40.94 |
+|      qwen1.5-4b-chat-hf       |               60.00 |                   84.71 |                  82.39 |                   69.66 |                    94.82 |                     90.74 |                    79.72 |                               78.49 |   41.67 | 57.47 |                             54.07 |         56.38 |
+|      qwen1.5-7b-chat-hf       |               66.30 |                   90.73 |                  84.66 |                   80.90 |                    94.30 |                     91.67 |                    82.55 |                               84.88 |   38.73 | 60.18 |                             60.77 |         63.42 |
+|      qwen1.5-14b-chat-hf      |               74.81 |                   93.73 |                  90.91 |                   92.13 |                    96.89 |                     98.15 |                    89.62 |                               88.37 |   54.41 | 70.14 |                             69.86 |         69.13 |
+|      qwen1.5-32b-chat-hf      |               80.37 |                   94.49 |                  93.75 |                   94.94 |                    97.93 |                     97.22 |                    90.09 |                               90.70 |   68.63 | 78.73 |                             73.21 |         77.52 |
+|      qwen1.5-72b-chat-hf      |               84.07 |                   96.74 |                  95.45 |                   94.94 |                    97.93 |                     95.37 |                    92.92 |                               91.28 |   63.73 | 80.09 |                             73.68 |         83.89 |
+|     qwen1.5-110b-chat-hf      |               90.37 |                   96.99 |                  96.02 |                   95.51 |                    98.45 |                     98.15 |                    93.87 |                               94.19 |   81.37 | 86.88 |                             84.69 |         90.94 |
+|    internlm2-chat-1.8b-hf     |               48.15 |                   65.41 |                  69.32 |                   54.49 |                    79.27 |                     70.37 |                    60.85 |                               64.53 |   32.35 | 32.58 |                             45.45 |         40.60 |
+|  internlm2-chat-1.8b-sft-hf   |               48.15 |                   64.91 |                  69.89 |                   53.93 |                    79.27 |                     70.37 |                    61.32 |                               63.95 |   33.82 | 29.86 |                             45.45 |         39.93 |
+|     internlm2-chat-7b-hf      |               66.67 |                   85.21 |                  73.30 |                   66.85 |                    91.19 |                     76.85 |                    70.28 |                               75.58 |   42.16 | 50.68 |                             60.77 |         70.47 |
+|   internlm2-chat-7b-sft-hf    |               67.04 |                   85.21 |                  73.86 |                   66.85 |                    90.67 |                     77.78 |                    71.70 |                               75.00 |   42.16 | 51.13 |                             60.29 |         72.15 |
+|     internlm2-chat-20b-hf     |               74.07 |                   85.96 |                  75.57 |                   77.53 |                    89.12 |                     76.85 |                    72.64 |                               83.72 |   51.96 | 56.11 |                             68.42 |         73.49 |
+|   internlm2-chat-20b-sft-hf   |               73.70 |                   85.46 |                  76.70 |                   78.09 |                    89.64 |                     76.85 |                    72.17 |                               84.88 |   50.00 | 56.56 |                             66.99 |         75.17 |
+|    llama-3-8b-instruct-hf     |               55.93 |                   67.42 |                  55.68 |                   55.06 |                    72.02 |                     62.04 |                    54.25 |                               66.86 |   44.12 | 40.72 |                             47.37 |         44.63 |
+|    llama-3-70b-instruct-hf    |               71.11 |                   84.21 |                  74.43 |                   73.03 |                    84.97 |                     80.56 |                    69.81 |                               78.49 |   57.35 | 50.68 |                             57.89 |         64.43 |
+| llama-3-8b-instruct-lmdeploy  |               54.81 |                   67.17 |                  58.52 |                   53.37 |                    72.54 |                     62.04 |                    57.08 |                               63.95 |   44.12 | 37.56 |                             46.89 |         42.62 |
+| llama-3-70b-instruct-lmdeploy |               70.37 |                   82.96 |                  72.16 |                   71.91 |                    83.94 |                     82.41 |                    69.34 |                               77.91 |   55.39 | 50.68 |                             56.46 |         64.09 |
+|  mistral-7b-instruct-v0.1-hf  |               39.63 |                   46.62 |                  33.52 |                   41.01 |                    56.48 |                     45.37 |                    36.32 |                               43.60 |   29.90 | 31.67 |                             39.71 |         31.88 |
+|  mistral-7b-instruct-v0.2-hf  |               46.30 |                   54.39 |                  39.20 |                   43.26 |                    61.66 |                     51.85 |                    35.38 |                               55.23 |   28.92 | 35.29 |                             37.80 |         29.19 |
+| mixtral-8x7b-instruct-v0.1-hf |               58.52 |                   66.17 |                  56.82 |                   57.30 |                    66.32 |                     62.04 |                    48.11 |                               66.28 |   41.67 | 37.10 |                             46.41 |         35.91 |
+
+|             model             |   professional_tour_guide |   legal_professional |   high_school_chinese |   high_school_history |   middle_school_history |   civil_servant |   sports_science |   plant_protection |   basic_medicine |   clinical_medicine |   urban_and_rural_planner |   accountant |
+|:-----------------------------:|--------------------------:|---------------------:|----------------------:|----------------------:|------------------------:|----------------:|-----------------:|-------------------:|-----------------:|--------------------:|--------------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |                     36.47 |                39.07 |                 27.53 |                 41.76 |                   45.89 |           39.63 |            35.56 |              31.66 |            37.71 |               34.00 |                     32.78 |        37.25 |
+|     qwen1.5-1.8b-chat-hf      |                     56.02 |                45.58 |                 39.33 |                 67.03 |                   84.54 |           49.42 |            48.89 |              51.76 |            47.43 |               50.50 |                     45.69 |        52.14 |
+|      qwen1.5-4b-chat-hf       |                     61.28 |                52.56 |                 42.70 |                 73.08 |                   85.99 |           55.48 |            59.44 |              55.28 |            60.57 |               57.00 |                     50.00 |        58.01 |
+|      qwen1.5-7b-chat-hf       |                     73.31 |                56.28 |                 58.99 |                 82.97 |                   88.41 |           64.57 |            66.67 |              63.82 |            77.14 |               75.50 |                     57.42 |        69.07 |
+|      qwen1.5-14b-chat-hf      |                     80.83 |                65.12 |                 70.79 |                 89.56 |                   93.24 |           67.60 |            72.78 |              68.34 |            80.57 |               80.00 |                     61.72 |        75.62 |
+|      qwen1.5-32b-chat-hf      |                     87.59 |                72.56 |                 76.40 |                 90.66 |                   95.65 |           74.36 |            80.00 |              80.40 |            86.86 |               84.00 |                     74.88 |        85.33 |
+|      qwen1.5-72b-chat-hf      |                     90.98 |                76.28 |                 75.84 |                 90.66 |                   95.65 |           75.52 |            84.44 |              82.91 |            91.43 |               89.00 |                     73.92 |        85.10 |
+|     qwen1.5-110b-chat-hf      |                     95.11 |                88.37 |                 82.58 |                 91.76 |                   96.62 |           87.65 |            91.67 |              90.95 |            93.71 |               95.00 |                     87.08 |        91.87 |
+|    internlm2-chat-1.8b-hf     |                     54.14 |                40.00 |                 27.53 |                 62.09 |                   70.53 |           44.99 |            41.67 |              51.76 |            45.71 |               39.00 |                     40.67 |        39.28 |
+|  internlm2-chat-1.8b-sft-hf   |                     54.14 |                42.33 |                 26.97 |                 61.54 |                   71.98 |           45.45 |            41.67 |              50.25 |            45.14 |               37.50 |                     41.39 |        40.63 |
+|     internlm2-chat-7b-hf      |                     70.68 |                44.19 |                 34.83 |                 73.63 |                   84.06 |           51.98 |            57.22 |              68.34 |            66.86 |               57.50 |                     54.55 |        50.11 |
+|   internlm2-chat-7b-sft-hf    |                     71.80 |                44.65 |                 37.64 |                 73.63 |                   84.06 |           51.98 |            57.78 |              67.84 |            65.71 |               60.50 |                     54.55 |        50.11 |
+|     internlm2-chat-20b-hf     |                     75.56 |                54.42 |                 42.13 |                 74.73 |                   85.51 |           57.34 |            65.56 |              67.84 |            73.71 |               64.00 |                     57.89 |        55.98 |
+|   internlm2-chat-20b-sft-hf   |                     76.32 |                55.35 |                 41.01 |                 75.27 |                   85.51 |           58.28 |            65.56 |              67.34 |            72.57 |               65.00 |                     58.37 |        56.43 |
+|    llama-3-8b-instruct-hf     |                     53.01 |                44.65 |                 33.15 |                 46.70 |                   66.18 |           45.22 |            58.89 |              61.81 |            62.86 |               57.50 |                     48.33 |        49.89 |
+|    llama-3-70b-instruct-hf    |                     71.43 |                50.70 |                 30.90 |                 71.43 |                   82.13 |           59.67 |            73.33 |              73.37 |            82.86 |               82.00 |                     59.09 |        62.08 |
+| llama-3-8b-instruct-lmdeploy  |                     51.13 |                45.12 |                 29.78 |                 43.96 |                   62.32 |           47.09 |            56.11 |              54.77 |            56.00 |               56.00 |                     49.04 |        47.40 |
+| llama-3-70b-instruct-lmdeploy |                     68.80 |                48.84 |                 30.90 |                 70.88 |                   81.64 |           58.28 |            72.22 |              70.85 |            80.00 |               81.00 |                     57.66 |        62.53 |
+|  mistral-7b-instruct-v0.1-hf  |                     30.45 |                35.81 |                 24.72 |                 40.11 |                   34.78 |           30.77 |            43.89 |              38.69 |            36.57 |               32.50 |                     44.74 |        34.09 |
+|  mistral-7b-instruct-v0.2-hf  |                     36.09 |                38.14 |                 23.03 |                 43.41 |                   45.41 |           35.90 |            50.00 |              41.71 |            42.86 |               36.00 |                     45.22 |        42.21 |
+| mixtral-8x7b-instruct-v0.1-hf |                     47.37 |                44.65 |                 30.34 |                 51.65 |                   60.87 |           42.19 |            53.89 |              58.29 |            52.00 |               47.00 |                     48.56 |        44.02 |
+
+|             model             |   fire_engineer |   environmental_impact_assessment_engineer |   tax_accountant |   physician |
+|:-----------------------------:|----------------:|-------------------------------------------:|-----------------:|------------:|
+|     qwen1.5-0.5b-chat-hf      |           27.66 |                                      38.43 |            32.28 |       35.44 |
+|     qwen1.5-1.8b-chat-hf      |           38.65 |                                      46.62 |            46.73 |       59.14 |
+|      qwen1.5-4b-chat-hf       |           49.29 |                                      54.80 |            51.02 |       70.20 |
+|      qwen1.5-7b-chat-hf       |           53.90 |                                      62.28 |            57.79 |       76.52 |
+|      qwen1.5-14b-chat-hf      |           58.87 |                                      65.12 |            67.27 |       86.68 |
+|      qwen1.5-32b-chat-hf      |           74.11 |                                      70.82 |            74.94 |       88.04 |
+|      qwen1.5-72b-chat-hf      |           74.82 |                                      75.09 |            78.56 |       89.39 |
+|     qwen1.5-110b-chat-hf      |           88.30 |                                      88.97 |            94.13 |       95.49 |
+|    internlm2-chat-1.8b-hf     |           30.14 |                                      41.99 |            34.54 |       46.73 |
+|  internlm2-chat-1.8b-sft-hf   |           30.14 |                                      43.06 |            34.31 |       47.86 |
+|     internlm2-chat-7b-hf      |           42.20 |                                      52.31 |            47.63 |       66.82 |
+|   internlm2-chat-7b-sft-hf    |           43.26 |                                      52.67 |            47.86 |       66.59 |
+|     internlm2-chat-20b-hf     |           45.74 |                                      54.80 |            51.02 |       69.07 |
+|   internlm2-chat-20b-sft-hf   |           45.74 |                                      55.16 |            51.02 |       68.62 |
+|    llama-3-8b-instruct-hf     |           37.59 |                                      50.53 |            42.44 |       68.40 |
+|    llama-3-70b-instruct-hf    |           50.71 |                                      64.06 |            55.53 |       84.42 |
+| llama-3-8b-instruct-lmdeploy  |           37.94 |                                      50.53 |            41.53 |       66.14 |
+| llama-3-70b-instruct-lmdeploy |           48.94 |                                      63.70 |            53.95 |       81.72 |
+|  mistral-7b-instruct-v0.1-hf  |           27.66 |                                      39.15 |            29.35 |       39.95 |
+|  mistral-7b-instruct-v0.2-hf  |           32.27 |                                      37.01 |            32.96 |       42.89 |
+| mixtral-8x7b-instruct-v0.1-hf |           36.88 |                                      48.75 |            41.76 |       53.05 |
+
+### Details on Dev Split
--- a/configs/datasets/gpqa/README.md
+++ b/configs/datasets/gpqa/README.md
@ -0,0 +1,64 @@
+# GPQA
+
+## Base Models
+
+|          model           |   GPQA_diamond |
+|:------------------------:|---------------:|
+|    llama-7b-turbomind    |          24.24 |
+|   llama-13b-turbomind    |          25.25 |
+|   llama-30b-turbomind    |          22.73 |
+|   llama-65b-turbomind    |          21.72 |
+|   llama-2-7b-turbomind   |          25.25 |
+|  llama-2-13b-turbomind   |          23.74 |
+|  llama-2-70b-turbomind   |          28.28 |
+|   llama-3-8b-turbomind   |          31.82 |
+|  llama-3-70b-turbomind   |          40.91 |
+| internlm2-1.8b-turbomind |          24.24 |
+|  internlm2-7b-turbomind  |          28.28 |
+| internlm2-20b-turbomind  |          31.31 |
+|   qwen-1.8b-turbomind    |          28.79 |
+|    qwen-7b-turbomind     |          24.75 |
+|    qwen-14b-turbomind    |          27.78 |
+|    qwen-72b-turbomind    |          31.31 |
+|     qwen1.5-0.5b-hf      |          23.74 |
+|     qwen1.5-1.8b-hf      |          28.79 |
+|      qwen1.5-4b-hf       |          23.23 |
+|      qwen1.5-7b-hf       |          20.71 |
+|      qwen1.5-14b-hf      |          32.32 |
+|      qwen1.5-32b-hf      |          30.81 |
+|      qwen1.5-72b-hf      |          31.82 |
+|   qwen1.5-moe-a2-7b-hf   |          28.79 |
+|    mistral-7b-v0.1-hf    |          24.75 |
+|    mistral-7b-v0.2-hf    |          23.74 |
+|   mixtral-8x7b-v0.1-hf   |          28.79 |
+|  mixtral-8x22b-v0.1-hf   |          36.36 |
+|         yi-6b-hf         |          28.28 |
+|        yi-34b-hf         |          35.86 |
+|   deepseek-7b-base-hf    |          20.71 |
+|   deepseek-67b-base-hf   |          25.25 |
+
+## Chat Models
+
+|             model             |   GPQA_diamond |
+|:-----------------------------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |          19.70 |
+|     qwen1.5-1.8b-chat-hf      |          29.80 |
+|      qwen1.5-4b-chat-hf       |          25.25 |
+|      qwen1.5-7b-chat-hf       |          31.82 |
+|      qwen1.5-14b-chat-hf      |          30.30 |
+|      qwen1.5-32b-chat-hf      |          31.31 |
+|      qwen1.5-72b-chat-hf      |          32.83 |
+|     qwen1.5-110b-chat-hf      |          35.86 |
+|    internlm2-chat-1.8b-hf     |          25.76 |
+|  internlm2-chat-1.8b-sft-hf   |          26.26 |
+|     internlm2-chat-7b-hf      |          28.28 |
+|   internlm2-chat-7b-sft-hf    |          27.27 |
+|     internlm2-chat-20b-hf     |          30.30 |
+|   internlm2-chat-20b-sft-hf   |          29.29 |
+|    llama-3-8b-instruct-hf     |          25.76 |
+|    llama-3-70b-instruct-hf    |          37.88 |
+| llama-3-8b-instruct-lmdeploy  |          25.76 |
+| llama-3-70b-instruct-lmdeploy |          37.88 |
+|  mistral-7b-instruct-v0.1-hf  |          30.30 |
+|  mistral-7b-instruct-v0.2-hf  |          25.25 |
+| mixtral-8x7b-instruct-v0.1-hf |          30.30 |
--- a/configs/datasets/gsm8k/README.md
+++ b/configs/datasets/gsm8k/README.md
@ -0,0 +1,64 @@
+# GSM8K
+
+## Base Models
+
+|          model           |   gsm8k |
+|:------------------------:|--------:|
+|    llama-7b-turbomind    |   10.31 |
+|   llama-13b-turbomind    |   20.55 |
+|   llama-30b-turbomind    |   42.08 |
+|   llama-65b-turbomind    |   54.81 |
+|   llama-2-7b-turbomind   |   16.76 |
+|  llama-2-13b-turbomind   |   29.87 |
+|  llama-2-70b-turbomind   |   63.53 |
+|   llama-3-8b-turbomind   |   54.28 |
+|  llama-3-70b-turbomind   |   69.98 |
+| internlm2-1.8b-turbomind |   30.40 |
+|  internlm2-7b-turbomind  |   69.98 |
+| internlm2-20b-turbomind  |   76.80 |
+|   qwen-1.8b-turbomind    |   23.73 |
+|    qwen-7b-turbomind     |   54.36 |
+|    qwen-14b-turbomind    |   61.64 |
+|    qwen-72b-turbomind    |   79.68 |
+|     qwen1.5-0.5b-hf      |   13.27 |
+|     qwen1.5-1.8b-hf      |   34.87 |
+|      qwen1.5-4b-hf       |   47.61 |
+|      qwen1.5-7b-hf       |   54.36 |
+|      qwen1.5-14b-hf      |   63.53 |
+|      qwen1.5-32b-hf      |   72.71 |
+|      qwen1.5-72b-hf      |   79.53 |
+|   qwen1.5-moe-a2-7b-hf   |   61.26 |
+|    mistral-7b-v0.1-hf    |   47.61 |
+|    mistral-7b-v0.2-hf    |   45.19 |
+|   mixtral-8x7b-v0.1-hf   |   66.26 |
+|  mixtral-8x22b-v0.1-hf   |   82.87 |
+|         yi-6b-hf         |   39.58 |
+|        yi-34b-hf         |   67.70 |
+|   deepseek-7b-base-hf    |   20.17 |
+|   deepseek-67b-base-hf   |   68.16 |
+
+## Chat Models
+
+|             model             |   gsm8k |
+|:-----------------------------:|--------:|
+|     qwen1.5-0.5b-chat-hf      |    8.79 |
+|     qwen1.5-1.8b-chat-hf      |   27.60 |
+|      qwen1.5-4b-chat-hf       |   47.61 |
+|      qwen1.5-7b-chat-hf       |   56.25 |
+|      qwen1.5-14b-chat-hf      |   64.90 |
+|      qwen1.5-32b-chat-hf      |   79.91 |
+|      qwen1.5-72b-chat-hf      |   77.03 |
+|     qwen1.5-110b-chat-hf      |   79.53 |
+|    internlm2-chat-1.8b-hf     |   39.73 |
+|  internlm2-chat-1.8b-sft-hf   |   36.85 |
+|     internlm2-chat-7b-hf      |   69.90 |
+|   internlm2-chat-7b-sft-hf    |   69.83 |
+|     internlm2-chat-20b-hf     |   75.21 |
+|   internlm2-chat-20b-sft-hf   |   76.95 |
+|    llama-3-8b-instruct-hf     |   79.53 |
+|    llama-3-70b-instruct-hf    |   89.76 |
+| llama-3-8b-instruct-lmdeploy  |   78.77 |
+| llama-3-70b-instruct-lmdeploy |   89.31 |
+|  mistral-7b-instruct-v0.1-hf  |   42.23 |
+|  mistral-7b-instruct-v0.2-hf  |   45.56 |
+| mixtral-8x7b-instruct-v0.1-hf |   65.13 |
--- a/configs/datasets/hellaswag/README.md
+++ b/configs/datasets/hellaswag/README.md
@ -0,0 +1,64 @@
+# HellaSwag
+
+## Base Models
+
+|          model           |   hellaswag |
+|:------------------------:|------------:|
+|    llama-7b-turbomind    |       26.99 |
+|   llama-13b-turbomind    |       34.21 |
+|   llama-30b-turbomind    |       35.65 |
+|   llama-65b-turbomind    |       44.63 |
+|   llama-2-7b-turbomind   |       29.29 |
+|  llama-2-13b-turbomind   |       45.06 |
+|  llama-2-70b-turbomind   |       55.91 |
+|   llama-3-8b-turbomind   |       50.86 |
+|  llama-3-70b-turbomind   |       80.60 |
+| internlm2-1.8b-turbomind |       44.86 |
+|  internlm2-7b-turbomind  |       89.52 |
+| internlm2-20b-turbomind  |       91.41 |
+|   qwen-1.8b-turbomind    |       38.04 |
+|    qwen-7b-turbomind     |       64.62 |
+|    qwen-14b-turbomind    |       85.88 |
+|    qwen-72b-turbomind    |       90.40 |
+|     qwen1.5-0.5b-hf      |       29.19 |
+|     qwen1.5-1.8b-hf      |       42.32 |
+|      qwen1.5-4b-hf       |       55.89 |
+|      qwen1.5-7b-hf       |       68.51 |
+|      qwen1.5-14b-hf      |       83.86 |
+|      qwen1.5-32b-hf      |       87.28 |
+|      qwen1.5-72b-hf      |       90.41 |
+|   qwen1.5-moe-a2-7b-hf   |       72.42 |
+|    mistral-7b-v0.1-hf    |       42.04 |
+|    mistral-7b-v0.2-hf    |       46.24 |
+|   mixtral-8x7b-v0.1-hf   |       66.22 |
+|  mixtral-8x22b-v0.1-hf   |       79.66 |
+|         yi-6b-hf         |       66.83 |
+|        yi-34b-hf         |       83.83 |
+|   deepseek-7b-base-hf    |       30.42 |
+|   deepseek-67b-base-hf   |       70.75 |
+
+## Chat Models
+
+|             model             |   hellaswag |
+|:-----------------------------:|------------:|
+|     qwen1.5-0.5b-chat-hf      |       29.60 |
+|     qwen1.5-1.8b-chat-hf      |       41.71 |
+|      qwen1.5-4b-chat-hf       |       60.45 |
+|      qwen1.5-7b-chat-hf       |       71.58 |
+|      qwen1.5-14b-chat-hf      |       79.70 |
+|      qwen1.5-32b-chat-hf      |       88.56 |
+|      qwen1.5-72b-chat-hf      |       89.37 |
+|     qwen1.5-110b-chat-hf      |       91.11 |
+|    internlm2-chat-1.8b-hf     |       60.47 |
+|  internlm2-chat-1.8b-sft-hf   |       61.58 |
+|     internlm2-chat-7b-hf      |       84.80 |
+|   internlm2-chat-7b-sft-hf    |       85.21 |
+|     internlm2-chat-20b-hf     |       88.48 |
+|   internlm2-chat-20b-sft-hf   |       88.95 |
+|    llama-3-8b-instruct-hf     |       74.39 |
+|    llama-3-70b-instruct-hf    |       89.07 |
+| llama-3-8b-instruct-lmdeploy  |       73.31 |
+| llama-3-70b-instruct-lmdeploy |       87.28 |
+|  mistral-7b-instruct-v0.1-hf  |       53.00 |
+|  mistral-7b-instruct-v0.2-hf  |       65.72 |
+| mixtral-8x7b-instruct-v0.1-hf |       76.16 |
--- a/configs/datasets/humaneval/README.md
+++ b/configs/datasets/humaneval/README.md
@ -0,0 +1,64 @@
+# HumanEval
+
+## Base Models
+
+|          model           |   pass@1 |
+|:------------------------:|---------:|
+|    llama-7b-turbomind    |    12.80 |
+|   llama-13b-turbomind    |    15.24 |
+|   llama-30b-turbomind    |     9.15 |
+|   llama-65b-turbomind    |     7.32 |
+|   llama-2-7b-turbomind   |    14.02 |
+|  llama-2-13b-turbomind   |    15.24 |
+|  llama-2-70b-turbomind   |    15.24 |
+|   llama-3-8b-turbomind   |    28.05 |
+|  llama-3-70b-turbomind   |    28.05 |
+| internlm2-1.8b-turbomind |    30.49 |
+|  internlm2-7b-turbomind  |    48.17 |
+| internlm2-20b-turbomind  |    51.83 |
+|   qwen-1.8b-turbomind    |    16.46 |
+|    qwen-7b-turbomind     |    23.78 |
+|    qwen-14b-turbomind    |    23.78 |
+|    qwen-72b-turbomind    |    66.46 |
+|     qwen1.5-0.5b-hf      |     8.54 |
+|     qwen1.5-1.8b-hf      |    23.17 |
+|      qwen1.5-4b-hf       |    41.46 |
+|      qwen1.5-7b-hf       |    53.05 |
+|      qwen1.5-14b-hf      |    57.32 |
+|      qwen1.5-32b-hf      |    70.12 |
+|      qwen1.5-72b-hf      |    65.85 |
+|   qwen1.5-moe-a2-7b-hf   |    45.73 |
+|    mistral-7b-v0.1-hf    |    14.02 |
+|    mistral-7b-v0.2-hf    |     9.15 |
+|   mixtral-8x7b-v0.1-hf   |    24.39 |
+|  mixtral-8x22b-v0.1-hf   |    16.46 |
+|         yi-6b-hf         |    14.63 |
+|        yi-34b-hf         |    17.07 |
+|   deepseek-7b-base-hf    |    18.29 |
+|   deepseek-67b-base-hf   |    23.17 |
+
+## Chat Models
+
+|             model             |   pass@1 |
+|:-----------------------------:|---------:|
+|     qwen1.5-0.5b-chat-hf      |     9.15 |
+|     qwen1.5-1.8b-chat-hf      |    15.85 |
+|      qwen1.5-4b-chat-hf       |    30.49 |
+|      qwen1.5-7b-chat-hf       |    40.85 |
+|      qwen1.5-14b-chat-hf      |    50.00 |
+|      qwen1.5-32b-chat-hf      |    57.93 |
+|      qwen1.5-72b-chat-hf      |    60.37 |
+|     qwen1.5-110b-chat-hf      |    65.24 |
+|    internlm2-chat-1.8b-hf     |    33.54 |
+|  internlm2-chat-1.8b-sft-hf   |    34.15 |
+|     internlm2-chat-7b-hf      |    56.71 |
+|   internlm2-chat-7b-sft-hf    |    61.59 |
+|     internlm2-chat-20b-hf     |    67.68 |
+|   internlm2-chat-20b-sft-hf   |    67.68 |
+|    llama-3-8b-instruct-hf     |    55.49 |
+|    llama-3-70b-instruct-hf    |    70.73 |
+| llama-3-8b-instruct-lmdeploy  |    57.93 |
+| llama-3-70b-instruct-lmdeploy |    70.73 |
+|  mistral-7b-instruct-v0.1-hf  |    32.32 |
+|  mistral-7b-instruct-v0.2-hf  |    29.27 |
+| mixtral-8x7b-instruct-v0.1-hf |    34.15 |
--- a/configs/datasets/math/README.md
+++ b/configs/datasets/math/README.md
@ -0,0 +1,64 @@
+# MATH
+
+## Base Models
+
+|          model           |   math |
+|:------------------------:|-------:|
+|    llama-7b-turbomind    |   2.94 |
+|   llama-13b-turbomind    |   3.84 |
+|   llama-30b-turbomind    |   6.54 |
+|   llama-65b-turbomind    |  10.66 |
+|   llama-2-7b-turbomind   |   3.58 |
+|  llama-2-13b-turbomind   |   5.30 |
+|  llama-2-70b-turbomind   |  13.26 |
+|   llama-3-8b-turbomind   |  16.42 |
+|  llama-3-70b-turbomind   |  39.64 |
+| internlm2-1.8b-turbomind |   9.42 |
+|  internlm2-7b-turbomind  |  25.16 |
+| internlm2-20b-turbomind  |  32.24 |
+|   qwen-1.8b-turbomind    |   6.30 |
+|    qwen-7b-turbomind     |  15.56 |
+|    qwen-14b-turbomind    |  30.38 |
+|    qwen-72b-turbomind    |  44.18 |
+|     qwen1.5-0.5b-hf      |   4.16 |
+|     qwen1.5-1.8b-hf      |  11.32 |
+|      qwen1.5-4b-hf       |  17.50 |
+|      qwen1.5-7b-hf       |  17.34 |
+|      qwen1.5-14b-hf      |  36.18 |
+|      qwen1.5-32b-hf      |  45.74 |
+|      qwen1.5-72b-hf      |  41.56 |
+|   qwen1.5-moe-a2-7b-hf   |  27.96 |
+|    mistral-7b-v0.1-hf    |  13.44 |
+|    mistral-7b-v0.2-hf    |  12.74 |
+|   mixtral-8x7b-v0.1-hf   |  29.46 |
+|  mixtral-8x22b-v0.1-hf   |  41.82 |
+|         yi-6b-hf         |   6.60 |
+|        yi-34b-hf         |  18.80 |
+|   deepseek-7b-base-hf    |   4.66 |
+|   deepseek-67b-base-hf   |  18.76 |
+
+## Chat Models
+
+|             model             |   math |
+|:-----------------------------:|-------:|
+|     qwen1.5-0.5b-chat-hf      |   0.56 |
+|     qwen1.5-1.8b-chat-hf      |   4.94 |
+|      qwen1.5-4b-chat-hf       |   7.34 |
+|      qwen1.5-7b-chat-hf       |  22.14 |
+|      qwen1.5-14b-chat-hf      |  32.22 |
+|      qwen1.5-32b-chat-hf      |  41.80 |
+|      qwen1.5-72b-chat-hf      |  45.22 |
+|     qwen1.5-110b-chat-hf      |  54.38 |
+|    internlm2-chat-1.8b-hf     |  14.06 |
+|  internlm2-chat-1.8b-sft-hf   |  13.10 |
+|     internlm2-chat-7b-hf      |  28.08 |
+|   internlm2-chat-7b-sft-hf    |  27.60 |
+|     internlm2-chat-20b-hf     |  34.68 |
+|   internlm2-chat-20b-sft-hf   |  32.54 |
+|    llama-3-8b-instruct-hf     |  27.50 |
+|    llama-3-70b-instruct-hf    |  47.52 |
+| llama-3-8b-instruct-lmdeploy  |  27.42 |
+| llama-3-70b-instruct-lmdeploy |  46.90 |
+|  mistral-7b-instruct-v0.1-hf  |   8.48 |
+|  mistral-7b-instruct-v0.2-hf  |  10.82 |
+| mixtral-8x7b-instruct-v0.1-hf |  27.02 |
--- a/configs/datasets/mbpp/README.md
+++ b/configs/datasets/mbpp/README.md
@ -0,0 +1,64 @@
+# MBPP
+
+## Base Models
+
+|          model           |   mbpp/pass@1 |   mbpp/pass |   mbpp/timeout |   mbpp/failed |   mbpp/wrong_answer |
+|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
+|    llama-7b-turbomind    |         25.29 |          65 |              8 |            62 |                 122 |
+|   llama-13b-turbomind    |         29.96 |          77 |              4 |            74 |                 102 |
+|   llama-30b-turbomind    |         37.35 |          96 |             17 |            39 |                 105 |
+|   llama-65b-turbomind    |         45.53 |         117 |             10 |            35 |                  95 |
+|   llama-2-7b-turbomind   |         26.46 |          68 |             18 |            49 |                 122 |
+|  llama-2-13b-turbomind   |         36.58 |          94 |             17 |            45 |                 101 |
+|  llama-2-70b-turbomind   |         49.42 |         127 |             12 |            32 |                  86 |
+|   llama-3-8b-turbomind   |         54.86 |         141 |             11 |            22 |                  83 |
+|  llama-3-70b-turbomind   |         77.82 |         200 |              0 |            10 |                  47 |
+| internlm2-1.8b-turbomind |         30.74 |          79 |             10 |            61 |                 107 |
+|  internlm2-7b-turbomind  |         54.47 |         140 |             11 |            28 |                  78 |
+| internlm2-20b-turbomind  |         59.92 |         154 |              6 |            31 |                  66 |
+|   qwen-1.8b-turbomind    |          2.72 |           7 |             16 |           222 |                  12 |
+|    qwen-7b-turbomind     |         46.69 |         120 |             10 |            37 |                  90 |
+|    qwen-14b-turbomind    |         55.64 |         143 |              0 |            31 |                  83 |
+|    qwen-72b-turbomind    |         65.76 |         169 |              0 |            26 |                  62 |
+|     qwen1.5-0.5b-hf      |          5.06 |          13 |             13 |           190 |                  41 |
+|     qwen1.5-1.8b-hf      |         15.95 |          41 |             19 |           124 |                  73 |
+|      qwen1.5-4b-hf       |         45.91 |         118 |              8 |            27 |                 104 |
+|      qwen1.5-7b-hf       |         52.14 |         134 |             11 |            24 |                  88 |
+|      qwen1.5-14b-hf      |         52.14 |         134 |             16 |            33 |                  74 |
+|      qwen1.5-32b-hf      |         59.14 |         152 |              7 |            25 |                  73 |
+|      qwen1.5-72b-hf      |         61.09 |         157 |              1 |            21 |                  78 |
+|   qwen1.5-moe-a2-7b-hf   |         47.08 |         121 |              0 |            52 |                  84 |
+|    mistral-7b-v0.1-hf    |         47.47 |         122 |              9 |            33 |                  93 |
+|    mistral-7b-v0.2-hf    |         49.81 |         128 |              9 |            27 |                  93 |
+|   mixtral-8x7b-v0.1-hf   |         62.65 |         161 |             10 |            13 |                  73 |
+|  mixtral-8x22b-v0.1-hf   |         73.15 |         188 |              1 |            10 |                  58 |
+|         yi-6b-hf         |         30.35 |          78 |              8 |            40 |                 131 |
+|        yi-34b-hf         |         48.64 |         125 |              0 |            43 |                  89 |
+|   deepseek-7b-base-hf    |         43.97 |         113 |             11 |            34 |                  99 |
+|   deepseek-67b-base-hf   |         64.98 |         167 |              0 |            24 |                  66 |
+
+## Chat Models
+
+|             model             |   mbpp/pass@1 |   mbpp/pass |   mbpp/timeout |   mbpp/failed |   mbpp/wrong_answer |
+|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
+|     qwen1.5-0.5b-chat-hf      |         11.28 |          29 |              1 |           129 |                  98 |
+|     qwen1.5-1.8b-chat-hf      |         22.57 |          58 |              2 |            70 |                 127 |
+|      qwen1.5-4b-chat-hf       |         43.58 |         112 |              1 |            33 |                 111 |
+|      qwen1.5-7b-chat-hf       |         50.58 |         130 |              0 |            35 |                  92 |
+|      qwen1.5-14b-chat-hf      |         56.03 |         144 |              0 |            24 |                  89 |
+|      qwen1.5-32b-chat-hf      |         65.37 |         168 |              2 |            13 |                  74 |
+|      qwen1.5-72b-chat-hf      |         66.93 |         172 |              0 |            17 |                  68 |
+|     qwen1.5-110b-chat-hf      |         68.48 |         176 |              0 |            16 |                  65 |
+|    internlm2-chat-1.8b-hf     |         39.69 |         102 |              0 |            48 |                 107 |
+|  internlm2-chat-1.8b-sft-hf   |         36.19 |          93 |              1 |            58 |                 105 |
+|     internlm2-chat-7b-hf      |         57.59 |         148 |              0 |            21 |                  88 |
+|   internlm2-chat-7b-sft-hf    |         55.64 |         143 |              2 |            22 |                  90 |
+|     internlm2-chat-20b-hf     |         68.87 |         177 |              0 |            16 |                  64 |
+|   internlm2-chat-20b-sft-hf   |         69.65 |         179 |              0 |            16 |                  62 |
+|    llama-3-8b-instruct-hf     |         68.87 |         177 |              0 |             8 |                  72 |
+|    llama-3-70b-instruct-hf    |         79.77 |         205 |              0 |             2 |                  50 |
+| llama-3-8b-instruct-lmdeploy  |         66.93 |         172 |              0 |             7 |                  78 |
+| llama-3-70b-instruct-lmdeploy |         77.82 |         200 |              1 |             2 |                  54 |
+|  mistral-7b-instruct-v0.1-hf  |         47.86 |         123 |              0 |            29 |                 105 |
+|  mistral-7b-instruct-v0.2-hf  |         45.91 |         118 |              0 |            31 |                 108 |
+| mixtral-8x7b-instruct-v0.1-hf |         61.48 |         158 |              1 |            13 |                  85 |
--- a/configs/datasets/mmlu/README.md
+++ b/configs/datasets/mmlu/README.md
@ -0,0 +1,363 @@
+# MMLU
+
+## Base Models
+
+|          model           |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|    llama-7b-turbomind    |  35.66 |       31.22 |                 37.70 |             38.90 |        37.01 |
+|   llama-13b-turbomind    |  47.76 |       37.68 |                 55.36 |             52.43 |        50.83 |
+|   llama-30b-turbomind    |  58.55 |       46.95 |                 67.35 |             65.13 |        60.78 |
+|   llama-65b-turbomind    |  63.78 |       52.35 |                 73.68 |             70.84 |        64.29 |
+|   llama-2-7b-turbomind   |  46.78 |       37.81 |                 52.11 |             51.69 |        50.04 |
+|  llama-2-13b-turbomind   |  55.76 |       44.61 |                 63.86 |             62.97 |        57.35 |
+|  llama-2-70b-turbomind   |  69.87 |       58.30 |                 79.86 |             75.84 |        71.58 |
+|   llama-3-8b-turbomind   |  66.43 |       55.95 |                 76.11 |             70.29 |        68.96 |
+|  llama-3-70b-turbomind   |  79.35 |       70.66 |                 87.54 |             83.43 |        80.42 |
+| internlm2-1.8b-turbomind |  45.99 |       39.63 |                 51.02 |             48.65 |        47.96 |
+|  internlm2-7b-turbomind  |  65.84 |       56.48 |                 74.43 |             69.68 |        67.75 |
+| internlm2-20b-turbomind  |  67.58 |       59.01 |                 76.04 |             71.20 |        68.69 |
+|   qwen-1.8b-turbomind    |  46.61 |       38.91 |                 51.35 |             49.57 |        50.51 |
+|    qwen-7b-turbomind     |  59.75 |       50.16 |                 67.98 |             63.48 |        62.44 |
+|    qwen-14b-turbomind    |  67.85 |       59.13 |                 76.18 |             71.62 |        69.12 |
+|    qwen-72b-turbomind    |  77.36 |       68.70 |                 85.28 |             80.60 |        79.45 |
+|     qwen1.5-0.5b-hf      |  39.98 |       33.96 |                 45.08 |             41.59 |        42.48 |
+|     qwen1.5-1.8b-hf      |  47.14 |       39.47 |                 52.70 |             49.01 |        51.33 |
+|      qwen1.5-4b-hf       |  57.03 |       47.80 |                 64.86 |             60.10 |        60.20 |
+|      qwen1.5-7b-hf       |  62.15 |       53.22 |                 70.25 |             65.62 |        64.26 |
+|      qwen1.5-14b-hf      |  69.10 |       61.46 |                 77.57 |             71.25 |        70.29 |
+|      qwen1.5-32b-hf      |  73.88 |       65.60 |                 81.41 |             77.10 |        75.79 |
+|      qwen1.5-72b-hf      |  77.02 |       69.00 |                 84.55 |             80.60 |        78.21 |
+|   qwen1.5-moe-a2-7b-hf   |  62.09 |       53.27 |                 70.74 |             63.80 |        65.28 |
+|    mistral-7b-v0.1-hf    |  64.04 |       53.21 |                 73.65 |             68.04 |        67.00 |
+|    mistral-7b-v0.2-hf    |  63.85 |       53.21 |                 72.17 |             68.40 |        67.15 |
+|   mixtral-8x7b-v0.1-hf   |  71.80 |       61.70 |                 81.03 |             75.51 |        74.35 |
+|  mixtral-8x22b-v0.1-hf   |  77.67 |       68.94 |                 86.81 |             81.23 |        78.43 |
+|         yi-6b-hf         |  64.08 |       52.61 |                 74.10 |             68.58 |        67.11 |
+|        yi-34b-hf         |  76.26 |       66.73 |                 83.74 |             81.78 |        77.77 |
+|   deepseek-7b-base-hf    |  49.22 |       40.17 |                 56.73 |             53.46 |        51.26 |
+|   deepseek-67b-base-hf   |  71.95 |       60.57 |                 81.69 |             77.11 |        74.42 |
+
+### Details
+
+|          model           |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|    llama-7b-turbomind    |             37.50 |               30.00 |                      30.00 |                 33.00 |             23.53 |                    23.45 |       34.87 |     37.78 |              25.00 |              27.68 |                34.34 |          31.00 |
+|   llama-13b-turbomind    |             46.53 |               30.00 |                      42.00 |                 36.00 |             18.63 |                    42.76 |       46.71 |     46.67 |              30.00 |              32.14 |                45.66 |          37.00 |
+|   llama-30b-turbomind    |             59.03 |               45.00 |                      47.00 |                 35.00 |             26.47 |                    53.10 |       61.18 |     51.85 |              37.00 |              41.07 |                57.36 |          38.00 |
+|   llama-65b-turbomind    |             68.75 |               49.00 |                      47.00 |                 37.00 |             35.29 |                    55.17 |       73.03 |     57.78 |              30.00 |              48.21 |                66.04 |          38.00 |
+|   llama-2-7b-turbomind   |             46.53 |               34.00 |                      33.00 |                 34.00 |             22.55 |                    47.59 |       40.13 |     47.41 |              29.00 |              38.39 |                46.42 |          32.00 |
+|  llama-2-13b-turbomind   |             59.03 |               44.00 |                      48.00 |                 29.00 |             26.47 |                    50.34 |       53.29 |     49.63 |              35.00 |              28.57 |                60.00 |          32.00 |
+|  llama-2-70b-turbomind   |             84.72 |               51.00 |                      60.00 |                 39.00 |             37.25 |                    65.52 |       81.58 |     63.70 |              32.00 |              52.68 |                72.08 |          46.00 |
+|   llama-3-8b-turbomind   |             77.08 |               46.00 |                      51.00 |                 31.00 |             51.96 |                    62.76 |       67.11 |     68.15 |              34.00 |              52.68 |                74.72 |          35.00 |
+|  llama-3-70b-turbomind   |             93.75 |               62.00 |                      72.00 |                 52.00 |             50.98 |                    74.48 |       92.11 |     79.26 |              48.00 |              63.39 |                86.42 |          49.00 |
+| internlm2-1.8b-turbomind |             38.89 |               37.00 |                      44.00 |                 35.00 |             30.39 |                    49.66 |       50.66 |     44.44 |              25.00 |              35.71 |                51.32 |          32.00 |
+|  internlm2-7b-turbomind  |             77.08 |               48.00 |                      64.00 |                 33.00 |             47.06 |                    63.45 |       73.68 |     57.78 |              37.00 |              45.54 |                69.81 |          35.00 |
+| internlm2-20b-turbomind  |             83.33 |               51.00 |                      61.00 |                 36.00 |             45.10 |                    64.83 |       75.00 |     59.26 |              39.00 |              53.57 |                73.58 |          32.00 |
+|   qwen-1.8b-turbomind    |             42.36 |               36.00 |                      39.00 |                 34.00 |             27.45 |                    51.03 |       50.66 |     42.96 |              31.00 |              31.25 |                53.21 |          28.00 |
+|    qwen-7b-turbomind     |             67.36 |               48.00 |                      53.00 |                 28.00 |             39.22 |                    59.31 |       63.82 |     49.63 |              34.00 |              38.39 |                63.02 |          37.00 |
+|    qwen-14b-turbomind    |             78.47 |               51.00 |                      62.00 |                 42.00 |             49.02 |                    65.52 |       71.05 |     60.00 |              37.00 |              58.93 |                71.32 |          40.00 |
+|    qwen-72b-turbomind    |             93.75 |               56.00 |                      66.00 |                 56.00 |             50.98 |                    80.69 |       85.53 |     73.33 |              41.00 |              62.50 |                83.77 |          54.00 |
+|     qwen1.5-0.5b-hf      |             38.89 |               25.00 |                      38.00 |                 32.00 |             25.49 |                    45.52 |       44.74 |     33.33 |              30.00 |              39.29 |                38.11 |          39.00 |
+|     qwen1.5-1.8b-hf      |             43.75 |               34.00 |                      45.00 |                 38.00 |             28.43 |                    47.59 |       47.37 |     40.74 |              32.00 |              31.25 |                53.96 |          37.00 |
+|      qwen1.5-4b-hf       |             50.00 |               46.00 |                      41.00 |                 45.00 |             31.37 |                    53.10 |       61.18 |     51.85 |              35.00 |              44.64 |                60.38 |          37.00 |
+|      qwen1.5-7b-hf       |             66.67 |               48.00 |                      55.00 |                 37.00 |             41.18 |                    60.69 |       65.79 |     52.59 |              39.00 |              41.07 |                68.68 |          43.00 |
+|      qwen1.5-14b-hf      |             75.69 |               49.00 |                      58.00 |                 49.00 |             49.02 |                    71.72 |       73.03 |     65.93 |              39.00 |              52.68 |                73.96 |          49.00 |
+|      qwen1.5-32b-hf      |             85.42 |               53.00 |                      59.00 |                 51.00 |             53.92 |                    72.41 |       82.24 |     63.70 |              43.00 |              58.04 |                78.11 |          50.00 |
+|      qwen1.5-72b-hf      |             90.97 |               54.00 |                      65.00 |                 57.00 |             52.94 |                    80.00 |       87.50 |     73.33 |              43.00 |              64.29 |                81.89 |          50.00 |
+|   qwen1.5-moe-a2-7b-hf   |             62.50 |               44.00 |                      54.00 |                 41.00 |             49.02 |                    58.62 |       69.74 |     57.78 |              37.00 |              38.39 |                66.79 |          38.00 |
+|    mistral-7b-v0.1-hf    |             72.92 |               50.00 |                      51.00 |                 40.00 |             39.22 |                    57.93 |       65.79 |     62.96 |              29.00 |              49.11 |                69.43 |          36.00 |
+|    mistral-7b-v0.2-hf    |             71.53 |               49.00 |                      53.00 |                 40.00 |             36.27 |                    57.24 |       64.47 |     60.00 |              29.00 |              53.57 |                67.92 |          39.00 |
+|   mixtral-8x7b-v0.1-hf   |             85.42 |               54.00 |                      62.00 |                 43.00 |             46.08 |                    68.97 |       82.89 |     70.37 |              37.00 |              56.25 |                79.25 |          51.00 |
+|  mixtral-8x22b-v0.1-hf   |             89.58 |               56.00 |                      69.00 |                 48.00 |             52.94 |                    76.55 |       86.18 |     77.04 |              53.00 |              62.50 |                82.26 |          56.00 |
+|         yi-6b-hf         |             66.67 |               43.00 |                      51.00 |                 39.00 |             35.29 |                    64.83 |       65.79 |     60.00 |              29.00 |              41.96 |                66.79 |          46.00 |
+|        yi-34b-hf         |             88.89 |               52.00 |                      66.00 |                 44.00 |             48.04 |                    80.00 |       89.47 |     74.81 |              44.00 |              58.04 |                78.87 |          52.00 |
+|   deepseek-7b-base-hf    |             52.08 |               29.00 |                      44.00 |                 40.00 |             31.37 |                    44.83 |       51.97 |     40.74 |              27.00 |              32.14 |                53.58 |          31.00 |
+|   deepseek-67b-base-hf   |             84.72 |               52.00 |                      62.00 |                 42.00 |             42.16 |                    70.34 |       80.92 |     65.19 |              39.00 |              50.00 |                78.11 |          42.00 |
+
+|          model           |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|    llama-7b-turbomind    |        33.01 |       39.22 |       45.73 |                     26.24 |                   33.33 |               51.24 |             24.25 |               45.00 |                        31.09 |              30.05 |              37.00 |                     35.13 |
+|   llama-13b-turbomind    |        66.02 |       51.63 |       71.79 |                     34.75 |                   55.05 |               64.46 |             30.06 |               63.00 |                        47.48 |              37.22 |              53.00 |                     48.53 |
+|   llama-30b-turbomind    |        76.70 |       62.42 |       84.19 |                     44.68 |                   71.72 |               75.21 |             40.56 |               66.00 |                        57.98 |              46.48 |              66.00 |                     63.73 |
+|   llama-65b-turbomind    |        82.52 |       68.95 |       87.18 |                     48.94 |                   79.29 |               81.82 |             47.82 |               79.00 |                        68.49 |              50.07 |              68.00 |                     66.67 |
+|   llama-2-7b-turbomind   |        53.40 |       48.69 |       68.38 |                     36.52 |                   49.49 |               65.29 |             24.02 |               60.00 |                        44.12 |              36.31 |              55.00 |                     43.79 |
+|  llama-2-13b-turbomind   |        72.82 |       61.76 |       79.49 |                     39.72 |                   69.19 |               74.38 |             43.80 |               70.00 |                        58.40 |              42.50 |              54.00 |                     54.90 |
+|  llama-2-70b-turbomind   |        83.50 |       77.12 |       91.03 |                     56.03 |                   86.87 |               87.60 |             44.69 |               77.00 |                        77.31 |              52.93 |              74.00 |                     75.65 |
+|   llama-3-8b-turbomind   |        87.38 |       75.82 |       89.74 |                     48.94 |                   80.81 |               84.30 |             40.89 |               81.00 |                        73.95 |              46.22 |              77.00 |                     71.90 |
+|  llama-3-70b-turbomind   |        91.26 |       87.25 |       94.87 |                     64.18 |                   93.94 |               89.26 |             62.91 |               83.00 |                        87.82 |              61.80 |              90.00 |                     85.78 |
+| internlm2-1.8b-turbomind |        60.19 |       58.17 |       63.25 |                     31.21 |                   56.57 |               56.20 |             24.47 |               52.00 |                        50.42 |              36.11 |              53.00 |                     41.83 |
+|  internlm2-7b-turbomind  |        79.61 |       75.49 |       87.61 |                     48.23 |                   82.83 |               77.69 |             49.39 |               74.00 |                        72.27 |              47.65 |              73.00 |                     65.03 |
+| internlm2-20b-turbomind  |        79.61 |       75.49 |       91.88 |                     50.00 |                   87.88 |               85.95 |             35.08 |               81.00 |                        70.59 |              49.48 |              78.00 |                     70.10 |
+|   qwen-1.8b-turbomind    |        66.02 |       60.46 |       73.50 |                     38.30 |                   56.57 |               66.94 |             23.91 |               56.00 |                        42.02 |              33.96 |              51.00 |                     39.54 |
+|    qwen-7b-turbomind     |        78.64 |       67.32 |       83.33 |                     41.49 |                   76.77 |               76.03 |             29.72 |               73.00 |                        58.40 |              41.72 |              69.00 |                     59.64 |
+|    qwen-14b-turbomind    |        78.64 |       73.86 |       88.89 |                     48.58 |                   83.84 |               84.30 |             45.47 |               77.00 |                        73.95 |              50.85 |              74.00 |                     69.61 |
+|    qwen-72b-turbomind    |        90.29 |       84.97 |       94.87 |                     65.96 |                   92.93 |               88.43 |             65.70 |               79.00 |                        84.87 |              61.21 |              86.00 |                     82.19 |
+|     qwen1.5-0.5b-hf      |        52.43 |       46.41 |       60.68 |                     31.21 |                   46.46 |               56.20 |             25.70 |               46.00 |                        37.39 |              32.79 |              46.00 |                     37.75 |
+|     qwen1.5-1.8b-hf      |        66.02 |       58.50 |       75.64 |                     33.69 |                   56.06 |               72.73 |             24.69 |               57.00 |                        39.50 |              36.11 |              53.00 |                     42.81 |
+|      qwen1.5-4b-hf       |        74.76 |       62.75 |       84.19 |                     46.81 |                   76.77 |               71.07 |             25.03 |               67.00 |                        55.04 |              41.33 |              64.00 |                     56.05 |
+|      qwen1.5-7b-hf       |        78.64 |       70.92 |       86.32 |                     44.68 |                   81.82 |               77.69 |             32.74 |               76.00 |                        64.29 |              45.37 |              68.00 |                     61.27 |
+|      qwen1.5-14b-hf      |        80.58 |       75.49 |       85.90 |                     51.06 |                   86.36 |               80.99 |             45.03 |               80.00 |                        76.47 |              48.57 |              78.00 |                     69.61 |
+|      qwen1.5-32b-hf      |        86.41 |       81.37 |       95.30 |                     56.38 |                   91.41 |               88.43 |             44.02 |               76.00 |                        82.77 |              57.89 |              83.00 |                     75.33 |
+|      qwen1.5-72b-hf      |        87.38 |       85.29 |       94.87 |                     64.89 |                   92.42 |               90.08 |             62.12 |               83.00 |                        84.03 |              60.76 |              86.00 |                     81.05 |
+|   qwen1.5-moe-a2-7b-hf   |        78.64 |       70.92 |       86.32 |                     46.81 |                   81.82 |               77.69 |             25.59 |               71.00 |                        65.97 |              45.37 |              65.00 |                     61.44 |
+|    mistral-7b-v0.1-hf    |        82.52 |       75.49 |       87.61 |                     48.94 |                   76.77 |               77.69 |             32.51 |               77.00 |                        66.39 |              44.98 |              74.00 |                     67.97 |
+|    mistral-7b-v0.2-hf    |        81.55 |       74.18 |       88.46 |                     51.06 |                   76.77 |               80.99 |             38.77 |               75.00 |                        64.71 |              45.37 |              72.00 |                     66.34 |
+|   mixtral-8x7b-v0.1-hf   |        87.38 |       81.70 |       91.88 |                     51.77 |                   85.86 |               85.95 |             40.11 |               80.00 |                        79.41 |              53.32 |              77.00 |                     77.94 |
+|  mixtral-8x22b-v0.1-hf   |        89.32 |       85.95 |       91.88 |                     62.06 |                   91.41 |               90.08 |             64.58 |               83.00 |                        87.82 |              60.82 |              84.00 |                     83.17 |
+|         yi-6b-hf         |        80.58 |       71.57 |       91.03 |                     48.23 |                   83.33 |               76.86 |             41.34 |               75.00 |                        74.79 |              49.35 |              80.00 |                     65.69 |
+|        yi-34b-hf         |        91.26 |       85.62 |       92.31 |                     65.25 |                   89.39 |               91.74 |             64.69 |               82.00 |                        85.29 |              59.97 |              87.00 |                     82.19 |
+|   deepseek-7b-base-hf    |        61.17 |       53.59 |       72.22 |                     34.04 |                   59.09 |               65.29 |             26.37 |               61.00 |                        44.96 |              35.53 |              56.00 |                     49.18 |
+|   deepseek-67b-base-hf   |        88.35 |       79.74 |       91.88 |                     57.09 |                   89.39 |               85.12 |             46.15 |               76.00 |                        82.35 |              55.93 |              72.00 |                     79.58 |
+
+|          model           |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|    llama-7b-turbomind    |           41.67 |             49.12 |        40.84 |      34.94 |                   29.56 |              40.00 |                        34.10 |             35.11 |                    26.46 |                 27.81 |                          34.00 |                          41.82 |
+|   llama-13b-turbomind    |           51.85 |             67.84 |        55.31 |      43.37 |                   28.57 |              60.91 |                        46.15 |             57.25 |                    26.98 |                 29.80 |                          49.00 |                          61.21 |
+|   llama-30b-turbomind    |           71.30 |             79.53 |        66.24 |      49.40 |                   40.39 |              70.00 |                        56.67 |             64.89 |                    37.30 |                 35.10 |                          60.00 |                          70.91 |
+|   llama-65b-turbomind    |           75.00 |             81.29 |        73.63 |      53.01 |                   41.38 |              74.55 |                        65.90 |             77.86 |                    40.21 |                 35.76 |                          69.00 |                          76.36 |
+|   llama-2-7b-turbomind   |           53.70 |             69.01 |        60.13 |      41.57 |                   36.95 |              54.55 |                        45.90 |             55.73 |                    27.25 |                 31.13 |                          40.00 |                          59.39 |
+|  llama-2-13b-turbomind   |           74.07 |             76.61 |        63.99 |      45.78 |                   44.83 |              62.73 |                        50.77 |             62.60 |                    34.13 |                 36.42 |                          57.00 |                          63.03 |
+|  llama-2-70b-turbomind   |           83.33 |             85.96 |        78.46 |      53.61 |                   52.22 |              69.09 |                        74.87 |             87.02 |                    43.39 |                 43.71 |                          78.00 |                          84.24 |
+|   llama-3-8b-turbomind   |           75.00 |             83.04 |        74.28 |      56.02 |                   54.68 |              71.82 |                        64.87 |             79.39 |                    42.06 |                 45.03 |                          68.00 |                          76.36 |
+|  llama-3-70b-turbomind   |           86.11 |             91.23 |        86.50 |      57.83 |                   71.92 |              74.55 |                        82.56 |             88.55 |                    62.70 |                 56.95 |                          86.00 |                          86.67 |
+| internlm2-1.8b-turbomind |           55.56 |             59.65 |        51.13 |      40.96 |                   43.35 |              52.73 |                        43.33 |             47.33 |                    30.42 |                 33.11 |                          47.00 |                          56.36 |
+|  internlm2-7b-turbomind  |           79.63 |             82.46 |        73.63 |      51.20 |                   55.17 |              70.00 |                        66.92 |             70.99 |                    46.03 |                 42.38 |                          70.00 |                          78.79 |
+| internlm2-20b-turbomind  |           75.93 |             82.46 |        73.95 |      56.02 |                   57.64 |              68.18 |                        70.51 |             68.70 |                    49.21 |                 38.41 |                          75.00 |                          82.42 |
+|   qwen-1.8b-turbomind    |           59.26 |             56.14 |        50.80 |      40.96 |                   37.93 |              60.00 |                        41.03 |             51.15 |                    33.33 |                 34.44 |                          39.00 |                          64.24 |
+|    qwen-7b-turbomind     |           73.15 |             76.61 |        67.20 |      47.59 |                   51.23 |              65.45 |                        60.00 |             69.47 |                    43.12 |                 38.41 |                          67.00 |                          66.67 |
+|    qwen-14b-turbomind    |           76.85 |             84.21 |        72.03 |      53.01 |                   65.52 |              66.36 |                        66.92 |             78.63 |                    51.32 |                 41.72 |                          72.00 |                          82.42 |
+|    qwen-72b-turbomind    |           83.33 |             88.30 |        83.28 |      58.43 |                   65.52 |              74.55 |                        81.54 |             89.31 |                    68.52 |                 58.28 |                          81.00 |                          84.24 |
+|     qwen1.5-0.5b-hf      |           40.74 |             40.94 |        41.48 |      40.96 |                   28.57 |              50.91 |                        36.92 |             41.98 |                    28.84 |                 22.52 |                          37.00 |                          52.73 |
+|     qwen1.5-1.8b-hf      |           55.56 |             57.31 |        49.84 |      40.96 |                   36.45 |              56.36 |                        43.59 |             56.49 |                    35.19 |                 27.81 |                          45.00 |                          61.21 |
+|      qwen1.5-4b-hf       |           70.37 |             70.76 |        61.74 |      44.58 |                   45.32 |              65.45 |                        54.62 |             64.89 |                    47.88 |                 32.45 |                          62.00 |                          70.30 |
+|      qwen1.5-7b-hf       |           75.93 |             77.19 |        66.24 |      50.60 |                   53.20 |              62.73 |                        60.00 |             71.76 |                    50.26 |                 38.41 |                          71.00 |                          74.55 |
+|      qwen1.5-14b-hf      |           74.07 |             83.63 |        70.74 |      46.39 |                   58.62 |              64.55 |                        73.59 |             76.34 |                    59.26 |                 49.01 |                          75.00 |                          83.64 |
+|      qwen1.5-32b-hf      |           83.33 |             85.96 |        82.96 |      56.63 |                   61.58 |              63.64 |                        77.95 |             83.97 |                    69.31 |                 50.99 |                          85.00 |                          86.06 |
+|      qwen1.5-72b-hf      |           84.26 |             88.89 |        82.32 |      57.23 |                   66.01 |              72.73 |                        82.05 |             87.02 |                    69.31 |                 56.95 |                          84.00 |                          84.24 |
+|   qwen1.5-moe-a2-7b-hf   |           70.37 |             80.12 |        66.56 |      51.20 |                   47.78 |              64.55 |                        62.31 |             70.99 |                    46.30 |                 45.03 |                          59.00 |                          69.70 |
+|    mistral-7b-v0.1-hf    |           77.78 |             83.04 |        69.45 |      54.82 |                   53.20 |              67.27 |                        66.15 |             78.63 |                    38.10 |                 31.79 |                          68.00 |                          78.79 |
+|    mistral-7b-v0.2-hf    |           73.15 |             82.46 |        72.99 |      53.01 |                   55.67 |              66.36 |                        62.31 |             77.10 |                    40.48 |                 34.44 |                          66.00 |                          76.36 |
+|   mixtral-8x7b-v0.1-hf   |           82.41 |             88.30 |        78.14 |      51.20 |                   62.56 |              70.00 |                        70.77 |             80.92 |                    48.68 |                 48.34 |                          71.00 |                          80.61 |
+|  mixtral-8x22b-v0.1-hf   |           84.26 |             89.47 |        84.57 |      59.04 |                   67.49 |              78.18 |                        79.23 |             88.55 |                    61.64 |                 52.98 |                          87.00 |                          86.06 |
+|         yi-6b-hf         |           78.70 |             81.87 |        69.77 |      46.39 |                   52.71 |              73.64 |                        65.13 |             74.81 |                    46.30 |                 38.41 |                          66.00 |                          71.52 |
+|        yi-34b-hf         |           89.81 |             86.55 |        83.92 |      57.23 |                   64.04 |              73.64 |                        79.49 |             85.50 |                    66.40 |                 52.32 |                          81.00 |                          86.06 |
+|   deepseek-7b-base-hf    |           55.56 |             73.10 |        56.59 |      46.99 |                   34.98 |              62.73 |                        48.21 |             58.78 |                    28.57 |                 29.14 |                          50.00 |                          61.82 |
+|   deepseek-67b-base-hf   |           84.26 |             85.96 |        81.03 |      56.02 |                   57.64 |              72.73 |                        73.85 |             82.44 |                    51.59 |                 45.03 |                          74.00 |                          81.82 |
+
+|          model           |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|    llama-7b-turbomind    |             42.00 |            40.46 |                    32.87 |           42.78 |          26.19 |                                 46.11 |        35.19 |              33.47 |                 32.90 |               42.33 |                       43.88 |                   43.75 |
+|   llama-13b-turbomind    |             46.00 |            50.00 |                    30.56 |           64.88 |          31.75 |                                 66.84 |        51.85 |              52.65 |                 51.94 |               52.76 |                       67.51 |                   51.10 |
+|   llama-30b-turbomind    |             55.00 |            66.76 |                    49.07 |           77.91 |          36.51 |                                 82.90 |        68.21 |              66.12 |                 69.35 |               67.48 |                       80.59 |                   55.88 |
+|   llama-65b-turbomind    |             59.00 |            73.70 |                    61.57 |           81.35 |          43.65 |                                 88.60 |        73.46 |              71.84 |                 74.19 |               77.30 |                       83.97 |                   62.13 |
+|   llama-2-7b-turbomind   |             53.00 |            51.16 |                    27.78 |           63.60 |          27.78 |                                 67.36 |        48.77 |              47.76 |                 50.97 |               51.53 |                       64.56 |                   52.57 |
+|  llama-2-13b-turbomind   |             54.00 |            64.45 |                    45.37 |           74.46 |          36.51 |                                 80.83 |        64.81 |              62.86 |                 67.42 |               66.87 |                       72.15 |                   54.41 |
+|  llama-2-70b-turbomind   |             72.00 |            77.17 |                    63.43 |           86.08 |          48.41 |                                 94.30 |        83.64 |              78.37 |                 81.61 |               80.98 |                       87.76 |                   74.63 |
+|   llama-3-8b-turbomind   |             62.00 |            73.70 |                    54.17 |           82.76 |          48.41 |                                 90.16 |        72.53 |              75.51 |                 77.74 |               73.01 |                       82.70 |                   72.06 |
+|  llama-3-70b-turbomind   |             83.00 |            85.55 |                    72.22 |           92.21 |          66.67 |                                 97.41 |        91.05 |              84.90 |                 90.32 |               87.73 |                       94.09 |                   87.13 |
+| internlm2-1.8b-turbomind |             44.00 |            45.95 |                    38.89 |           59.39 |          32.54 |                                 60.62 |        50.31 |              54.29 |                 52.58 |               45.40 |                       62.87 |                   37.87 |
+|  internlm2-7b-turbomind  |             69.00 |            66.76 |                    57.87 |           80.72 |          50.00 |                                 90.16 |        73.15 |              75.10 |                 79.68 |               68.71 |                       81.01 |                   70.22 |
+| internlm2-20b-turbomind  |             74.00 |            74.57 |                    60.19 |           81.48 |          44.44 |                                 91.71 |        75.31 |              81.63 |                 82.58 |               75.46 |                       87.76 |                   63.60 |
+|   qwen-1.8b-turbomind    |             52.00 |            52.31 |                    34.72 |           57.98 |          29.37 |                                 59.07 |        47.22 |              48.57 |                 52.26 |               44.17 |                       61.18 |                   43.38 |
+|    qwen-7b-turbomind     |             68.00 |            64.74 |                    45.37 |           77.39 |          43.65 |                                 83.94 |        68.21 |              70.20 |                 72.26 |               65.64 |                       75.95 |                   58.46 |
+|    qwen-14b-turbomind    |             75.00 |            74.86 |                    57.87 |           84.04 |          51.59 |                                 91.71 |        70.99 |              77.14 |                 83.55 |               73.01 |                       83.12 |                   67.65 |
+|    qwen-72b-turbomind    |             80.00 |            84.97 |                    68.98 |           91.44 |          54.76 |                                 98.96 |        87.04 |              81.63 |                 89.03 |               84.05 |                       90.30 |                   84.93 |
+|     qwen1.5-0.5b-hf      |             47.00 |            46.82 |                    23.15 |           48.02 |          29.37 |                                 48.70 |        40.12 |              38.37 |                 40.65 |               35.58 |                       53.16 |                   31.62 |
+|     qwen1.5-1.8b-hf      |             54.00 |            54.91 |                    28.70 |           61.69 |          23.81 |                                 58.03 |        48.15 |              51.84 |                 55.48 |               45.40 |                       59.92 |                   39.71 |
+|      qwen1.5-4b-hf       |             65.00 |            66.76 |                    44.44 |           73.95 |          35.71 |                                 78.24 |        60.19 |              65.31 |                 66.45 |               65.64 |                       71.31 |                   50.00 |
+|      qwen1.5-7b-hf       |             68.00 |            70.81 |                    48.61 |           76.50 |          38.89 |                                 84.97 |        69.44 |              68.16 |                 74.52 |               68.10 |                       77.22 |                   56.25 |
+|      qwen1.5-14b-hf      |             77.00 |            73.70 |                    62.96 |           83.40 |          53.17 |                                 90.67 |        71.60 |              80.82 |                 84.52 |               76.69 |                       83.54 |                   71.69 |
+|      qwen1.5-32b-hf      |             77.00 |            78.90 |                    68.98 |           88.12 |          54.76 |                                 94.82 |        81.48 |              80.82 |                 88.39 |               82.21 |                       86.08 |                   80.88 |
+|      qwen1.5-72b-hf      |             80.00 |            84.39 |                    68.98 |           91.44 |          55.56 |                                 98.96 |        86.73 |              81.63 |                 88.71 |               85.89 |                       89.87 |                   82.72 |
+|   qwen1.5-moe-a2-7b-hf   |             74.00 |            65.90 |                    56.48 |           82.25 |          34.13 |                                 84.46 |        70.68 |              74.29 |                 73.23 |               68.10 |                       76.79 |                   66.91 |
+|    mistral-7b-v0.1-hf    |             57.00 |            71.10 |                    57.41 |           81.61 |          40.48 |                                 86.53 |        73.46 |              72.65 |                 76.77 |               79.14 |                       77.22 |                   68.75 |
+|    mistral-7b-v0.2-hf    |             61.00 |            71.39 |                    52.78 |           80.08 |          40.48 |                                 88.08 |        69.44 |              72.24 |                 76.13 |               77.91 |                       78.06 |                   70.59 |
+|   mixtral-8x7b-v0.1-hf   |             77.00 |            80.06 |                    63.43 |           87.87 |          54.76 |                                 93.26 |        83.95 |              80.00 |                 84.19 |               79.14 |                       88.61 |                   81.25 |
+|  mixtral-8x22b-v0.1-hf   |             72.00 |            84.10 |                    68.52 |           90.68 |          57.14 |                                 96.37 |        86.73 |              86.53 |                 90.32 |               87.73 |                       90.30 |                   87.87 |
+|         yi-6b-hf         |             67.00 |            69.36 |                    52.78 |           80.46 |          44.44 |                                 89.64 |        70.99 |              74.69 |                 77.10 |               78.53 |                       78.90 |                   65.81 |
+|        yi-34b-hf         |             79.00 |            83.82 |                    66.67 |           90.29 |          57.14 |                                 97.93 |        87.65 |              84.90 |                 88.39 |               87.73 |                       92.83 |                   81.99 |
+|   deepseek-7b-base-hf    |             49.00 |            52.31 |                    41.20 |           66.28 |          30.95 |                                 63.73 |        55.86 |              51.84 |                 52.90 |               58.90 |                       62.45 |                   45.22 |
+|   deepseek-67b-base-hf   |             81.00 |            77.17 |                    63.89 |           90.04 |          53.17 |                                 97.93 |        85.49 |              73.88 |                 82.26 |               84.05 |                       91.56 |                   78.31 |
+
+|          model           |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|    llama-7b-turbomind    |                     24.81 |              32.95 |                    38.73 |       45.77 |          27.19 |                    48.07 |         38.12 |               43.00 |
+|   llama-13b-turbomind    |                     26.30 |              42.20 |                    59.80 |       61.19 |          28.95 |                    61.28 |         53.36 |               78.00 |
+|   llama-30b-turbomind    |                     27.41 |              54.91 |                    76.96 |       79.10 |          35.96 |                    76.15 |         67.71 |               83.00 |
+|   llama-65b-turbomind    |                     34.44 |              54.34 |                    82.84 |       81.09 |          39.47 |                    82.39 |         66.37 |               88.00 |
+|   llama-2-7b-turbomind   |                     29.63 |              43.35 |                    60.29 |       62.69 |          27.19 |                    62.75 |         56.05 |               64.00 |
+|  llama-2-13b-turbomind   |                     27.04 |              52.60 |                    75.49 |       73.13 |          32.46 |                    76.51 |         64.57 |               82.00 |
+|  llama-2-70b-turbomind   |                     34.07 |              64.16 |                    90.69 |       90.55 |          44.74 |                    87.52 |         80.27 |               92.00 |
+|   llama-3-8b-turbomind   |                     38.15 |              64.16 |                    83.33 |       86.57 |          47.37 |                    84.04 |         70.85 |               87.00 |
+|  llama-3-70b-turbomind   |                     48.89 |              79.77 |                    95.10 |       94.03 |          72.81 |                    94.13 |         82.51 |               94.00 |
+| internlm2-1.8b-turbomind |                     30.37 |              41.04 |                    55.88 |       51.74 |          28.95 |                    61.47 |         51.12 |               63.00 |
+|  internlm2-7b-turbomind  |                     39.63 |              68.21 |                    76.96 |       84.58 |          44.74 |                    84.59 |         72.65 |               86.00 |
+| internlm2-20b-turbomind  |                     39.63 |              66.47 |                    82.84 |       85.07 |          47.37 |                    86.79 |         70.85 |               84.00 |
+|   qwen-1.8b-turbomind    |                     28.52 |              43.35 |                    54.90 |       60.70 |          36.84 |                    60.73 |         48.43 |               60.00 |
+|    qwen-7b-turbomind     |                     30.00 |              57.23 |                    75.98 |       79.10 |          32.46 |                    79.27 |         63.23 |               81.00 |
+|    qwen-14b-turbomind    |                     37.41 |              70.52 |                    81.37 |       85.07 |          50.00 |                    84.95 |         73.09 |               86.00 |
+|    qwen-72b-turbomind    |                     50.00 |              75.72 |                    92.16 |       90.05 |          59.65 |                    92.66 |         82.51 |               95.00 |
+|     qwen1.5-0.5b-hf      |                     29.63 |              33.53 |                    45.10 |       59.70 |          28.95 |                    44.77 |         37.22 |               69.00 |
+|     qwen1.5-1.8b-hf      |                     34.07 |              39.31 |                    47.55 |       63.18 |          32.46 |                    59.08 |         53.81 |               73.00 |
+|      qwen1.5-4b-hf       |                     35.93 |              55.49 |                    71.08 |       73.13 |          37.72 |                    72.11 |         63.68 |               79.00 |
+|      qwen1.5-7b-hf       |                     34.81 |              61.85 |                    78.92 |       82.09 |          41.23 |                    80.73 |         61.88 |               84.00 |
+|      qwen1.5-14b-hf      |                     45.93 |              68.21 |                    80.88 |       83.08 |          55.26 |                    86.06 |         73.09 |               88.00 |
+|      qwen1.5-32b-hf      |                     47.04 |              76.30 |                    90.20 |       86.07 |          57.89 |                    90.28 |         75.78 |               92.00 |
+|      qwen1.5-72b-hf      |                     47.78 |              75.14 |                    92.65 |       88.56 |          59.65 |                    92.48 |         79.82 |               94.00 |
+|   qwen1.5-moe-a2-7b-hf   |                     46.30 |              54.91 |                    78.43 |       79.10 |          38.60 |                    82.39 |         66.82 |               83.00 |
+|    mistral-7b-v0.1-hf    |                     33.70 |              65.32 |                    78.92 |       83.08 |          50.00 |                    82.39 |         69.51 |               86.00 |
+|    mistral-7b-v0.2-hf    |                     38.15 |              64.16 |                    81.86 |       82.09 |          43.86 |                    80.18 |         69.96 |               86.00 |
+|   mixtral-8x7b-v0.1-hf   |                     40.37 |              69.94 |                    86.27 |       88.56 |          65.79 |                    88.81 |         79.37 |               91.00 |
+|  mixtral-8x22b-v0.1-hf   |                     45.93 |              79.19 |                    90.20 |       93.03 |          70.18 |                    92.29 |         79.37 |               95.00 |
+|         yi-6b-hf         |                     32.59 |              61.27 |                    79.90 |       82.59 |          35.96 |                    82.94 |         67.26 |               86.00 |
+|        yi-34b-hf         |                     45.19 |              71.68 |                    91.18 |       88.56 |          55.26 |                    91.74 |         78.48 |               91.00 |
+|   deepseek-7b-base-hf    |                     28.89 |              41.62 |                    60.29 |       70.15 |          26.32 |                    69.72 |         55.61 |               76.00 |
+|   deepseek-67b-base-hf   |                     38.89 |              72.25 |                    90.69 |       90.05 |          52.63 |                    90.46 |         80.72 |               95.00 |
+
+## Chat Models
+
+|             model             |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |  35.32 |       30.90 |                 37.59 |             37.29 |        37.73 |
+|     qwen1.5-1.8b-chat-hf      |  45.62 |       39.20 |                 49.21 |             47.67 |        49.63 |
+|      qwen1.5-4b-chat-hf       |  55.90 |       48.07 |                 62.67 |             59.70 |        57.31 |
+|      qwen1.5-7b-chat-hf       |  61.79 |       52.68 |                 69.41 |             66.41 |        63.45 |
+|      qwen1.5-14b-chat-hf      |  67.96 |       59.79 |                 75.46 |             71.23 |        69.72 |
+|      qwen1.5-32b-chat-hf      |  75.36 |       67.04 |                 82.11 |             80.44 |        76.23 |
+|      qwen1.5-72b-chat-hf      |  77.24 |       69.59 |                 83.95 |             81.58 |        77.87 |
+|     qwen1.5-110b-chat-hf      |  77.95 |       71.56 |                 83.77 |             81.44 |        78.41 |
+|    internlm2-chat-1.8b-hf     |  47.58 |       40.88 |                 53.33 |             49.92 |        49.74 |
+|  internlm2-chat-1.8b-sft-hf   |  47.44 |       40.55 |                 53.31 |             49.67 |        49.89 |
+|     internlm2-chat-7b-hf      |  63.05 |       53.42 |                 71.47 |             67.27 |        65.13 |
+|   internlm2-chat-7b-sft-hf    |  63.33 |       53.95 |                 71.74 |             67.62 |        65.00 |
+|     internlm2-chat-20b-hf     |  67.37 |       57.39 |                 75.75 |             71.63 |        69.95 |
+|   internlm2-chat-20b-sft-hf   |  67.34 |       57.49 |                 75.67 |             70.99 |        70.40 |
+|    llama-3-8b-instruct-hf     |  68.37 |       58.01 |                 77.82 |             71.22 |        71.94 |
+|    llama-3-70b-instruct-hf    |  80.93 |       73.86 |                 87.71 |             83.90 |        82.01 |
+| llama-3-8b-instruct-lmdeploy  |  67.35 |       56.66 |                 75.96 |             70.90 |        71.49 |
+| llama-3-70b-instruct-lmdeploy |  80.85 |       74.07 |                 87.26 |             83.73 |        81.96 |
+|  mistral-7b-instruct-v0.1-hf  |  54.36 |       43.74 |                 62.96 |             58.87 |        57.46 |
+|  mistral-7b-instruct-v0.2-hf  |  59.98 |       49.56 |                 69.22 |             64.41 |        62.24 |
+| mixtral-8x7b-instruct-v0.1-hf |  70.11 |       60.29 |                 79.01 |             74.08 |        72.28 |
+
+### Details
+
+|             model             |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |             31.25 |               32.00 |                      33.00 |                 29.00 |             33.33 |                    38.62 |       33.55 |     28.89 |              20.00 |              27.68 |                40.38 |          33.00 |
+|     qwen1.5-1.8b-chat-hf      |             42.36 |               28.00 |                      45.00 |                 33.00 |             27.45 |                    44.83 |       51.97 |     42.22 |              32.00 |              38.39 |                48.30 |          30.00 |
+|      qwen1.5-4b-chat-hf       |             56.25 |               47.00 |                      49.00 |                 39.00 |             36.27 |                    54.48 |       57.89 |     49.63 |              38.00 |              33.04 |                59.62 |          23.00 |
+|      qwen1.5-7b-chat-hf       |             64.58 |               51.00 |                      59.00 |                 37.00 |             41.18 |                    53.79 |       66.45 |     53.33 |              43.00 |              41.07 |                67.92 |          36.00 |
+|      qwen1.5-14b-chat-hf      |             77.08 |               51.00 |                      64.00 |                 42.00 |             45.10 |                    64.83 |       77.63 |     65.93 |              39.00 |              46.43 |                73.21 |          45.00 |
+|      qwen1.5-32b-chat-hf      |             84.72 |               53.00 |                      57.00 |                 48.00 |             52.94 |                    74.48 |       82.24 |     67.41 |              52.00 |              61.61 |                78.11 |          48.00 |
+|      qwen1.5-72b-chat-hf      |             90.97 |               57.00 |                      66.00 |                 55.00 |             55.88 |                    80.00 |       88.16 |     72.59 |              56.00 |              59.82 |                80.00 |          51.00 |
+|     qwen1.5-110b-chat-hf      |             88.89 |               62.00 |                      66.00 |                 64.00 |             58.82 |                    75.86 |       89.47 |     68.15 |              59.00 |              63.39 |                79.62 |          59.00 |
+|    internlm2-chat-1.8b-hf     |             49.31 |               36.00 |                      47.00 |                 33.00 |             36.27 |                    42.76 |       48.03 |     49.63 |              30.00 |              33.93 |                53.58 |          28.00 |
+|  internlm2-chat-1.8b-sft-hf   |             51.39 |               37.00 |                      50.00 |                 33.00 |             33.33 |                    42.76 |       46.05 |     49.63 |              31.00 |              32.14 |                53.21 |          29.00 |
+|     internlm2-chat-7b-hf      |             68.75 |               47.00 |                      62.00 |                 32.00 |             38.24 |                    57.24 |       69.74 |     58.52 |              29.00 |              53.57 |                70.19 |          41.00 |
+|   internlm2-chat-7b-sft-hf    |             71.53 |               47.00 |                      63.00 |                 34.00 |             37.25 |                    57.24 |       69.74 |     57.78 |              29.00 |              52.68 |                69.43 |          34.00 |
+|     internlm2-chat-20b-hf     |             76.39 |               51.00 |                      61.00 |                 37.00 |             40.20 |                    62.76 |       78.95 |     67.41 |              33.00 |              46.43 |                75.09 |          42.00 |
+|   internlm2-chat-20b-sft-hf   |             77.08 |               49.00 |                      60.00 |                 39.00 |             39.22 |                    64.14 |       79.61 |     68.15 |              35.00 |              46.43 |                75.09 |          42.00 |
+|    llama-3-8b-instruct-hf     |             81.94 |               48.00 |                      58.00 |                 43.00 |             48.04 |                    60.69 |       76.32 |     71.11 |              33.00 |              54.46 |                73.58 |          46.00 |
+|    llama-3-70b-instruct-hf    |             93.06 |               56.00 |                      70.00 |                 60.00 |             60.78 |                    77.24 |       93.42 |     79.26 |              53.00 |              71.43 |                86.42 |          66.00 |
+| llama-3-8b-instruct-lmdeploy  |             79.17 |               47.00 |                      53.00 |                 36.00 |             49.02 |                    60.00 |       73.68 |     68.89 |              36.00 |              55.36 |                73.96 |          42.00 |
+| llama-3-70b-instruct-lmdeploy |             93.75 |               57.00 |                      66.00 |                 61.00 |             65.69 |                    77.93 |       92.11 |     78.52 |              55.00 |              70.54 |                86.42 |          64.00 |
+|  mistral-7b-instruct-v0.1-hf  |             57.64 |               35.00 |                      50.00 |                 31.00 |             24.51 |                    51.72 |       58.55 |     45.93 |              35.00 |              41.07 |                56.98 |          32.00 |
+|  mistral-7b-instruct-v0.2-hf  |             70.14 |               42.00 |                      49.00 |                 35.00 |             43.14 |                    54.48 |       65.79 |     56.30 |              29.00 |              42.86 |                65.28 |          37.00 |
+| mixtral-8x7b-instruct-v0.1-hf |             81.25 |               57.00 |                      57.00 |                 40.00 |             50.00 |                    60.69 |       80.92 |     65.93 |              45.00 |              50.89 |                76.60 |          41.00 |
+
+|             model             |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|     qwen1.5-0.5b-chat-hf      |        41.75 |       38.89 |       49.15 |                     26.60 |                   48.48 |               50.41 |             24.69 |               42.00 |                        32.35 |              31.75 |              31.00 |                     32.35 |
+|     qwen1.5-1.8b-chat-hf      |        62.14 |       55.56 |       76.92 |                     34.40 |                   58.08 |               61.16 |             21.90 |               56.00 |                        42.44 |              35.14 |              50.00 |                     44.93 |
+|      qwen1.5-4b-chat-hf       |        73.79 |       58.50 |       82.05 |                     47.16 |                   74.24 |               71.90 |             32.29 |               69.00 |                        58.40 |              40.74 |              58.00 |                     53.76 |
+|      qwen1.5-7b-chat-hf       |        79.61 |       69.28 |       85.47 |                     41.49 |                   78.79 |               76.86 |             35.75 |               74.00 |                        65.13 |              44.78 |              68.00 |                     57.68 |
+|      qwen1.5-14b-chat-hf      |        82.52 |       70.26 |       87.18 |                     51.77 |                   85.86 |               82.64 |             53.74 |               81.00 |                        76.05 |              47.98 |              76.00 |                     67.48 |
+|      qwen1.5-32b-chat-hf      |        84.47 |       77.78 |       94.44 |                     60.99 |                   90.91 |               87.60 |             72.96 |               79.00 |                        83.61 |              58.28 |              83.00 |                     77.94 |
+|      qwen1.5-72b-chat-hf      |        89.32 |       85.95 |       93.59 |                     61.35 |                   90.91 |               86.78 |             75.98 |               83.00 |                        84.87 |              60.30 |              83.00 |                     81.05 |
+|     qwen1.5-110b-chat-hf      |        86.41 |       80.72 |       92.74 |                     69.15 |                   93.94 |               84.30 |             77.88 |               83.00 |                        88.66 |              61.73 |              84.00 |                     82.19 |
+|    internlm2-chat-1.8b-hf     |        72.82 |       50.65 |       69.23 |                     35.46 |                   56.06 |               56.20 |             27.82 |               60.00 |                        49.16 |              33.83 |              54.00 |                     43.79 |
+|  internlm2-chat-1.8b-sft-hf   |        71.84 |       52.61 |       68.80 |                     34.75 |                   55.56 |               53.72 |             27.04 |               58.00 |                        48.74 |              34.09 |              54.00 |                     44.61 |
+|     internlm2-chat-7b-hf      |        78.64 |       66.67 |       85.90 |                     46.81 |                   79.29 |               70.25 |             35.31 |               79.00 |                        68.07 |              46.41 |              68.00 |                     64.87 |
+|   internlm2-chat-7b-sft-hf    |        79.61 |       67.97 |       86.75 |                     47.52 |                   80.30 |               70.25 |             35.98 |               80.00 |                        69.33 |              45.83 |              70.00 |                     65.36 |
+|     internlm2-chat-20b-hf     |        80.58 |       75.16 |       90.17 |                     52.13 |                   83.84 |               80.99 |             39.33 |               80.00 |                        70.59 |              49.67 |              75.00 |                     70.26 |
+|   internlm2-chat-20b-sft-hf   |        80.58 |       76.14 |       91.03 |                     53.19 |                   84.34 |               80.99 |             36.31 |               77.00 |                        71.85 |              49.61 |              77.00 |                     70.59 |
+|    llama-3-8b-instruct-hf     |        82.52 |       79.41 |       91.45 |                     52.48 |                   80.30 |               79.34 |             46.26 |               75.00 |                        76.89 |              49.61 |              85.00 |                     72.22 |
+|    llama-3-70b-instruct-hf    |        89.32 |       87.58 |       93.16 |                     66.67 |                   92.42 |               90.08 |             76.20 |               83.00 |                        89.50 |              64.67 |              92.00 |                     87.09 |
+| llama-3-8b-instruct-lmdeploy  |        87.38 |       79.41 |       90.17 |                     52.48 |                   79.80 |               78.51 |             44.25 |               75.00 |                        74.37 |              48.76 |              84.00 |                     69.61 |
+| llama-3-70b-instruct-lmdeploy |        90.29 |       88.56 |       93.59 |                     65.96 |                   92.93 |               89.26 |             75.75 |               83.00 |                        89.92 |              63.95 |              92.00 |                     86.60 |
+|  mistral-7b-instruct-v0.1-hf  |        69.90 |       59.80 |       85.47 |                     38.65 |                   69.70 |               65.29 |             37.54 |               69.00 |                        51.26 |              37.81 |              65.00 |                     52.45 |
+|  mistral-7b-instruct-v0.2-hf  |        74.76 |       66.99 |       88.89 |                     43.97 |                   75.25 |               76.86 |             42.01 |               73.00 |                        62.61 |              42.24 |              67.00 |                     62.25 |
+| mixtral-8x7b-instruct-v0.1-hf |        85.44 |       80.39 |       92.74 |                     55.32 |                   85.35 |               82.64 |             48.38 |               78.00 |                        75.21 |              53.52 |              75.00 |                     74.02 |
+
+|             model             |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |           42.59 |             24.56 |        39.87 |      39.76 |                   29.06 |              38.18 |                        35.64 |             38.93 |                    27.78 |                 29.80 |                          34.00 |                          48.48 |
+|     qwen1.5-1.8b-chat-hf      |           50.93 |             56.73 |        44.37 |      42.77 |                   35.96 |              51.82 |                        38.46 |             49.62 |                    35.45 |                 27.15 |                          47.00 |                          63.03 |
+|      qwen1.5-4b-chat-hf       |           71.30 |             65.50 |        58.20 |      50.00 |                   44.33 |              57.27 |                        54.10 |             61.83 |                    43.65 |                 41.06 |                          60.00 |                          72.12 |
+|      qwen1.5-7b-chat-hf       |           76.85 |             76.61 |        68.49 |      48.80 |                   51.72 |              64.55 |                        59.23 |             68.70 |                    48.94 |                 37.09 |                          69.00 |                          79.39 |
+|      qwen1.5-14b-chat-hf      |           75.93 |             80.70 |        69.13 |      51.20 |                   55.67 |              64.55 |                        67.69 |             74.05 |                    57.14 |                 47.02 |                          74.00 |                          82.42 |
+|      qwen1.5-32b-chat-hf      |           83.33 |             89.47 |        82.64 |      60.84 |                   62.56 |              70.00 |                        76.67 |             83.21 |                    67.46 |                 59.60 |                          85.00 |                          84.85 |
+|      qwen1.5-72b-chat-hf      |           86.11 |             89.47 |        80.71 |      59.04 |                   68.47 |              72.73 |                        80.00 |             87.79 |                    67.72 |                 52.32 |                          79.00 |                          85.45 |
+|     qwen1.5-110b-chat-hf      |           83.33 |             87.13 |        81.03 |      54.22 |                   69.95 |              73.64 |                        78.21 |             87.02 |                    75.93 |                 57.62 |                          84.00 |                          88.48 |
+|    internlm2-chat-1.8b-hf     |           52.78 |             60.82 |        49.20 |      42.77 |                   42.36 |              50.00 |                        47.18 |             53.44 |                    32.54 |                 31.79 |                          39.00 |                          60.00 |
+|  internlm2-chat-1.8b-sft-hf   |           53.70 |             61.40 |        50.16 |      42.17 |                   40.89 |              50.00 |                        47.69 |             51.15 |                    32.54 |                 29.14 |                          40.00 |                          59.39 |
+|     internlm2-chat-7b-hf      |           73.15 |             81.87 |        67.85 |      47.59 |                   49.75 |              62.73 |                        61.79 |             66.41 |                    44.97 |                 33.77 |                          71.00 |                          81.82 |
+|   internlm2-chat-7b-sft-hf    |           73.15 |             81.87 |        66.88 |      48.19 |                   48.77 |              63.64 |                        62.31 |             65.65 |                    45.77 |                 33.77 |                          72.00 |                          81.82 |
+|     internlm2-chat-20b-hf     |           80.56 |             81.87 |        72.99 |      55.42 |                   54.19 |              70.00 |                        67.95 |             71.76 |                    48.15 |                 39.74 |                          75.00 |                          80.00 |
+|   internlm2-chat-20b-sft-hf   |           81.48 |             79.53 |        72.99 |      54.82 |                   54.19 |              69.09 |                        67.95 |             71.76 |                    48.94 |                 41.06 |                          75.00 |                          80.00 |
+|    llama-3-8b-instruct-hf     |           76.85 |             79.53 |        72.35 |      53.61 |                   54.19 |              70.91 |                        66.41 |             80.92 |                    49.47 |                 46.36 |                          71.00 |                          75.15 |
+|    llama-3-70b-instruct-hf    |           87.04 |             88.30 |        82.64 |      56.02 |                   67.49 |              74.55 |                        86.41 |             88.55 |                    74.34 |                 65.56 |                          91.00 |                          86.06 |
+| llama-3-8b-instruct-lmdeploy  |           77.78 |             79.53 |        70.74 |      52.41 |                   53.20 |              68.18 |                        65.38 |             79.39 |                    50.79 |                 37.75 |                          72.00 |                          76.97 |
+| llama-3-70b-instruct-lmdeploy |           87.96 |             90.64 |        83.28 |      54.82 |                   69.46 |              73.64 |                        86.92 |             87.02 |                    74.87 |                 66.23 |                          92.00 |                          85.45 |
+|  mistral-7b-instruct-v0.1-hf  |           64.81 |             70.18 |        63.67 |      41.57 |                   38.92 |              68.18 |                        49.49 |             61.83 |                    33.33 |                 32.45 |                          55.00 |                          66.67 |
+|  mistral-7b-instruct-v0.2-hf  |           70.37 |             80.12 |        64.95 |      50.60 |                   50.74 |              68.18 |                        54.36 |             71.76 |                    40.74 |                 35.10 |                          60.00 |                          73.33 |
+| mixtral-8x7b-instruct-v0.1-hf |           79.63 |             87.72 |        73.63 |      54.82 |                   61.58 |              67.27 |                        69.49 |             83.21 |                    52.91 |                 47.02 |                          74.00 |                          80.61 |
+
+|             model             |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|     qwen1.5-0.5b-chat-hf      |             45.00 |            41.04 |                    30.09 |           39.21 |          24.60 |                                 35.23 |        33.95 |              25.31 |                 36.13 |               31.29 |                       49.37 |                   38.24 |
+|     qwen1.5-1.8b-chat-hf      |             54.00 |            50.29 |                    34.26 |           58.49 |          24.60 |                                 55.96 |        47.53 |              39.18 |                 47.74 |               44.17 |                       64.98 |                   40.81 |
+|      qwen1.5-4b-chat-hf       |             61.00 |            64.16 |                    46.30 |           71.01 |          39.68 |                                 72.02 |        54.01 |              65.31 |                 63.55 |               63.80 |                       71.31 |                   51.10 |
+|      qwen1.5-7b-chat-hf       |             69.00 |            67.05 |                    50.93 |           76.25 |          53.17 |                                 82.38 |        62.96 |              71.02 |                 73.23 |               68.10 |                       76.79 |                   60.29 |
+|      qwen1.5-14b-chat-hf      |             74.00 |            75.14 |                    58.33 |           82.89 |          51.59 |                                 88.60 |        69.44 |              77.96 |                 84.19 |               73.62 |                       82.70 |                   71.32 |
+|      qwen1.5-32b-chat-hf      |             80.00 |            80.64 |                    70.83 |           89.40 |          60.32 |                                 94.82 |        81.79 |              79.59 |                 90.00 |               86.50 |                       88.61 |                   80.15 |
+|      qwen1.5-72b-chat-hf      |             80.00 |            82.95 |                    68.98 |           91.83 |          57.14 |                                 98.45 |        86.73 |              78.78 |                 89.03 |               87.12 |                       91.14 |                   83.82 |
+|     qwen1.5-110b-chat-hf      |             79.00 |            78.03 |                    67.13 |           92.98 |          62.70 |                                 97.93 |        87.04 |              74.29 |                 88.71 |               82.82 |                       91.14 |                   84.93 |
+|    internlm2-chat-1.8b-hf     |             48.00 |            49.13 |                    44.91 |           57.60 |          26.98 |                                 61.14 |        50.62 |              51.02 |                 52.58 |               57.67 |                       67.51 |                   37.50 |
+|  internlm2-chat-1.8b-sft-hf   |             50.00 |            49.13 |                    44.91 |           57.73 |          28.57 |                                 61.66 |        49.69 |              51.02 |                 49.68 |               57.67 |                       66.67 |                   38.60 |
+|     internlm2-chat-7b-hf      |             65.00 |            65.61 |                    49.54 |           80.84 |          43.65 |                                 88.08 |        70.99 |              68.98 |                 78.39 |               75.46 |                       82.28 |                   61.76 |
+|   internlm2-chat-7b-sft-hf    |             64.00 |            66.18 |                    52.31 |           81.35 |          46.03 |                                 88.08 |        71.60 |              67.76 |                 78.39 |               77.30 |                       82.28 |                   63.60 |
+|     internlm2-chat-20b-hf     |             74.00 |            73.70 |                    59.72 |           81.86 |          46.83 |                                 89.12 |        74.69 |              75.92 |                 80.65 |               79.14 |                       82.70 |                   70.59 |
+|   internlm2-chat-20b-sft-hf   |             76.00 |            73.12 |                    60.19 |           81.99 |          43.65 |                                 88.60 |        74.38 |              73.88 |                 80.32 |               80.37 |                       82.70 |                   70.59 |
+|    llama-3-8b-instruct-hf     |             72.00 |            73.12 |                    55.09 |           84.55 |          50.00 |                                 90.67 |        77.16 |              77.55 |                 81.61 |               77.91 |                       84.81 |                   75.00 |
+|    llama-3-70b-instruct-hf    |             85.00 |            85.26 |                    75.00 |           92.72 |          69.05 |                                 97.41 |        90.43 |              82.04 |                 91.61 |               87.12 |                       94.09 |                   89.71 |
+| llama-3-8b-instruct-lmdeploy  |             72.00 |            72.83 |                    52.78 |           82.12 |          51.59 |                                 89.64 |        76.85 |              76.73 |                 80.97 |               76.69 |                       84.39 |                   74.63 |
+| llama-3-70b-instruct-lmdeploy |             85.00 |            84.39 |                    73.61 |           92.72 |          67.46 |                                 97.93 |        89.81 |              81.63 |                 90.65 |               87.12 |                       93.25 |                   89.34 |
+|  mistral-7b-instruct-v0.1-hf  |             55.00 |            57.51 |                    39.81 |           74.07 |          39.68 |                                 75.65 |        57.72 |              62.04 |                 59.35 |               69.33 |                       67.93 |                   55.88 |
+|  mistral-7b-instruct-v0.2-hf  |             61.00 |            66.76 |                    46.76 |           78.67 |          36.51 |                                 84.97 |        68.83 |              70.20 |                 68.39 |               69.33 |                       73.00 |                   58.09 |
+| mixtral-8x7b-instruct-v0.1-hf |             66.00 |            76.59 |                    57.87 |           86.59 |          50.00 |                                 93.78 |        83.02 |              79.18 |                 82.58 |               75.46 |                       86.50 |                   77.94 |
+
+|             model             |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|     qwen1.5-0.5b-chat-hf      |                     24.44 |              35.26 |                    42.16 |       47.26 |          29.82 |                    40.55 |         32.29 |               47.00 |
+|     qwen1.5-1.8b-chat-hf      |                     32.22 |              43.35 |                    54.90 |       48.26 |          28.95 |                    61.83 |         48.43 |               71.00 |
+|      qwen1.5-4b-chat-hf       |                     36.30 |              51.45 |                    71.08 |       76.62 |          34.21 |                    72.29 |         58.30 |               72.00 |
+|      qwen1.5-7b-chat-hf       |                     31.11 |              61.27 |                    76.47 |       79.10 |          42.11 |                    81.28 |         61.43 |               83.00 |
+|      qwen1.5-14b-chat-hf      |                     41.48 |              68.79 |                    80.88 |       82.59 |          48.25 |                    84.40 |         72.20 |               88.00 |
+|      qwen1.5-32b-chat-hf      |                     48.52 |              75.72 |                    88.73 |       86.07 |          57.02 |                    90.46 |         78.03 |               95.00 |
+|      qwen1.5-72b-chat-hf      |                     51.48 |              73.99 |                    90.69 |       87.06 |          59.65 |                    92.11 |         79.37 |               94.00 |
+|     qwen1.5-110b-chat-hf      |                     52.22 |              76.30 |                    93.14 |       87.56 |          62.28 |                    91.56 |         80.27 |               88.00 |
+|    internlm2-chat-1.8b-hf     |                     31.48 |              46.82 |                    56.37 |       65.17 |          28.07 |                    65.87 |         50.22 |               69.00 |
+|  internlm2-chat-1.8b-sft-hf   |                     30.74 |              47.40 |                    54.41 |       64.18 |          29.82 |                    66.24 |         48.43 |               69.00 |
+|     internlm2-chat-7b-hf      |                     33.70 |              67.05 |                    79.90 |       81.09 |          48.25 |                    84.04 |         67.26 |               84.00 |
+|   internlm2-chat-7b-sft-hf    |                     35.19 |              67.05 |                    79.90 |       80.60 |          48.25 |                    84.59 |         65.47 |               85.00 |
+|     internlm2-chat-20b-hf     |                     36.30 |              66.47 |                    88.73 |       85.07 |          51.75 |                    85.69 |         70.85 |               87.00 |
+|   internlm2-chat-20b-sft-hf   |                     35.93 |              65.90 |                    87.75 |       85.57 |          52.63 |                    84.77 |         70.85 |               87.00 |
+|    llama-3-8b-instruct-hf     |                     36.67 |              68.79 |                    83.82 |       86.57 |          61.40 |                    84.95 |         70.85 |               85.00 |
+|    llama-3-70b-instruct-hf    |                     57.41 |              78.61 |                    89.71 |       91.54 |          74.56 |                    94.50 |         82.96 |               94.00 |
+| llama-3-8b-instruct-lmdeploy  |                     38.52 |              68.79 |                    82.84 |       85.57 |          54.39 |                    85.50 |         69.96 |               83.00 |
+| llama-3-70b-instruct-lmdeploy |                     54.81 |              79.77 |                    90.20 |       92.04 |          71.05 |                    94.50 |         82.96 |               93.00 |
+|  mistral-7b-instruct-v0.1-hf  |                     28.89 |              50.29 |                    67.16 |       76.12 |          39.47 |                    72.29 |         62.33 |               77.00 |
+|  mistral-7b-instruct-v0.2-hf  |                     30.74 |              53.18 |                    73.04 |       77.11 |          42.11 |                    79.82 |         63.68 |               82.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                     35.56 |              73.41 |                    85.29 |       87.06 |          60.53 |                    86.97 |         74.44 |               86.00 |
--- a/configs/datasets/nq/README.md
+++ b/configs/datasets/nq/README.md
@ -0,0 +1,64 @@
+# NQ
+
+## Base Models
+
+|          model           |    nq |
+|:------------------------:|------:|
+|    llama-7b-turbomind    | 15.10 |
+|   llama-13b-turbomind    | 16.43 |
+|   llama-30b-turbomind    | 22.11 |
+|   llama-65b-turbomind    | 26.09 |
+|   llama-2-7b-turbomind   | 14.35 |
+|  llama-2-13b-turbomind   | 21.69 |
+|  llama-2-70b-turbomind   | 23.27 |
+|   llama-3-8b-turbomind   | 18.78 |
+|  llama-3-70b-turbomind   | 31.88 |
+| internlm2-1.8b-turbomind | 20.66 |
+|  internlm2-7b-turbomind  | 41.05 |
+| internlm2-20b-turbomind  | 43.55 |
+|   qwen-1.8b-turbomind    |  5.68 |
+|    qwen-7b-turbomind     | 17.87 |
+|    qwen-14b-turbomind    | 13.77 |
+|    qwen-72b-turbomind    | 18.20 |
+|     qwen1.5-0.5b-hf      |  6.01 |
+|     qwen1.5-1.8b-hf      | 10.28 |
+|      qwen1.5-4b-hf       | 15.73 |
+|      qwen1.5-7b-hf       | 18.61 |
+|      qwen1.5-14b-hf      | 16.07 |
+|      qwen1.5-32b-hf      | 21.75 |
+|      qwen1.5-72b-hf      | 20.53 |
+|   qwen1.5-moe-a2-7b-hf   | 16.62 |
+|    mistral-7b-v0.1-hf    | 20.66 |
+|    mistral-7b-v0.2-hf    | 20.78 |
+|   mixtral-8x7b-v0.1-hf   | 24.85 |
+|  mixtral-8x22b-v0.1-hf   | 34.43 |
+|         yi-6b-hf         | 10.08 |
+|        yi-34b-hf         | 13.96 |
+|   deepseek-7b-base-hf    |  8.45 |
+|   deepseek-67b-base-hf   | 17.59 |
+
+## Chat Models
+
+|             model             |    nq |
+|:-----------------------------:|------:|
+|     qwen1.5-0.5b-chat-hf      |  7.42 |
+|     qwen1.5-1.8b-chat-hf      | 10.22 |
+|      qwen1.5-4b-chat-hf       | 19.31 |
+|      qwen1.5-7b-chat-hf       | 16.87 |
+|      qwen1.5-14b-chat-hf      | 20.53 |
+|      qwen1.5-32b-chat-hf      | 25.26 |
+|      qwen1.5-72b-chat-hf      | 35.21 |
+|     qwen1.5-110b-chat-hf      | 36.98 |
+|    internlm2-chat-1.8b-hf     | 19.09 |
+|  internlm2-chat-1.8b-sft-hf   | 18.14 |
+|     internlm2-chat-7b-hf      | 28.73 |
+|   internlm2-chat-7b-sft-hf    | 30.78 |
+|     internlm2-chat-20b-hf     | 28.75 |
+|   internlm2-chat-20b-sft-hf   | 34.10 |
+|    llama-3-8b-instruct-hf     | 30.17 |
+|    llama-3-70b-instruct-hf    | 40.25 |
+| llama-3-8b-instruct-lmdeploy  | 28.28 |
+| llama-3-70b-instruct-lmdeploy | 39.14 |
+|  mistral-7b-instruct-v0.1-hf  | 22.47 |
+|  mistral-7b-instruct-v0.2-hf  | 25.18 |
+| mixtral-8x7b-instruct-v0.1-hf | 32.08 |
--- a/configs/datasets/race/README.md
+++ b/configs/datasets/race/README.md
@ -0,0 +1,64 @@
+# RACE
+
+## Base Models
+
+|          model           |   race-high |   race-middle |
+|:------------------------:|------------:|--------------:|
+|    llama-7b-turbomind    |       31.30 |         29.53 |
+|   llama-13b-turbomind    |       35.56 |         40.25 |
+|   llama-30b-turbomind    |       57.35 |         55.78 |
+|   llama-65b-turbomind    |       70.21 |         75.35 |
+|   llama-2-7b-turbomind   |       39.74 |         46.73 |
+|  llama-2-13b-turbomind   |       57.06 |         60.52 |
+|  llama-2-70b-turbomind   |       79.02 |         82.17 |
+|   llama-3-8b-turbomind   |       67.75 |         73.61 |
+|  llama-3-70b-turbomind   |       85.79 |         90.25 |
+| internlm2-1.8b-turbomind |       64.72 |         70.40 |
+|  internlm2-7b-turbomind  |       72.56 |         74.16 |
+| internlm2-20b-turbomind  |       72.90 |         74.03 |
+|   qwen-1.8b-turbomind    |       63.09 |         69.29 |
+|    qwen-7b-turbomind     |       80.30 |         85.38 |
+|    qwen-14b-turbomind    |       88.11 |         92.06 |
+|    qwen-72b-turbomind    |       90.62 |         93.59 |
+|     qwen1.5-0.5b-hf      |       54.66 |         60.38 |
+|     qwen1.5-1.8b-hf      |       67.27 |         73.33 |
+|      qwen1.5-4b-hf       |       78.50 |         83.29 |
+|      qwen1.5-7b-hf       |       82.73 |         86.70 |
+|      qwen1.5-14b-hf      |       87.99 |         91.85 |
+|      qwen1.5-32b-hf      |       90.57 |         93.25 |
+|      qwen1.5-72b-hf      |       90.45 |         93.87 |
+|   qwen1.5-moe-a2-7b-hf   |       79.56 |         83.57 |
+|    mistral-7b-v0.1-hf    |       73.58 |         76.25 |
+|    mistral-7b-v0.2-hf    |       73.67 |         77.09 |
+|   mixtral-8x7b-v0.1-hf   |       80.13 |         84.61 |
+|  mixtral-8x22b-v0.1-hf   |       86.56 |         89.62 |
+|         yi-6b-hf         |       82.93 |         85.72 |
+|        yi-34b-hf         |       90.94 |         92.76 |
+|   deepseek-7b-base-hf    |       50.91 |         56.82 |
+|   deepseek-67b-base-hf   |       83.53 |         88.23 |
+
+## Chat Models
+
+|             model             |   race-high |   race-middle |
+|:-----------------------------:|------------:|--------------:|
+|     qwen1.5-0.5b-chat-hf      |       49.03 |         52.79 |
+|     qwen1.5-1.8b-chat-hf      |       66.24 |         72.91 |
+|      qwen1.5-4b-chat-hf       |       73.53 |         80.29 |
+|      qwen1.5-7b-chat-hf       |       83.28 |         88.09 |
+|      qwen1.5-14b-chat-hf      |       87.51 |         91.36 |
+|      qwen1.5-32b-chat-hf      |       91.22 |         93.52 |
+|      qwen1.5-72b-chat-hf      |       91.11 |         93.38 |
+|     qwen1.5-110b-chat-hf      |       92.31 |         93.66 |
+|    internlm2-chat-1.8b-hf     |       73.87 |         81.13 |
+|  internlm2-chat-1.8b-sft-hf   |       73.81 |         81.69 |
+|     internlm2-chat-7b-hf      |       84.51 |         88.72 |
+|   internlm2-chat-7b-sft-hf    |       84.88 |         89.90 |
+|     internlm2-chat-20b-hf     |       88.02 |         91.43 |
+|   internlm2-chat-20b-sft-hf   |       88.11 |         91.57 |
+|    llama-3-8b-instruct-hf     |       81.22 |         86.63 |
+|    llama-3-70b-instruct-hf    |       89.57 |         93.45 |
+| llama-3-8b-instruct-lmdeploy  |       81.02 |         86.14 |
+| llama-3-70b-instruct-lmdeploy |       89.34 |         93.25 |
+|  mistral-7b-instruct-v0.1-hf  |       69.75 |         74.72 |
+|  mistral-7b-instruct-v0.2-hf  |       73.30 |         77.58 |
+| mixtral-8x7b-instruct-v0.1-hf |       81.88 |         87.26 |
--- a/configs/datasets/triviaqa/README.md
+++ b/configs/datasets/triviaqa/README.md
@ -0,0 +1,64 @@
+# TriviaQA
+
+## Base Models
+
+|          model           |   triviaqa |
+|:------------------------:|-----------:|
+|    llama-7b-turbomind    |      40.39 |
+|   llama-13b-turbomind    |      66.41 |
+|   llama-30b-turbomind    |      75.90 |
+|   llama-65b-turbomind    |      82.26 |
+|   llama-2-7b-turbomind   |      43.21 |
+|  llama-2-13b-turbomind   |      71.32 |
+|  llama-2-70b-turbomind   |      67.45 |
+|   llama-3-8b-turbomind   |      71.24 |
+|  llama-3-70b-turbomind   |      88.16 |
+| internlm2-1.8b-turbomind |      38.42 |
+|  internlm2-7b-turbomind  |      69.15 |
+| internlm2-20b-turbomind  |      74.03 |
+|   qwen-1.8b-turbomind    |      22.76 |
+|    qwen-7b-turbomind     |      53.61 |
+|    qwen-14b-turbomind    |      49.72 |
+|    qwen-72b-turbomind    |      79.13 |
+|     qwen1.5-0.5b-hf      |      21.24 |
+|     qwen1.5-1.8b-hf      |      34.32 |
+|      qwen1.5-4b-hf       |      44.59 |
+|      qwen1.5-7b-hf       |      56.60 |
+|      qwen1.5-14b-hf      |      59.96 |
+|      qwen1.5-32b-hf      |      56.20 |
+|      qwen1.5-72b-hf      |      77.81 |
+|   qwen1.5-moe-a2-7b-hf   |      65.49 |
+|    mistral-7b-v0.1-hf    |      72.93 |
+|    mistral-7b-v0.2-hf    |      70.91 |
+|   mixtral-8x7b-v0.1-hf   |      85.05 |
+|  mixtral-8x22b-v0.1-hf   |      89.47 |
+|         yi-6b-hf         |      23.76 |
+|        yi-34b-hf         |      14.73 |
+|   deepseek-7b-base-hf    |      59.48 |
+|   deepseek-67b-base-hf   |      72.15 |
+
+## Chat Models
+
+|             model             |   triviaqa |
+|:-----------------------------:|-----------:|
+|     qwen1.5-0.5b-chat-hf      |      19.84 |
+|     qwen1.5-1.8b-chat-hf      |      35.81 |
+|      qwen1.5-4b-chat-hf       |      48.93 |
+|      qwen1.5-7b-chat-hf       |      53.65 |
+|      qwen1.5-14b-chat-hf      |      62.58 |
+|      qwen1.5-32b-chat-hf      |      74.72 |
+|      qwen1.5-72b-chat-hf      |      83.25 |
+|     qwen1.5-110b-chat-hf      |      86.20 |
+|    internlm2-chat-1.8b-hf     |      46.69 |
+|  internlm2-chat-1.8b-sft-hf   |      46.50 |
+|     internlm2-chat-7b-hf      |      69.54 |
+|   internlm2-chat-7b-sft-hf    |      70.75 |
+|     internlm2-chat-20b-hf     |      75.53 |
+|   internlm2-chat-20b-sft-hf   |      75.90 |
+|    llama-3-8b-instruct-hf     |      78.99 |
+|    llama-3-70b-instruct-hf    |      89.79 |
+| llama-3-8b-instruct-lmdeploy  |      76.77 |
+| llama-3-70b-instruct-lmdeploy |      89.62 |
+|  mistral-7b-instruct-v0.1-hf  |      62.94 |
+|  mistral-7b-instruct-v0.2-hf  |      67.72 |
+| mixtral-8x7b-instruct-v0.1-hf |      79.57 |
--- a/configs/datasets/winogrande/README.md
+++ b/configs/datasets/winogrande/README.md
@ -0,0 +1,64 @@
+# WinoGrande
+
+## Base Models
+
+|          model           |   winogrande |
+|:------------------------:|-------------:|
+|    llama-7b-turbomind    |        71.19 |
+|   llama-13b-turbomind    |        76.16 |
+|   llama-30b-turbomind    |        80.66 |
+|   llama-65b-turbomind    |        82.16 |
+|   llama-2-7b-turbomind   |        74.03 |
+|  llama-2-13b-turbomind   |        76.48 |
+|  llama-2-70b-turbomind   |        83.98 |
+|   llama-3-8b-turbomind   |        77.82 |
+|  llama-3-70b-turbomind   |        83.43 |
+| internlm2-1.8b-turbomind |        66.77 |
+|  internlm2-7b-turbomind  |        83.50 |
+| internlm2-20b-turbomind  |        84.69 |
+|   qwen-1.8b-turbomind    |        61.25 |
+|    qwen-7b-turbomind     |        72.06 |
+|    qwen-14b-turbomind    |        72.45 |
+|    qwen-72b-turbomind    |        82.56 |
+|     qwen1.5-0.5b-hf      |        57.38 |
+|     qwen1.5-1.8b-hf      |        60.46 |
+|      qwen1.5-4b-hf       |        65.90 |
+|      qwen1.5-7b-hf       |        70.01 |
+|      qwen1.5-14b-hf      |        72.93 |
+|      qwen1.5-32b-hf      |        78.69 |
+|      qwen1.5-72b-hf      |        80.74 |
+|   qwen1.5-moe-a2-7b-hf   |        71.43 |
+|    mistral-7b-v0.1-hf    |        78.30 |
+|    mistral-7b-v0.2-hf    |        77.51 |
+|   mixtral-8x7b-v0.1-hf   |        81.53 |
+|  mixtral-8x22b-v0.1-hf   |        86.50 |
+|         yi-6b-hf         |        74.35 |
+|        yi-34b-hf         |        79.01 |
+|   deepseek-7b-base-hf    |        74.11 |
+|   deepseek-67b-base-hf   |        79.32 |
+
+## Chat Models
+
+|             model             |   winogrande |
+|:-----------------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |        50.51 |
+|     qwen1.5-1.8b-chat-hf      |        51.07 |
+|      qwen1.5-4b-chat-hf       |        57.54 |
+|      qwen1.5-7b-chat-hf       |        65.27 |
+|      qwen1.5-14b-chat-hf      |        70.09 |
+|      qwen1.5-32b-chat-hf      |        77.90 |
+|      qwen1.5-72b-chat-hf      |        80.82 |
+|     qwen1.5-110b-chat-hf      |        82.32 |
+|    internlm2-chat-1.8b-hf     |        57.62 |
+|  internlm2-chat-1.8b-sft-hf   |        57.93 |
+|     internlm2-chat-7b-hf      |        73.56 |
+|   internlm2-chat-7b-sft-hf    |        73.80 |
+|     internlm2-chat-20b-hf     |        81.06 |
+|   internlm2-chat-20b-sft-hf   |        81.37 |
+|    llama-3-8b-instruct-hf     |        66.22 |
+|    llama-3-70b-instruct-hf    |        81.29 |
+| llama-3-8b-instruct-lmdeploy  |        66.93 |
+| llama-3-70b-instruct-lmdeploy |        81.22 |
+|  mistral-7b-instruct-v0.1-hf  |        58.56 |
+|  mistral-7b-instruct-v0.2-hf  |        59.43 |
+| mixtral-8x7b-instruct-v0.1-hf |        65.75 |
--- a/configs/eval_charm.py
+++ b/configs/eval_charm.py
@ -1,7 +1,7 @@
 from mmengine.config import read_base

 with read_base():
-    from .datasets.CHARM.charm_rea_gen_f8fca2 import charm_rea_datasets as datasets
+    from .datasets.CHARM.charm_reason_gen_f8fca2 import charm_reason_datasets as datasets
    from .models.hf_internlm.lmdeploy_internlm2_chat_7b import models as lmdeploy_7b_chat_model

    # from models.openai.gpt_3_5_turbo_1106 import models as gpt_3_5_turbo_1106_model
@ -34,19 +34,19 @@ work_dir = './outputs/CHARM/chat/'

 # dataset                                                        version    metric         mode    internlm2-chat-7b-turbomind
 # -------------------------------------------------------------  ---------  -------------  ------  -----------------------------
-# charm-rea-Direct                                               -          naive_average  gen     49.51
-# charm-rea-ZH-CoT                                               -          naive_average  gen     61.33
-# charm-rea-EN-CoT                                               -          naive_average  gen     54.55
-# charm-rea-XLT                                                  -          naive_average  gen     58.46
-# charm-rea-Translate-EN                                         -          naive_average  gen     56.15
+# charm-reason-Direct                                               -          naive_average  gen     49.51
+# charm-reason-ZH-CoT                                               -          naive_average  gen     61.33
+# charm-reason-EN-CoT                                               -          naive_average  gen     54.55
+# charm-reason-XLT                                                  -          naive_average  gen     58.46
+# charm-reason-Translate-EN                                         -          naive_average  gen     56.15
 #                                                                -          -              -       -
-# charm-rea-Chinese_Direct                                       -          naive_average  gen     47.14
-# charm-rea-Chinese_ZH-CoT                                       -          naive_average  gen     58.40
-# charm-rea-Chinese_EN-CoT                                       -          naive_average  gen     48.31
-# charm-rea-Chinese_XLT                                          -          naive_average  gen     53.57
-# charm-rea-Chinese_Translate-EN                                 -          naive_average  gen     48.21
-# charm-rea-Global_Direct                                        -          naive_average  gen     51.88
-# charm-rea-Global_ZH-CoT                                        -          naive_average  gen     64.26
-# charm-rea-Global_EN-CoT                                        -          naive_average  gen     60.79
-# charm-rea-Global_XLT                                           -          naive_average  gen     63.36
-# charm-rea-Global_Translate-EN                                  -          naive_average  gen     64.10
+# charm-reason-Chinese_Direct                                       -          naive_average  gen     47.14
+# charm-reason-Chinese_ZH-CoT                                       -          naive_average  gen     58.40
+# charm-reason-Chinese_EN-CoT                                       -          naive_average  gen     48.31
+# charm-reason-Chinese_XLT                                          -          naive_average  gen     53.57
+# charm-reason-Chinese_Translate-EN                                 -          naive_average  gen     48.21
+# charm-reason-Global_Direct                                        -          naive_average  gen     51.88
+# charm-reason-Global_ZH-CoT                                        -          naive_average  gen     64.26
+# charm-reason-Global_EN-CoT                                        -          naive_average  gen     60.79
+# charm-reason-Global_XLT                                           -          naive_average  gen     63.36
+# charm-reason-Global_Translate-EN                                  -          naive_average  gen     64.10
--- a/configs/eval_subjective_alignbench.py
+++ b/configs/eval_subjective_alignbench.py
@ -4,7 +4,6 @@ with read_base():
    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/eval_subjective_alpacaeval_oc.py
+++ b/configs/eval_subjective_alpacaeval_oc.py
@ -5,7 +5,7 @@ with read_base():
    from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/eval_subjective_alpacaeval_official.py
+++ b/configs/eval_subjective_alpacaeval_official.py
@ -4,7 +4,7 @@ with read_base():
    from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAI, OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/eval_subjective_compassarena.py
+++ b/configs/eval_subjective_compassarena.py
@ -6,7 +6,6 @@ with read_base():
    from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/eval_subjective_creationbench.py
+++ b/configs/eval_subjective_creationbench.py
@ -4,7 +4,6 @@ with read_base():
    from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/eval_subjective_judge_pandalm.py
+++ b/configs/eval_subjective_judge_pandalm.py
@ -3,7 +3,7 @@ from mmengine.config import read_base
 with read_base():
    from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets

-from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
 from opencompass.partitioners import NaivePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
--- a/configs/eval_subjective_mtbench.py
+++ b/configs/eval_subjective_mtbench.py
@ -4,7 +4,6 @@ with read_base():
    from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets

 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.models.openai_api import OpenAIAllesAPIN
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
--- a/configs/models/hf_internlm/README.md
+++ b/configs/models/hf_internlm/README.md
@ -0,0 +1,124 @@
+# InternLM Model Details
+
+## InternLM
+
+InternLM is pre-trained on a large corpora with 1.6T tokens with a multi-phase progressive process, and then fine-tuned to align with human preferences. We also developed a training system called Uniscale-LLM for efficient large language model training. The evaluation on a number of benchmarks shows that InternLM achieves state-of-the-art performance in multiple aspects, including knowledge understanding, reading comprehension, mathematics, and coding. With such well-rounded capabilities, InternLM achieves outstanding performances on comprehensive exams, including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without resorting to external tools. On these benchmarks, InternLM not only significantly outperforms open-source models, but also obtains superior performance compared to ChatGPT. Also, InternLM demonstrates excellent capability of understanding Chinese language and Chinese culture, which makes it a suitable foundation model to support Chinese-oriented language applications.
+
+## InternLM2
+
+The evolution of Large Language Models (LLMs) like ChatGPT and GPT-4 has sparked discussions on the advent of Artificial General Intelligence (AGI). However, replicating such advancements in open-source models has been challenging. This paper introduces InternLM2, an open-source LLM that outperforms its predecessors in comprehensive evaluations across 6 dimensions and 30 benchmarks, long-context modeling, and open-ended subjective evaluations through innovative pre-training and optimization techniques. The pre-training process of InternLM2 is meticulously detailed, highlighting the preparation of diverse data types including text, code, and long-context data. InternLM2 efficiently captures long-term dependencies, initially trained on 4k tokens before advancing to 32k tokens in pre-training and fine-tuning stages, exhibiting remarkable performance on the 200k "Needle-in-a-Haystack" test. InternLM2 is further aligned using Supervised Fine-Tuning (SFT) and a novel Conditional Online Reinforcement Learning from Human Feedback (COOL RLHF) strategy that addresses conflicting human preferences and reward hacking. By releasing InternLM2 models in different training stages and model sizes, we provide the community with insights into the model's evolution.
+
+# Evaluation Command
+
+## Base Models
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_7b --datasets cmmlu_ppl_041cbf --debug
+python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_internlm2_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
+```
+
+## Chat Models
+
+```bash
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets cmmlu_gen_c13365 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
+```
+
+# Benchmarks
+
+We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
+
+## Base Models
+
+|   dataset    |   internlm2-1.8b-turbomind |   internlm2-7b-turbomind |   internlm2-20b-turbomind |
+|:------------:|---------------------------:|-------------------------:|--------------------------:|
+|     mmlu     |                      45.99 |                    65.84 |                     67.58 |
+|    cmmlu     |                      45.27 |                    66.17 |                     68.28 |
+|  ceval-test  |                      44.79 |                    63.54 |                     67.28 |
+| GaokaoBench  |                      23.78 |                    41.41 |                     58.99 |
+|   triviaqa   |                      38.42 |                    69.15 |                     74.03 |
+|      nq      |                      20.66 |                    41.05 |                     43.55 |
+|  race-high   |                      64.72 |                    72.56 |                     72.90 |
+|  winogrande  |                      66.77 |                    83.50 |                     84.69 |
+|  hellaswag   |                      44.86 |                    89.52 |                     91.41 |
+|     bbh      |                      36.03 |                    63.56 |                     71.29 |
+|    gsm8k     |                      30.40 |                    69.98 |                     76.80 |
+|     math     |                       9.42 |                    25.16 |                     32.24 |
+|  TheoremQA   |                      10.50 |                    21.88 |                     26.00 |
+|  humaneval   |                      30.49 |                    48.17 |                     51.83 |
+|     mbpp     |                      30.74 |                    54.47 |                     59.92 |
+|   lcbench    |                       4.34 |                    12.16 |                     18.46 |
+| GPQA_diamond |                      24.24 |                    28.28 |                     31.31 |
+
+## Chat Models
+
+|   dataset    |   internlm2-chat-1.8b-hf |   internlm2-chat-1.8b-sft-hf |   internlm2-chat-7b-hf |   internlm2-chat-7b-sft-hf |   internlm2-chat-20b-hf |   internlm2-chat-20b-sft-hf |
+|:------------:|-------------------------:|-----------------------------:|-----------------------:|---------------------------:|------------------------:|----------------------------:|
+|     mmlu     |                    47.58 |                        47.44 |                  63.05 |                      63.33 |                   67.37 |                       67.34 |
+|    cmmlu     |                    46.11 |                        46.27 |                  62.10 |                      62.38 |                   66.26 |                       66.39 |
+|  ceval-test  |                    47.04 |                        47.19 |                  58.75 |                      58.96 |                   63.12 |                       63.16 |
+| GaokaoBench  |                    29.73 |                        28.79 |                  54.54 |                      55.39 |                   57.95 |                       57.62 |
+|   triviaqa   |                    46.69 |                        46.50 |                  69.54 |                      70.75 |                   75.53 |                       75.90 |
+|      nq      |                    19.09 |                        18.14 |                  28.73 |                      30.78 |                   28.75 |                       34.10 |
+|  race-high   |                    73.87 |                        73.81 |                  84.51 |                      84.88 |                   88.02 |                       88.11 |
+|  winogrande  |                    57.62 |                        57.93 |                  73.56 |                      73.80 |                   81.06 |                       81.37 |
+|  hellaswag   |                    60.47 |                        61.58 |                  84.80 |                      85.21 |                   88.48 |                       88.95 |
+|     bbh      |                    37.69 |                        37.12 |                  57.83 |                      57.19 |                   68.24 |                       69.38 |
+|    gsm8k     |                    39.73 |                        36.85 |                  69.90 |                      69.83 |                   75.21 |                       76.95 |
+|     math     |                    14.06 |                        13.10 |                  28.08 |                      27.60 |                   34.68 |                       32.54 |
+|  TheoremQA   |                    13.63 |                        12.88 |                  18.50 |                      18.75 |                   23.00 |                       25.12 |
+|  humaneval   |                    33.54 |                        34.15 |                  56.71 |                      61.59 |                   67.68 |                       67.68 |
+|     mbpp     |                    39.69 |                        36.19 |                  57.59 |                      55.64 |                   68.87 |                       69.65 |
+|   lcbench    |                     4.52 |                         3.56 |                  14.60 |                      14.34 |                   19.64 |                       20.55 |
+| GPQA_diamond |                    25.76 |                        26.26 |                  28.28 |                      27.27 |                   30.30 |                       29.29 |
+|    IFEval    |                    18.30 |                        18.67 |                  34.75 |                      39.19 |                   36.41 |                       44.55 |
+
+# Citation
+
+```BibTeX
+@misc{2023internlm,
+    title={InternLM: A Multilingual Language Model with Progressively Enhanced Capabilities},
+    author={InternLM Team},
+    howpublished = {\url{https://github.com/InternLM/InternLM-techreport}},
+    year={2023}
+}
+@misc{cai2024internlm2,
+      title={InternLM2 Technical Report},
+      author={Zheng Cai and Maosong Cao and Haojiong Chen and Kai Chen and Keyu Chen and Xin Chen and Xun Chen and Zehui Chen and Zhi Chen and Pei Chu and Xiaoyi Dong and Haodong Duan and Qi Fan and Zhaoye Fei and Yang Gao and Jiaye Ge and Chenya Gu and Yuzhe Gu and Tao Gui and Aijia Guo and Qipeng Guo and Conghui He and Yingfan Hu and Ting Huang and Tao Jiang and Penglong Jiao and Zhenjiang Jin and Zhikai Lei and Jiaxing Li and Jingwen Li and Linyang Li and Shuaibin Li and Wei Li and Yining Li and Hongwei Liu and Jiangning Liu and Jiawei Hong and Kaiwen Liu and Kuikun Liu and Xiaoran Liu and Chengqi Lv and Haijun Lv and Kai Lv and Li Ma and Runyuan Ma and Zerun Ma and Wenchang Ning and Linke Ouyang and Jiantao Qiu and Yuan Qu and Fukai Shang and Yunfan Shao and Demin Song and Zifan Song and Zhihao Sui and Peng Sun and Yu Sun and Huanze Tang and Bin Wang and Guoteng Wang and Jiaqi Wang and Jiayu Wang and Rui Wang and Yudong Wang and Ziyi Wang and Xingjian Wei and Qizhen Weng and Fan Wu and Yingtong Xiong and Chao Xu and Ruiliang Xu and Hang Yan and Yirong Yan and Xiaogui Yang and Haochen Ye and Huaiyuan Ying and Jia Yu and Jing Yu and Yuhang Zang and Chuyu Zhang and Li Zhang and Pan Zhang and Peng Zhang and Ruijie Zhang and Shuo Zhang and Songyang Zhang and Wenjian Zhang and Wenwei Zhang and Xingcheng Zhang and Xinyue Zhang and Hui Zhao and Qian Zhao and Xiaomeng Zhao and Fengzhe Zhou and Zaida Zhou and Jingming Zhuo and Yicheng Zou and Xipeng Qiu and Yu Qiao and Dahua Lin},
+      year={2024},
+      eprint={2403.17297},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
--- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_1_8b.py
@ -0,0 +1,23 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='internlm2-chat-1.8b-turbomind',
+        path='internlm/internlm2-chat-1_8b',
+        engine_config=dict(
+            max_batch_size=16,
+            tp=1,
+        ),
+        gen_config=dict(
+            top_k=1,
+            temperature=1e-6,
+            top_p=0.9,
+        ),
+        max_seq_len=2048,
+        max_out_len=1024,
+        batch_size=32768,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['</s>', '<|im_end|>'],
+    )
+]
--- a/configs/models/hf_internlm/lmdeploy_internlm2_series.py
+++ b/configs/models/hf_internlm/lmdeploy_internlm2_series.py
@ -3,7 +3,9 @@ from opencompass.models import TurboMindModel
 settings = [
    ('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
    ('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
+    ('internlm2-base-7b-turbomind', 'internlm/internlm2-base-7b', 1),
    ('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
+    ('internlm2-base-20b-turbomind', 'internlm/internlm2-base-20b', 2),
 ]

 models = []
--- a/configs/models/hf_llama/hf_llama2_13b.py
+++ b/configs/models/hf_llama/hf_llama2_13b.py
@ -7,6 +7,6 @@ models = [
        path='meta-llama/Llama-2-13b-hf',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        run_cfg=dict(num_gpus=2),
    )
 ]
--- a/configs/models/hf_llama/hf_llama2_13b_chat.py
+++ b/configs/models/hf_llama/hf_llama2_13b_chat.py
@ -7,6 +7,6 @@ models = [
        path='meta-llama/Llama-2-13b-chat-hf',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        run_cfg=dict(num_gpus=2),
    )
 ]
--- a/configs/models/hf_llama/hf_llama2_70b.py
+++ b/configs/models/hf_llama/hf_llama2_70b.py
@ -7,6 +7,6 @@ models = [
        path='meta-llama/Llama-2-70b-hf',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=4),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/hf_llama/hf_llama3_70b.py
+++ b/configs/models/hf_llama/hf_llama3_70b.py
@ -7,6 +7,6 @@ models = [
        path='meta-llama/Meta-Llama-3-70B',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_13b.py
+++ b/configs/models/hf_llama/hf_llama_13b.py
@ -7,6 +7,6 @@ models = [
        path='huggyllama/llama-13b',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        run_cfg=dict(num_gpus=2),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_30b.py
+++ b/configs/models/hf_llama/hf_llama_30b.py
@ -7,6 +7,6 @@ models = [
        path='huggyllama/llama-30b',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=2),
+        run_cfg=dict(num_gpus=4),
    )
 ]
--- a/configs/models/hf_llama/hf_llama_65b.py
+++ b/configs/models/hf_llama/hf_llama_65b.py
@ -7,6 +7,6 @@ models = [
        path='huggyllama/llama-65b',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=4),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
+++ b/configs/models/mistral/hf_mistral_7b_instruct_v0_3.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='mistral-7b-instruct-v0.3-hf',
+        path='mistralai/Mistral-7B-Instruct-v0.3',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/mistral/hf_mistral_7b_v0_3.py
+++ b/configs/models/mistral/hf_mistral_7b_v0_3.py
@ -0,0 +1,13 @@
+from opencompass.models import HuggingFaceBaseModel
+
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='mistral-7b-v0.3-hf',
+        path='mistralai/Mistral-7B-v0.3',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/qwen/README.md
+++ b/configs/models/qwen/README.md
@ -0,0 +1,142 @@
+# Qwen Model Details
+
+## Qwen
+
+Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
+
+## Qwen1.5
+
+Qwen1.5 is the beta version of Qwen2, a transformer-based decoder-only language model pretrained on a large amount of data. In comparison with the previous released Qwen, the improvements include:
+
+- 8 model sizes, including 0.5B, 1.8B, 4B, 7B, 14B, 32B and 72B dense models, and an MoE model of 14B with 2.7B activated;
+- Significant performance improvement in human preference for chat models;
+- Multilingual support of both base and chat models;
+- Stable support of 32K context length for models of all sizes
+- No need of trust_remote_code.
+
+# Evaluation Command
+
+## Base Models
+
+```bash
+python3 run.py --models hf_qwen1_5_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_qwen1_5_7b --datasets cmmlu_ppl_041cbf --debug
+python3 run.py --models hf_qwen1_5_7b --datasets ceval_internal_ppl_93e5ce --debug
+python3 run.py --models hf_qwen1_5_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets nq_open_1shot_gen_20a989 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets race_ppl_abed12 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets winogrande_5shot_ll_252f01 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets hellaswag_10shot_ppl_59c85e --debug
+python3 run.py --models hf_qwen1_5_7b --datasets bbh_gen_98fba6 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets gsm8k_gen_17d0dc --debug
+python3 run.py --models hf_qwen1_5_7b --datasets math_4shot_base_gen_db136b --debug
+python3 run.py --models hf_qwen1_5_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets humaneval_gen_d2537e --debug
+python3 run.py --models hf_qwen1_5_7b --datasets sanitized_mbpp_gen_742f0c --debug
+python3 run.py --models hf_qwen1_5_7b --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_qwen1_5_7b --datasets gpqa_ppl_6bf57a --debug
+```
+
+## Chat Models
+
+```bash
+python3 run.py --models hf_qwen1_5_7b_chat --datasets mmlu_gen_4d595a --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets cmmlu_gen_c13365 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets ceval_internal_gen_2daf24 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets GaokaoBench_no_subjective_gen_4c31db --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets nq_open_1shot_gen_01cf41 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets race_gen_69ee4f --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets winogrande_5shot_gen_b36770 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets hellaswag_10shot_gen_e42710 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets bbh_gen_5b92b0 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets gsm8k_gen_1d7fe4 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets math_0shot_gen_393424 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets humaneval_gen_8e312c --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets lcbench_gen_5ff288 --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets gpqa_gen_4baadb --debug
+python3 run.py --models hf_qwen1_5_7b_chat --datasets IFEval_gen_3321a3 --debug
+```
+
+# Benchmarks
+
+We provide reference results for the classifical models, you can reproduce these results by following the aforementioned commands.
+
+## Base Models
+
+|   dataset    |   qwen-1.8b-turbomind |   qwen-7b-turbomind |   qwen-14b-turbomind |   qwen-72b-turbomind |
+|:------------:|----------------------:|--------------------:|---------------------:|---------------------:|
+|     mmlu     |                 46.61 |               59.75 |                67.85 |                77.36 |
+|    cmmlu     |                 51.98 |               62.10 |                70.05 |                83.32 |
+|  ceval-test  |                 54.24 |               62.06 |                70.33 |                83.25 |
+| GaokaoBench  |                 22.11 |               35.32 |                54.07 |                77.56 |
+|   triviaqa   |                 22.76 |               53.61 |                49.72 |                79.13 |
+|      nq      |                  5.68 |               17.87 |                13.77 |                18.20 |
+|  race-high   |                 63.09 |               80.30 |                88.11 |                90.62 |
+|  winogrande  |                 61.25 |               72.06 |                72.45 |                82.56 |
+|  hellaswag   |                 38.04 |               64.62 |                85.88 |                90.40 |
+|     bbh      |                 22.53 |               45.89 |                56.75 |                63.35 |
+|    gsm8k     |                 23.73 |               54.36 |                61.64 |                79.68 |
+|     math     |                  6.30 |               15.56 |                30.38 |                44.18 |
+|  TheoremQA   |                  9.38 |               15.00 |                21.62 |                27.12 |
+|  humaneval   |                 16.46 |               23.78 |                23.78 |                66.46 |
+|     mbpp     |                  2.72 |               46.69 |                55.64 |                65.76 |
+|   lcbench    |                  1.82 |                4.95 |                 8.86 |                16.86 |
+| GPQA_diamond |                 28.79 |               24.75 |                27.78 |                31.31 |
+
+|   dataset    |   qwen1.5-0.5b-hf |   qwen1.5-1.8b-hf |   qwen1.5-4b-hf |   qwen1.5-7b-hf |   qwen1.5-14b-hf |   qwen1.5-32b-hf |   qwen1.5-72b-hf |
+|:------------:|------------------:|------------------:|----------------:|----------------:|-----------------:|-----------------:|-----------------:|
+|     mmlu     |             39.98 |             47.14 |           57.03 |           62.15 |            69.10 |            73.88 |            77.02 |
+|    cmmlu     |             46.05 |             57.45 |           66.38 |           71.86 |            76.95 |            81.58 |            83.00 |
+|  ceval-test  |             48.36 |             58.67 |           66.55 |           72.49 |            76.93 |            82.50 |            83.03 |
+| GaokaoBench  |             30.67 |             35.66 |           54.31 |           65.99 |            66.60 |            79.01 |            80.26 |
+|   triviaqa   |             21.24 |             34.32 |           44.59 |           56.60 |            59.96 |            56.20 |            77.81 |
+|      nq      |              6.01 |             10.28 |           15.73 |           18.61 |            16.07 |            21.75 |            20.53 |
+|  race-high   |             54.66 |             67.27 |           78.50 |           82.73 |            87.99 |            90.57 |            90.45 |
+|  winogrande  |             57.38 |             60.46 |           65.90 |           70.01 |            72.93 |            78.69 |            80.74 |
+|  hellaswag   |             29.19 |             42.32 |           55.89 |           68.51 |            83.86 |            87.28 |            90.41 |
+|     bbh      |             20.54 |             27.01 |           34.81 |           39.87 |            50.38 |            67.47 |            58.81 |
+|    gsm8k     |             13.27 |             34.87 |           47.61 |           54.36 |            63.53 |            72.71 |            79.53 |
+|     math     |              4.16 |             11.32 |           17.50 |           17.34 |            36.18 |            45.74 |            41.56 |
+|  TheoremQA   |              5.88 |             12.00 |           13.75 |            4.25 |            12.62 |            26.62 |            26.62 |
+|  humaneval   |              8.54 |             23.17 |           41.46 |           53.05 |            57.32 |            70.12 |            65.85 |
+|     mbpp     |              5.06 |             15.95 |           45.91 |           52.14 |            52.14 |            59.14 |            61.09 |
+|   lcbench    |              0.87 |              2.00 |            5.65 |            6.69 |            12.69 |            14.34 |            15.29 |
+| GPQA_diamond |             23.74 |             28.79 |           23.23 |           20.71 |            32.32 |            30.81 |            31.82 |
+
+## Chat Models
+
+|   dataset    |   qwen1.5-0.5b-chat-hf |   qwen1.5-1.8b-chat-hf |   qwen1.5-4b-chat-hf |   qwen1.5-7b-chat-hf |   qwen1.5-14b-chat-hf |   qwen1.5-32b-chat-hf |   qwen1.5-72b-chat-hf |   qwen1.5-110b-chat-hf |
+|:------------:|-----------------------:|-----------------------:|---------------------:|---------------------:|----------------------:|----------------------:|----------------------:|-----------------------:|
+|     mmlu     |                  35.32 |                  45.62 |                55.90 |                61.79 |                 67.96 |                 75.36 |                 77.24 |                  77.95 |
+|    cmmlu     |                  31.55 |                  48.93 |                58.53 |                68.78 |                 75.07 |                 80.39 |                 82.48 |                  86.46 |
+|  ceval-test  |                  36.88 |                  55.17 |                61.54 |                68.71 |                 74.80 |                 80.47 |                 81.53 |                  87.33 |
+| GaokaoBench  |                  21.51 |                  46.19 |                59.11 |                70.55 |                 80.39 |                 86.15 |                 88.58 |                  89.59 |
+|   triviaqa   |                  19.84 |                  35.81 |                48.93 |                53.65 |                 62.58 |                 74.72 |                 83.25 |                  86.20 |
+|      nq      |                   7.42 |                  10.22 |                19.31 |                16.87 |                 20.53 |                 25.26 |                 35.21 |                  36.98 |
+|  race-high   |                  49.03 |                  66.24 |                73.53 |                83.28 |                 87.51 |                 91.22 |                 91.11 |                  92.31 |
+|  winogrande  |                  50.51 |                  51.07 |                57.54 |                65.27 |                 70.09 |                 77.90 |                 80.82 |                  82.32 |
+|  hellaswag   |                  29.60 |                  41.71 |                60.45 |                71.58 |                 79.70 |                 88.56 |                 89.37 |                  91.11 |
+|     bbh      |                  24.12 |                  26.82 |                43.15 |                38.12 |                 55.38 |                 69.28 |                 72.97 |                  71.04 |
+|    gsm8k     |                   8.79 |                  27.60 |                47.61 |                56.25 |                 64.90 |                 79.91 |                 77.03 |                  79.53 |
+|     math     |                   0.56 |                   4.94 |                 7.34 |                22.14 |                 32.22 |                 41.80 |                 45.22 |                  54.38 |
+|  TheoremQA   |                   9.00 |                   9.25 |                13.88 |                12.25 |                 13.63 |                 19.25 |                 22.75 |                  17.50 |
+|  humaneval   |                   9.15 |                  15.85 |                30.49 |                40.85 |                 50.00 |                 57.93 |                 60.37 |                  65.24 |
+|     mbpp     |                  11.28 |                  22.57 |                43.58 |                50.58 |                 56.03 |                 65.37 |                 66.93 |                  68.48 |
+|   lcbench    |                   0.00 |                   1.65 |                 5.56 |                 8.78 |                 14.42 |                 10.78 |                 18.77 |                  34.58 |
+| GPQA_diamond |                  19.70 |                  29.80 |                25.25 |                31.82 |                 30.30 |                 31.31 |                 32.83 |                  35.86 |
+|    IFEval    |                  13.12 |                  16.08 |                25.51 |                38.82 |                 42.51 |                 49.54 |                 51.02 |                  55.08 |
+
+# Citation
+
+```BibTeX
+@article{qwen,
+  title={Qwen Technical Report},
+  author={Jinze Bai and Shuai Bai and Yunfei Chu and Zeyu Cui and Kai Dang and Xiaodong Deng and Yang Fan and Wenbin Ge and Yu Han and Fei Huang and Binyuan Hui and Luo Ji and Mei Li and Junyang Lin and Runji Lin and Dayiheng Liu and Gao Liu and Chengqiang Lu and Keming Lu and Jianxin Ma and Rui Men and Xingzhang Ren and Xuancheng Ren and Chuanqi Tan and Sinan Tan and Jianhong Tu and Peng Wang and Shijie Wang and Wei Wang and Shengguang Wu and Benfeng Xu and Jin Xu and An Yang and Hao Yang and Jian Yang and Shusheng Yang and Yang Yao and Bowen Yu and Hongyi Yuan and Zheng Yuan and Jianwei Zhang and Xingxuan Zhang and Yichang Zhang and Zhenru Zhang and Chang Zhou and Jingren Zhou and Xiaohuan Zhou and Tianhang Zhu},
+  journal={arXiv preprint arXiv:2309.16609},
+  year={2023}
+}
+```
--- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py
+++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b.py
@ -1,25 +1,12 @@
-from opencompass.models import HuggingFaceCausalLM
+from opencompass.models import HuggingFaceBaseModel

 models = [
    dict(
-        type=HuggingFaceCausalLM,
-        abbr='qwen1.5-moe-a2-7b-hf',
+        type=HuggingFaceBaseModel,
+        abbr='qwen1.5-moe-a2.7b-hf',
        path='Qwen/Qwen1.5-MoE-A2.7B',
-        tokenizer_path='Qwen/Qwen1.5-MoE-A2.7B',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        pad_token_id=151645,
-        max_out_len=100,
-        max_seq_len=2048,
+        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1, num_procs=1),
+        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
+++ b/configs/models/qwen/hf_qwen1_5_moe_a2_7b_chat.py
@ -1,33 +1,12 @@
-from opencompass.models import HuggingFaceCausalLM
-
-_meta_template = dict(
-    round=[
-        dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
-        dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
-    ],
-)
+from opencompass.models import HuggingFacewithChatTemplate

 models = [
    dict(
-        type=HuggingFaceCausalLM,
-        abbr='qwen1.5-moe-a2-7b-chat-hf',
+        type=HuggingFacewithChatTemplate,
+        abbr='qwen1.5-moe-a2.7b-chat-hf',
        path='Qwen/Qwen1.5-MoE-A2.7B-Chat',
-        model_kwargs=dict(
-            device_map='auto',
-            trust_remote_code=True
-        ),
-        tokenizer_kwargs=dict(
-            padding_side='left',
-            truncation_side='left',
-            trust_remote_code=True,
-            use_fast=False,
-        ),
-        meta_template=_meta_template,
-        max_out_len=100,
-        max_seq_len=2048,
+        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=4, num_procs=1),
-        end_str='<|im_end|>',
-        batch_padding=True,
+        run_cfg=dict(num_gpus=1),
    )
 ]
--- a/configs/models/qwen/hf_qwen2_0_5b.py
+++ b/configs/models/qwen/hf_qwen2_0_5b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen2-0.5b-hf',
+        path='Qwen/Qwen2-0.5B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/qwen/hf_qwen2_1_5b.py
+++ b/configs/models/qwen/hf_qwen2_1_5b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen2-1.5b-hf',
+        path='Qwen/Qwen2-1.5B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/qwen/hf_qwen2_72b.py
+++ b/configs/models/qwen/hf_qwen2_72b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen2-72b-hf',
+        path='Qwen/Qwen2-72B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+    )
+]
--- a/configs/models/qwen/hf_qwen2_7b.py
+++ b/configs/models/qwen/hf_qwen2_7b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen2-7b-hf',
+        path='Qwen/Qwen2-7B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/qwen/hf_qwen2_moe_57b_a14b.py
+++ b/configs/models/qwen/hf_qwen2_moe_57b_a14b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='qwen2-moe-57b-a14b-hf',
+        path='Qwen/Qwen2-MoE-57B-A14B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
--- a/configs/models/qwen/hf_qwen_14b.py
+++ b/configs/models/qwen/hf_qwen_14b.py
@ -7,6 +7,6 @@ models = [
        path='Qwen/Qwen-14B',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        run_cfg=dict(num_gpus=2),
    )
 ]
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+    ],
+)

 models = [
    dict(
-        type=HuggingFacewithChatTemplate,
+        type=HuggingFaceCausalLM,
        abbr='qwen-14b-chat-hf',
        path='Qwen/Qwen-14B-Chat',
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        pad_token_id=151643,
        max_out_len=1024,
+        max_seq_len=8192,
        batch_size=8,
-        run_cfg=dict(num_gpus=1),
+        batch_padding=True,
+        meta_template=_meta_template,
+        run_cfg=dict(num_gpus=2),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_1_8b_chat.py
+++ b/configs/models/qwen/hf_qwen_1_8b_chat.py
@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+    ],
+)

 models = [
    dict(
-        type=HuggingFacewithChatTemplate,
+        type=HuggingFaceCausalLM,
        abbr='qwen-1.8b-chat-hf',
        path='Qwen/Qwen-1_8B-Chat',
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        pad_token_id=151643,
        max_out_len=1024,
+        max_seq_len=8192,
        batch_size=8,
+        batch_padding=True,
+        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_72b.py
+++ b/configs/models/qwen/hf_qwen_72b.py
@ -7,6 +7,6 @@ models = [
        path='Qwen/Qwen-72B',
        max_out_len=1024,
        batch_size=8,
-        run_cfg=dict(num_gpus=4),
+        run_cfg=dict(num_gpus=8),
    )
 ]
--- a/configs/models/qwen/hf_qwen_72b_chat.py
+++ b/configs/models/qwen/hf_qwen_72b_chat.py
@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+    ],
+)

 models = [
    dict(
-        type=HuggingFacewithChatTemplate,
+        type=HuggingFaceCausalLM,
        abbr='qwen-72b-chat-hf',
        path='Qwen/Qwen-72B-Chat',
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        pad_token_id=151643,
        max_out_len=1024,
+        max_seq_len=8192,
        batch_size=8,
-        run_cfg=dict(num_gpus=4),
+        batch_padding=True,
+        meta_template=_meta_template,
+        run_cfg=dict(num_gpus=8),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@ -1,12 +1,31 @@
-from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.models import HuggingFaceCausalLM
+
+_meta_template = dict(
+    round=[
+        dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
+    ],
+)

 models = [
    dict(
-        type=HuggingFacewithChatTemplate,
+        type=HuggingFaceCausalLM,
        abbr='qwen-7b-chat-hf',
        path='Qwen/Qwen-7B-Chat',
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+        ),
+        pad_token_id=151643,
        max_out_len=1024,
+        max_seq_len=8192,
        batch_size=8,
+        batch_padding=True,
+        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/yi/hf_yi_1_5_34b.py
+++ b/configs/models/yi/hf_yi_1_5_34b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='yi-1.5-34b-hf',
+        path='01-ai/Yi-1.5-34B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=2),
+    )
+]
--- a/configs/models/yi/hf_yi_1_5_6b.py
+++ b/configs/models/yi/hf_yi_1_5_6b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='yi-1.5-6b-hf',
+        path='01-ai/Yi-1.5-6B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/models/yi/hf_yi_1_5_9b.py
+++ b/configs/models/yi/hf_yi_1_5_9b.py
@ -0,0 +1,12 @@
+from opencompass.models import HuggingFaceBaseModel
+
+models = [
+    dict(
+        type=HuggingFaceBaseModel,
+        abbr='yi-1.5-9b-hf',
+        path='01-ai/Yi-1.5-9B',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+    )
+]
--- a/configs/subjective/model_cfg.py
+++ b/configs/subjective/model_cfg.py
@ -1,5 +1,5 @@
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
-from opencompass.models.openai_api import OpenAIAllesAPIN
+from opencompass.models.openai_api import OpenAI
 from opencompass.partitioners import NaivePartitioner, SizePartitioner
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
@ -44,7 +44,7 @@ models = [

 judge_model = dict(
        abbr='GPT4-Turbo',
-        type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
+        type=OpenAI, path='gpt-4-1106-preview',
        key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
        url='',
        meta_template=api_meta_template,
@ -70,7 +70,7 @@ runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalT

 gpt4 = dict(
    abbr='gpt4-turbo',
-    type=OpenAIAllesAPIN,
+    type=OpenAI,
    path='gpt-4-1106-preview',
    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
    meta_template=api_meta_template,
--- a/configs/summarizers/charm_rea.py
+++ b/configs/summarizers/charm_rea.py
@ -1,98 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    from .groups.charm_rea import charm_rea_summary_groups
-
-summarizer = dict(
-    dataset_abbrs=[
-        'charm-rea-Direct',
-        'charm-rea-ZH-CoT',
-        'charm-rea-EN-CoT',
-        'charm-rea-XLT',
-        'charm-rea-Translate-EN',
-        '',
-        'charm-rea-Chinese_Direct',
-        'charm-rea-Chinese_ZH-CoT',
-        'charm-rea-Chinese_EN-CoT',
-        'charm-rea-Chinese_XLT',
-        'charm-rea-Chinese_Translate-EN',
-        'charm-rea-Global_Direct',
-        'charm-rea-Global_ZH-CoT',
-        'charm-rea-Global_EN-CoT',
-        'charm-rea-Global_XLT',
-        'charm-rea-Global_Translate-EN',
-        '',
-        'charm-rea-Chinese_Anachronisms_Judgment_Direct',
-        'charm-rea-Chinese_Movie_and_Music_Recommendation_Direct',
-        'charm-rea-Chinese_Natural_Language_Inference_Direct',
-        'charm-rea-Chinese_Reading_Comprehension_Direct',
-        'charm-rea-Chinese_Sequence_Understanding_Direct',
-        'charm-rea-Chinese_Sport_Understanding_Direct',
-        'charm-rea-Chinese_Time_Understanding_Direct',
-        'charm-rea-Global_Anachronisms_Judgment_Direct',
-        'charm-rea-Global_Movie_and_Music_Recommendation_Direct',
-        'charm-rea-Global_Natural_Language_Inference_Direct',
-        'charm-rea-Global_Reading_Comprehension_Direct',
-        'charm-rea-Global_Sequence_Understanding_Direct',
-        'charm-rea-Global_Sport_Understanding_Direct',
-        'charm-rea-Global_Time_Understanding_Direct',
-        'charm-rea-Chinese_Anachronisms_Judgment_ZH-CoT',
-        'charm-rea-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
-        'charm-rea-Chinese_Natural_Language_Inference_ZH-CoT',
-        'charm-rea-Chinese_Reading_Comprehension_ZH-CoT',
-        'charm-rea-Chinese_Sequence_Understanding_ZH-CoT',
-        'charm-rea-Chinese_Sport_Understanding_ZH-CoT',
-        'charm-rea-Chinese_Time_Understanding_ZH-CoT',
-        'charm-rea-Global_Anachronisms_Judgment_ZH-CoT',
-        'charm-rea-Global_Movie_and_Music_Recommendation_ZH-CoT',
-        'charm-rea-Global_Natural_Language_Inference_ZH-CoT',
-        'charm-rea-Global_Reading_Comprehension_ZH-CoT',
-        'charm-rea-Global_Sequence_Understanding_ZH-CoT',
-        'charm-rea-Global_Sport_Understanding_ZH-CoT',
-        'charm-rea-Global_Time_Understanding_ZH-CoT',
-        'charm-rea-Chinese_Anachronisms_Judgment_EN-CoT',
-        'charm-rea-Chinese_Movie_and_Music_Recommendation_EN-CoT',
-        'charm-rea-Chinese_Natural_Language_Inference_EN-CoT',
-        'charm-rea-Chinese_Reading_Comprehension_EN-CoT',
-        'charm-rea-Chinese_Sequence_Understanding_EN-CoT',
-        'charm-rea-Chinese_Sport_Understanding_EN-CoT',
-        'charm-rea-Chinese_Time_Understanding_EN-CoT',
-        'charm-rea-Global_Anachronisms_Judgment_EN-CoT',
-        'charm-rea-Global_Movie_and_Music_Recommendation_EN-CoT',
-        'charm-rea-Global_Natural_Language_Inference_EN-CoT',
-        'charm-rea-Global_Reading_Comprehension_EN-CoT',
-        'charm-rea-Global_Sequence_Understanding_EN-CoT',
-        'charm-rea-Global_Sport_Understanding_EN-CoT',
-        'charm-rea-Global_Time_Understanding_EN-CoT',
-        'charm-rea-Chinese_Anachronisms_Judgment_XLT',
-        'charm-rea-Chinese_Movie_and_Music_Recommendation_XLT',
-        'charm-rea-Chinese_Natural_Language_Inference_XLT',
-        'charm-rea-Chinese_Reading_Comprehension_XLT',
-        'charm-rea-Chinese_Sequence_Understanding_XLT',
-        'charm-rea-Chinese_Sport_Understanding_XLT',
-        'charm-rea-Chinese_Time_Understanding_XLT',
-        'charm-rea-Global_Anachronisms_Judgment_XLT',
-        'charm-rea-Global_Movie_and_Music_Recommendation_XLT',
-        'charm-rea-Global_Natural_Language_Inference_XLT',
-        'charm-rea-Global_Reading_Comprehension_XLT',
-        'charm-rea-Global_Sequence_Understanding_XLT',
-        'charm-rea-Global_Sport_Understanding_XLT',
-        'charm-rea-Global_Time_Understanding_XLT',
-        'charm-rea-Chinese_Anachronisms_Judgment_Translate-EN',
-        'charm-rea-Chinese_Movie_and_Music_Recommendation_Translate-EN',
-        'charm-rea-Chinese_Natural_Language_Inference_Translate-EN',
-        'charm-rea-Chinese_Reading_Comprehension_Translate-EN',
-        'charm-rea-Chinese_Sequence_Understanding_Translate-EN',
-        'charm-rea-Chinese_Sport_Understanding_Translate-EN',
-        'charm-rea-Chinese_Time_Understanding_Translate-EN',
-        'charm-rea-Global_Anachronisms_Judgment_Translate-EN',
-        'charm-rea-Global_Movie_and_Music_Recommendation_Translate-EN',
-        'charm-rea-Global_Natural_Language_Inference_Translate-EN',
-        'charm-rea-Global_Reading_Comprehension_Translate-EN',
-        'charm-rea-Global_Sequence_Understanding_Translate-EN',
-        'charm-rea-Global_Sport_Understanding_Translate-EN',
-        'charm-rea-Global_Time_Understanding_Translate-EN',
-    ],
-    summary_groups=sum(
-        [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
-)
--- a/configs/summarizers/charm_reason.py
+++ b/configs/summarizers/charm_reason.py
@ -0,0 +1,98 @@
+from mmengine.config import reasond_base
+
+with reasond_base():
+    from .groups.charm_reason import charm_reason_summary_groups
+
+summarizer = dict(
+    dataset_abbrs=[
+        'charm-reason-Direct',
+        'charm-reason-ZH-CoT',
+        'charm-reason-EN-CoT',
+        'charm-reason-XLT',
+        'charm-reason-Translate-EN',
+        '',
+        'charm-reason-Chinese_Direct',
+        'charm-reason-Chinese_ZH-CoT',
+        'charm-reason-Chinese_EN-CoT',
+        'charm-reason-Chinese_XLT',
+        'charm-reason-Chinese_Translate-EN',
+        'charm-reason-Global_Direct',
+        'charm-reason-Global_ZH-CoT',
+        'charm-reason-Global_EN-CoT',
+        'charm-reason-Global_XLT',
+        'charm-reason-Global_Translate-EN',
+        '',
+        'charm-reason-Chinese_Anachronisms_Judgment_Direct',
+        'charm-reason-Chinese_Movie_and_Music_Recommendation_Direct',
+        'charm-reason-Chinese_Natural_Language_Inference_Direct',
+        'charm-reason-Chinese_Reading_Comprehension_Direct',
+        'charm-reason-Chinese_Sequence_Understanding_Direct',
+        'charm-reason-Chinese_Sport_Understanding_Direct',
+        'charm-reason-Chinese_Time_Understanding_Direct',
+        'charm-reason-Global_Anachronisms_Judgment_Direct',
+        'charm-reason-Global_Movie_and_Music_Recommendation_Direct',
+        'charm-reason-Global_Natural_Language_Inference_Direct',
+        'charm-reason-Global_Reading_Comprehension_Direct',
+        'charm-reason-Global_Sequence_Understanding_Direct',
+        'charm-reason-Global_Sport_Understanding_Direct',
+        'charm-reason-Global_Time_Understanding_Direct',
+        'charm-reason-Chinese_Anachronisms_Judgment_ZH-CoT',
+        'charm-reason-Chinese_Movie_and_Music_Recommendation_ZH-CoT',
+        'charm-reason-Chinese_Natural_Language_Inference_ZH-CoT',
+        'charm-reason-Chinese_Reading_Comprehension_ZH-CoT',
+        'charm-reason-Chinese_Sequence_Understanding_ZH-CoT',
+        'charm-reason-Chinese_Sport_Understanding_ZH-CoT',
+        'charm-reason-Chinese_Time_Understanding_ZH-CoT',
+        'charm-reason-Global_Anachronisms_Judgment_ZH-CoT',
+        'charm-reason-Global_Movie_and_Music_Recommendation_ZH-CoT',
+        'charm-reason-Global_Natural_Language_Inference_ZH-CoT',
+        'charm-reason-Global_Reading_Comprehension_ZH-CoT',
+        'charm-reason-Global_Sequence_Understanding_ZH-CoT',
+        'charm-reason-Global_Sport_Understanding_ZH-CoT',
+        'charm-reason-Global_Time_Understanding_ZH-CoT',
+        'charm-reason-Chinese_Anachronisms_Judgment_EN-CoT',
+        'charm-reason-Chinese_Movie_and_Music_Recommendation_EN-CoT',
+        'charm-reason-Chinese_Natural_Language_Inference_EN-CoT',
+        'charm-reason-Chinese_Reading_Comprehension_EN-CoT',
+        'charm-reason-Chinese_Sequence_Understanding_EN-CoT',
+        'charm-reason-Chinese_Sport_Understanding_EN-CoT',
+        'charm-reason-Chinese_Time_Understanding_EN-CoT',
+        'charm-reason-Global_Anachronisms_Judgment_EN-CoT',
+        'charm-reason-Global_Movie_and_Music_Recommendation_EN-CoT',
+        'charm-reason-Global_Natural_Language_Inference_EN-CoT',
+        'charm-reason-Global_Reading_Comprehension_EN-CoT',
+        'charm-reason-Global_Sequence_Understanding_EN-CoT',
+        'charm-reason-Global_Sport_Understanding_EN-CoT',
+        'charm-reason-Global_Time_Understanding_EN-CoT',
+        'charm-reason-Chinese_Anachronisms_Judgment_XLT',
+        'charm-reason-Chinese_Movie_and_Music_Recommendation_XLT',
+        'charm-reason-Chinese_Natural_Language_Inference_XLT',
+        'charm-reason-Chinese_Reading_Comprehension_XLT',
+        'charm-reason-Chinese_Sequence_Understanding_XLT',
+        'charm-reason-Chinese_Sport_Understanding_XLT',
+        'charm-reason-Chinese_Time_Understanding_XLT',
+        'charm-reason-Global_Anachronisms_Judgment_XLT',
+        'charm-reason-Global_Movie_and_Music_Recommendation_XLT',
+        'charm-reason-Global_Natural_Language_Inference_XLT',
+        'charm-reason-Global_Reading_Comprehension_XLT',
+        'charm-reason-Global_Sequence_Understanding_XLT',
+        'charm-reason-Global_Sport_Understanding_XLT',
+        'charm-reason-Global_Time_Understanding_XLT',
+        'charm-reason-Chinese_Anachronisms_Judgment_Translate-EN',
+        'charm-reason-Chinese_Movie_and_Music_Recommendation_Translate-EN',
+        'charm-reason-Chinese_Natural_Language_Inference_Translate-EN',
+        'charm-reason-Chinese_Reading_Comprehension_Translate-EN',
+        'charm-reason-Chinese_Sequence_Understanding_Translate-EN',
+        'charm-reason-Chinese_Sport_Understanding_Translate-EN',
+        'charm-reason-Chinese_Time_Understanding_Translate-EN',
+        'charm-reason-Global_Anachronisms_Judgment_Translate-EN',
+        'charm-reason-Global_Movie_and_Music_Recommendation_Translate-EN',
+        'charm-reason-Global_Natural_Language_Inference_Translate-EN',
+        'charm-reason-Global_Reading_Comprehension_Translate-EN',
+        'charm-reason-Global_Sequence_Understanding_Translate-EN',
+        'charm-reason-Global_Sport_Understanding_Translate-EN',
+        'charm-reason-Global_Time_Understanding_Translate-EN',
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], [])
+)
--- a/configs/summarizers/chat_OC15_multi_faceted.py
+++ b/configs/summarizers/chat_OC15_multi_faceted.py
@ -122,10 +122,10 @@ IFEval_dataset_abbrs = [
    ['IFEval', 'Inst-level-loose-accuracy'],
 ]

-
 summarizer = dict(
    type=MultiFacetedSummarizer,
    dataset_abbrs_list=[
+        {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
        {'name': 'mmlu', 'dataset_abbrs': mmlu_dataset_abbrs},
        {'name': 'cmmlu', 'dataset_abbrs': cmmlu_dataset_abbrs},
        {'name': 'ceval', 'dataset_abbrs': ceval_dataset_abbrs},
@ -143,7 +143,6 @@ summarizer = dict(
        {'name': 'humaneval', 'dataset_abbrs': [['openai_humaneval', 'humaneval_pass@1']]},
        {'name': 'GPQA', 'dataset_abbrs': [['GPQA_diamond', 'accuracy']]},
        {'name': 'IFEval', 'dataset_abbrs': IFEval_dataset_abbrs},
-        {'name': 'overall', 'dataset_abbrs': overall_dataset_abbrs},
    ],
    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
--- a/configs/summarizers/groups/charm_rea.py
+++ b/configs/summarizers/groups/charm_rea.py
@ -1,30 +0,0 @@
-charm_tasks = [
-    'Anachronisms_Judgment',
-    'Movie_and_Music_Recommendation',
-    'Natural_Language_Inference',
-    'Reading_Comprehension',
-    'Sequence_Understanding',
-    'Sport_Understanding',
-    'Time_Understanding',
-]
-regions = [
-    'Chinese',
-    'Global',
-]
-prompts = [
-    'Direct',
-    'ZH-CoT',
-    'EN-CoT',
-    'XLT',
-    'Translate-EN',
-]
-
-
-charm_rea_summary_groups = []
-for prompt in prompts:
-    for region in regions:
-        subsets = ['charm-rea-' + region + '_' + task + '_' + prompt for task in charm_tasks]
-        charm_rea_summary_groups.append({'name': 'charm-rea-' + region + '_' + prompt, 'subsets': subsets})
-for prompt in prompts:
-    subsets = ['charm-rea-' + region + '_' + prompt for region in regions]
-    charm_rea_summary_groups.append({'name': 'charm-rea-' + prompt, 'subsets': subsets})
--- a/configs/summarizers/groups/charm_reason.py
+++ b/configs/summarizers/groups/charm_reason.py
@ -0,0 +1,35 @@
+charm_tasks = [
+    'Anachronisms_Judgment',
+    'Movie_and_Music_Recommendation',
+    'Natural_Language_Inference',
+    'Reading_Comprehension',
+    'Sequence_Understanding',
+    'Sport_Understanding',
+    'Time_Understanding',
+]
+regions = [
+    'Chinese',
+    'Global',
+]
+prompts = [
+    'Direct',
+    'ZH-CoT',
+    'EN-CoT',
+    'XLT',
+    'Translate-EN',
+]
+
+
+charm_reaso_summary_groups = []
+for prompt in prompts:
+    for region in regions:
+        subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
+        charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
+
+for prompt in prompts:
+    subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
+    charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
+
+charm_reaso_summary_groups.append(
+    {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
+)
--- a/configs/summarizers/groups/mathbench_2024.py
+++ b/configs/summarizers/groups/mathbench_2024.py
@ -0,0 +1,26 @@
+
+mathbench_2024_wocircular_summary_groups = [
+    {'name': 'college', 'subsets': ['college-single_choice_cn', 'college-single_choice_en']},
+    {'name': 'high', 'subsets': ['high-single_choice_cn', 'high-single_choice_en']},
+    {'name': 'middle', 'subsets': ['middle-single_choice_cn', 'middle-single_choice_en']},
+    {'name': 'primary', 'subsets': ['primary-cloze_cn', 'primary-cloze_en']},
+    {'name': 'cn', 'subsets': ['college-single_choice_cn', 'high-single_choice_cn', 'middle-single_choice_cn', 'primary-cloze_cn']},
+    {'name': 'en', 'subsets': ['college-single_choice_en', 'high-single_choice_en', 'middle-single_choice_en', 'primary-cloze_en']},
+    {'name': 'a', 'subsets': ['college', 'high', 'middle', 'primary', 'arithmetic-cloze_en']},
+
+    {'name': 'college_knowledge', 'subsets': ['college_knowledge-single_choice_cn', 'college_knowledge-single_choice_en']},
+    {'name': 'high_knowledge', 'subsets': ['high_knowledge-single_choice_cn', 'high_knowledge-single_choice_en']},
+    {'name': 'middle_knowledge', 'subsets': ['middle_knowledge-single_choice_cn', 'middle_knowledge-single_choice_en']},
+    {'name': 'primary_knowledge', 'subsets': ['primary_knowledge-single_choice_cn', 'primary_knowledge-single_choice_en']},
+    {'name': 'knowledge-cn', 'subsets': ['college_knowledge-single_choice_cn', 'high_knowledge-single_choice_cn', 'middle_knowledge-single_choice_cn', 'primary_knowledge-single_choice_cn']},
+    {'name': 'knowledge-en', 'subsets': ['college_knowledge-single_choice_en', 'high_knowledge-single_choice_en', 'middle_knowledge-single_choice_en', 'primary_knowledge-single_choice_en']},
+    {'name': 't', 'subsets': ['college_knowledge', 'high_knowledge', 'middle_knowledge', 'primary_knowledge']},
+
+    {'name': 'overall', 'subsets': ['a', 't']},
+]
+
+for g in mathbench_2024_wocircular_summary_groups:
+    g['name'] = 'mathbench-wocircular-' + g['name']
+    g['subsets'] = ['mathbench-wocircular-' + s for s in g['subsets']]
+
+mathbench_2024_summary_groups = mathbench_2024_wocircular_summary_groups
--- a/opencompass/init.py
+++ b/opencompass/init.py
@ -1 +1 @@
-__version__ = '0.2.4'
+__version__ = '0.2.5'
--- a/opencompass/datasets/charm.py
+++ b/opencompass/datasets/charm.py
@ -11,8 +11,8 @@ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
 from .base import BaseDataset


-@TEXT_POSTPROCESSORS.register_module('charm-rea')
-def charm_rea_postprocess(text: str) -> str:
+@TEXT_POSTPROCESSORS.register_module('charm-reason')
+def charm_reason_postprocess(text: str) -> str:
    ans = text
    ans_line = ans.split('answer is ')
    if len(ans_line) != 1:
@ -27,14 +27,11 @@ def charm_rea_postprocess(text: str) -> str:


@ICL_EVALUATORS.register_module()
-class CharmReaEvaluator(BaseEvaluator):
+class CharmReasonEvaluator(BaseEvaluator):

    def score(self, predictions, references):
        if len(predictions) != len(references):
-            return {
-                'error': 'predictions and references have different '
-                'length'
-            }
+            return {'error': 'preds and refrs have different length'}
        details = []
        cnt = 0
        for pred, ref in zip(predictions, references):
@ -43,9 +40,7 @@ class CharmReaEvaluator(BaseEvaluator):
                cnt += 1
                detail['correct'] = True
            details.append(detail)
-
        score = cnt / len(predictions) * 100
-
        return {'score': score, 'details': details}


--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@ -208,7 +208,8 @@ class MBPPEvaluator(BaseEvaluator):
        assert self.metric in ['MBPP', 'MBPPPlus']

    def score(self, predictions, references):
-        assert len(predictions) == len(references)
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}

        if self.metric == 'MBPP':
            result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
@ -286,6 +287,8 @@ class MBPPEvaluator(BaseEvaluator):
            r'BEGIN\s*(.*)\s*DONE',
            r'```python\s*(.*)\s*```',
            r'```\s*(.*)\s*```',
+            r'```python\s*(.*)\s*$',
+            r'```\s*(.*)\s*$',
            r'(.*)\s*```.*',
            r"\[BEGIN\]\s*'(.*)",
            r'\[BEGIN\](.*)',
--- a/opencompass/models/init.py
+++ b/opencompass/models/init.py
@ -18,7 +18,7 @@ from .huggingface_above_v4_33 import HuggingFacewithChatTemplate  # noqa: F401
 from .hunyuan_api import Hunyuan  # noqa: F401
 from .intern_model import InternLM  # noqa: F401
 from .krgpt_api import KrGPT  # noqa: F401
-from .lightllm_api import LightllmAPI  # noqa: F401
+from .lightllm_api import LightllmAPI, LightllmChatAPI  # noqa: F401
 from .llama2 import Llama2, Llama2Chat  # noqa: F401
 from .lmdeploy_pytorch import LmdeployPytorchModel  # noqa: F401
 from .lmdeploy_tis import LmdeployTisModel  # noqa: F401
--- a/opencompass/models/huggingface_above_v4_33.py
+++ b/opencompass/models/huggingface_above_v4_33.py
@ -84,7 +84,6 @@ def _convert_chat_messages(inputs, merge_role=True):
            messages = merged_messages

        outputs.append(messages)
-        print(messages)
    return outputs


--- a/opencompass/models/lightllm_api.py
+++ b/opencompass/models/lightllm_api.py
@ -1,16 +1,20 @@
 import json
 import re
+import time
 from concurrent.futures import ThreadPoolExecutor
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union

 import numpy as np
 import requests

 from opencompass.registry import MODELS
 from opencompass.utils.logging import get_logger
+from opencompass.utils.prompt import PromptList

 from .base import BaseModel
-from .base_api import TokenBucket
+from .base_api import BaseAPIModel, TokenBucket
+
+PromptType = Union[PromptList, str]


@MODELS.register_module()
@ -189,3 +193,158 @@ class LightllmAPI(BaseModel):
        chinese_count = sum(len(part) for part in chinese_parts)

        return english_count + chinese_count
+
+
+class LightllmChatAPI(BaseAPIModel):
+    """Model wrapper around YiAPI.
+
+    Documentation:
+
+    Args:
+        path (str): The name of YiAPI model.
+            e.g. `moonshot-v1-32k`
+        key (str): Authorization key.
+        query_per_second (int): The maximum queries allowed per second
+            between two consecutive calls of the API. Defaults to 1.
+        max_seq_len (int): Unused here.
+        meta_template (Dict, optional): The model's meta prompt
+            template if needed, in case the requirement of injecting or
+            wrapping of any meta instructions.
+        retry (int): Number of retires if the API call fails. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        path: str,
+        url: str,
+        query_per_second: int = 2,
+        max_seq_len: int = 2048,
+        meta_template: Optional[Dict] = None,
+        retry: int = 2,
+    ):
+        super().__init__(path=path,
+                         max_seq_len=max_seq_len,
+                         query_per_second=query_per_second,
+                         meta_template=meta_template,
+                         retry=retry)
+        self.url = url
+        self.model = path
+
+    def generate(
+        self,
+        inputs: List[PromptType],
+        max_out_len: int = 512,
+    ) -> List[str]:
+        """Generate results given a list of inputs.
+
+        Args:
+            inputs (List[PromptType]): A list of strings or PromptDicts.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            List[str]: A list of generated strings.
+        """
+        with ThreadPoolExecutor() as executor:
+            results = list(
+                executor.map(self._generate, inputs,
+                             [max_out_len] * len(inputs)))
+        self.flush()
+        return results
+
+    def _generate(
+        self,
+        input: PromptType,
+        max_out_len: int = 512,
+    ) -> str:
+        """Generate results given an input.
+
+        Args:
+            inputs (PromptType): A string or PromptDict.
+                The PromptDict should be organized in OpenCompass'
+                API format.
+            max_out_len (int): The maximum length of the output.
+
+        Returns:
+            str: The generated string.
+        """
+        assert isinstance(input, (str, PromptList))
+
+        if isinstance(input, str):
+            messages = [{'role': 'user', 'content': input}]
+        else:
+            messages = []
+            msg_buffer, last_role = [], None
+            for item in input:
+                item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
+                if item['role'] != last_role and last_role is not None:
+                    messages.append({
+                        'content': '\n'.join(msg_buffer),
+                        'role': last_role
+                    })
+                    msg_buffer = []
+                msg_buffer.append(item['prompt'])
+                last_role = item['role']
+            messages.append({
+                'content': '\n'.join(msg_buffer),
+                'role': last_role
+            })
+
+        data = {'messages': messages}
+
+        max_num_retries = 0
+        while max_num_retries < self.retry:
+            self.acquire()
+            try:
+                raw_response = requests.request('POST',
+                                                url=self.url,
+                                                json=data)
+            except Exception as err:
+                print('Request Error:{}'.format(err))
+                time.sleep(2)
+                continue
+
+            try:
+                response = raw_response.json()
+            except Exception as err:
+                print('Response Error:{}'.format(err))
+                response = None
+            self.release()
+
+            if response is None:
+                print('Connection error, reconnect.')
+                # if connect error, frequent requests will casuse
+                # continuous unstable network, therefore wait here
+                # to slow down the request
+                self.wait()
+                continue
+
+            if raw_response.status_code == 200:
+                # msg = json.load(response.text)
+                # response
+                msg = response['choices'][0]['message']['content']
+                self.logger.debug(f'Generated: {msg}')
+                return msg
+
+            if raw_response.status_code == 401:
+                print('请求被拒绝 api_key错误')
+                continue
+            elif raw_response.status_code == 400:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                msg = 'The request was rejected because high risk'
+                return msg
+            elif raw_response.status_code == 429:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(5)
+                continue
+            else:
+                print(messages, response)
+                print('请求失败，状态码:', raw_response)
+                time.sleep(1)
+
+            max_num_retries += 1
+
+        raise RuntimeError(raw_response)
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@ -50,6 +50,9 @@ class OpenAI(BaseAPIModel):
        temperature (float, optional): What sampling temperature to use.
            If not None, will override the temperature in the `generate()`
            call. Defaults to None.
+        tokenizer_path (str, optional): The path to the tokenizer. Use path if
+            'tokenizer_path' is None, otherwise use the 'tokenizer_path'.
+            Defaults to None.
    """

    is_api: bool = True
@ -67,7 +70,8 @@ class OpenAI(BaseAPIModel):
                 mode: str = 'none',
                 logprobs: Optional[bool] = False,
                 top_logprobs: Optional[int] = None,
-                 temperature: Optional[float] = None):
+                 temperature: Optional[float] = None,
+                 tokenizer_path: Optional[str] = None):

        super().__init__(path=path,
                         max_seq_len=max_seq_len,
@ -82,6 +86,7 @@ class OpenAI(BaseAPIModel):
        self.mode = mode
        self.logprobs = logprobs
        self.top_logprobs = top_logprobs
+        self.tokenizer_path = tokenizer_path

        if isinstance(key, str):
            if key == 'ENV':
@ -287,7 +292,8 @@ class OpenAI(BaseAPIModel):
        Returns:
            int: Length of the input tokens
        """
-        enc = self.tiktoken.encoding_for_model(self.path)
+        enc = self.tiktoken.encoding_for_model(self.path
+                                               or self.tokenizer_path)
        return len(enc.encode(prompt))

    def bin_trim(self, prompt: str, num_token: int) -> str:
@ -333,140 +339,3 @@ class OpenAI(BaseAPIModel):
        elif self.mode == 'rear':
            prompt = sep.join(words[:l])
        return prompt
-
-
-class OpenAIAllesAPIN(OpenAI):
-    """Model wrapper around OpenAI-AllesAPIN.
-
-    Args:
-        path (str): The name of OpenAI's model.
-        url (str): URL to AllesAPIN.
-        key (str): AllesAPIN key.
-        query_per_second (int): The maximum queries allowed per second
-            between two consecutive calls of the API. Defaults to 1.
-        max_seq_len (int): Unused here.
-        meta_template (Dict, optional): The model's meta prompt
-            template if needed, in case the requirement of injecting or
-            wrapping of any meta instructions.
-        retry (int): Number of retires if the API call fails. Defaults to 2.
-    """
-
-    is_api: bool = True
-
-    def __init__(self,
-                 path: str,
-                 url: str,
-                 key: str,
-                 temperature: float = 1.0,
-                 query_per_second: int = 1,
-                 rpm_verbose: bool = False,
-                 max_seq_len: int = 2048,
-                 meta_template: Optional[Dict] = None,
-                 retry: int = 2):
-        super().__init__(path=path,
-                         max_seq_len=max_seq_len,
-                         query_per_second=query_per_second,
-                         rpm_verbose=rpm_verbose,
-                         meta_template=meta_template,
-                         retry=retry)
-        self.url = url
-        self.temperature = temperature
-        self.headers = {
-            'alles-apin-token': key,
-            'content-type': 'application/json',
-        }
-
-    def _generate(self, input: PromptType, max_out_len: int,
-                  temperature: float) -> str:
-        """Generate results given an input.
-
-        Args:
-            inputs (PromptType): A string or PromptDict.
-                The PromptDict should be organized in OpenCompass'
-                API format.
-            max_out_len (int): The maximum length of the output.
-            temperature (float): What sampling temperature to use,
-                between 0 and 2. Higher values like 0.8 will make the output
-                more random, while lower values like 0.2 will make it more
-                focused and deterministic.
-
-        Returns:
-            str: The generated string.
-        """
-        assert isinstance(input, (str, PromptList))
-
-        if isinstance(input, str):
-            messages = [{'role': 'user', 'content': input}]
-        else:
-            messages = []
-            for item in input:
-                msg = {'content': item['prompt']}
-                if item['role'] == 'HUMAN':
-                    msg['role'] = 'user'
-                elif item['role'] == 'BOT':
-                    msg['role'] = 'assistant'
-                elif item['role'] == 'SYSTEM':
-                    msg['role'] = 'system'
-                messages.append(msg)
-
-            # model can be response with user and system
-            # when it comes with agent involved.
-            assert msg['role'] in ['user', 'system']
-
-        data = {
-            'model': self.path,
-            'messages': messages,
-            'temperature': temperature
-        }
-        for _ in range(self.retry):
-            self.wait()
-            try:
-                raw_response = requests.post(self.url,
-                                             headers=self.headers,
-                                             data=json.dumps(data))
-            except requests.ConnectionError:
-                self.logger.error('Request error, got',
-                                  str(raw_response.content))
-                time.sleep(1)
-                continue
-            try:
-                response = raw_response.json()
-            except requests.JSONDecodeError:
-                self.logger.error('JsonDecode error, got',
-                                  str(raw_response.content))
-                time.sleep(1)
-                continue
-            if raw_response.status_code == 200 and response[
-                    'msgCode'] == '10000':
-                data = response['data']
-                choices = data['choices']
-                if choices is None:
-                    self.logger.error(data)
-                else:
-                    return choices[0]['message']['content'].strip()
-            try:
-                match = re.match(r'Error code: \d+ - (.*)', response['data'])
-                err = eval(match.group(1))['error']
-                if err['code'] == 'content_filter' and err['status'] == 400:
-                    return err['message']
-            except Exception:
-                pass
-            self.logger.error(response['msg'])
-            self.logger.error(response)
-            time.sleep(1)
-
-        raise RuntimeError('API call failed.')
-
-    def get_token_len(self, prompt: str) -> int:
-        """Get lengths of the tokenized string. Only English and Chinese
-        characters are counted for now. Users are encouraged to override this
-        method if more accurate length is needed.
-
-        Args:
-            prompt (str): Input string.
-
-        Returns:
-            int: Length of the input tokens
-        """
-        enc = self.tiktoken.encoding_for_model(self.path)
-        return len(enc.encode(prompt))
--- a/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_base_inferencer.py
@ -145,6 +145,8 @@ class PPLInferencerOutputHandler:
    def save_prompt_and_ppl(self, label, input, prompt, ppl, idx):
        if str(idx) not in self.results_dict.keys():
            self.results_dict[str(idx)] = {}
+        if 'origin_prompt' not in self.results_dict[str(idx)]:
+            self.results_dict[str(idx)]['origin_prompt'] = input
        if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
            self.results_dict[str(idx)]['label: ' + str(label)] = {}
        self.results_dict[str(idx)]['label: ' +
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@ -161,6 +161,9 @@ class DLCRunner(BaseRunner):
            shell_cmd += 'umask 0000; '
            shell_cmd += '{task_cmd}'

+            # set priority to 1 as default
+            task_priority = self.aliyun_cfg.get('priority', 1)
+
            tmpl = (
                'dlc submit pytorchjob'
                f" --command '{shell_cmd}'"
@ -168,6 +171,7 @@ class DLCRunner(BaseRunner):
                f" --config {self.aliyun_cfg['dlc_config_path']}"
                f" --workspace_id {self.aliyun_cfg['workspace_id']}"
                f" --resource_id {self.aliyun_cfg['resource_id']}"
+                f' --priority {task_priority}'
                ' --workers 1'
                f' --worker_cpu {max(num_gpus * 8, 12)}'
                f' --worker_gpu {num_gpus}'
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@ -226,7 +226,7 @@ class DefaultSummarizer:

        return raw_results, parsed_results, dataset_metrics, dataset_eval_mode

-    def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None):
+    def _format_table(self, parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=None, skip_all_slash=False):
        dataset_abbrs = [dataset_abbr_from_cfg(dataset) for dataset in self.dataset_cfgs]
        prompt_version = {dataset_abbr_from_cfg(d): get_prompt_hash(d)[:6] for d in self.dataset_cfgs}

@ -257,14 +257,16 @@ class DefaultSummarizer:
        table.append(header)
        for dataset_abbr, metric in summarizer_dataset_abbrs:
            if dataset_abbr not in dataset_metrics:
-                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
+                if not skip_all_slash:
+                    table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
                continue
            if metric is None:
                metric = dataset_metrics[dataset_abbr][0]
            elif metric in dataset_metrics[dataset_abbr]:
                pass
            else:
-                table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
+                if not skip_all_slash:
+                    table.append([dataset_abbr, '-', '-', '-'] + ['-'] * len(self.model_abbrs))
                continue

            row = [dataset_abbr, prompt_version.get(dataset_abbr, '-'), metric, dataset_eval_mode.get(dataset_abbr, '-')]
--- a/opencompass/summarizers/multi_faceted.py
+++ b/opencompass/summarizers/multi_faceted.py
@ -1,11 +1,9 @@
 # flake8: noqa
 # yapf: disable
-import functools
-import getpass
-import math
+import json
 import os
 from datetime import datetime
-from typing import Any, Dict, List, Optional
+from typing import Dict, List, Optional

 import tabulate
 from mmengine import ConfigDict
@ -33,7 +31,9 @@ class MultiFacetedSummarizer(DefaultSummarizer):
            profile_dataset_abbrs = dataset_abbrs_item['dataset_abbrs']

            # format table
-            table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs)
+            table = self._format_table(parsed_results, dataset_metrics, dataset_eval_mode, required_dataset_abbrs=profile_dataset_abbrs, skip_all_slash=True)
+            if len(table) == 1:
+                continue

            # output to screen
            print(tabulate.tabulate(table, headers='firstrow', floatfmt='.2f'))
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@ -214,6 +214,8 @@ class OpenICLEvalTask(BaseTask):
            preds['references'] = (test_set[self.output_column]
                                   if self.output_column else None)
            preds['test_set'] = test_set
+            if 'origin_prompt' not in preds:
+                preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
            preds = {
                k: preds[k]
                for k in signature(icl_evaluator.score).parameters
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@ -258,7 +258,7 @@ def change_accelerator(models, accelerator):
                    path=model['path'],
                    model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus']),
                    max_out_len=model['max_out_len'],
-                    batch_size=32768,
+                    batch_size=16,
                    run_cfg=model['run_cfg'],
                    stop_words=model.get('stop_words', []),
                )
@ -272,7 +272,7 @@ def change_accelerator(models, accelerator):
                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
                    max_seq_len=model.get('max_seq_len', 2048),
                    max_out_len=model['max_out_len'],
-                    batch_size=32768,
+                    batch_size=16,
                    run_cfg=model['run_cfg'],
                    stop_words=model.get('stop_words', []),
                )
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@ -63,15 +63,15 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
    # yapf: disable
    # flake8: noqa: W605
    patterns = [
-        f'答案是?\s?([{options}])',
-        f'答案是?\s?：([{options}])',
-        f'答案是?\s?:([{options}])',
-        f'答案应该?是\s?([{options}])',
-        f'答案应该?选\s?([{options}])',
-        f'答案为\s?([{options}])',
-        f'答案选\s?([{options}])',
-        f'选择?\s?([{options}])',
-        f'故选?\s?([{options}])'
+        f'答案是?\s*([{options}])',
+        f'答案是?\s*：\s*([{options}])',
+        f'答案是?\s*:\s*([{options}])',
+        f'答案应该?是\s*([{options}])',
+        f'答案应该?选\s*([{options}])',
+        f'答案为\s*([{options}])',
+        f'答案选\s*([{options}])',
+        f'选择?\s*([{options}])',
+        f'故选?\s*([{options}])'
        f'只有选?项?\s?([{options}])\s?是?对',
        f'只有选?项?\s?([{options}])\s?是?错',
        f'只有选?项?\s?([{options}])\s?不?正确',
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -22,6 +22,7 @@ OpenCC
 opencv-python-headless
 pandas<2.0.0
 prettytable
+protobuf
 pyext
 pypinyin
 python-Levenshtein