style: pass all formatting hooks (yapf & quote fixer)

2025-05-30 16:03:24 +08:00 · 2025-04-28 08:03:47 +00:00 · 2025-04-28 08:03:47 +00:00 · e2f80574ec
commit e2f80574ec
parent 8c74e6a39e
11 changed files with 967 additions and 5 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -20,7 +20,7 @@ exclude: |
    )
 repos:
  - repo: https://github.com/PyCQA/flake8
-    rev: 5.0.4
+    rev: 7.2.0
    hooks:
      - id: flake8
        exclude: |
@ -29,7 +29,7 @@ repos:
                examples/
            )
  - repo: https://github.com/PyCQA/isort
-    rev: 5.11.5
+    rev: 6.0.1
    hooks:
      - id: isort
        exclude: |
@ -47,7 +47,7 @@ repos:
                examples/
            )
  - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.1
+    rev: v2.4.1
    hooks:
      - id: codespell
        exclude: |
@ -58,7 +58,7 @@ repos:
                examples/
            )
  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v5.0.0
    hooks:
      - id: trailing-whitespace
        exclude: |
@ -81,7 +81,7 @@ repos:
      - id: mixed-line-ending
        args: ["--fix=lf"]
  - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.9
+    rev: 0.7.22
    hooks:
      - id: mdformat
        args: ["--number", "--table-width", "200"]
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_gen.py
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_gen.py
@ -0,0 +1,61 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.datasets import SciEvalDataset  # 你自己实现的类
+
+# 只评测 biology + multiple-choice 的 test split
+_hint = ('Given a question and four options, please select the right answer. '
+         "Your answer should be 'A', 'B', 'C' or 'D'.")
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+            ),
+            dict(role='BOT', prompt='{target}\n')
+        ]),
+    ),
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin='</E>',
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                ),
+            ],
+        ),
+        ice_token='</E>',
+    ),
+    retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+    inferencer=dict(type=GenInferencer),
+)
+
+scieval_eval_cfg = dict(
+    evaluator=dict(type=AccwithDetailsEvaluator),
+    pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
+)
+
+scieval_datasets = [
+    dict(
+        abbr='scieval_biology',
+        type=SciEvalDataset,
+        path='OpenDFM/SciEval',
+        name='default',
+        reader_cfg=scieval_reader_cfg,
+        infer_cfg=scieval_infer_cfg,
+        eval_cfg=scieval_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_llmjudge_gen.py
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_llmjudge_gen.py
@ -0,0 +1,125 @@
+# SciEval_lifescience_llmjudge_gen.py
+
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import SciEvalDataset
+
+with read_base():
+    from .SciEval_lifescience_sets import SciEval_lifescience_subsets
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+GRADER_TEMPLATE = """
+Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+Here are some evaluation criteria:
+1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+A: CORRECT 
+B: INCORRECT
+Just return the letters "A" or "B", with no text around it.
+
+Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+<Original Question Begin>: {input}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+<Original Question End>
+
+<Gold Target Begin>:
+{target}
+<Gold Target End>
+
+<Predicted Answer Begin>:
+{prediction}
+<Predicted End>
+
+Judging the correctness of candidates' answers:
+""".strip()
+
+scieval_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='test',
+)
+
+scieval_datasets = []
+for name in SciEval_lifescience_subsets:
+    scieval_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    scieval_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt=(
+                                'You are a helpful assistant who evaluates the correctness '
+                                "and quality of models' outputs."
+                            ),
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=SciEvalDataset,
+                path='OpenDFM/SciEval',
+                name='default',
+                reader_cfg=scieval_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    scieval_datasets.append(
+        dict(
+            abbr=f'scieval_lifescience_{name}_llmjudge',
+            type=SciEvalDataset,
+            path='OpenDFM/SciEval',
+            name='default',
+            reader_cfg=scieval_reader_cfg,
+            infer_cfg=scieval_infer_cfg,
+            eval_cfg=scieval_eval_cfg,
+            mode='singlescore',
+        )
+    )
--- a/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
+++ b/opencompass/configs/datasets/SciEval_lifscience/SciEval_lifescience_sets.py
@ -0,0 +1,3 @@
+SciEval_lifescience_subsets = [
+    'biology',        # 大学生物学
+]
--- a/opencompass/configs/datasets/mmlu_lifescience/README.md
+++ b/opencompass/configs/datasets/mmlu_lifescience/README.md
@ -0,0 +1,368 @@
+# MMLU
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
+```
+
+## Base Models
+
+|          model           |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|    llama-7b-turbomind    |  35.66 |       31.22 |                 37.70 |             38.90 |        37.01 |
+|   llama-13b-turbomind    |  47.76 |       37.68 |                 55.36 |             52.43 |        50.83 |
+|   llama-30b-turbomind    |  58.55 |       46.95 |                 67.35 |             65.13 |        60.78 |
+|   llama-65b-turbomind    |  63.78 |       52.35 |                 73.68 |             70.84 |        64.29 |
+|   llama-2-7b-turbomind   |  46.78 |       37.81 |                 52.11 |             51.69 |        50.04 |
+|  llama-2-13b-turbomind   |  55.76 |       44.61 |                 63.86 |             62.97 |        57.35 |
+|  llama-2-70b-turbomind   |  69.87 |       58.30 |                 79.86 |             75.84 |        71.58 |
+|   llama-3-8b-turbomind   |  66.43 |       55.95 |                 76.11 |             70.29 |        68.96 |
+|  llama-3-70b-turbomind   |  79.35 |       70.66 |                 87.54 |             83.43 |        80.42 |
+| internlm2-1.8b-turbomind |  45.99 |       39.63 |                 51.02 |             48.65 |        47.96 |
+|  internlm2-7b-turbomind  |  65.84 |       56.48 |                 74.43 |             69.68 |        67.75 |
+| internlm2-20b-turbomind  |  67.58 |       59.01 |                 76.04 |             71.20 |        68.69 |
+|   qwen-1.8b-turbomind    |  46.61 |       38.91 |                 51.35 |             49.57 |        50.51 |
+|    qwen-7b-turbomind     |  59.75 |       50.16 |                 67.98 |             63.48 |        62.44 |
+|    qwen-14b-turbomind    |  67.85 |       59.13 |                 76.18 |             71.62 |        69.12 |
+|    qwen-72b-turbomind    |  77.36 |       68.70 |                 85.28 |             80.60 |        79.45 |
+|     qwen1.5-0.5b-hf      |  39.98 |       33.96 |                 45.08 |             41.59 |        42.48 |
+|     qwen1.5-1.8b-hf      |  47.14 |       39.47 |                 52.70 |             49.01 |        51.33 |
+|      qwen1.5-4b-hf       |  57.03 |       47.80 |                 64.86 |             60.10 |        60.20 |
+|      qwen1.5-7b-hf       |  62.15 |       53.22 |                 70.25 |             65.62 |        64.26 |
+|      qwen1.5-14b-hf      |  69.10 |       61.46 |                 77.57 |             71.25 |        70.29 |
+|      qwen1.5-32b-hf      |  73.88 |       65.60 |                 81.41 |             77.10 |        75.79 |
+|      qwen1.5-72b-hf      |  77.02 |       69.00 |                 84.55 |             80.60 |        78.21 |
+|   qwen1.5-moe-a2-7b-hf   |  62.09 |       53.27 |                 70.74 |             63.80 |        65.28 |
+|    mistral-7b-v0.1-hf    |  64.04 |       53.21 |                 73.65 |             68.04 |        67.00 |
+|    mistral-7b-v0.2-hf    |  63.85 |       53.21 |                 72.17 |             68.40 |        67.15 |
+|   mixtral-8x7b-v0.1-hf   |  71.80 |       61.70 |                 81.03 |             75.51 |        74.35 |
+|  mixtral-8x22b-v0.1-hf   |  77.67 |       68.94 |                 86.81 |             81.23 |        78.43 |
+|         yi-6b-hf         |  64.08 |       52.61 |                 74.10 |             68.58 |        67.11 |
+|        yi-34b-hf         |  76.26 |       66.73 |                 83.74 |             81.78 |        77.77 |
+|   deepseek-7b-base-hf    |  49.22 |       40.17 |                 56.73 |             53.46 |        51.26 |
+|   deepseek-67b-base-hf   |  71.95 |       60.57 |                 81.69 |             77.11 |        74.42 |
+
+### Details
+
+|          model           |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|    llama-7b-turbomind    |             37.50 |               30.00 |                      30.00 |                 33.00 |             23.53 |                    23.45 |       34.87 |     37.78 |              25.00 |              27.68 |                34.34 |          31.00 |
+|   llama-13b-turbomind    |             46.53 |               30.00 |                      42.00 |                 36.00 |             18.63 |                    42.76 |       46.71 |     46.67 |              30.00 |              32.14 |                45.66 |          37.00 |
+|   llama-30b-turbomind    |             59.03 |               45.00 |                      47.00 |                 35.00 |             26.47 |                    53.10 |       61.18 |     51.85 |              37.00 |              41.07 |                57.36 |          38.00 |
+|   llama-65b-turbomind    |             68.75 |               49.00 |                      47.00 |                 37.00 |             35.29 |                    55.17 |       73.03 |     57.78 |              30.00 |              48.21 |                66.04 |          38.00 |
+|   llama-2-7b-turbomind   |             46.53 |               34.00 |                      33.00 |                 34.00 |             22.55 |                    47.59 |       40.13 |     47.41 |              29.00 |              38.39 |                46.42 |          32.00 |
+|  llama-2-13b-turbomind   |             59.03 |               44.00 |                      48.00 |                 29.00 |             26.47 |                    50.34 |       53.29 |     49.63 |              35.00 |              28.57 |                60.00 |          32.00 |
+|  llama-2-70b-turbomind   |             84.72 |               51.00 |                      60.00 |                 39.00 |             37.25 |                    65.52 |       81.58 |     63.70 |              32.00 |              52.68 |                72.08 |          46.00 |
+|   llama-3-8b-turbomind   |             77.08 |               46.00 |                      51.00 |                 31.00 |             51.96 |                    62.76 |       67.11 |     68.15 |              34.00 |              52.68 |                74.72 |          35.00 |
+|  llama-3-70b-turbomind   |             93.75 |               62.00 |                      72.00 |                 52.00 |             50.98 |                    74.48 |       92.11 |     79.26 |              48.00 |              63.39 |                86.42 |          49.00 |
+| internlm2-1.8b-turbomind |             38.89 |               37.00 |                      44.00 |                 35.00 |             30.39 |                    49.66 |       50.66 |     44.44 |              25.00 |              35.71 |                51.32 |          32.00 |
+|  internlm2-7b-turbomind  |             77.08 |               48.00 |                      64.00 |                 33.00 |             47.06 |                    63.45 |       73.68 |     57.78 |              37.00 |              45.54 |                69.81 |          35.00 |
+| internlm2-20b-turbomind  |             83.33 |               51.00 |                      61.00 |                 36.00 |             45.10 |                    64.83 |       75.00 |     59.26 |              39.00 |              53.57 |                73.58 |          32.00 |
+|   qwen-1.8b-turbomind    |             42.36 |               36.00 |                      39.00 |                 34.00 |             27.45 |                    51.03 |       50.66 |     42.96 |              31.00 |              31.25 |                53.21 |          28.00 |
+|    qwen-7b-turbomind     |             67.36 |               48.00 |                      53.00 |                 28.00 |             39.22 |                    59.31 |       63.82 |     49.63 |              34.00 |              38.39 |                63.02 |          37.00 |
+|    qwen-14b-turbomind    |             78.47 |               51.00 |                      62.00 |                 42.00 |             49.02 |                    65.52 |       71.05 |     60.00 |              37.00 |              58.93 |                71.32 |          40.00 |
+|    qwen-72b-turbomind    |             93.75 |               56.00 |                      66.00 |                 56.00 |             50.98 |                    80.69 |       85.53 |     73.33 |              41.00 |              62.50 |                83.77 |          54.00 |
+|     qwen1.5-0.5b-hf      |             38.89 |               25.00 |                      38.00 |                 32.00 |             25.49 |                    45.52 |       44.74 |     33.33 |              30.00 |              39.29 |                38.11 |          39.00 |
+|     qwen1.5-1.8b-hf      |             43.75 |               34.00 |                      45.00 |                 38.00 |             28.43 |                    47.59 |       47.37 |     40.74 |              32.00 |              31.25 |                53.96 |          37.00 |
+|      qwen1.5-4b-hf       |             50.00 |               46.00 |                      41.00 |                 45.00 |             31.37 |                    53.10 |       61.18 |     51.85 |              35.00 |              44.64 |                60.38 |          37.00 |
+|      qwen1.5-7b-hf       |             66.67 |               48.00 |                      55.00 |                 37.00 |             41.18 |                    60.69 |       65.79 |     52.59 |              39.00 |              41.07 |                68.68 |          43.00 |
+|      qwen1.5-14b-hf      |             75.69 |               49.00 |                      58.00 |                 49.00 |             49.02 |                    71.72 |       73.03 |     65.93 |              39.00 |              52.68 |                73.96 |          49.00 |
+|      qwen1.5-32b-hf      |             85.42 |               53.00 |                      59.00 |                 51.00 |             53.92 |                    72.41 |       82.24 |     63.70 |              43.00 |              58.04 |                78.11 |          50.00 |
+|      qwen1.5-72b-hf      |             90.97 |               54.00 |                      65.00 |                 57.00 |             52.94 |                    80.00 |       87.50 |     73.33 |              43.00 |              64.29 |                81.89 |          50.00 |
+|   qwen1.5-moe-a2-7b-hf   |             62.50 |               44.00 |                      54.00 |                 41.00 |             49.02 |                    58.62 |       69.74 |     57.78 |              37.00 |              38.39 |                66.79 |          38.00 |
+|    mistral-7b-v0.1-hf    |             72.92 |               50.00 |                      51.00 |                 40.00 |             39.22 |                    57.93 |       65.79 |     62.96 |              29.00 |              49.11 |                69.43 |          36.00 |
+|    mistral-7b-v0.2-hf    |             71.53 |               49.00 |                      53.00 |                 40.00 |             36.27 |                    57.24 |       64.47 |     60.00 |              29.00 |              53.57 |                67.92 |          39.00 |
+|   mixtral-8x7b-v0.1-hf   |             85.42 |               54.00 |                      62.00 |                 43.00 |             46.08 |                    68.97 |       82.89 |     70.37 |              37.00 |              56.25 |                79.25 |          51.00 |
+|  mixtral-8x22b-v0.1-hf   |             89.58 |               56.00 |                      69.00 |                 48.00 |             52.94 |                    76.55 |       86.18 |     77.04 |              53.00 |              62.50 |                82.26 |          56.00 |
+|         yi-6b-hf         |             66.67 |               43.00 |                      51.00 |                 39.00 |             35.29 |                    64.83 |       65.79 |     60.00 |              29.00 |              41.96 |                66.79 |          46.00 |
+|        yi-34b-hf         |             88.89 |               52.00 |                      66.00 |                 44.00 |             48.04 |                    80.00 |       89.47 |     74.81 |              44.00 |              58.04 |                78.87 |          52.00 |
+|   deepseek-7b-base-hf    |             52.08 |               29.00 |                      44.00 |                 40.00 |             31.37 |                    44.83 |       51.97 |     40.74 |              27.00 |              32.14 |                53.58 |          31.00 |
+|   deepseek-67b-base-hf   |             84.72 |               52.00 |                      62.00 |                 42.00 |             42.16 |                    70.34 |       80.92 |     65.19 |              39.00 |              50.00 |                78.11 |          42.00 |
+
+|          model           |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|    llama-7b-turbomind    |        33.01 |       39.22 |       45.73 |                     26.24 |                   33.33 |               51.24 |             24.25 |               45.00 |                        31.09 |              30.05 |              37.00 |                     35.13 |
+|   llama-13b-turbomind    |        66.02 |       51.63 |       71.79 |                     34.75 |                   55.05 |               64.46 |             30.06 |               63.00 |                        47.48 |              37.22 |              53.00 |                     48.53 |
+|   llama-30b-turbomind    |        76.70 |       62.42 |       84.19 |                     44.68 |                   71.72 |               75.21 |             40.56 |               66.00 |                        57.98 |              46.48 |              66.00 |                     63.73 |
+|   llama-65b-turbomind    |        82.52 |       68.95 |       87.18 |                     48.94 |                   79.29 |               81.82 |             47.82 |               79.00 |                        68.49 |              50.07 |              68.00 |                     66.67 |
+|   llama-2-7b-turbomind   |        53.40 |       48.69 |       68.38 |                     36.52 |                   49.49 |               65.29 |             24.02 |               60.00 |                        44.12 |              36.31 |              55.00 |                     43.79 |
+|  llama-2-13b-turbomind   |        72.82 |       61.76 |       79.49 |                     39.72 |                   69.19 |               74.38 |             43.80 |               70.00 |                        58.40 |              42.50 |              54.00 |                     54.90 |
+|  llama-2-70b-turbomind   |        83.50 |       77.12 |       91.03 |                     56.03 |                   86.87 |               87.60 |             44.69 |               77.00 |                        77.31 |              52.93 |              74.00 |                     75.65 |
+|   llama-3-8b-turbomind   |        87.38 |       75.82 |       89.74 |                     48.94 |                   80.81 |               84.30 |             40.89 |               81.00 |                        73.95 |              46.22 |              77.00 |                     71.90 |
+|  llama-3-70b-turbomind   |        91.26 |       87.25 |       94.87 |                     64.18 |                   93.94 |               89.26 |             62.91 |               83.00 |                        87.82 |              61.80 |              90.00 |                     85.78 |
+| internlm2-1.8b-turbomind |        60.19 |       58.17 |       63.25 |                     31.21 |                   56.57 |               56.20 |             24.47 |               52.00 |                        50.42 |              36.11 |              53.00 |                     41.83 |
+|  internlm2-7b-turbomind  |        79.61 |       75.49 |       87.61 |                     48.23 |                   82.83 |               77.69 |             49.39 |               74.00 |                        72.27 |              47.65 |              73.00 |                     65.03 |
+| internlm2-20b-turbomind  |        79.61 |       75.49 |       91.88 |                     50.00 |                   87.88 |               85.95 |             35.08 |               81.00 |                        70.59 |              49.48 |              78.00 |                     70.10 |
+|   qwen-1.8b-turbomind    |        66.02 |       60.46 |       73.50 |                     38.30 |                   56.57 |               66.94 |             23.91 |               56.00 |                        42.02 |              33.96 |              51.00 |                     39.54 |
+|    qwen-7b-turbomind     |        78.64 |       67.32 |       83.33 |                     41.49 |                   76.77 |               76.03 |             29.72 |               73.00 |                        58.40 |              41.72 |              69.00 |                     59.64 |
+|    qwen-14b-turbomind    |        78.64 |       73.86 |       88.89 |                     48.58 |                   83.84 |               84.30 |             45.47 |               77.00 |                        73.95 |              50.85 |              74.00 |                     69.61 |
+|    qwen-72b-turbomind    |        90.29 |       84.97 |       94.87 |                     65.96 |                   92.93 |               88.43 |             65.70 |               79.00 |                        84.87 |              61.21 |              86.00 |                     82.19 |
+|     qwen1.5-0.5b-hf      |        52.43 |       46.41 |       60.68 |                     31.21 |                   46.46 |               56.20 |             25.70 |               46.00 |                        37.39 |              32.79 |              46.00 |                     37.75 |
+|     qwen1.5-1.8b-hf      |        66.02 |       58.50 |       75.64 |                     33.69 |                   56.06 |               72.73 |             24.69 |               57.00 |                        39.50 |              36.11 |              53.00 |                     42.81 |
+|      qwen1.5-4b-hf       |        74.76 |       62.75 |       84.19 |                     46.81 |                   76.77 |               71.07 |             25.03 |               67.00 |                        55.04 |              41.33 |              64.00 |                     56.05 |
+|      qwen1.5-7b-hf       |        78.64 |       70.92 |       86.32 |                     44.68 |                   81.82 |               77.69 |             32.74 |               76.00 |                        64.29 |              45.37 |              68.00 |                     61.27 |
+|      qwen1.5-14b-hf      |        80.58 |       75.49 |       85.90 |                     51.06 |                   86.36 |               80.99 |             45.03 |               80.00 |                        76.47 |              48.57 |              78.00 |                     69.61 |
+|      qwen1.5-32b-hf      |        86.41 |       81.37 |       95.30 |                     56.38 |                   91.41 |               88.43 |             44.02 |               76.00 |                        82.77 |              57.89 |              83.00 |                     75.33 |
+|      qwen1.5-72b-hf      |        87.38 |       85.29 |       94.87 |                     64.89 |                   92.42 |               90.08 |             62.12 |               83.00 |                        84.03 |              60.76 |              86.00 |                     81.05 |
+|   qwen1.5-moe-a2-7b-hf   |        78.64 |       70.92 |       86.32 |                     46.81 |                   81.82 |               77.69 |             25.59 |               71.00 |                        65.97 |              45.37 |              65.00 |                     61.44 |
+|    mistral-7b-v0.1-hf    |        82.52 |       75.49 |       87.61 |                     48.94 |                   76.77 |               77.69 |             32.51 |               77.00 |                        66.39 |              44.98 |              74.00 |                     67.97 |
+|    mistral-7b-v0.2-hf    |        81.55 |       74.18 |       88.46 |                     51.06 |                   76.77 |               80.99 |             38.77 |               75.00 |                        64.71 |              45.37 |              72.00 |                     66.34 |
+|   mixtral-8x7b-v0.1-hf   |        87.38 |       81.70 |       91.88 |                     51.77 |                   85.86 |               85.95 |             40.11 |               80.00 |                        79.41 |              53.32 |              77.00 |                     77.94 |
+|  mixtral-8x22b-v0.1-hf   |        89.32 |       85.95 |       91.88 |                     62.06 |                   91.41 |               90.08 |             64.58 |               83.00 |                        87.82 |              60.82 |              84.00 |                     83.17 |
+|         yi-6b-hf         |        80.58 |       71.57 |       91.03 |                     48.23 |                   83.33 |               76.86 |             41.34 |               75.00 |                        74.79 |              49.35 |              80.00 |                     65.69 |
+|        yi-34b-hf         |        91.26 |       85.62 |       92.31 |                     65.25 |                   89.39 |               91.74 |             64.69 |               82.00 |                        85.29 |              59.97 |              87.00 |                     82.19 |
+|   deepseek-7b-base-hf    |        61.17 |       53.59 |       72.22 |                     34.04 |                   59.09 |               65.29 |             26.37 |               61.00 |                        44.96 |              35.53 |              56.00 |                     49.18 |
+|   deepseek-67b-base-hf   |        88.35 |       79.74 |       91.88 |                     57.09 |                   89.39 |               85.12 |             46.15 |               76.00 |                        82.35 |              55.93 |              72.00 |                     79.58 |
+
+|          model           |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|    llama-7b-turbomind    |           41.67 |             49.12 |        40.84 |      34.94 |                   29.56 |              40.00 |                        34.10 |             35.11 |                    26.46 |                 27.81 |                          34.00 |                          41.82 |
+|   llama-13b-turbomind    |           51.85 |             67.84 |        55.31 |      43.37 |                   28.57 |              60.91 |                        46.15 |             57.25 |                    26.98 |                 29.80 |                          49.00 |                          61.21 |
+|   llama-30b-turbomind    |           71.30 |             79.53 |        66.24 |      49.40 |                   40.39 |              70.00 |                        56.67 |             64.89 |                    37.30 |                 35.10 |                          60.00 |                          70.91 |
+|   llama-65b-turbomind    |           75.00 |             81.29 |        73.63 |      53.01 |                   41.38 |              74.55 |                        65.90 |             77.86 |                    40.21 |                 35.76 |                          69.00 |                          76.36 |
+|   llama-2-7b-turbomind   |           53.70 |             69.01 |        60.13 |      41.57 |                   36.95 |              54.55 |                        45.90 |             55.73 |                    27.25 |                 31.13 |                          40.00 |                          59.39 |
+|  llama-2-13b-turbomind   |           74.07 |             76.61 |        63.99 |      45.78 |                   44.83 |              62.73 |                        50.77 |             62.60 |                    34.13 |                 36.42 |                          57.00 |                          63.03 |
+|  llama-2-70b-turbomind   |           83.33 |             85.96 |        78.46 |      53.61 |                   52.22 |              69.09 |                        74.87 |             87.02 |                    43.39 |                 43.71 |                          78.00 |                          84.24 |
+|   llama-3-8b-turbomind   |           75.00 |             83.04 |        74.28 |      56.02 |                   54.68 |              71.82 |                        64.87 |             79.39 |                    42.06 |                 45.03 |                          68.00 |                          76.36 |
+|  llama-3-70b-turbomind   |           86.11 |             91.23 |        86.50 |      57.83 |                   71.92 |              74.55 |                        82.56 |             88.55 |                    62.70 |                 56.95 |                          86.00 |                          86.67 |
+| internlm2-1.8b-turbomind |           55.56 |             59.65 |        51.13 |      40.96 |                   43.35 |              52.73 |                        43.33 |             47.33 |                    30.42 |                 33.11 |                          47.00 |                          56.36 |
+|  internlm2-7b-turbomind  |           79.63 |             82.46 |        73.63 |      51.20 |                   55.17 |              70.00 |                        66.92 |             70.99 |                    46.03 |                 42.38 |                          70.00 |                          78.79 |
+| internlm2-20b-turbomind  |           75.93 |             82.46 |        73.95 |      56.02 |                   57.64 |              68.18 |                        70.51 |             68.70 |                    49.21 |                 38.41 |                          75.00 |                          82.42 |
+|   qwen-1.8b-turbomind    |           59.26 |             56.14 |        50.80 |      40.96 |                   37.93 |              60.00 |                        41.03 |             51.15 |                    33.33 |                 34.44 |                          39.00 |                          64.24 |
+|    qwen-7b-turbomind     |           73.15 |             76.61 |        67.20 |      47.59 |                   51.23 |              65.45 |                        60.00 |             69.47 |                    43.12 |                 38.41 |                          67.00 |                          66.67 |
+|    qwen-14b-turbomind    |           76.85 |             84.21 |        72.03 |      53.01 |                   65.52 |              66.36 |                        66.92 |             78.63 |                    51.32 |                 41.72 |                          72.00 |                          82.42 |
+|    qwen-72b-turbomind    |           83.33 |             88.30 |        83.28 |      58.43 |                   65.52 |              74.55 |                        81.54 |             89.31 |                    68.52 |                 58.28 |                          81.00 |                          84.24 |
+|     qwen1.5-0.5b-hf      |           40.74 |             40.94 |        41.48 |      40.96 |                   28.57 |              50.91 |                        36.92 |             41.98 |                    28.84 |                 22.52 |                          37.00 |                          52.73 |
+|     qwen1.5-1.8b-hf      |           55.56 |             57.31 |        49.84 |      40.96 |                   36.45 |              56.36 |                        43.59 |             56.49 |                    35.19 |                 27.81 |                          45.00 |                          61.21 |
+|      qwen1.5-4b-hf       |           70.37 |             70.76 |        61.74 |      44.58 |                   45.32 |              65.45 |                        54.62 |             64.89 |                    47.88 |                 32.45 |                          62.00 |                          70.30 |
+|      qwen1.5-7b-hf       |           75.93 |             77.19 |        66.24 |      50.60 |                   53.20 |              62.73 |                        60.00 |             71.76 |                    50.26 |                 38.41 |                          71.00 |                          74.55 |
+|      qwen1.5-14b-hf      |           74.07 |             83.63 |        70.74 |      46.39 |                   58.62 |              64.55 |                        73.59 |             76.34 |                    59.26 |                 49.01 |                          75.00 |                          83.64 |
+|      qwen1.5-32b-hf      |           83.33 |             85.96 |        82.96 |      56.63 |                   61.58 |              63.64 |                        77.95 |             83.97 |                    69.31 |                 50.99 |                          85.00 |                          86.06 |
+|      qwen1.5-72b-hf      |           84.26 |             88.89 |        82.32 |      57.23 |                   66.01 |              72.73 |                        82.05 |             87.02 |                    69.31 |                 56.95 |                          84.00 |                          84.24 |
+|   qwen1.5-moe-a2-7b-hf   |           70.37 |             80.12 |        66.56 |      51.20 |                   47.78 |              64.55 |                        62.31 |             70.99 |                    46.30 |                 45.03 |                          59.00 |                          69.70 |
+|    mistral-7b-v0.1-hf    |           77.78 |             83.04 |        69.45 |      54.82 |                   53.20 |              67.27 |                        66.15 |             78.63 |                    38.10 |                 31.79 |                          68.00 |                          78.79 |
+|    mistral-7b-v0.2-hf    |           73.15 |             82.46 |        72.99 |      53.01 |                   55.67 |              66.36 |                        62.31 |             77.10 |                    40.48 |                 34.44 |                          66.00 |                          76.36 |
+|   mixtral-8x7b-v0.1-hf   |           82.41 |             88.30 |        78.14 |      51.20 |                   62.56 |              70.00 |                        70.77 |             80.92 |                    48.68 |                 48.34 |                          71.00 |                          80.61 |
+|  mixtral-8x22b-v0.1-hf   |           84.26 |             89.47 |        84.57 |      59.04 |                   67.49 |              78.18 |                        79.23 |             88.55 |                    61.64 |                 52.98 |                          87.00 |                          86.06 |
+|         yi-6b-hf         |           78.70 |             81.87 |        69.77 |      46.39 |                   52.71 |              73.64 |                        65.13 |             74.81 |                    46.30 |                 38.41 |                          66.00 |                          71.52 |
+|        yi-34b-hf         |           89.81 |             86.55 |        83.92 |      57.23 |                   64.04 |              73.64 |                        79.49 |             85.50 |                    66.40 |                 52.32 |                          81.00 |                          86.06 |
+|   deepseek-7b-base-hf    |           55.56 |             73.10 |        56.59 |      46.99 |                   34.98 |              62.73 |                        48.21 |             58.78 |                    28.57 |                 29.14 |                          50.00 |                          61.82 |
+|   deepseek-67b-base-hf   |           84.26 |             85.96 |        81.03 |      56.02 |                   57.64 |              72.73 |                        73.85 |             82.44 |                    51.59 |                 45.03 |                          74.00 |                          81.82 |
+
+|          model           |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|    llama-7b-turbomind    |             42.00 |            40.46 |                    32.87 |           42.78 |          26.19 |                                 46.11 |        35.19 |              33.47 |                 32.90 |               42.33 |                       43.88 |                   43.75 |
+|   llama-13b-turbomind    |             46.00 |            50.00 |                    30.56 |           64.88 |          31.75 |                                 66.84 |        51.85 |              52.65 |                 51.94 |               52.76 |                       67.51 |                   51.10 |
+|   llama-30b-turbomind    |             55.00 |            66.76 |                    49.07 |           77.91 |          36.51 |                                 82.90 |        68.21 |              66.12 |                 69.35 |               67.48 |                       80.59 |                   55.88 |
+|   llama-65b-turbomind    |             59.00 |            73.70 |                    61.57 |           81.35 |          43.65 |                                 88.60 |        73.46 |              71.84 |                 74.19 |               77.30 |                       83.97 |                   62.13 |
+|   llama-2-7b-turbomind   |             53.00 |            51.16 |                    27.78 |           63.60 |          27.78 |                                 67.36 |        48.77 |              47.76 |                 50.97 |               51.53 |                       64.56 |                   52.57 |
+|  llama-2-13b-turbomind   |             54.00 |            64.45 |                    45.37 |           74.46 |          36.51 |                                 80.83 |        64.81 |              62.86 |                 67.42 |               66.87 |                       72.15 |                   54.41 |
+|  llama-2-70b-turbomind   |             72.00 |            77.17 |                    63.43 |           86.08 |          48.41 |                                 94.30 |        83.64 |              78.37 |                 81.61 |               80.98 |                       87.76 |                   74.63 |
+|   llama-3-8b-turbomind   |             62.00 |            73.70 |                    54.17 |           82.76 |          48.41 |                                 90.16 |        72.53 |              75.51 |                 77.74 |               73.01 |                       82.70 |                   72.06 |
+|  llama-3-70b-turbomind   |             83.00 |            85.55 |                    72.22 |           92.21 |          66.67 |                                 97.41 |        91.05 |              84.90 |                 90.32 |               87.73 |                       94.09 |                   87.13 |
+| internlm2-1.8b-turbomind |             44.00 |            45.95 |                    38.89 |           59.39 |          32.54 |                                 60.62 |        50.31 |              54.29 |                 52.58 |               45.40 |                       62.87 |                   37.87 |
+|  internlm2-7b-turbomind  |             69.00 |            66.76 |                    57.87 |           80.72 |          50.00 |                                 90.16 |        73.15 |              75.10 |                 79.68 |               68.71 |                       81.01 |                   70.22 |
+| internlm2-20b-turbomind  |             74.00 |            74.57 |                    60.19 |           81.48 |          44.44 |                                 91.71 |        75.31 |              81.63 |                 82.58 |               75.46 |                       87.76 |                   63.60 |
+|   qwen-1.8b-turbomind    |             52.00 |            52.31 |                    34.72 |           57.98 |          29.37 |                                 59.07 |        47.22 |              48.57 |                 52.26 |               44.17 |                       61.18 |                   43.38 |
+|    qwen-7b-turbomind     |             68.00 |            64.74 |                    45.37 |           77.39 |          43.65 |                                 83.94 |        68.21 |              70.20 |                 72.26 |               65.64 |                       75.95 |                   58.46 |
+|    qwen-14b-turbomind    |             75.00 |            74.86 |                    57.87 |           84.04 |          51.59 |                                 91.71 |        70.99 |              77.14 |                 83.55 |               73.01 |                       83.12 |                   67.65 |
+|    qwen-72b-turbomind    |             80.00 |            84.97 |                    68.98 |           91.44 |          54.76 |                                 98.96 |        87.04 |              81.63 |                 89.03 |               84.05 |                       90.30 |                   84.93 |
+|     qwen1.5-0.5b-hf      |             47.00 |            46.82 |                    23.15 |           48.02 |          29.37 |                                 48.70 |        40.12 |              38.37 |                 40.65 |               35.58 |                       53.16 |                   31.62 |
+|     qwen1.5-1.8b-hf      |             54.00 |            54.91 |                    28.70 |           61.69 |          23.81 |                                 58.03 |        48.15 |              51.84 |                 55.48 |               45.40 |                       59.92 |                   39.71 |
+|      qwen1.5-4b-hf       |             65.00 |            66.76 |                    44.44 |           73.95 |          35.71 |                                 78.24 |        60.19 |              65.31 |                 66.45 |               65.64 |                       71.31 |                   50.00 |
+|      qwen1.5-7b-hf       |             68.00 |            70.81 |                    48.61 |           76.50 |          38.89 |                                 84.97 |        69.44 |              68.16 |                 74.52 |               68.10 |                       77.22 |                   56.25 |
+|      qwen1.5-14b-hf      |             77.00 |            73.70 |                    62.96 |           83.40 |          53.17 |                                 90.67 |        71.60 |              80.82 |                 84.52 |               76.69 |                       83.54 |                   71.69 |
+|      qwen1.5-32b-hf      |             77.00 |            78.90 |                    68.98 |           88.12 |          54.76 |                                 94.82 |        81.48 |              80.82 |                 88.39 |               82.21 |                       86.08 |                   80.88 |
+|      qwen1.5-72b-hf      |             80.00 |            84.39 |                    68.98 |           91.44 |          55.56 |                                 98.96 |        86.73 |              81.63 |                 88.71 |               85.89 |                       89.87 |                   82.72 |
+|   qwen1.5-moe-a2-7b-hf   |             74.00 |            65.90 |                    56.48 |           82.25 |          34.13 |                                 84.46 |        70.68 |              74.29 |                 73.23 |               68.10 |                       76.79 |                   66.91 |
+|    mistral-7b-v0.1-hf    |             57.00 |            71.10 |                    57.41 |           81.61 |          40.48 |                                 86.53 |        73.46 |              72.65 |                 76.77 |               79.14 |                       77.22 |                   68.75 |
+|    mistral-7b-v0.2-hf    |             61.00 |            71.39 |                    52.78 |           80.08 |          40.48 |                                 88.08 |        69.44 |              72.24 |                 76.13 |               77.91 |                       78.06 |                   70.59 |
+|   mixtral-8x7b-v0.1-hf   |             77.00 |            80.06 |                    63.43 |           87.87 |          54.76 |                                 93.26 |        83.95 |              80.00 |                 84.19 |               79.14 |                       88.61 |                   81.25 |
+|  mixtral-8x22b-v0.1-hf   |             72.00 |            84.10 |                    68.52 |           90.68 |          57.14 |                                 96.37 |        86.73 |              86.53 |                 90.32 |               87.73 |                       90.30 |                   87.87 |
+|         yi-6b-hf         |             67.00 |            69.36 |                    52.78 |           80.46 |          44.44 |                                 89.64 |        70.99 |              74.69 |                 77.10 |               78.53 |                       78.90 |                   65.81 |
+|        yi-34b-hf         |             79.00 |            83.82 |                    66.67 |           90.29 |          57.14 |                                 97.93 |        87.65 |              84.90 |                 88.39 |               87.73 |                       92.83 |                   81.99 |
+|   deepseek-7b-base-hf    |             49.00 |            52.31 |                    41.20 |           66.28 |          30.95 |                                 63.73 |        55.86 |              51.84 |                 52.90 |               58.90 |                       62.45 |                   45.22 |
+|   deepseek-67b-base-hf   |             81.00 |            77.17 |                    63.89 |           90.04 |          53.17 |                                 97.93 |        85.49 |              73.88 |                 82.26 |               84.05 |                       91.56 |                   78.31 |
+
+|          model           |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|    llama-7b-turbomind    |                     24.81 |              32.95 |                    38.73 |       45.77 |          27.19 |                    48.07 |         38.12 |               43.00 |
+|   llama-13b-turbomind    |                     26.30 |              42.20 |                    59.80 |       61.19 |          28.95 |                    61.28 |         53.36 |               78.00 |
+|   llama-30b-turbomind    |                     27.41 |              54.91 |                    76.96 |       79.10 |          35.96 |                    76.15 |         67.71 |               83.00 |
+|   llama-65b-turbomind    |                     34.44 |              54.34 |                    82.84 |       81.09 |          39.47 |                    82.39 |         66.37 |               88.00 |
+|   llama-2-7b-turbomind   |                     29.63 |              43.35 |                    60.29 |       62.69 |          27.19 |                    62.75 |         56.05 |               64.00 |
+|  llama-2-13b-turbomind   |                     27.04 |              52.60 |                    75.49 |       73.13 |          32.46 |                    76.51 |         64.57 |               82.00 |
+|  llama-2-70b-turbomind   |                     34.07 |              64.16 |                    90.69 |       90.55 |          44.74 |                    87.52 |         80.27 |               92.00 |
+|   llama-3-8b-turbomind   |                     38.15 |              64.16 |                    83.33 |       86.57 |          47.37 |                    84.04 |         70.85 |               87.00 |
+|  llama-3-70b-turbomind   |                     48.89 |              79.77 |                    95.10 |       94.03 |          72.81 |                    94.13 |         82.51 |               94.00 |
+| internlm2-1.8b-turbomind |                     30.37 |              41.04 |                    55.88 |       51.74 |          28.95 |                    61.47 |         51.12 |               63.00 |
+|  internlm2-7b-turbomind  |                     39.63 |              68.21 |                    76.96 |       84.58 |          44.74 |                    84.59 |         72.65 |               86.00 |
+| internlm2-20b-turbomind  |                     39.63 |              66.47 |                    82.84 |       85.07 |          47.37 |                    86.79 |         70.85 |               84.00 |
+|   qwen-1.8b-turbomind    |                     28.52 |              43.35 |                    54.90 |       60.70 |          36.84 |                    60.73 |         48.43 |               60.00 |
+|    qwen-7b-turbomind     |                     30.00 |              57.23 |                    75.98 |       79.10 |          32.46 |                    79.27 |         63.23 |               81.00 |
+|    qwen-14b-turbomind    |                     37.41 |              70.52 |                    81.37 |       85.07 |          50.00 |                    84.95 |         73.09 |               86.00 |
+|    qwen-72b-turbomind    |                     50.00 |              75.72 |                    92.16 |       90.05 |          59.65 |                    92.66 |         82.51 |               95.00 |
+|     qwen1.5-0.5b-hf      |                     29.63 |              33.53 |                    45.10 |       59.70 |          28.95 |                    44.77 |         37.22 |               69.00 |
+|     qwen1.5-1.8b-hf      |                     34.07 |              39.31 |                    47.55 |       63.18 |          32.46 |                    59.08 |         53.81 |               73.00 |
+|      qwen1.5-4b-hf       |                     35.93 |              55.49 |                    71.08 |       73.13 |          37.72 |                    72.11 |         63.68 |               79.00 |
+|      qwen1.5-7b-hf       |                     34.81 |              61.85 |                    78.92 |       82.09 |          41.23 |                    80.73 |         61.88 |               84.00 |
+|      qwen1.5-14b-hf      |                     45.93 |              68.21 |                    80.88 |       83.08 |          55.26 |                    86.06 |         73.09 |               88.00 |
+|      qwen1.5-32b-hf      |                     47.04 |              76.30 |                    90.20 |       86.07 |          57.89 |                    90.28 |         75.78 |               92.00 |
+|      qwen1.5-72b-hf      |                     47.78 |              75.14 |                    92.65 |       88.56 |          59.65 |                    92.48 |         79.82 |               94.00 |
+|   qwen1.5-moe-a2-7b-hf   |                     46.30 |              54.91 |                    78.43 |       79.10 |          38.60 |                    82.39 |         66.82 |               83.00 |
+|    mistral-7b-v0.1-hf    |                     33.70 |              65.32 |                    78.92 |       83.08 |          50.00 |                    82.39 |         69.51 |               86.00 |
+|    mistral-7b-v0.2-hf    |                     38.15 |              64.16 |                    81.86 |       82.09 |          43.86 |                    80.18 |         69.96 |               86.00 |
+|   mixtral-8x7b-v0.1-hf   |                     40.37 |              69.94 |                    86.27 |       88.56 |          65.79 |                    88.81 |         79.37 |               91.00 |
+|  mixtral-8x22b-v0.1-hf   |                     45.93 |              79.19 |                    90.20 |       93.03 |          70.18 |                    92.29 |         79.37 |               95.00 |
+|         yi-6b-hf         |                     32.59 |              61.27 |                    79.90 |       82.59 |          35.96 |                    82.94 |         67.26 |               86.00 |
+|        yi-34b-hf         |                     45.19 |              71.68 |                    91.18 |       88.56 |          55.26 |                    91.74 |         78.48 |               91.00 |
+|   deepseek-7b-base-hf    |                     28.89 |              41.62 |                    60.29 |       70.15 |          26.32 |                    69.72 |         55.61 |               76.00 |
+|   deepseek-67b-base-hf   |                     38.89 |              72.25 |                    90.69 |       90.05 |          52.63 |                    90.46 |         80.72 |               95.00 |
+
+## Chat Models
+
+|             model             |   mmlu |   mmlu-stem |   mmlu-social-science |   mmlu-humanities |   mmlu-other |
+|:-----------------------------:|-------:|------------:|----------------------:|------------------:|-------------:|
+|     qwen1.5-0.5b-chat-hf      |  35.32 |       30.90 |                 37.59 |             37.29 |        37.73 |
+|     qwen1.5-1.8b-chat-hf      |  45.62 |       39.20 |                 49.21 |             47.67 |        49.63 |
+|      qwen1.5-4b-chat-hf       |  55.90 |       48.07 |                 62.67 |             59.70 |        57.31 |
+|      qwen1.5-7b-chat-hf       |  61.79 |       52.68 |                 69.41 |             66.41 |        63.45 |
+|      qwen1.5-14b-chat-hf      |  67.96 |       59.79 |                 75.46 |             71.23 |        69.72 |
+|      qwen1.5-32b-chat-hf      |  75.36 |       67.04 |                 82.11 |             80.44 |        76.23 |
+|      qwen1.5-72b-chat-hf      |  77.24 |       69.59 |                 83.95 |             81.58 |        77.87 |
+|     qwen1.5-110b-chat-hf      |  77.95 |       71.56 |                 83.77 |             81.44 |        78.41 |
+|    internlm2-chat-1.8b-hf     |  47.58 |       40.88 |                 53.33 |             49.92 |        49.74 |
+|  internlm2-chat-1.8b-sft-hf   |  47.44 |       40.55 |                 53.31 |             49.67 |        49.89 |
+|     internlm2-chat-7b-hf      |  63.05 |       53.42 |                 71.47 |             67.27 |        65.13 |
+|   internlm2-chat-7b-sft-hf    |  63.33 |       53.95 |                 71.74 |             67.62 |        65.00 |
+|     internlm2-chat-20b-hf     |  67.37 |       57.39 |                 75.75 |             71.63 |        69.95 |
+|   internlm2-chat-20b-sft-hf   |  67.34 |       57.49 |                 75.67 |             70.99 |        70.40 |
+|    llama-3-8b-instruct-hf     |  68.37 |       58.01 |                 77.82 |             71.22 |        71.94 |
+|    llama-3-70b-instruct-hf    |  80.93 |       73.86 |                 87.71 |             83.90 |        82.01 |
+| llama-3-8b-instruct-lmdeploy  |  67.35 |       56.66 |                 75.96 |             70.90 |        71.49 |
+| llama-3-70b-instruct-lmdeploy |  80.85 |       74.07 |                 87.26 |             83.73 |        81.96 |
+|  mistral-7b-instruct-v0.1-hf  |  54.36 |       43.74 |                 62.96 |             58.87 |        57.46 |
+|  mistral-7b-instruct-v0.2-hf  |  59.98 |       49.56 |                 69.22 |             64.41 |        62.24 |
+| mixtral-8x7b-instruct-v0.1-hf |  70.11 |       60.29 |                 79.01 |             74.08 |        72.28 |
+
+### Details
+
+|             model             |   college_biology |   college_chemistry |   college_computer_science |   college_mathematics |   college_physics |   electrical_engineering |   astronomy |   anatomy |   abstract_algebra |   machine_learning |   clinical_knowledge |   global_facts |
+|:-----------------------------:|------------------:|--------------------:|---------------------------:|----------------------:|------------------:|-------------------------:|------------:|----------:|-------------------:|-------------------:|---------------------:|---------------:|
+|     qwen1.5-0.5b-chat-hf      |             31.25 |               32.00 |                      33.00 |                 29.00 |             33.33 |                    38.62 |       33.55 |     28.89 |              20.00 |              27.68 |                40.38 |          33.00 |
+|     qwen1.5-1.8b-chat-hf      |             42.36 |               28.00 |                      45.00 |                 33.00 |             27.45 |                    44.83 |       51.97 |     42.22 |              32.00 |              38.39 |                48.30 |          30.00 |
+|      qwen1.5-4b-chat-hf       |             56.25 |               47.00 |                      49.00 |                 39.00 |             36.27 |                    54.48 |       57.89 |     49.63 |              38.00 |              33.04 |                59.62 |          23.00 |
+|      qwen1.5-7b-chat-hf       |             64.58 |               51.00 |                      59.00 |                 37.00 |             41.18 |                    53.79 |       66.45 |     53.33 |              43.00 |              41.07 |                67.92 |          36.00 |
+|      qwen1.5-14b-chat-hf      |             77.08 |               51.00 |                      64.00 |                 42.00 |             45.10 |                    64.83 |       77.63 |     65.93 |              39.00 |              46.43 |                73.21 |          45.00 |
+|      qwen1.5-32b-chat-hf      |             84.72 |               53.00 |                      57.00 |                 48.00 |             52.94 |                    74.48 |       82.24 |     67.41 |              52.00 |              61.61 |                78.11 |          48.00 |
+|      qwen1.5-72b-chat-hf      |             90.97 |               57.00 |                      66.00 |                 55.00 |             55.88 |                    80.00 |       88.16 |     72.59 |              56.00 |              59.82 |                80.00 |          51.00 |
+|     qwen1.5-110b-chat-hf      |             88.89 |               62.00 |                      66.00 |                 64.00 |             58.82 |                    75.86 |       89.47 |     68.15 |              59.00 |              63.39 |                79.62 |          59.00 |
+|    internlm2-chat-1.8b-hf     |             49.31 |               36.00 |                      47.00 |                 33.00 |             36.27 |                    42.76 |       48.03 |     49.63 |              30.00 |              33.93 |                53.58 |          28.00 |
+|  internlm2-chat-1.8b-sft-hf   |             51.39 |               37.00 |                      50.00 |                 33.00 |             33.33 |                    42.76 |       46.05 |     49.63 |              31.00 |              32.14 |                53.21 |          29.00 |
+|     internlm2-chat-7b-hf      |             68.75 |               47.00 |                      62.00 |                 32.00 |             38.24 |                    57.24 |       69.74 |     58.52 |              29.00 |              53.57 |                70.19 |          41.00 |
+|   internlm2-chat-7b-sft-hf    |             71.53 |               47.00 |                      63.00 |                 34.00 |             37.25 |                    57.24 |       69.74 |     57.78 |              29.00 |              52.68 |                69.43 |          34.00 |
+|     internlm2-chat-20b-hf     |             76.39 |               51.00 |                      61.00 |                 37.00 |             40.20 |                    62.76 |       78.95 |     67.41 |              33.00 |              46.43 |                75.09 |          42.00 |
+|   internlm2-chat-20b-sft-hf   |             77.08 |               49.00 |                      60.00 |                 39.00 |             39.22 |                    64.14 |       79.61 |     68.15 |              35.00 |              46.43 |                75.09 |          42.00 |
+|    llama-3-8b-instruct-hf     |             81.94 |               48.00 |                      58.00 |                 43.00 |             48.04 |                    60.69 |       76.32 |     71.11 |              33.00 |              54.46 |                73.58 |          46.00 |
+|    llama-3-70b-instruct-hf    |             93.06 |               56.00 |                      70.00 |                 60.00 |             60.78 |                    77.24 |       93.42 |     79.26 |              53.00 |              71.43 |                86.42 |          66.00 |
+| llama-3-8b-instruct-lmdeploy  |             79.17 |               47.00 |                      53.00 |                 36.00 |             49.02 |                    60.00 |       73.68 |     68.89 |              36.00 |              55.36 |                73.96 |          42.00 |
+| llama-3-70b-instruct-lmdeploy |             93.75 |               57.00 |                      66.00 |                 61.00 |             65.69 |                    77.93 |       92.11 |     78.52 |              55.00 |              70.54 |                86.42 |          64.00 |
+|  mistral-7b-instruct-v0.1-hf  |             57.64 |               35.00 |                      50.00 |                 31.00 |             24.51 |                    51.72 |       58.55 |     45.93 |              35.00 |              41.07 |                56.98 |          32.00 |
+|  mistral-7b-instruct-v0.2-hf  |             70.14 |               42.00 |                      49.00 |                 35.00 |             43.14 |                    54.48 |       65.79 |     56.30 |              29.00 |              42.86 |                65.28 |          37.00 |
+| mixtral-8x7b-instruct-v0.1-hf |             81.25 |               57.00 |                      57.00 |                 40.00 |             50.00 |                    60.69 |       80.92 |     65.93 |              45.00 |              50.89 |                76.60 |          41.00 |
+
+|             model             |   management |   nutrition |   marketing |   professional_accounting |   high_school_geography |   international_law |   moral_scenarios |   computer_security |   high_school_microeconomics |   professional_law |   medical_genetics |   professional_psychology |
+|:-----------------------------:|-------------:|------------:|------------:|--------------------------:|------------------------:|--------------------:|------------------:|--------------------:|-----------------------------:|-------------------:|-------------------:|--------------------------:|
+|     qwen1.5-0.5b-chat-hf      |        41.75 |       38.89 |       49.15 |                     26.60 |                   48.48 |               50.41 |             24.69 |               42.00 |                        32.35 |              31.75 |              31.00 |                     32.35 |
+|     qwen1.5-1.8b-chat-hf      |        62.14 |       55.56 |       76.92 |                     34.40 |                   58.08 |               61.16 |             21.90 |               56.00 |                        42.44 |              35.14 |              50.00 |                     44.93 |
+|      qwen1.5-4b-chat-hf       |        73.79 |       58.50 |       82.05 |                     47.16 |                   74.24 |               71.90 |             32.29 |               69.00 |                        58.40 |              40.74 |              58.00 |                     53.76 |
+|      qwen1.5-7b-chat-hf       |        79.61 |       69.28 |       85.47 |                     41.49 |                   78.79 |               76.86 |             35.75 |               74.00 |                        65.13 |              44.78 |              68.00 |                     57.68 |
+|      qwen1.5-14b-chat-hf      |        82.52 |       70.26 |       87.18 |                     51.77 |                   85.86 |               82.64 |             53.74 |               81.00 |                        76.05 |              47.98 |              76.00 |                     67.48 |
+|      qwen1.5-32b-chat-hf      |        84.47 |       77.78 |       94.44 |                     60.99 |                   90.91 |               87.60 |             72.96 |               79.00 |                        83.61 |              58.28 |              83.00 |                     77.94 |
+|      qwen1.5-72b-chat-hf      |        89.32 |       85.95 |       93.59 |                     61.35 |                   90.91 |               86.78 |             75.98 |               83.00 |                        84.87 |              60.30 |              83.00 |                     81.05 |
+|     qwen1.5-110b-chat-hf      |        86.41 |       80.72 |       92.74 |                     69.15 |                   93.94 |               84.30 |             77.88 |               83.00 |                        88.66 |              61.73 |              84.00 |                     82.19 |
+|    internlm2-chat-1.8b-hf     |        72.82 |       50.65 |       69.23 |                     35.46 |                   56.06 |               56.20 |             27.82 |               60.00 |                        49.16 |              33.83 |              54.00 |                     43.79 |
+|  internlm2-chat-1.8b-sft-hf   |        71.84 |       52.61 |       68.80 |                     34.75 |                   55.56 |               53.72 |             27.04 |               58.00 |                        48.74 |              34.09 |              54.00 |                     44.61 |
+|     internlm2-chat-7b-hf      |        78.64 |       66.67 |       85.90 |                     46.81 |                   79.29 |               70.25 |             35.31 |               79.00 |                        68.07 |              46.41 |              68.00 |                     64.87 |
+|   internlm2-chat-7b-sft-hf    |        79.61 |       67.97 |       86.75 |                     47.52 |                   80.30 |               70.25 |             35.98 |               80.00 |                        69.33 |              45.83 |              70.00 |                     65.36 |
+|     internlm2-chat-20b-hf     |        80.58 |       75.16 |       90.17 |                     52.13 |                   83.84 |               80.99 |             39.33 |               80.00 |                        70.59 |              49.67 |              75.00 |                     70.26 |
+|   internlm2-chat-20b-sft-hf   |        80.58 |       76.14 |       91.03 |                     53.19 |                   84.34 |               80.99 |             36.31 |               77.00 |                        71.85 |              49.61 |              77.00 |                     70.59 |
+|    llama-3-8b-instruct-hf     |        82.52 |       79.41 |       91.45 |                     52.48 |                   80.30 |               79.34 |             46.26 |               75.00 |                        76.89 |              49.61 |              85.00 |                     72.22 |
+|    llama-3-70b-instruct-hf    |        89.32 |       87.58 |       93.16 |                     66.67 |                   92.42 |               90.08 |             76.20 |               83.00 |                        89.50 |              64.67 |              92.00 |                     87.09 |
+| llama-3-8b-instruct-lmdeploy  |        87.38 |       79.41 |       90.17 |                     52.48 |                   79.80 |               78.51 |             44.25 |               75.00 |                        74.37 |              48.76 |              84.00 |                     69.61 |
+| llama-3-70b-instruct-lmdeploy |        90.29 |       88.56 |       93.59 |                     65.96 |                   92.93 |               89.26 |             75.75 |               83.00 |                        89.92 |              63.95 |              92.00 |                     86.60 |
+|  mistral-7b-instruct-v0.1-hf  |        69.90 |       59.80 |       85.47 |                     38.65 |                   69.70 |               65.29 |             37.54 |               69.00 |                        51.26 |              37.81 |              65.00 |                     52.45 |
+|  mistral-7b-instruct-v0.2-hf  |        74.76 |       66.99 |       88.89 |                     43.97 |                   75.25 |               76.86 |             42.01 |               73.00 |                        62.61 |              42.24 |              67.00 |                     62.25 |
+| mixtral-8x7b-instruct-v0.1-hf |        85.44 |       80.39 |       92.74 |                     55.32 |                   85.35 |               82.64 |             48.38 |               78.00 |                        75.21 |              53.52 |              75.00 |                     74.02 |
+
+|             model             |   jurisprudence |   world_religions |   philosophy |   virology |   high_school_chemistry |   public_relations |   high_school_macroeconomics |   human_sexuality |   elementary_mathematics |   high_school_physics |   high_school_computer_science |   high_school_european_history |
+|:-----------------------------:|----------------:|------------------:|-------------:|-----------:|------------------------:|-------------------:|-----------------------------:|------------------:|-------------------------:|----------------------:|-------------------------------:|-------------------------------:|
+|     qwen1.5-0.5b-chat-hf      |           42.59 |             24.56 |        39.87 |      39.76 |                   29.06 |              38.18 |                        35.64 |             38.93 |                    27.78 |                 29.80 |                          34.00 |                          48.48 |
+|     qwen1.5-1.8b-chat-hf      |           50.93 |             56.73 |        44.37 |      42.77 |                   35.96 |              51.82 |                        38.46 |             49.62 |                    35.45 |                 27.15 |                          47.00 |                          63.03 |
+|      qwen1.5-4b-chat-hf       |           71.30 |             65.50 |        58.20 |      50.00 |                   44.33 |              57.27 |                        54.10 |             61.83 |                    43.65 |                 41.06 |                          60.00 |                          72.12 |
+|      qwen1.5-7b-chat-hf       |           76.85 |             76.61 |        68.49 |      48.80 |                   51.72 |              64.55 |                        59.23 |             68.70 |                    48.94 |                 37.09 |                          69.00 |                          79.39 |
+|      qwen1.5-14b-chat-hf      |           75.93 |             80.70 |        69.13 |      51.20 |                   55.67 |              64.55 |                        67.69 |             74.05 |                    57.14 |                 47.02 |                          74.00 |                          82.42 |
+|      qwen1.5-32b-chat-hf      |           83.33 |             89.47 |        82.64 |      60.84 |                   62.56 |              70.00 |                        76.67 |             83.21 |                    67.46 |                 59.60 |                          85.00 |                          84.85 |
+|      qwen1.5-72b-chat-hf      |           86.11 |             89.47 |        80.71 |      59.04 |                   68.47 |              72.73 |                        80.00 |             87.79 |                    67.72 |                 52.32 |                          79.00 |                          85.45 |
+|     qwen1.5-110b-chat-hf      |           83.33 |             87.13 |        81.03 |      54.22 |                   69.95 |              73.64 |                        78.21 |             87.02 |                    75.93 |                 57.62 |                          84.00 |                          88.48 |
+|    internlm2-chat-1.8b-hf     |           52.78 |             60.82 |        49.20 |      42.77 |                   42.36 |              50.00 |                        47.18 |             53.44 |                    32.54 |                 31.79 |                          39.00 |                          60.00 |
+|  internlm2-chat-1.8b-sft-hf   |           53.70 |             61.40 |        50.16 |      42.17 |                   40.89 |              50.00 |                        47.69 |             51.15 |                    32.54 |                 29.14 |                          40.00 |                          59.39 |
+|     internlm2-chat-7b-hf      |           73.15 |             81.87 |        67.85 |      47.59 |                   49.75 |              62.73 |                        61.79 |             66.41 |                    44.97 |                 33.77 |                          71.00 |                          81.82 |
+|   internlm2-chat-7b-sft-hf    |           73.15 |             81.87 |        66.88 |      48.19 |                   48.77 |              63.64 |                        62.31 |             65.65 |                    45.77 |                 33.77 |                          72.00 |                          81.82 |
+|     internlm2-chat-20b-hf     |           80.56 |             81.87 |        72.99 |      55.42 |                   54.19 |              70.00 |                        67.95 |             71.76 |                    48.15 |                 39.74 |                          75.00 |                          80.00 |
+|   internlm2-chat-20b-sft-hf   |           81.48 |             79.53 |        72.99 |      54.82 |                   54.19 |              69.09 |                        67.95 |             71.76 |                    48.94 |                 41.06 |                          75.00 |                          80.00 |
+|    llama-3-8b-instruct-hf     |           76.85 |             79.53 |        72.35 |      53.61 |                   54.19 |              70.91 |                        66.41 |             80.92 |                    49.47 |                 46.36 |                          71.00 |                          75.15 |
+|    llama-3-70b-instruct-hf    |           87.04 |             88.30 |        82.64 |      56.02 |                   67.49 |              74.55 |                        86.41 |             88.55 |                    74.34 |                 65.56 |                          91.00 |                          86.06 |
+| llama-3-8b-instruct-lmdeploy  |           77.78 |             79.53 |        70.74 |      52.41 |                   53.20 |              68.18 |                        65.38 |             79.39 |                    50.79 |                 37.75 |                          72.00 |                          76.97 |
+| llama-3-70b-instruct-lmdeploy |           87.96 |             90.64 |        83.28 |      54.82 |                   69.46 |              73.64 |                        86.92 |             87.02 |                    74.87 |                 66.23 |                          92.00 |                          85.45 |
+|  mistral-7b-instruct-v0.1-hf  |           64.81 |             70.18 |        63.67 |      41.57 |                   38.92 |              68.18 |                        49.49 |             61.83 |                    33.33 |                 32.45 |                          55.00 |                          66.67 |
+|  mistral-7b-instruct-v0.2-hf  |           70.37 |             80.12 |        64.95 |      50.60 |                   50.74 |              68.18 |                        54.36 |             71.76 |                    40.74 |                 35.10 |                          60.00 |                          73.33 |
+| mixtral-8x7b-instruct-v0.1-hf |           79.63 |             87.72 |        73.63 |      54.82 |                   61.58 |              67.27 |                        69.49 |             83.21 |                    52.91 |                 47.02 |                          74.00 |                          80.61 |
+
+|             model             |   business_ethics |   moral_disputes |   high_school_statistics |   miscellaneous |   formal_logic |   high_school_government_and_politics |   prehistory |   security_studies |   high_school_biology |   logical_fallacies |   high_school_world_history |   professional_medicine |
+|:-----------------------------:|------------------:|-----------------:|-------------------------:|----------------:|---------------:|--------------------------------------:|-------------:|-------------------:|----------------------:|--------------------:|----------------------------:|------------------------:|
+|     qwen1.5-0.5b-chat-hf      |             45.00 |            41.04 |                    30.09 |           39.21 |          24.60 |                                 35.23 |        33.95 |              25.31 |                 36.13 |               31.29 |                       49.37 |                   38.24 |
+|     qwen1.5-1.8b-chat-hf      |             54.00 |            50.29 |                    34.26 |           58.49 |          24.60 |                                 55.96 |        47.53 |              39.18 |                 47.74 |               44.17 |                       64.98 |                   40.81 |
+|      qwen1.5-4b-chat-hf       |             61.00 |            64.16 |                    46.30 |           71.01 |          39.68 |                                 72.02 |        54.01 |              65.31 |                 63.55 |               63.80 |                       71.31 |                   51.10 |
+|      qwen1.5-7b-chat-hf       |             69.00 |            67.05 |                    50.93 |           76.25 |          53.17 |                                 82.38 |        62.96 |              71.02 |                 73.23 |               68.10 |                       76.79 |                   60.29 |
+|      qwen1.5-14b-chat-hf      |             74.00 |            75.14 |                    58.33 |           82.89 |          51.59 |                                 88.60 |        69.44 |              77.96 |                 84.19 |               73.62 |                       82.70 |                   71.32 |
+|      qwen1.5-32b-chat-hf      |             80.00 |            80.64 |                    70.83 |           89.40 |          60.32 |                                 94.82 |        81.79 |              79.59 |                 90.00 |               86.50 |                       88.61 |                   80.15 |
+|      qwen1.5-72b-chat-hf      |             80.00 |            82.95 |                    68.98 |           91.83 |          57.14 |                                 98.45 |        86.73 |              78.78 |                 89.03 |               87.12 |                       91.14 |                   83.82 |
+|     qwen1.5-110b-chat-hf      |             79.00 |            78.03 |                    67.13 |           92.98 |          62.70 |                                 97.93 |        87.04 |              74.29 |                 88.71 |               82.82 |                       91.14 |                   84.93 |
+|    internlm2-chat-1.8b-hf     |             48.00 |            49.13 |                    44.91 |           57.60 |          26.98 |                                 61.14 |        50.62 |              51.02 |                 52.58 |               57.67 |                       67.51 |                   37.50 |
+|  internlm2-chat-1.8b-sft-hf   |             50.00 |            49.13 |                    44.91 |           57.73 |          28.57 |                                 61.66 |        49.69 |              51.02 |                 49.68 |               57.67 |                       66.67 |                   38.60 |
+|     internlm2-chat-7b-hf      |             65.00 |            65.61 |                    49.54 |           80.84 |          43.65 |                                 88.08 |        70.99 |              68.98 |                 78.39 |               75.46 |                       82.28 |                   61.76 |
+|   internlm2-chat-7b-sft-hf    |             64.00 |            66.18 |                    52.31 |           81.35 |          46.03 |                                 88.08 |        71.60 |              67.76 |                 78.39 |               77.30 |                       82.28 |                   63.60 |
+|     internlm2-chat-20b-hf     |             74.00 |            73.70 |                    59.72 |           81.86 |          46.83 |                                 89.12 |        74.69 |              75.92 |                 80.65 |               79.14 |                       82.70 |                   70.59 |
+|   internlm2-chat-20b-sft-hf   |             76.00 |            73.12 |                    60.19 |           81.99 |          43.65 |                                 88.60 |        74.38 |              73.88 |                 80.32 |               80.37 |                       82.70 |                   70.59 |
+|    llama-3-8b-instruct-hf     |             72.00 |            73.12 |                    55.09 |           84.55 |          50.00 |                                 90.67 |        77.16 |              77.55 |                 81.61 |               77.91 |                       84.81 |                   75.00 |
+|    llama-3-70b-instruct-hf    |             85.00 |            85.26 |                    75.00 |           92.72 |          69.05 |                                 97.41 |        90.43 |              82.04 |                 91.61 |               87.12 |                       94.09 |                   89.71 |
+| llama-3-8b-instruct-lmdeploy  |             72.00 |            72.83 |                    52.78 |           82.12 |          51.59 |                                 89.64 |        76.85 |              76.73 |                 80.97 |               76.69 |                       84.39 |                   74.63 |
+| llama-3-70b-instruct-lmdeploy |             85.00 |            84.39 |                    73.61 |           92.72 |          67.46 |                                 97.93 |        89.81 |              81.63 |                 90.65 |               87.12 |                       93.25 |                   89.34 |
+|  mistral-7b-instruct-v0.1-hf  |             55.00 |            57.51 |                    39.81 |           74.07 |          39.68 |                                 75.65 |        57.72 |              62.04 |                 59.35 |               69.33 |                       67.93 |                   55.88 |
+|  mistral-7b-instruct-v0.2-hf  |             61.00 |            66.76 |                    46.76 |           78.67 |          36.51 |                                 84.97 |        68.83 |              70.20 |                 68.39 |               69.33 |                       73.00 |                   58.09 |
+| mixtral-8x7b-instruct-v0.1-hf |             66.00 |            76.59 |                    57.87 |           86.59 |          50.00 |                                 93.78 |        83.02 |              79.18 |                 82.58 |               75.46 |                       86.50 |                   77.94 |
+
+|             model             |   high_school_mathematics |   college_medicine |   high_school_us_history |   sociology |   econometrics |   high_school_psychology |   human_aging |   us_foreign_policy |
+|:-----------------------------:|--------------------------:|-------------------:|-------------------------:|------------:|---------------:|-------------------------:|--------------:|--------------------:|
+|     qwen1.5-0.5b-chat-hf      |                     24.44 |              35.26 |                    42.16 |       47.26 |          29.82 |                    40.55 |         32.29 |               47.00 |
+|     qwen1.5-1.8b-chat-hf      |                     32.22 |              43.35 |                    54.90 |       48.26 |          28.95 |                    61.83 |         48.43 |               71.00 |
+|      qwen1.5-4b-chat-hf       |                     36.30 |              51.45 |                    71.08 |       76.62 |          34.21 |                    72.29 |         58.30 |               72.00 |
+|      qwen1.5-7b-chat-hf       |                     31.11 |              61.27 |                    76.47 |       79.10 |          42.11 |                    81.28 |         61.43 |               83.00 |
+|      qwen1.5-14b-chat-hf      |                     41.48 |              68.79 |                    80.88 |       82.59 |          48.25 |                    84.40 |         72.20 |               88.00 |
+|      qwen1.5-32b-chat-hf      |                     48.52 |              75.72 |                    88.73 |       86.07 |          57.02 |                    90.46 |         78.03 |               95.00 |
+|      qwen1.5-72b-chat-hf      |                     51.48 |              73.99 |                    90.69 |       87.06 |          59.65 |                    92.11 |         79.37 |               94.00 |
+|     qwen1.5-110b-chat-hf      |                     52.22 |              76.30 |                    93.14 |       87.56 |          62.28 |                    91.56 |         80.27 |               88.00 |
+|    internlm2-chat-1.8b-hf     |                     31.48 |              46.82 |                    56.37 |       65.17 |          28.07 |                    65.87 |         50.22 |               69.00 |
+|  internlm2-chat-1.8b-sft-hf   |                     30.74 |              47.40 |                    54.41 |       64.18 |          29.82 |                    66.24 |         48.43 |               69.00 |
+|     internlm2-chat-7b-hf      |                     33.70 |              67.05 |                    79.90 |       81.09 |          48.25 |                    84.04 |         67.26 |               84.00 |
+|   internlm2-chat-7b-sft-hf    |                     35.19 |              67.05 |                    79.90 |       80.60 |          48.25 |                    84.59 |         65.47 |               85.00 |
+|     internlm2-chat-20b-hf     |                     36.30 |              66.47 |                    88.73 |       85.07 |          51.75 |                    85.69 |         70.85 |               87.00 |
+|   internlm2-chat-20b-sft-hf   |                     35.93 |              65.90 |                    87.75 |       85.57 |          52.63 |                    84.77 |         70.85 |               87.00 |
+|    llama-3-8b-instruct-hf     |                     36.67 |              68.79 |                    83.82 |       86.57 |          61.40 |                    84.95 |         70.85 |               85.00 |
+|    llama-3-70b-instruct-hf    |                     57.41 |              78.61 |                    89.71 |       91.54 |          74.56 |                    94.50 |         82.96 |               94.00 |
+| llama-3-8b-instruct-lmdeploy  |                     38.52 |              68.79 |                    82.84 |       85.57 |          54.39 |                    85.50 |         69.96 |               83.00 |
+| llama-3-70b-instruct-lmdeploy |                     54.81 |              79.77 |                    90.20 |       92.04 |          71.05 |                    94.50 |         82.96 |               93.00 |
+|  mistral-7b-instruct-v0.1-hf  |                     28.89 |              50.29 |                    67.16 |       76.12 |          39.47 |                    72.29 |         62.33 |               77.00 |
+|  mistral-7b-instruct-v0.2-hf  |                     30.74 |              53.18 |                    73.04 |       77.11 |          42.11 |                    79.82 |         63.68 |               82.00 |
+| mixtral-8x7b-instruct-v0.1-hf |                     35.56 |              73.41 |                    85.29 |       87.06 |          60.53 |                    86.97 |         74.44 |               86.00 |
--- a/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_gen.py
+++ b/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_gen.py
@ -0,0 +1,69 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+# 指定只使用生命科学相关的子集
+mmlu_life_science_subsets = [
+    'anatomy',                # 解剖学
+    'clinical_knowledge',     # 临床知识
+    'professional_medicine',  # 专业医学
+    'medical_genetics',       # 遗传学
+    'college_medicine',       # 大学医学
+    'college_biology',        # 大学生物学
+]
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev')
+
+mmlu_datasets = []
+
+for _name in mmlu_life_science_subsets:
+    _hint = f'There is a single choice question about {_name.replace("_", " ")}. Answer the question by replying A, B, C or D.'
+    mmlu_infer_cfg = dict(
+        ice_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '),
+                dict(role='BOT', prompt='{target}\n')
+            ]),
+        ),
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin='</E>',
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f'{_hint}\nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer: '
+                    ),
+                ],
+            ),
+            ice_token='</E>',
+        ),
+        retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(type=AccwithDetailsEvaluator),
+        pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
+
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{_name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=_name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+        ))
+
+del _name, _hint
--- a/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_llmjudge_gen.py
+++ b/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_llmjudge_gen.py
@ -0,0 +1,111 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .mmlu_lifescience_sets import mmlu_life_science_subsets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev',
+)
+
+mmlu_datasets = []
+for name in mmlu_life_science_subsets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MMLUDataset,
+                path='opencompass/mmlu',
+                name=name,
+                reader_cfg=mmlu_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+            mode='singlescore',
+        )
+    )
--- a/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_sets.py
+++ b/opencompass/configs/datasets/mmlu_lifescience/mmlu_lifescience_sets.py
@ -0,0 +1,8 @@
+mmlu_life_science_subsets = [
+    'anatomy',                # 解剖学
+    'clinical_knowledge',     # 临床知识
+    'professional_medicine',  # 专业医学
+    'medical_genetics',       # 遗传学
+    'college_medicine',       # 大学医学
+    'college_biology',        # 大学生物学
+]
--- a/opencompass/datasets/SciEval_lifescience.py
+++ b/opencompass/datasets/SciEval_lifescience.py
@ -0,0 +1,62 @@
+import re
+from typing import List
+
+from datasets import Dataset, DatasetDict, load_dataset
+
+from opencompass.datasets.base import BaseDataset
+from opencompass.registry import LOAD_DATASET
+
+# 预编译的多选题正则，按 PEP-8 每行 < 79 字符
+_PATTERN_MC = (
+    r'^(?P<stem>.*?)'  # 题干
+    r'(?:A\.)\s*(?P<A>.*?)\s*'  # 选项 A
+    r'B\.\s*(?P<B>.*?)\s*'  # 选项 B
+    r'C\.\s*(?P<C>.*?)\s*'  # 选项 C
+    r'D\.\s*(?P<D>.*?)'  # 选项 D
+    r'Answer:'  # 答案分隔符
+)
+
+
+@LOAD_DATASET.register_module()
+class SciEvalDataset(BaseDataset):
+    """Biology multiple-choice subset of SciEval."""
+
+    @staticmethod
+    def load(path: str, name: str, **kwargs) -> DatasetDict:
+        dataset = DatasetDict()
+
+        for split in ('test', ):
+            raw_iter = load_dataset(
+                path,
+                name=name,
+                split=split,
+                streaming=True,
+            )
+
+            examples: List[dict] = []
+            for ex in raw_iter:
+                if (ex.get('category') != 'biology'
+                        or ex.get('type') != 'multiple-choice'):
+                    continue
+
+                ans_list = ex.get('answer') or ex.get('answers') or []
+                if not ans_list:
+                    continue
+                target = ans_list[0]
+
+                match = re.search(_PATTERN_MC, ex.get('question', ''), re.S)
+                if not match:
+                    continue
+
+                examples.append({
+                    'input': match.group('stem').strip(),
+                    'A': match.group('A').strip(),
+                    'B': match.group('B').strip(),
+                    'C': match.group('C').strip(),
+                    'D': match.group('D').strip(),
+                    'target': target,
+                })
+
+            dataset[split] = Dataset.from_list(examples)
+
+        return dataset
--- a/opencompass/datasets/init.py
+++ b/opencompass/datasets/init.py
@ -99,6 +99,7 @@ from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
 from .mmlu_cf import *  # noqa: F401, F403
+from .mmlu_lifescience import *  # noqa: F401, F403
 from .mmlu_pro import *  # noqa: F401, F403
 from .MMLUArabic import *  # noqa: F401, F403
 from .mmmlu import *  # noqa: F401, F403
@ -127,6 +128,7 @@ from .ruler import *  # noqa: F401, F403
 from .safety import *  # noqa: F401, F403
 from .scibench import ScibenchDataset, scibench_postprocess  # noqa: F401, F403
 from .scicode import *  # noqa: F401, F403
+from .SciEval_lifescience import SciEvalDataset  # noqa: F401
 from .simpleqa import *  # noqa: F401, F403
 from .siqa import *  # noqa: F401, F403
 from .smolinstruct import *  # noqa: F401, F403
--- a/opencompass/datasets/mmlu_lifescience.py
+++ b/opencompass/datasets/mmlu_lifescience.py
@ -0,0 +1,153 @@
+import csv
+import json
+import os.path as osp
+from os import environ
+
+from datasets import Dataset, DatasetDict
+
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class MMLUDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str, **kwargs):
+        path = get_data_path(path)
+        dataset = DatasetDict()
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            from modelscope import MsDataset
+            for split in ['dev', 'test']:
+                # 从 ModelScope 加载数据
+                ms_dataset = MsDataset.load(path,
+                                            subset_name=name,
+                                            split=split)
+                dataset_list = []
+                for line in ms_dataset:
+                    dataset_list.append({
+                        'input': line['question'],
+                        'A': line['choices'][0],
+                        'B': line['choices'][1],
+                        'C': line['choices'][2],
+                        'D': line['choices'][3],
+                        'target': 'ABCD'[line['answer']],
+                    })
+                dataset[split] = Dataset.from_list(dataset_list)
+        else:
+            for split in ['dev', 'test']:
+                raw_data = []
+                filename = osp.join(path, split, f'{name}_{split}.csv')
+                with open(filename, encoding='utf-8') as f:
+                    reader = csv.reader(f)
+                    for row in reader:
+                        assert len(row) == 6
+                        raw_data.append({
+                            'input': row[0],
+                            'A': row[1],
+                            'B': row[2],
+                            'C': row[3],
+                            'D': row[4],
+                            'target': row[5],
+                        })
+                dataset[split] = Dataset.from_list(raw_data)
+        return dataset
+
+
+class MMLUDatasetClean(BaseDataset):
+
+    # load the contamination annotations of CEval from
+    # https://github.com/liyucheng09/Contamination_Detector
+    @staticmethod
+    def load_contamination_annotations(path, split='val'):
+        import requests
+
+        assert split == 'test', 'We only use test set for MMLU'
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            from modelscope.utils.config_ds import MS_DATASETS_CACHE
+            annotation_cache_path = osp.join(
+                MS_DATASETS_CACHE,
+                f'MMLU_{split}_contamination_annotations.json')
+            link_of_annotations = 'https://modelscope.cn/datasets/opencompass/Contamination_Detector/resolve/master/mmlu_annotations.json'  # noqa
+        else:
+            annotation_cache_path = osp.join(
+                path, split, f'MMLU_{split}_contamination_annotations.json')
+            link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc2/mmlu_annotations.json'  # noqa
+
+        if osp.exists(annotation_cache_path):
+            with open(annotation_cache_path, 'r') as f:
+                annotations = json.load(f)
+            return annotations
+
+        annotations = json.loads(requests.get(link_of_annotations).text)
+        with open(annotation_cache_path, 'w') as f:
+            json.dump(annotations, f)
+        return annotations
+
+    @staticmethod
+    def load(path: str, name: str):
+        path = get_data_path(path)
+        dataset = DatasetDict()
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            for split in ['dev', 'test']:
+                from modelscope import MsDataset
+
+                # 从 ModelScope 加载数据
+                ms_dataset = MsDataset.load(path,
+                                            subset_name=name,
+                                            split=split)
+                if split == 'test':
+                    annotations = \
+                        MMLUDatasetClean.load_contamination_annotations(
+                            path, split)
+                dataset_list = []
+                for row_index, line in enumerate(ms_dataset):
+                    item = {
+                        'input': line['question'],
+                        'A': line['choices'][0],
+                        'B': line['choices'][1],
+                        'C': line['choices'][2],
+                        'D': line['choices'][3],
+                        'target': 'ABCD'[line['answer']],
+                    }
+                    if split == 'test':
+                        row_id = f'{name} {row_index}'
+                        if row_id in annotations:
+                            is_clean = annotations[row_id][0]
+                        else:
+                            is_clean = 'not labeled'
+                        item['is_clean'] = is_clean
+                    dataset_list.append(item)
+                dataset[split] = Dataset.from_list(dataset_list)
+        else:
+            for split in ['dev', 'test']:
+                raw_data = []
+                filename = osp.join(path, split, f'{name}_{split}.csv')
+                if split == 'test':
+                    annotations = \
+                        MMLUDatasetClean.load_contamination_annotations(
+                            path, split)
+                with open(filename, encoding='utf-8') as f:
+                    reader = csv.reader(f)
+                    for row_index, row in enumerate(reader):
+                        assert len(row) == 6
+                        item = {
+                            'input': row[0],
+                            'A': row[1],
+                            'B': row[2],
+                            'C': row[3],
+                            'D': row[4],
+                            'target': row[5],
+                        }
+                        if split == 'test':
+                            row_id = f'{name} {row_index}'
+                            if row_id in annotations:
+                                is_clean = annotations[row_id][0]
+                            else:
+                                is_clean = 'not labeled'
+                            item['is_clean'] = is_clean
+                        raw_data.append(item)
+                dataset[split] = Dataset.from_list(raw_data)
+        return dataset