[Feature] Add qwen & qwen-chat support (#286)

* add and apply update suffix tool * add tool doc * add qwen configs * add cmmlu * rename bbh * update datasets * delete * update hf_qwen_7b.py
2025-05-30 16:03:24 +08:00 · 2023-08-31 11:29:05 +08:00 · 2023-08-31 11:29:05 +08:00 · 7ca6ba625e
commit 7ca6ba625e
parent fd389e2d78
17 changed files with 658 additions and 1 deletions
--- a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py
+++ b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py
@ -0,0 +1,35 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import EMEvaluator
+from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess
+
+ReCoRD_reader_cfg = dict(
+    input_columns=['question', 'text'], output_column='answers')
+
+ReCoRD_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:"
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+ReCoRD_eval_cfg = dict(
+    evaluator=dict(type=EMEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=ReCoRD_postprocess))
+
+ReCoRD_datasets = [
+    dict(
+        type=ReCoRDDataset_V2,
+        abbr='ReCoRD',
+        path='./data/SuperGLUE/ReCoRD/val.jsonl',
+        reader_cfg=ReCoRD_reader_cfg,
+        infer_cfg=ReCoRD_infer_cfg,
+        eval_cfg=ReCoRD_eval_cfg)
+]
--- a/configs/datasets/bbh/bbh_gen_e3d13a.py
+++ b/configs/datasets/bbh/bbh_gen_e3d13a.py
@ -0,0 +1,105 @@
+from os.path import exists
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
+
+bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
+
+_path_prefix = "./data/BBH"
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    _hint = None
+    if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
+        _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=AccEvaluator),
+        pred_role="BOT",
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"{_path_prefix}/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+for _name in bbh_free_form_sets:
+    _hint = None
+    if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
+        _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role="HUMAN",
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path=f"{_path_prefix}/data",
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+del _name, _hint, _path_prefix
--- a/configs/datasets/collections/leaderboard/qwen.py
+++ b/configs/datasets/collections/leaderboard/qwen.py
@ -0,0 +1,51 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ...ceval.ceval_ppl_578f8d import ceval_datasets
+    from ...agieval.agieval_mixed_2f14ad import agieval_datasets
+    from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
+    from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
+    from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
+    from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
+    from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
+
+    from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
+    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
+    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
+    from ...flores.flores_gen_806ede import flores_datasets
+
+    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
+    from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
+    from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
+    from ...nq.nq_gen_0356ec import nq_datasets
+
+    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ...race.race_ppl_5831a0 import race_datasets
+    from ...obqa.obqa_gen_9069e4 import obqa_datasets
+    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
+    from ...lambada.lambada_gen_217e11 import lambada_datasets
+
+    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
+    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
+    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
+    from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
+    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
+    from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
+    from ...piqa.piqa_gen_1194eb import piqa_datasets
+    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
+    from ...math.math_gen_265cce import math_datasets
+    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ...drop.drop_gen_599f07 import drop_datasets
+    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ...bbh.bbh_gen_e3d13a import bbh_datasets
+
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/configs/datasets/collections/leaderboard/qwen_chat.py
+++ b/configs/datasets/collections/leaderboard/qwen_chat.py
@ -0,0 +1,51 @@
+from mmengine.config import read_base
+
+with read_base():
+    from ...ceval.ceval_gen_5f30c7 import ceval_datasets
+    from ...agieval.agieval_mixed_2f14ad import agieval_datasets
+    from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets
+    from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
+    from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
+    from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
+    from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
+
+    from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
+    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
+    from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
+    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
+    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
+    from ...flores.flores_gen_806ede import flores_datasets
+
+    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
+    from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
+    from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
+    from ...nq.nq_gen_c788f6 import nq_datasets
+
+    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
+    from ...race.race_gen_69ee4f import race_datasets
+    from ...obqa.obqa_ppl_6aac9e import obqa_datasets
+    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
+    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
+    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
+    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
+    from ...lambada.lambada_gen_217e11 import lambada_datasets
+
+    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
+    from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
+    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
+    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
+    from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
+    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
+    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
+    from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
+    from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
+    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
+    from ...math.math_gen_265cce import math_datasets
+    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
+    from ...drop.drop_gen_599f07 import drop_datasets
+    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
+    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
+    from ...bbh.bbh_gen_6bd693 import bbh_datasets
+
+
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/configs/eval_qwen_7b.py
+++ b/configs/eval_qwen_7b.py
@ -0,0 +1,58 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .models.hf_qwen_7b import models
+    from .datasets.collections.leaderboard.qwen import datasets
+    from .summarizers.leaderboard import summarizer
+
+'''
+dataset                                 version    metric            mode    qwen-7b-hf
+--------------------------------------  ---------  ----------------  ------  ------------
+--------- 考试 Exam ---------           -          -                 -       -
+ceval                                   -          naive_average     ppl     58.65
+agieval                                 -          naive_average     mixed   40.49
+mmlu                                    -          naive_average     ppl     57.78
+cmmlu                                   -          naive_average     ppl     58.57
+GaokaoBench                             -          weighted_average  mixed   51.76
+ARC-c                                   72cf91     accuracy          gen     83.73
+ARC-e                                   72cf91     accuracy          gen     90.65
+--------- 语言 Language ---------       -          -                 -       -
+WiC                                     ce62e6     accuracy          ppl     51.10
+chid-dev                                25f3d3     accuracy          ppl     86.63
+afqmc-dev                               cc328c     accuracy          ppl     69.00
+WSC                                     678cb5     accuracy          ppl     63.46
+tydiqa-goldp                            -          naive_average     gen     19.98
+flores_100                              -          naive_average     gen     3.20
+--------- 知识 Knowledge ---------      -          -                 -       -
+BoolQ                                   463fee     accuracy          ppl     83.00
+commonsense_qa                          0d8e25     accuracy          ppl     67.49
+triviaqa                                b6904f     score             gen     40.45
+nq                                      b6904f     score             gen     14.16
+--------- 理解 Understanding ---------  -          -                 -       -
+C3                                      e6778d     accuracy          gen     75.29
+race-middle                             73bdec     accuracy          ppl     90.53
+race-high                               73bdec     accuracy          ppl     87.71
+openbookqa_fact                         fa871c     accuracy          gen     92.20
+csl_dev                                 3c4211     accuracy          ppl     56.25
+lcsts                                   0b3969     rouge1            gen     12.38
+Xsum                                    207e69     rouge1            gen     36.00
+eprstmt-dev                             101429     accuracy          gen     89.38
+lambada                                 de1af2     accuracy          gen     67.88
+--------- 推理 Reasoning ---------      -          -                 -       -
+cmnli                                   15e783     accuracy          ppl     54.85
+ocnli                                   1471e7     accuracy          gen     42.34
+AX_b                                    793c72     accuracy          gen     58.61
+AX_g                                    c4c886     accuracy          gen     69.10
+RTE                                     c4c886     accuracy          gen     57.76
+COPA                                    59f42c     accuracy          gen     88.00
+ReCoRD                                  3e0689     score             gen     27.78
+hellaswag                               06a1e2     accuracy          gen     92.47
+piqa                                    24369d     accuracy          gen     78.02
+siqa                                    ea30d1     accuracy          ppl     75.03
+math                                    2c0b9e     accuracy          gen     11.06
+gsm8k                                   4c7f6e     accuracy          gen     50.87
+drop                                    53a0a7     score             gen     44.95
+openai_humaneval                        dd0dff     humaneval_pass@1  gen     23.78
+mbpp                                    60ca11     score             gen     31.20
+bbh                                     -          naive_average     gen     40.03
+'''
--- a/configs/eval_qwen_7b_chat.py
+++ b/configs/eval_qwen_7b_chat.py
@ -0,0 +1,58 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .models.hf_qwen_7b_chat import models
+    from .datasets.collections.leaderboard.qwen_chat import datasets
+    from .summarizers.leaderboard import summarizer
+
+'''
+dataset                                 version    metric            mode    qwen-7b-chat-hf
+--------------------------------------  ---------  ----------------  ------  -----------------
+--------- 考试 Exam ---------           -          -                 -       -
+ceval                                   -          naive_average     gen     56.07
+agieval                                 -          naive_average     mixed   39.51
+mmlu                                    -          naive_average     gen     53.49
+cmmlu                                   -          naive_average     gen     55.29
+GaokaoBench                             -          weighted_average  gen     48.01
+ARC-c                                   ca1e8e     accuracy          ppl     74.92
+ARC-e                                   ca1e8e     accuracy          ppl     85.71
+--------- 语言 Language ---------       -          -                 -       -
+WiC                                     efbd01     accuracy          gen     51.41
+chid-dev                                25f3d3     accuracy          ppl     77.72
+afqmc-dev                               4a1636     accuracy          gen     69.00
+WSC                                     678cb5     accuracy          ppl     67.31
+tydiqa-goldp                            -          naive_average     gen     15.32
+flores_100                              -          naive_average     gen     10.00
+--------- 知识 Knowledge ---------      -          -                 -       -
+BoolQ                                   463fee     accuracy          ppl     83.18
+commonsense_qa                          ddaabf     accuracy          gen     76.41
+triviaqa                                b6904f     score             gen     43.25
+nq                                      23dc1a     score             gen     16.26
+--------- 理解 Understanding ---------  -          -                 -       -
+C3                                      e6778d     accuracy          gen     81.53
+race-middle                             e0908b     accuracy          gen     83.01
+race-high                               e0908b     accuracy          gen     77.79
+openbookqa_fact                         49689a     accuracy          ppl     86.40
+csl_dev                                 3c4211     accuracy          ppl     64.38
+lcsts                                   0b3969     rouge1            gen     12.75
+Xsum                                    207e69     rouge1            gen     20.21
+eprstmt-dev                             ed0c5d     accuracy          ppl     85.00
+lambada                                 de1af2     accuracy          gen     59.19
+--------- 推理 Reasoning ---------      -          -                 -       -
+cmnli                                   15e783     accuracy          ppl     48.08
+ocnli                                   15e783     accuracy          ppl     51.40
+AX_b                                    689df1     accuracy          ppl     65.67
+AX_g                                    808a19     accuracy          ppl     76.12
+RTE                                     808a19     accuracy          ppl     68.95
+COPA                                    59f42c     accuracy          gen     92.00
+ReCoRD                                  6f7cfc     score             gen     0.16
+hellaswag                               8d79e0     accuracy          ppl     69.28
+piqa                                    34eee7     accuracy          ppl     72.20
+siqa                                    ea30d1     accuracy          ppl     72.88
+math                                    2c0b9e     accuracy          gen     7.84
+gsm8k                                   4c7f6e     accuracy          gen     45.41
+drop                                    53a0a7     score             gen     39.62
+openai_humaneval                        dd0dff     humaneval_pass@1  gen     10.98
+mbpp                                    60ca11     score             gen     20.60
+bbh                                     -          naive_average     gen     42.61
+'''
--- a/configs/models/hf_qwen_7b.py
+++ b/configs/models/hf_qwen_7b.py
@ -0,0 +1,32 @@
+from opencompass.models import HuggingFaceCausalLM
+
+# Please note that we have specified the revision here. Recently (on 20230827),
+# during our evaluations, we found that the newer revision models have a drop
+# of more than 5 points on datasets like GaokaoBench / mbpp.
+# We are not yet sure whether this drop is due to incorrect logic in OpenCompass
+# calling qwen or some other reasons. We would like to highlight this.
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='qwen-7b-hf',
+        path="Qwen/Qwen-7B",
+        tokenizer_path='Qwen/Qwen-7B',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,
+            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
+        ),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(
+            device_map='auto',
+            trust_remote_code=True,
+            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
+        ),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/models/hf_qwen_7b_chat.py
+++ b/configs/models/hf_qwen_7b_chat.py
@ -0,0 +1,29 @@
+from opencompass.models import HuggingFaceCausalLM
+
+
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
+    ],
+)
+
+models = [
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='qwen-7b-chat-hf',
+        path="Qwen/Qwen-7B-Chat",
+        tokenizer_path='Qwen/Qwen-7B-Chat',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+            use_fast=False,),
+        max_out_len=100,
+        max_seq_len=2048,
+        batch_size=8,
+        meta_template=_meta_template,
+        model_kwargs=dict(device_map='auto', trust_remote_code=True),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    )
+]
--- a/configs/summarizers/example.py
+++ b/configs/summarizers/example.py
@ -3,10 +3,13 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
+    from .groups.tydiqa import tydiqa_summary_groups
+    from .groups.xiezhi import xiezhi_summary_groups

 summarizer = dict(
    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
--- a/configs/summarizers/groups/cmmlu.py
+++ b/configs/summarizers/groups/cmmlu.py
@ -0,0 +1,104 @@
+subcategories = {
+    "agronomy": ['other'],
+    "anatomy": ['biology'],
+    "ancient_chinese": ['linguistics','china specific'],
+    "arts": ['arts'],
+    "astronomy": ['physics'],
+    "business_ethics": ['business'],
+    "chinese_civil_service_exam": ['politics','china specific'],
+    "chinese_driving_rule": ['other','china specific'],
+    "chinese_food_culture": ['culture','china specific'],
+    "chinese_foreign_policy": ['politics','china specific'],
+    "chinese_history":['history','china specific'],
+    "chinese_literature": ['literature','china specific'],
+    "chinese_teacher_qualification": ['education','china specific'],
+    "college_actuarial_science":['math'],
+    "college_education":['education'],
+    "college_engineering_hydrology": ['engineering'],
+    "college_law": ['law'],
+    "college_mathematics": ['math'],
+    "college_medical_statistics":['statistics'],
+    "clinical_knowledge": ['other'],
+    "college_medicine": ['other'],
+    "computer_science": ['computer science'],
+    "computer_security": ['other'],
+    "conceptual_physics": ['physics'],
+    "construction_project_management": ['other','china specific'],
+    "economics": ['economics'],
+    "education": ['education'],
+    "elementary_chinese":['linguistics','china specific'],
+    "elementary_commonsense":['other','china specific'],
+    "elementary_information_and_technology": ['other'],
+    "electrical_engineering": ['engineering'],
+    "elementary_mathematics": ['math'],
+    "ethnology": ['culture','china specific'],
+    "food_science": ['other'],
+    "genetics": ['biology'],
+    "global_facts": ['global'],
+    "high_school_biology": ['biology'],
+    "high_school_chemistry": ['chemistry'],
+    "high_school_geography": ['geography'],
+    "high_school_mathematics": ['math'],
+    "high_school_physics": ['physics'],
+    "high_school_politics": ['politics','china specific'],
+    "human_sexuality": ['other'],
+    "international_law": ['law'],
+    "journalism": ['sociology'],
+    "jurisprudence": ['law'],
+    "legal_and_moral_basis": ['other'],
+    "logical": ['philosophy'],
+    "machine_learning": ['computer science'],
+    "management": ['business'],
+    "marketing": ['business'],
+    "marxist_theory": ['philosophy'],
+    "modern_chinese": ['linguistics','china specific'],
+    "nutrition": ['other'],
+    "philosophy": ['philosophy'],
+    "professional_accounting": ['business'],
+    "professional_law": ['law'],
+    "professional_medicine": ['other'],
+    "professional_psychology": ['psychology'],
+    "public_relations": ['politics'],
+    "security_study": ['politics'],
+    "sociology": ['culture'],
+    "sports_science": ['other'],
+    "traditional_chinese_medicine": ['other','china specific'],
+    "virology": ['biology'],
+    "world_history":['history'],
+    "world_religions": ['global'],
+}
+
+categories = {
+    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
+    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
+    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
+    "Other":["other"],
+    "China specific": ["china specific"],
+}
+
+category2subject = {}
+for k, v in categories.items():
+    for subject, subcat in subcategories.items():
+        for c in subcat:
+            if c in v:
+                category2subject.setdefault(k, []).append(subject)
+
+cmmlu_summary_groups = []
+
+_cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']]
+cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities})
+
+_cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']]
+cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem})
+
+_cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']]
+cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science})
+
+_cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']]
+cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other})
+
+_cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']]
+cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific})
+
+_cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()]
+cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all})
--- a/configs/summarizers/groups/tydiqa.py
+++ b/configs/summarizers/groups/tydiqa.py
@ -0,0 +1,5 @@
+tydiqa_summary_groups = []
+
+_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
+_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa]
+tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})
--- a/configs/summarizers/groups/xiezhi.py
+++ b/configs/summarizers/groups/xiezhi.py
@ -0,0 +1,4 @@
+xiezhi_summary_groups = []
+
+_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
+xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
--- a/configs/summarizers/leaderboard.py
+++ b/configs/summarizers/leaderboard.py
@ -0,0 +1,89 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.agieval import agieval_summary_groups
+    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
+    from .groups.ceval import ceval_summary_groups
+    from .groups.bbh import bbh_summary_groups
+    from .groups.GaokaoBench import GaokaoBench_summary_groups
+    from .groups.flores import flores_summary_groups
+    from .groups.tydiqa import tydiqa_summary_groups
+    from .groups.xiezhi import xiezhi_summary_groups
+
+
+summarizer = dict(
+    dataset_abbrs=[
+        '--------- 考试 Exam ---------',  # category
+        # 'Mixed', # subcategory
+        "ceval",
+        'agieval',
+        'mmlu',
+        'cmmlu',
+        "GaokaoBench",
+        'ARC-c',
+        'ARC-e',
+        '--------- 语言 Language ---------',  # category
+        # '字词释义', # subcategory
+        'WiC',
+        # '成语习语', # subcategory
+        'chid-dev',
+        # '语义相似度', # subcategory
+        'afqmc-dev',
+        # '指代消解', # subcategory
+        'WSC',
+        # '多语种问答', # subcategory
+        'tydiqa-goldp',
+        # '翻译', # subcategory
+        'flores_100',
+        '--------- 知识 Knowledge ---------',  # category
+        # '知识问答', # subcategory
+        'BoolQ',
+        'commonsense_qa',
+        'triviaqa',
+        'nq',
+        '--------- 理解 Understanding ---------',  # category
+        # '阅读理解', # subcategory
+        'C3',
+        'race-middle',
+        'race-high',
+        'openbookqa_fact',
+        # '内容总结', # subcategory
+        'csl_dev',
+        'lcsts',
+        'Xsum',
+        # '内容分析', # subcategory
+        'eprstmt-dev',
+        'lambada',
+        '--------- 推理 Reasoning ---------',  # category
+        # '文本蕴含', # subcategory
+        'cmnli',
+        'ocnli',
+        'AX_b',
+        'AX_g',
+        'RTE',
+        # '常识推理', # subcategory
+        'COPA',
+        'ReCoRD',
+        'hellaswag',
+        'piqa',
+        'siqa',
+        # '数学推理', # subcategory
+        'math',
+        'gsm8k',
+        # '定理应用', # subcategory
+        # '阅读理解', # subcategory
+        'drop',
+        # '代码', # subcategory
+        'openai_humaneval',
+        'mbpp',
+        # '综合推理', # subcategory
+        "bbh",
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+    prompt_db=dict(
+        database_path='configs/datasets/log.json',
+        config_dir='configs/datasets',
+        blacklist='.promptignore'),
+)
--- a/configs/summarizers/medium.py
+++ b/configs/summarizers/medium.py
@ -3,11 +3,14 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
    from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
+    from .groups.tydiqa import tydiqa_summary_groups
+    from .groups.xiezhi import xiezhi_summary_groups

 summarizer = dict(
    dataset_abbrs=[
--- a/configs/summarizers/small.py
+++ b/configs/summarizers/small.py
@ -3,10 +3,13 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
+    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
+    from .groups.tydiqa import tydiqa_summary_groups
+    from .groups.xiezhi import xiezhi_summary_groups

 summarizer = dict(
    dataset_abbrs = [
--- a/opencompass/datasets/record.py
+++ b/opencompass/datasets/record.py
@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset):
            return dataset


+class ReCoRDDataset_V2(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        with open(path, 'r', errors='ignore') as in_f:
+            rows = []
+            for i, line in enumerate(in_f):
+                sample = json.loads(line.strip())
+                text = sample['passage']['text'].replace('@highlight',
+                                                         '').replace(
+                                                             '\n\n', '\n')
+                for qas_dict in sample['qas']:
+                    query = qas_dict['query'].replace('@placeholder', '____')
+                    answers = [
+                        answer_dict['text']
+                        for answer_dict in qas_dict['answers']
+                    ]
+                    rows.append({
+                        'text': text,
+                        'question': query,
+                        'answers': answers
+                    })
+
+            dataset = Dataset.from_list(rows)
+            return dataset
+
+
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
 def ReCoRD_postprocess(text: str) -> str:
    text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner):
                fnmatch(dataset_abbr, pattern)
                for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
                                'agieval-jec*', 'agieval-gaokao-mathcloze',
-                                'agieval-math')):
+                                'agieval-math', '*professional_law')):
            factor *= 10

        return factor