[Feature] Add qwen & qwen-chat support (#286)

* add and apply update suffix tool * add tool doc * add qwen configs * add cmmlu * rename bbh * update datasets * delete * update hf_qwen_7b.py
2025-05-30 16:03:24 +08:00 · 2023-08-31 11:29:05 +08:00 · 2023-08-31 11:29:05 +08:00 · 7ca6ba625e
commit 7ca6ba625e
parent fd389e2d78
17 changed files with 658 additions and 1 deletions
--- a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py
+++ b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py
@ -0,0 +1,35 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import EMEvaluator
 from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess
 ReCoRD_reader_cfg = dict(
    input_columns=['question', 'text'], output_column='answers')
 ReCoRD_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
        template=dict(round=[
            dict(
                role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:"
            ),
        ]),
    ),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer))
 ReCoRD_eval_cfg = dict(
    evaluator=dict(type=EMEvaluator),
    pred_role='BOT',
    pred_postprocessor=dict(type=ReCoRD_postprocess))
 ReCoRD_datasets = [
    dict(
        type=ReCoRDDataset_V2,
        abbr='ReCoRD',
        path='./data/SuperGLUE/ReCoRD/val.jsonl',
        reader_cfg=ReCoRD_reader_cfg,
        infer_cfg=ReCoRD_infer_cfg,
        eval_cfg=ReCoRD_eval_cfg)
 ]
--- a/configs/datasets/bbh/bbh_gen_e3d13a.py
+++ b/configs/datasets/bbh/bbh_gen_e3d13a.py
@ -0,0 +1,105 @@
 from os.path import exists
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import AccEvaluator
 from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
 bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
 _path_prefix = "./data/BBH"
 bbh_multiple_choice_sets = [
    'temporal_sequences',
    'disambiguation_qa',
    'date_understanding',
    'tracking_shuffled_objects_three_objects',
    'penguins_in_a_table',
    'geometric_shapes',
    'snarks',
    'ruin_names',
    'tracking_shuffled_objects_seven_objects',
    'tracking_shuffled_objects_five_objects',
    'logical_deduction_three_objects',
    'hyperbaton',
    'logical_deduction_five_objects',
    'logical_deduction_seven_objects',
    'movie_recommendation',
    'salient_translation_error_detection',
    'reasoning_about_colored_objects',
 ]
 bbh_free_form_sets = [
    'multistep_arithmetic_two',
    'navigate',
    'dyck_languages',
    'word_sorting',
    'sports_understanding',
    'boolean_expressions',
    'object_counting',
    'formal_fallacies',
    'causal_judgement',
    'web_of_lies',
 ]
 bbh_datasets = []
 for _name in bbh_multiple_choice_sets:
    _hint = None
    if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
        _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role="HUMAN",
                    prompt=
                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(
        evaluator=dict(type=AccEvaluator),
        pred_role="BOT",
        pred_postprocessor=dict(type=bbh_mcq_postprocess),
        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path=f"{_path_prefix}/data",
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
 for _name in bbh_free_form_sets:
    _hint = None
    if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
        _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
    bbh_infer_cfg = dict(
        prompt_template=dict(
            type=PromptTemplate,
            template=dict(round=[
                dict(
                    role="HUMAN",
                    prompt=
                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
                )
            ])),
        retriever=dict(type=ZeroRetriever),
        inferencer=dict(type=GenInferencer, max_out_len=512))
    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
    bbh_datasets.append(
        dict(
            type=BBHDataset,
            path=f"{_path_prefix}/data",
            name=_name,
            abbr='bbh-' + _name,
            reader_cfg=bbh_reader_cfg,
            infer_cfg=bbh_infer_cfg.copy(),
            eval_cfg=bbh_eval_cfg.copy()))
 del _name, _hint, _path_prefix
--- a/configs/datasets/collections/leaderboard/qwen.py
+++ b/configs/datasets/collections/leaderboard/qwen.py
@ -0,0 +1,51 @@
 from mmengine.config import read_base
 with read_base():
    from ...ceval.ceval_ppl_578f8d import ceval_datasets
    from ...agieval.agieval_mixed_2f14ad import agieval_datasets
    from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
    from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
    from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
    from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
    from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
    from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
    from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
    from ...flores.flores_gen_806ede import flores_datasets
    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
    from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
    from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
    from ...nq.nq_gen_0356ec import nq_datasets
    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
    from ...race.race_ppl_5831a0 import race_datasets
    from ...obqa.obqa_gen_9069e4 import obqa_datasets
    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
    from ...lambada.lambada_gen_217e11 import lambada_datasets
    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
    from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
    from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
    from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
    from ...piqa.piqa_gen_1194eb import piqa_datasets
    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
    from ...math.math_gen_265cce import math_datasets
    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from ...drop.drop_gen_599f07 import drop_datasets
    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
    from ...bbh.bbh_gen_e3d13a import bbh_datasets
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/configs/datasets/collections/leaderboard/qwen_chat.py
+++ b/configs/datasets/collections/leaderboard/qwen_chat.py
@ -0,0 +1,51 @@
 from mmengine.config import read_base
 with read_base():
    from ...ceval.ceval_gen_5f30c7 import ceval_datasets
    from ...agieval.agieval_mixed_2f14ad import agieval_datasets
    from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets
    from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
    from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
    from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
    from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
    from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
    from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
    from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
    from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
    from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
    from ...flores.flores_gen_806ede import flores_datasets
    from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
    from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
    from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
    from ...nq.nq_gen_c788f6 import nq_datasets
    from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
    from ...race.race_gen_69ee4f import race_datasets
    from ...obqa.obqa_ppl_6aac9e import obqa_datasets
    from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
    from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
    from ...Xsum.Xsum_gen_31397e import Xsum_datasets
    from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
    from ...lambada.lambada_gen_217e11 import lambada_datasets
    from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
    from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
    from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
    from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
    from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
    from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
    from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
    from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
    from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
    from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
    from ...math.math_gen_265cce import math_datasets
    from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
    from ...drop.drop_gen_599f07 import drop_datasets
    from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
    from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
    from ...bbh.bbh_gen_6bd693 import bbh_datasets
 datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
--- a/configs/eval_qwen_7b.py
+++ b/configs/eval_qwen_7b.py
@ -0,0 +1,58 @@
 from mmengine.config import read_base
 with read_base():
    from .models.hf_qwen_7b import models
    from .datasets.collections.leaderboard.qwen import datasets
    from .summarizers.leaderboard import summarizer
 '''
 dataset                                 version    metric            mode    qwen-7b-hf
 --------------------------------------  ---------  ----------------  ------  ------------
 --------- 考试 Exam ---------           -          -                 -       -
 ceval                                   -          naive_average     ppl     58.65
 agieval                                 -          naive_average     mixed   40.49
 mmlu                                    -          naive_average     ppl     57.78
 cmmlu                                   -          naive_average     ppl     58.57
 GaokaoBench                             -          weighted_average  mixed   51.76
 ARC-c                                   72cf91     accuracy          gen     83.73
 ARC-e                                   72cf91     accuracy          gen     90.65
 --------- 语言 Language ---------       -          -                 -       -
 WiC                                     ce62e6     accuracy          ppl     51.10
 chid-dev                                25f3d3     accuracy          ppl     86.63
 afqmc-dev                               cc328c     accuracy          ppl     69.00
 WSC                                     678cb5     accuracy          ppl     63.46
 tydiqa-goldp                            -          naive_average     gen     19.98
 flores_100                              -          naive_average     gen     3.20
 --------- 知识 Knowledge ---------      -          -                 -       -
 BoolQ                                   463fee     accuracy          ppl     83.00
 commonsense_qa                          0d8e25     accuracy          ppl     67.49
 triviaqa                                b6904f     score             gen     40.45
 nq                                      b6904f     score             gen     14.16
 --------- 理解 Understanding ---------  -          -                 -       -
 C3                                      e6778d     accuracy          gen     75.29
 race-middle                             73bdec     accuracy          ppl     90.53
 race-high                               73bdec     accuracy          ppl     87.71
 openbookqa_fact                         fa871c     accuracy          gen     92.20
 csl_dev                                 3c4211     accuracy          ppl     56.25
 lcsts                                   0b3969     rouge1            gen     12.38
 Xsum                                    207e69     rouge1            gen     36.00
 eprstmt-dev                             101429     accuracy          gen     89.38
 lambada                                 de1af2     accuracy          gen     67.88
 --------- 推理 Reasoning ---------      -          -                 -       -
 cmnli                                   15e783     accuracy          ppl     54.85
 ocnli                                   1471e7     accuracy          gen     42.34
 AX_b                                    793c72     accuracy          gen     58.61
 AX_g                                    c4c886     accuracy          gen     69.10
 RTE                                     c4c886     accuracy          gen     57.76
 COPA                                    59f42c     accuracy          gen     88.00
 ReCoRD                                  3e0689     score             gen     27.78
 hellaswag                               06a1e2     accuracy          gen     92.47
 piqa                                    24369d     accuracy          gen     78.02
 siqa                                    ea30d1     accuracy          ppl     75.03
 math                                    2c0b9e     accuracy          gen     11.06
 gsm8k                                   4c7f6e     accuracy          gen     50.87
 drop                                    53a0a7     score             gen     44.95
 openai_humaneval                        dd0dff     humaneval_pass@1  gen     23.78
 mbpp                                    60ca11     score             gen     31.20
 bbh                                     -          naive_average     gen     40.03
 '''
--- a/configs/eval_qwen_7b_chat.py
+++ b/configs/eval_qwen_7b_chat.py
@ -0,0 +1,58 @@
 from mmengine.config import read_base
 with read_base():
    from .models.hf_qwen_7b_chat import models
    from .datasets.collections.leaderboard.qwen_chat import datasets
    from .summarizers.leaderboard import summarizer
 '''
 dataset                                 version    metric            mode    qwen-7b-chat-hf
 --------------------------------------  ---------  ----------------  ------  -----------------
 --------- 考试 Exam ---------           -          -                 -       -
 ceval                                   -          naive_average     gen     56.07
 agieval                                 -          naive_average     mixed   39.51
 mmlu                                    -          naive_average     gen     53.49
 cmmlu                                   -          naive_average     gen     55.29
 GaokaoBench                             -          weighted_average  gen     48.01
 ARC-c                                   ca1e8e     accuracy          ppl     74.92
 ARC-e                                   ca1e8e     accuracy          ppl     85.71
 --------- 语言 Language ---------       -          -                 -       -
 WiC                                     efbd01     accuracy          gen     51.41
 chid-dev                                25f3d3     accuracy          ppl     77.72
 afqmc-dev                               4a1636     accuracy          gen     69.00
 WSC                                     678cb5     accuracy          ppl     67.31
 tydiqa-goldp                            -          naive_average     gen     15.32
 flores_100                              -          naive_average     gen     10.00
 --------- 知识 Knowledge ---------      -          -                 -       -
 BoolQ                                   463fee     accuracy          ppl     83.18
 commonsense_qa                          ddaabf     accuracy          gen     76.41
 triviaqa                                b6904f     score             gen     43.25
 nq                                      23dc1a     score             gen     16.26
 --------- 理解 Understanding ---------  -          -                 -       -
 C3                                      e6778d     accuracy          gen     81.53
 race-middle                             e0908b     accuracy          gen     83.01
 race-high                               e0908b     accuracy          gen     77.79
 openbookqa_fact                         49689a     accuracy          ppl     86.40
 csl_dev                                 3c4211     accuracy          ppl     64.38
 lcsts                                   0b3969     rouge1            gen     12.75
 Xsum                                    207e69     rouge1            gen     20.21
 eprstmt-dev                             ed0c5d     accuracy          ppl     85.00
 lambada                                 de1af2     accuracy          gen     59.19
 --------- 推理 Reasoning ---------      -          -                 -       -
 cmnli                                   15e783     accuracy          ppl     48.08
 ocnli                                   15e783     accuracy          ppl     51.40
 AX_b                                    689df1     accuracy          ppl     65.67
 AX_g                                    808a19     accuracy          ppl     76.12
 RTE                                     808a19     accuracy          ppl     68.95
 COPA                                    59f42c     accuracy          gen     92.00
 ReCoRD                                  6f7cfc     score             gen     0.16
 hellaswag                               8d79e0     accuracy          ppl     69.28
 piqa                                    34eee7     accuracy          ppl     72.20
 siqa                                    ea30d1     accuracy          ppl     72.88
 math                                    2c0b9e     accuracy          gen     7.84
 gsm8k                                   4c7f6e     accuracy          gen     45.41
 drop                                    53a0a7     score             gen     39.62
 openai_humaneval                        dd0dff     humaneval_pass@1  gen     10.98
 mbpp                                    60ca11     score             gen     20.60
 bbh                                     -          naive_average     gen     42.61
 '''
--- a/configs/models/hf_qwen_7b.py
+++ b/configs/models/hf_qwen_7b.py
@ -0,0 +1,32 @@
 from opencompass.models import HuggingFaceCausalLM
 # Please note that we have specified the revision here. Recently (on 20230827),
 # during our evaluations, we found that the newer revision models have a drop
 # of more than 5 points on datasets like GaokaoBench / mbpp.
 # We are not yet sure whether this drop is due to incorrect logic in OpenCompass
 # calling qwen or some other reasons. We would like to highlight this.
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='qwen-7b-hf',
        path="Qwen/Qwen-7B",
        tokenizer_path='Qwen/Qwen-7B',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
            use_fast=False,
            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
        ),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        model_kwargs=dict(
            device_map='auto',
            trust_remote_code=True,
            revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
        ),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/models/hf_qwen_7b_chat.py
+++ b/configs/models/hf_qwen_7b_chat.py
@ -0,0 +1,29 @@
 from opencompass.models import HuggingFaceCausalLM
 _meta_template = dict(
    round=[
        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
        dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
    ],
 )
 models = [
    dict(
        type=HuggingFaceCausalLM,
        abbr='qwen-7b-chat-hf',
        path="Qwen/Qwen-7B-Chat",
        tokenizer_path='Qwen/Qwen-7B-Chat',
        tokenizer_kwargs=dict(
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
            use_fast=False,),
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        model_kwargs=dict(device_map='auto', trust_remote_code=True),
        run_cfg=dict(num_gpus=1, num_procs=1),
    )
 ]
--- a/configs/summarizers/example.py
+++ b/configs/summarizers/example.py
@ -3,10 +3,13 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
    from .groups.tydiqa import tydiqa_summary_groups
    from .groups.xiezhi import xiezhi_summary_groups
 summarizer = dict(
    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
--- a/configs/summarizers/groups/cmmlu.py
+++ b/configs/summarizers/groups/cmmlu.py
@ -0,0 +1,104 @@
 subcategories = {
    "agronomy": ['other'],
    "anatomy": ['biology'],
    "ancient_chinese": ['linguistics','china specific'],
    "arts": ['arts'],
    "astronomy": ['physics'],
    "business_ethics": ['business'],
    "chinese_civil_service_exam": ['politics','china specific'],
    "chinese_driving_rule": ['other','china specific'],
    "chinese_food_culture": ['culture','china specific'],
    "chinese_foreign_policy": ['politics','china specific'],
    "chinese_history":['history','china specific'],
    "chinese_literature": ['literature','china specific'],
    "chinese_teacher_qualification": ['education','china specific'],
    "college_actuarial_science":['math'],
    "college_education":['education'],
    "college_engineering_hydrology": ['engineering'],
    "college_law": ['law'],
    "college_mathematics": ['math'],
    "college_medical_statistics":['statistics'],
    "clinical_knowledge": ['other'],
    "college_medicine": ['other'],
    "computer_science": ['computer science'],
    "computer_security": ['other'],
    "conceptual_physics": ['physics'],
    "construction_project_management": ['other','china specific'],
    "economics": ['economics'],
    "education": ['education'],
    "elementary_chinese":['linguistics','china specific'],
    "elementary_commonsense":['other','china specific'],
    "elementary_information_and_technology": ['other'],
    "electrical_engineering": ['engineering'],
    "elementary_mathematics": ['math'],
    "ethnology": ['culture','china specific'],
    "food_science": ['other'],
    "genetics": ['biology'],
    "global_facts": ['global'],
    "high_school_biology": ['biology'],
    "high_school_chemistry": ['chemistry'],
    "high_school_geography": ['geography'],
    "high_school_mathematics": ['math'],
    "high_school_physics": ['physics'],
    "high_school_politics": ['politics','china specific'],
    "human_sexuality": ['other'],
    "international_law": ['law'],
    "journalism": ['sociology'],
    "jurisprudence": ['law'],
    "legal_and_moral_basis": ['other'],
    "logical": ['philosophy'],
    "machine_learning": ['computer science'],
    "management": ['business'],
    "marketing": ['business'],
    "marxist_theory": ['philosophy'],
    "modern_chinese": ['linguistics','china specific'],
    "nutrition": ['other'],
    "philosophy": ['philosophy'],
    "professional_accounting": ['business'],
    "professional_law": ['law'],
    "professional_medicine": ['other'],
    "professional_psychology": ['psychology'],
    "public_relations": ['politics'],
    "security_study": ['politics'],
    "sociology": ['culture'],
    "sports_science": ['other'],
    "traditional_chinese_medicine": ['other','china specific'],
    "virology": ['biology'],
    "world_history":['history'],
    "world_religions": ['global'],
 }
 categories = {
    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
    "Other":["other"],
    "China specific": ["china specific"],
 }
 category2subject = {}
 for k, v in categories.items():
    for subject, subcat in subcategories.items():
        for c in subcat:
            if c in v:
                category2subject.setdefault(k, []).append(subject)
 cmmlu_summary_groups = []
 _cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']]
 cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities})
 _cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']]
 cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem})
 _cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']]
 cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science})
 _cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']]
 cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other})
 _cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']]
 cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific})
 _cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()]
 cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all})
--- a/configs/summarizers/groups/tydiqa.py
+++ b/configs/summarizers/groups/tydiqa.py
@ -0,0 +1,5 @@
 tydiqa_summary_groups = []
 _tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
 _tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa]
 tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})
--- a/configs/summarizers/groups/xiezhi.py
+++ b/configs/summarizers/groups/xiezhi.py
@ -0,0 +1,4 @@
 xiezhi_summary_groups = []
 _xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
 xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
--- a/configs/summarizers/leaderboard.py
+++ b/configs/summarizers/leaderboard.py
@ -0,0 +1,89 @@
 from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
    from .groups.tydiqa import tydiqa_summary_groups
    from .groups.xiezhi import xiezhi_summary_groups
 summarizer = dict(
    dataset_abbrs=[
        '--------- 考试 Exam ---------',  # category
        # 'Mixed', # subcategory
        "ceval",
        'agieval',
        'mmlu',
        'cmmlu',
        "GaokaoBench",
        'ARC-c',
        'ARC-e',
        '--------- 语言 Language ---------',  # category
        # '字词释义', # subcategory
        'WiC',
        # '成语习语', # subcategory
        'chid-dev',
        # '语义相似度', # subcategory
        'afqmc-dev',
        # '指代消解', # subcategory
        'WSC',
        # '多语种问答', # subcategory
        'tydiqa-goldp',
        # '翻译', # subcategory
        'flores_100',
        '--------- 知识 Knowledge ---------',  # category
        # '知识问答', # subcategory
        'BoolQ',
        'commonsense_qa',
        'triviaqa',
        'nq',
        '--------- 理解 Understanding ---------',  # category
        # '阅读理解', # subcategory
        'C3',
        'race-middle',
        'race-high',
        'openbookqa_fact',
        # '内容总结', # subcategory
        'csl_dev',
        'lcsts',
        'Xsum',
        # '内容分析', # subcategory
        'eprstmt-dev',
        'lambada',
        '--------- 推理 Reasoning ---------',  # category
        # '文本蕴含', # subcategory
        'cmnli',
        'ocnli',
        'AX_b',
        'AX_g',
        'RTE',
        # '常识推理', # subcategory
        'COPA',
        'ReCoRD',
        'hellaswag',
        'piqa',
        'siqa',
        # '数学推理', # subcategory
        'math',
        'gsm8k',
        # '定理应用', # subcategory
        # '阅读理解', # subcategory
        'drop',
        # '代码', # subcategory
        'openai_humaneval',
        'mbpp',
        # '综合推理', # subcategory
        "bbh",
    ],
    summary_groups=sum(
        [v for k, v in locals().items() if k.endswith("_summary_groups")], []),
    prompt_db=dict(
        database_path='configs/datasets/log.json',
        config_dir='configs/datasets',
        blacklist='.promptignore'),
 )
--- a/configs/summarizers/medium.py
+++ b/configs/summarizers/medium.py
@ -3,11 +3,14 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
    from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
    from .groups.tydiqa import tydiqa_summary_groups
    from .groups.xiezhi import xiezhi_summary_groups
 summarizer = dict(
    dataset_abbrs=[
--- a/configs/summarizers/small.py
+++ b/configs/summarizers/small.py
@ -3,10 +3,13 @@ from mmengine.config import read_base
 with read_base():
    from .groups.agieval import agieval_summary_groups
    from .groups.mmlu import mmlu_summary_groups
    from .groups.cmmlu import cmmlu_summary_groups
    from .groups.ceval import ceval_summary_groups
    from .groups.bbh import bbh_summary_groups
    from .groups.GaokaoBench import GaokaoBench_summary_groups
    from .groups.flores import flores_summary_groups
    from .groups.tydiqa import tydiqa_summary_groups
    from .groups.xiezhi import xiezhi_summary_groups
 summarizer = dict(
    dataset_abbrs = [
--- a/opencompass/datasets/record.py
+++ b/opencompass/datasets/record.py
@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset):
            return dataset
 class ReCoRDDataset_V2(BaseDataset):
    @staticmethod
    def load(path: str):
        with open(path, 'r', errors='ignore') as in_f:
            rows = []
            for i, line in enumerate(in_f):
                sample = json.loads(line.strip())
                text = sample['passage']['text'].replace('@highlight',
                                                         '').replace(
                                                             '\n\n', '\n')
                for qas_dict in sample['qas']:
                    query = qas_dict['query'].replace('@placeholder', '____')
                    answers = [
                        answer_dict['text']
                        for answer_dict in qas_dict['answers']
                    ]
                    rows.append({
                        'text': text,
                        'question': query,
                        'answers': answers
                    })
            dataset = Dataset.from_list(rows)
            return dataset
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
 def ReCoRD_postprocess(text: str) -> str:
    text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
--- a/opencompass/partitioners/size.py
+++ b/opencompass/partitioners/size.py
@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner):
                fnmatch(dataset_abbr, pattern)
                for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
                                'agieval-jec*', 'agieval-gaokao-mathcloze',
-                                'agieval-math')):
+                                'agieval-math', '*professional_law')):
            factor *= 10
        return factor