diff --git a/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py new file mode 100644 index 00000000..ecc103b1 --- /dev/null +++ b/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen_a69961.py @@ -0,0 +1,35 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import EMEvaluator +from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess + +ReCoRD_reader_cfg = dict( + input_columns=['question', 'text'], output_column='answers') + +ReCoRD_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +ReCoRD_eval_cfg = dict( + evaluator=dict(type=EMEvaluator), + pred_role='BOT', + pred_postprocessor=dict(type=ReCoRD_postprocess)) + +ReCoRD_datasets = [ + dict( + type=ReCoRDDataset_V2, + abbr='ReCoRD', + path='./data/SuperGLUE/ReCoRD/val.jsonl', + reader_cfg=ReCoRD_reader_cfg, + infer_cfg=ReCoRD_infer_cfg, + eval_cfg=ReCoRD_eval_cfg) +] diff --git a/configs/datasets/bbh/bbh_gen_e3d13a.py b/configs/datasets/bbh/bbh_gen_e3d13a.py new file mode 100644 index 00000000..3441348e --- /dev/null +++ b/configs/datasets/bbh/bbh_gen_e3d13a.py @@ -0,0 +1,105 @@ +from os.path import exists +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess + +bbh_reader_cfg = dict(input_columns=["input"], output_column="target") + +_path_prefix = "./data/BBH" + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + +bbh_datasets = [] +for _name in bbh_multiple_choice_sets: + _hint = None + if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"): + _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read() + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), + pred_role="BOT", + pred_postprocessor=dict(type=bbh_mcq_postprocess), + dataset_postprocessor=dict(type=bbh_mcq_postprocess)) + + bbh_datasets.append( + dict( + type=BBHDataset, + path=f"{_path_prefix}/data", + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +for _name in bbh_free_form_sets: + _hint = None + if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"): + _hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read() + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role="HUMAN", + prompt= + f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT") + + bbh_datasets.append( + dict( + type=BBHDataset, + path=f"{_path_prefix}/data", + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) + +del _name, _hint, _path_prefix diff --git a/configs/datasets/collections/leaderboard/qwen.py b/configs/datasets/collections/leaderboard/qwen.py new file mode 100644 index 00000000..fb441bb0 --- /dev/null +++ b/configs/datasets/collections/leaderboard/qwen.py @@ -0,0 +1,51 @@ +from mmengine.config import read_base + +with read_base(): + from ...ceval.ceval_ppl_578f8d import ceval_datasets + from ...agieval.agieval_mixed_2f14ad import agieval_datasets + from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets + from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets + from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets + from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets + from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets + + from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets + from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets + from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets + from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets + from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets + from ...flores.flores_gen_806ede import flores_datasets + + from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets + from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets + from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets + from ...nq.nq_gen_0356ec import nq_datasets + + from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets + from ...race.race_ppl_5831a0 import race_datasets + from ...obqa.obqa_gen_9069e4 import obqa_datasets + from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets + from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets + from ...Xsum.Xsum_gen_31397e import Xsum_datasets + from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets + from ...lambada.lambada_gen_217e11 import lambada_datasets + + from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets + from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets + from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets + from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets + from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets + from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets + from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets + from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets + from ...piqa.piqa_gen_1194eb import piqa_datasets + from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets + from ...math.math_gen_265cce import math_datasets + from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from ...drop.drop_gen_599f07 import drop_datasets + from ...humaneval.humaneval_gen_a82cae import humaneval_datasets + from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ...bbh.bbh_gen_e3d13a import bbh_datasets + + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/collections/leaderboard/qwen_chat.py b/configs/datasets/collections/leaderboard/qwen_chat.py new file mode 100644 index 00000000..adf5106b --- /dev/null +++ b/configs/datasets/collections/leaderboard/qwen_chat.py @@ -0,0 +1,51 @@ +from mmengine.config import read_base + +with read_base(): + from ...ceval.ceval_gen_5f30c7 import ceval_datasets + from ...agieval.agieval_mixed_2f14ad import agieval_datasets + from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets + from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets + from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets + from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets + from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets + + from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets + from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets + from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets + from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets + from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets + from ...flores.flores_gen_806ede import flores_datasets + + from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets + from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets + from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets + from ...nq.nq_gen_c788f6 import nq_datasets + + from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets + from ...race.race_gen_69ee4f import race_datasets + from ...obqa.obqa_ppl_6aac9e import obqa_datasets + from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets + from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets + from ...Xsum.Xsum_gen_31397e import Xsum_datasets + from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets + from ...lambada.lambada_gen_217e11 import lambada_datasets + + from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets + from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets + from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets + from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets + from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets + from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets + from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets + from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets + from ...piqa.piqa_ppl_0cfff2 import piqa_datasets + from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets + from ...math.math_gen_265cce import math_datasets + from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets + from ...drop.drop_gen_599f07 import drop_datasets + from ...humaneval.humaneval_gen_a82cae import humaneval_datasets + from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ...bbh.bbh_gen_6bd693 import bbh_datasets + + +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/eval_qwen_7b.py b/configs/eval_qwen_7b.py new file mode 100644 index 00000000..ba465fa6 --- /dev/null +++ b/configs/eval_qwen_7b.py @@ -0,0 +1,58 @@ +from mmengine.config import read_base + +with read_base(): + from .models.hf_qwen_7b import models + from .datasets.collections.leaderboard.qwen import datasets + from .summarizers.leaderboard import summarizer + +''' +dataset version metric mode qwen-7b-hf +-------------------------------------- --------- ---------------- ------ ------------ +--------- 考试 Exam --------- - - - - +ceval - naive_average ppl 58.65 +agieval - naive_average mixed 40.49 +mmlu - naive_average ppl 57.78 +cmmlu - naive_average ppl 58.57 +GaokaoBench - weighted_average mixed 51.76 +ARC-c 72cf91 accuracy gen 83.73 +ARC-e 72cf91 accuracy gen 90.65 +--------- 语言 Language --------- - - - - +WiC ce62e6 accuracy ppl 51.10 +chid-dev 25f3d3 accuracy ppl 86.63 +afqmc-dev cc328c accuracy ppl 69.00 +WSC 678cb5 accuracy ppl 63.46 +tydiqa-goldp - naive_average gen 19.98 +flores_100 - naive_average gen 3.20 +--------- 知识 Knowledge --------- - - - - +BoolQ 463fee accuracy ppl 83.00 +commonsense_qa 0d8e25 accuracy ppl 67.49 +triviaqa b6904f score gen 40.45 +nq b6904f score gen 14.16 +--------- 理解 Understanding --------- - - - - +C3 e6778d accuracy gen 75.29 +race-middle 73bdec accuracy ppl 90.53 +race-high 73bdec accuracy ppl 87.71 +openbookqa_fact fa871c accuracy gen 92.20 +csl_dev 3c4211 accuracy ppl 56.25 +lcsts 0b3969 rouge1 gen 12.38 +Xsum 207e69 rouge1 gen 36.00 +eprstmt-dev 101429 accuracy gen 89.38 +lambada de1af2 accuracy gen 67.88 +--------- 推理 Reasoning --------- - - - - +cmnli 15e783 accuracy ppl 54.85 +ocnli 1471e7 accuracy gen 42.34 +AX_b 793c72 accuracy gen 58.61 +AX_g c4c886 accuracy gen 69.10 +RTE c4c886 accuracy gen 57.76 +COPA 59f42c accuracy gen 88.00 +ReCoRD 3e0689 score gen 27.78 +hellaswag 06a1e2 accuracy gen 92.47 +piqa 24369d accuracy gen 78.02 +siqa ea30d1 accuracy ppl 75.03 +math 2c0b9e accuracy gen 11.06 +gsm8k 4c7f6e accuracy gen 50.87 +drop 53a0a7 score gen 44.95 +openai_humaneval dd0dff humaneval_pass@1 gen 23.78 +mbpp 60ca11 score gen 31.20 +bbh - naive_average gen 40.03 +''' diff --git a/configs/eval_qwen_7b_chat.py b/configs/eval_qwen_7b_chat.py new file mode 100644 index 00000000..28b86113 --- /dev/null +++ b/configs/eval_qwen_7b_chat.py @@ -0,0 +1,58 @@ +from mmengine.config import read_base + +with read_base(): + from .models.hf_qwen_7b_chat import models + from .datasets.collections.leaderboard.qwen_chat import datasets + from .summarizers.leaderboard import summarizer + +''' +dataset version metric mode qwen-7b-chat-hf +-------------------------------------- --------- ---------------- ------ ----------------- +--------- 考试 Exam --------- - - - - +ceval - naive_average gen 56.07 +agieval - naive_average mixed 39.51 +mmlu - naive_average gen 53.49 +cmmlu - naive_average gen 55.29 +GaokaoBench - weighted_average gen 48.01 +ARC-c ca1e8e accuracy ppl 74.92 +ARC-e ca1e8e accuracy ppl 85.71 +--------- 语言 Language --------- - - - - +WiC efbd01 accuracy gen 51.41 +chid-dev 25f3d3 accuracy ppl 77.72 +afqmc-dev 4a1636 accuracy gen 69.00 +WSC 678cb5 accuracy ppl 67.31 +tydiqa-goldp - naive_average gen 15.32 +flores_100 - naive_average gen 10.00 +--------- 知识 Knowledge --------- - - - - +BoolQ 463fee accuracy ppl 83.18 +commonsense_qa ddaabf accuracy gen 76.41 +triviaqa b6904f score gen 43.25 +nq 23dc1a score gen 16.26 +--------- 理解 Understanding --------- - - - - +C3 e6778d accuracy gen 81.53 +race-middle e0908b accuracy gen 83.01 +race-high e0908b accuracy gen 77.79 +openbookqa_fact 49689a accuracy ppl 86.40 +csl_dev 3c4211 accuracy ppl 64.38 +lcsts 0b3969 rouge1 gen 12.75 +Xsum 207e69 rouge1 gen 20.21 +eprstmt-dev ed0c5d accuracy ppl 85.00 +lambada de1af2 accuracy gen 59.19 +--------- 推理 Reasoning --------- - - - - +cmnli 15e783 accuracy ppl 48.08 +ocnli 15e783 accuracy ppl 51.40 +AX_b 689df1 accuracy ppl 65.67 +AX_g 808a19 accuracy ppl 76.12 +RTE 808a19 accuracy ppl 68.95 +COPA 59f42c accuracy gen 92.00 +ReCoRD 6f7cfc score gen 0.16 +hellaswag 8d79e0 accuracy ppl 69.28 +piqa 34eee7 accuracy ppl 72.20 +siqa ea30d1 accuracy ppl 72.88 +math 2c0b9e accuracy gen 7.84 +gsm8k 4c7f6e accuracy gen 45.41 +drop 53a0a7 score gen 39.62 +openai_humaneval dd0dff humaneval_pass@1 gen 10.98 +mbpp 60ca11 score gen 20.60 +bbh - naive_average gen 42.61 +''' diff --git a/configs/models/hf_qwen_7b.py b/configs/models/hf_qwen_7b.py new file mode 100644 index 00000000..84e248da --- /dev/null +++ b/configs/models/hf_qwen_7b.py @@ -0,0 +1,32 @@ +from opencompass.models import HuggingFaceCausalLM + +# Please note that we have specified the revision here. Recently (on 20230827), +# during our evaluations, we found that the newer revision models have a drop +# of more than 5 points on datasets like GaokaoBench / mbpp. +# We are not yet sure whether this drop is due to incorrect logic in OpenCompass +# calling qwen or some other reasons. We would like to highlight this. + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='qwen-7b-hf', + path="Qwen/Qwen-7B", + tokenizer_path='Qwen/Qwen-7B', + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09' + ), + max_out_len=100, + max_seq_len=2048, + batch_size=8, + model_kwargs=dict( + device_map='auto', + trust_remote_code=True, + revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09' + ), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/models/hf_qwen_7b_chat.py b/configs/models/hf_qwen_7b_chat.py new file mode 100644 index 00000000..4b75ee2d --- /dev/null +++ b/configs/models/hf_qwen_7b_chat.py @@ -0,0 +1,29 @@ +from opencompass.models import HuggingFaceCausalLM + + +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True), + ], +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='qwen-7b-chat-hf', + path="Qwen/Qwen-7B-Chat", + tokenizer_path='Qwen/Qwen-7B-Chat', + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False,), + max_out_len=100, + max_seq_len=2048, + batch_size=8, + meta_template=_meta_template, + model_kwargs=dict(device_map='auto', trust_remote_code=True), + run_cfg=dict(num_gpus=1, num_procs=1), + ) +] diff --git a/configs/summarizers/example.py b/configs/summarizers/example.py index 9b935a40..3ceb0f6d 100644 --- a/configs/summarizers/example.py +++ b/configs/summarizers/example.py @@ -3,10 +3,13 @@ from mmengine.config import read_base with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups + from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups from .groups.GaokaoBench import GaokaoBench_summary_groups from .groups.flores import flores_summary_groups + from .groups.tydiqa import tydiqa_summary_groups + from .groups.xiezhi import xiezhi_summary_groups summarizer = dict( summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []), diff --git a/configs/summarizers/groups/cmmlu.py b/configs/summarizers/groups/cmmlu.py new file mode 100644 index 00000000..95fca9f2 --- /dev/null +++ b/configs/summarizers/groups/cmmlu.py @@ -0,0 +1,104 @@ +subcategories = { + "agronomy": ['other'], + "anatomy": ['biology'], + "ancient_chinese": ['linguistics','china specific'], + "arts": ['arts'], + "astronomy": ['physics'], + "business_ethics": ['business'], + "chinese_civil_service_exam": ['politics','china specific'], + "chinese_driving_rule": ['other','china specific'], + "chinese_food_culture": ['culture','china specific'], + "chinese_foreign_policy": ['politics','china specific'], + "chinese_history":['history','china specific'], + "chinese_literature": ['literature','china specific'], + "chinese_teacher_qualification": ['education','china specific'], + "college_actuarial_science":['math'], + "college_education":['education'], + "college_engineering_hydrology": ['engineering'], + "college_law": ['law'], + "college_mathematics": ['math'], + "college_medical_statistics":['statistics'], + "clinical_knowledge": ['other'], + "college_medicine": ['other'], + "computer_science": ['computer science'], + "computer_security": ['other'], + "conceptual_physics": ['physics'], + "construction_project_management": ['other','china specific'], + "economics": ['economics'], + "education": ['education'], + "elementary_chinese":['linguistics','china specific'], + "elementary_commonsense":['other','china specific'], + "elementary_information_and_technology": ['other'], + "electrical_engineering": ['engineering'], + "elementary_mathematics": ['math'], + "ethnology": ['culture','china specific'], + "food_science": ['other'], + "genetics": ['biology'], + "global_facts": ['global'], + "high_school_biology": ['biology'], + "high_school_chemistry": ['chemistry'], + "high_school_geography": ['geography'], + "high_school_mathematics": ['math'], + "high_school_physics": ['physics'], + "high_school_politics": ['politics','china specific'], + "human_sexuality": ['other'], + "international_law": ['law'], + "journalism": ['sociology'], + "jurisprudence": ['law'], + "legal_and_moral_basis": ['other'], + "logical": ['philosophy'], + "machine_learning": ['computer science'], + "management": ['business'], + "marketing": ['business'], + "marxist_theory": ['philosophy'], + "modern_chinese": ['linguistics','china specific'], + "nutrition": ['other'], + "philosophy": ['philosophy'], + "professional_accounting": ['business'], + "professional_law": ['law'], + "professional_medicine": ['other'], + "professional_psychology": ['psychology'], + "public_relations": ['politics'], + "security_study": ['politics'], + "sociology": ['culture'], + "sports_science": ['other'], + "traditional_chinese_medicine": ['other','china specific'], + "virology": ['biology'], + "world_history":['history'], + "world_religions": ['global'], +} + +categories = { + "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"], + "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"], + "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"], + "Other":["other"], + "China specific": ["china specific"], +} + +category2subject = {} +for k, v in categories.items(): + for subject, subcat in subcategories.items(): + for c in subcat: + if c in v: + category2subject.setdefault(k, []).append(subject) + +cmmlu_summary_groups = [] + +_cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']] +cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities}) + +_cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']] +cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem}) + +_cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']] +cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science}) + +_cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']] +cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other}) + +_cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']] +cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific}) + +_cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()] +cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all}) diff --git a/configs/summarizers/groups/tydiqa.py b/configs/summarizers/groups/tydiqa.py new file mode 100644 index 00000000..e5191ad8 --- /dev/null +++ b/configs/summarizers/groups/tydiqa.py @@ -0,0 +1,5 @@ +tydiqa_summary_groups = [] + +_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai'] +_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa] +tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa}) diff --git a/configs/summarizers/groups/xiezhi.py b/configs/summarizers/groups/xiezhi.py new file mode 100644 index 00000000..8cb714a5 --- /dev/null +++ b/configs/summarizers/groups/xiezhi.py @@ -0,0 +1,4 @@ +xiezhi_summary_groups = [] + +_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"] +xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi}) diff --git a/configs/summarizers/leaderboard.py b/configs/summarizers/leaderboard.py new file mode 100644 index 00000000..92221c18 --- /dev/null +++ b/configs/summarizers/leaderboard.py @@ -0,0 +1,89 @@ +from mmengine.config import read_base + +with read_base(): + from .groups.agieval import agieval_summary_groups + from .groups.mmlu import mmlu_summary_groups + from .groups.cmmlu import cmmlu_summary_groups + from .groups.ceval import ceval_summary_groups + from .groups.bbh import bbh_summary_groups + from .groups.GaokaoBench import GaokaoBench_summary_groups + from .groups.flores import flores_summary_groups + from .groups.tydiqa import tydiqa_summary_groups + from .groups.xiezhi import xiezhi_summary_groups + + +summarizer = dict( + dataset_abbrs=[ + '--------- 考试 Exam ---------', # category + # 'Mixed', # subcategory + "ceval", + 'agieval', + 'mmlu', + 'cmmlu', + "GaokaoBench", + 'ARC-c', + 'ARC-e', + '--------- 语言 Language ---------', # category + # '字词释义', # subcategory + 'WiC', + # '成语习语', # subcategory + 'chid-dev', + # '语义相似度', # subcategory + 'afqmc-dev', + # '指代消解', # subcategory + 'WSC', + # '多语种问答', # subcategory + 'tydiqa-goldp', + # '翻译', # subcategory + 'flores_100', + '--------- 知识 Knowledge ---------', # category + # '知识问答', # subcategory + 'BoolQ', + 'commonsense_qa', + 'triviaqa', + 'nq', + '--------- 理解 Understanding ---------', # category + # '阅读理解', # subcategory + 'C3', + 'race-middle', + 'race-high', + 'openbookqa_fact', + # '内容总结', # subcategory + 'csl_dev', + 'lcsts', + 'Xsum', + # '内容分析', # subcategory + 'eprstmt-dev', + 'lambada', + '--------- 推理 Reasoning ---------', # category + # '文本蕴含', # subcategory + 'cmnli', + 'ocnli', + 'AX_b', + 'AX_g', + 'RTE', + # '常识推理', # subcategory + 'COPA', + 'ReCoRD', + 'hellaswag', + 'piqa', + 'siqa', + # '数学推理', # subcategory + 'math', + 'gsm8k', + # '定理应用', # subcategory + # '阅读理解', # subcategory + 'drop', + # '代码', # subcategory + 'openai_humaneval', + 'mbpp', + # '综合推理', # subcategory + "bbh", + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith("_summary_groups")], []), + prompt_db=dict( + database_path='configs/datasets/log.json', + config_dir='configs/datasets', + blacklist='.promptignore'), +) diff --git a/configs/summarizers/medium.py b/configs/summarizers/medium.py index 8b652ccf..093f1146 100644 --- a/configs/summarizers/medium.py +++ b/configs/summarizers/medium.py @@ -3,11 +3,14 @@ from mmengine.config import read_base with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups + from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups from .groups.GaokaoBench import GaokaoBench_summary_groups from .groups.flores import flores_summary_groups from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups + from .groups.tydiqa import tydiqa_summary_groups + from .groups.xiezhi import xiezhi_summary_groups summarizer = dict( dataset_abbrs=[ diff --git a/configs/summarizers/small.py b/configs/summarizers/small.py index 01d97a00..f5b719dd 100644 --- a/configs/summarizers/small.py +++ b/configs/summarizers/small.py @@ -3,10 +3,13 @@ from mmengine.config import read_base with read_base(): from .groups.agieval import agieval_summary_groups from .groups.mmlu import mmlu_summary_groups + from .groups.cmmlu import cmmlu_summary_groups from .groups.ceval import ceval_summary_groups from .groups.bbh import bbh_summary_groups from .groups.GaokaoBench import GaokaoBench_summary_groups from .groups.flores import flores_summary_groups + from .groups.tydiqa import tydiqa_summary_groups + from .groups.xiezhi import xiezhi_summary_groups summarizer = dict( dataset_abbrs = [ diff --git a/opencompass/datasets/record.py b/opencompass/datasets/record.py index d6c976e8..7f90c535 100644 --- a/opencompass/datasets/record.py +++ b/opencompass/datasets/record.py @@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset): return dataset +class ReCoRDDataset_V2(BaseDataset): + + @staticmethod + def load(path: str): + with open(path, 'r', errors='ignore') as in_f: + rows = [] + for i, line in enumerate(in_f): + sample = json.loads(line.strip()) + text = sample['passage']['text'].replace('@highlight', + '').replace( + '\n\n', '\n') + for qas_dict in sample['qas']: + query = qas_dict['query'].replace('@placeholder', '____') + answers = [ + answer_dict['text'] + for answer_dict in qas_dict['answers'] + ] + rows.append({ + 'text': text, + 'question': query, + 'answers': answers + }) + + dataset = Dataset.from_list(rows) + return dataset + + @TEXT_POSTPROCESSORS.register_module('ReCoRD') def ReCoRD_postprocess(text: str) -> str: text = text.strip().split('\n')[0].replace('Answer: ', '').strip() diff --git a/opencompass/partitioners/size.py b/opencompass/partitioners/size.py index d86e72d2..9b7f8c6a 100644 --- a/opencompass/partitioners/size.py +++ b/opencompass/partitioners/size.py @@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner): fnmatch(dataset_abbr, pattern) for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*', 'agieval-jec*', 'agieval-gaokao-mathcloze', - 'agieval-math')): + 'agieval-math', '*professional_law')): factor *= 10 return factor