[Feature] Add qwen & qwen-chat support (#286)

* add and apply update suffix tool

* add tool doc

* add qwen configs

* add cmmlu

* rename bbh

* update datasets

* delete

* update hf_qwen_7b.py
This commit is contained in:
Leymore 2023-08-31 11:29:05 +08:00 committed by GitHub
parent fd389e2d78
commit 7ca6ba625e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
17 changed files with 658 additions and 1 deletions

View File

@ -0,0 +1,35 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import EMEvaluator
from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess
ReCoRD_reader_cfg = dict(
input_columns=['question', 'text'], output_column='answers')
ReCoRD_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
ReCoRD_eval_cfg = dict(
evaluator=dict(type=EMEvaluator),
pred_role='BOT',
pred_postprocessor=dict(type=ReCoRD_postprocess))
ReCoRD_datasets = [
dict(
type=ReCoRDDataset_V2,
abbr='ReCoRD',
path='./data/SuperGLUE/ReCoRD/val.jsonl',
reader_cfg=ReCoRD_reader_cfg,
infer_cfg=ReCoRD_infer_cfg,
eval_cfg=ReCoRD_eval_cfg)
]

View File

@ -0,0 +1,105 @@
from os.path import exists
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
_path_prefix = "./data/BBH"
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
bbh_datasets = []
for _name in bbh_multiple_choice_sets:
_hint = None
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
bbh_datasets.append(
dict(
type=BBHDataset,
path=f"{_path_prefix}/data",
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
for _name in bbh_free_form_sets:
_hint = None
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt=
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
bbh_datasets.append(
dict(
type=BBHDataset,
path=f"{_path_prefix}/data",
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
del _name, _hint, _path_prefix

View File

@ -0,0 +1,51 @@
from mmengine.config import read_base
with read_base():
from ...ceval.ceval_ppl_578f8d import ceval_datasets
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
from ...flores.flores_gen_806ede import flores_datasets
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
from ...nq.nq_gen_0356ec import nq_datasets
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ...race.race_ppl_5831a0 import race_datasets
from ...obqa.obqa_gen_9069e4 import obqa_datasets
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
from ...lambada.lambada_gen_217e11 import lambada_datasets
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from ...piqa.piqa_gen_1194eb import piqa_datasets
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ...math.math_gen_265cce import math_datasets
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.drop_gen_599f07 import drop_datasets
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_e3d13a import bbh_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,51 @@
from mmengine.config import read_base
with read_base():
from ...ceval.ceval_gen_5f30c7 import ceval_datasets
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets
from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
from ...flores.flores_gen_806ede import flores_datasets
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
from ...nq.nq_gen_c788f6 import nq_datasets
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ...race.race_gen_69ee4f import race_datasets
from ...obqa.obqa_ppl_6aac9e import obqa_datasets
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ...lambada.lambada_gen_217e11 import lambada_datasets
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ...math.math_gen_265cce import math_datasets
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.drop_gen_599f07 import drop_datasets
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_6bd693 import bbh_datasets
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

58
configs/eval_qwen_7b.py Normal file
View File

@ -0,0 +1,58 @@
from mmengine.config import read_base
with read_base():
from .models.hf_qwen_7b import models
from .datasets.collections.leaderboard.qwen import datasets
from .summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-hf
-------------------------------------- --------- ---------------- ------ ------------
--------- 考试 Exam --------- - - - -
ceval - naive_average ppl 58.65
agieval - naive_average mixed 40.49
mmlu - naive_average ppl 57.78
cmmlu - naive_average ppl 58.57
GaokaoBench - weighted_average mixed 51.76
ARC-c 72cf91 accuracy gen 83.73
ARC-e 72cf91 accuracy gen 90.65
--------- 语言 Language --------- - - - -
WiC ce62e6 accuracy ppl 51.10
chid-dev 25f3d3 accuracy ppl 86.63
afqmc-dev cc328c accuracy ppl 69.00
WSC 678cb5 accuracy ppl 63.46
tydiqa-goldp - naive_average gen 19.98
flores_100 - naive_average gen 3.20
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.00
commonsense_qa 0d8e25 accuracy ppl 67.49
triviaqa b6904f score gen 40.45
nq b6904f score gen 14.16
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 75.29
race-middle 73bdec accuracy ppl 90.53
race-high 73bdec accuracy ppl 87.71
openbookqa_fact fa871c accuracy gen 92.20
csl_dev 3c4211 accuracy ppl 56.25
lcsts 0b3969 rouge1 gen 12.38
Xsum 207e69 rouge1 gen 36.00
eprstmt-dev 101429 accuracy gen 89.38
lambada de1af2 accuracy gen 67.88
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 54.85
ocnli 1471e7 accuracy gen 42.34
AX_b 793c72 accuracy gen 58.61
AX_g c4c886 accuracy gen 69.10
RTE c4c886 accuracy gen 57.76
COPA 59f42c accuracy gen 88.00
ReCoRD 3e0689 score gen 27.78
hellaswag 06a1e2 accuracy gen 92.47
piqa 24369d accuracy gen 78.02
siqa ea30d1 accuracy ppl 75.03
math 2c0b9e accuracy gen 11.06
gsm8k 4c7f6e accuracy gen 50.87
drop 53a0a7 score gen 44.95
openai_humaneval dd0dff humaneval_pass@1 gen 23.78
mbpp 60ca11 score gen 31.20
bbh - naive_average gen 40.03
'''

View File

@ -0,0 +1,58 @@
from mmengine.config import read_base
with read_base():
from .models.hf_qwen_7b_chat import models
from .datasets.collections.leaderboard.qwen_chat import datasets
from .summarizers.leaderboard import summarizer
'''
dataset version metric mode qwen-7b-chat-hf
-------------------------------------- --------- ---------------- ------ -----------------
--------- 考试 Exam --------- - - - -
ceval - naive_average gen 56.07
agieval - naive_average mixed 39.51
mmlu - naive_average gen 53.49
cmmlu - naive_average gen 55.29
GaokaoBench - weighted_average gen 48.01
ARC-c ca1e8e accuracy ppl 74.92
ARC-e ca1e8e accuracy ppl 85.71
--------- 语言 Language --------- - - - -
WiC efbd01 accuracy gen 51.41
chid-dev 25f3d3 accuracy ppl 77.72
afqmc-dev 4a1636 accuracy gen 69.00
WSC 678cb5 accuracy ppl 67.31
tydiqa-goldp - naive_average gen 15.32
flores_100 - naive_average gen 10.00
--------- 知识 Knowledge --------- - - - -
BoolQ 463fee accuracy ppl 83.18
commonsense_qa ddaabf accuracy gen 76.41
triviaqa b6904f score gen 43.25
nq 23dc1a score gen 16.26
--------- 理解 Understanding --------- - - - -
C3 e6778d accuracy gen 81.53
race-middle e0908b accuracy gen 83.01
race-high e0908b accuracy gen 77.79
openbookqa_fact 49689a accuracy ppl 86.40
csl_dev 3c4211 accuracy ppl 64.38
lcsts 0b3969 rouge1 gen 12.75
Xsum 207e69 rouge1 gen 20.21
eprstmt-dev ed0c5d accuracy ppl 85.00
lambada de1af2 accuracy gen 59.19
--------- 推理 Reasoning --------- - - - -
cmnli 15e783 accuracy ppl 48.08
ocnli 15e783 accuracy ppl 51.40
AX_b 689df1 accuracy ppl 65.67
AX_g 808a19 accuracy ppl 76.12
RTE 808a19 accuracy ppl 68.95
COPA 59f42c accuracy gen 92.00
ReCoRD 6f7cfc score gen 0.16
hellaswag 8d79e0 accuracy ppl 69.28
piqa 34eee7 accuracy ppl 72.20
siqa ea30d1 accuracy ppl 72.88
math 2c0b9e accuracy gen 7.84
gsm8k 4c7f6e accuracy gen 45.41
drop 53a0a7 score gen 39.62
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
mbpp 60ca11 score gen 20.60
bbh - naive_average gen 42.61
'''

View File

@ -0,0 +1,32 @@
from opencompass.models import HuggingFaceCausalLM
# Please note that we have specified the revision here. Recently (on 20230827),
# during our evaluations, we found that the newer revision models have a drop
# of more than 5 points on datasets like GaokaoBench / mbpp.
# We are not yet sure whether this drop is due to incorrect logic in OpenCompass
# calling qwen or some other reasons. We would like to highlight this.
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen-7b-hf',
path="Qwen/Qwen-7B",
tokenizer_path='Qwen/Qwen-7B',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,29 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen-7b-chat-hf',
path="Qwen/Qwen-7B-Chat",
tokenizer_path='Qwen/Qwen-7B-Chat',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -3,10 +3,13 @@ from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
from .groups.tydiqa import tydiqa_summary_groups
from .groups.xiezhi import xiezhi_summary_groups
summarizer = dict(
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),

View File

@ -0,0 +1,104 @@
subcategories = {
"agronomy": ['other'],
"anatomy": ['biology'],
"ancient_chinese": ['linguistics','china specific'],
"arts": ['arts'],
"astronomy": ['physics'],
"business_ethics": ['business'],
"chinese_civil_service_exam": ['politics','china specific'],
"chinese_driving_rule": ['other','china specific'],
"chinese_food_culture": ['culture','china specific'],
"chinese_foreign_policy": ['politics','china specific'],
"chinese_history":['history','china specific'],
"chinese_literature": ['literature','china specific'],
"chinese_teacher_qualification": ['education','china specific'],
"college_actuarial_science":['math'],
"college_education":['education'],
"college_engineering_hydrology": ['engineering'],
"college_law": ['law'],
"college_mathematics": ['math'],
"college_medical_statistics":['statistics'],
"clinical_knowledge": ['other'],
"college_medicine": ['other'],
"computer_science": ['computer science'],
"computer_security": ['other'],
"conceptual_physics": ['physics'],
"construction_project_management": ['other','china specific'],
"economics": ['economics'],
"education": ['education'],
"elementary_chinese":['linguistics','china specific'],
"elementary_commonsense":['other','china specific'],
"elementary_information_and_technology": ['other'],
"electrical_engineering": ['engineering'],
"elementary_mathematics": ['math'],
"ethnology": ['culture','china specific'],
"food_science": ['other'],
"genetics": ['biology'],
"global_facts": ['global'],
"high_school_biology": ['biology'],
"high_school_chemistry": ['chemistry'],
"high_school_geography": ['geography'],
"high_school_mathematics": ['math'],
"high_school_physics": ['physics'],
"high_school_politics": ['politics','china specific'],
"human_sexuality": ['other'],
"international_law": ['law'],
"journalism": ['sociology'],
"jurisprudence": ['law'],
"legal_and_moral_basis": ['other'],
"logical": ['philosophy'],
"machine_learning": ['computer science'],
"management": ['business'],
"marketing": ['business'],
"marxist_theory": ['philosophy'],
"modern_chinese": ['linguistics','china specific'],
"nutrition": ['other'],
"philosophy": ['philosophy'],
"professional_accounting": ['business'],
"professional_law": ['law'],
"professional_medicine": ['other'],
"professional_psychology": ['psychology'],
"public_relations": ['politics'],
"security_study": ['politics'],
"sociology": ['culture'],
"sports_science": ['other'],
"traditional_chinese_medicine": ['other','china specific'],
"virology": ['biology'],
"world_history":['history'],
"world_religions": ['global'],
}
categories = {
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
"Other":["other"],
"China specific": ["china specific"],
}
category2subject = {}
for k, v in categories.items():
for subject, subcat in subcategories.items():
for c in subcat:
if c in v:
category2subject.setdefault(k, []).append(subject)
cmmlu_summary_groups = []
_cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']]
cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities})
_cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']]
cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem})
_cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']]
cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science})
_cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']]
cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other})
_cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']]
cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific})
_cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()]
cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all})

View File

@ -0,0 +1,5 @@
tydiqa_summary_groups = []
_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa]
tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})

View File

@ -0,0 +1,4 @@
xiezhi_summary_groups = []
_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})

View File

@ -0,0 +1,89 @@
from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
from .groups.tydiqa import tydiqa_summary_groups
from .groups.xiezhi import xiezhi_summary_groups
summarizer = dict(
dataset_abbrs=[
'--------- 考试 Exam ---------', # category
# 'Mixed', # subcategory
"ceval",
'agieval',
'mmlu',
'cmmlu',
"GaokaoBench",
'ARC-c',
'ARC-e',
'--------- 语言 Language ---------', # category
# '字词释义', # subcategory
'WiC',
# '成语习语', # subcategory
'chid-dev',
# '语义相似度', # subcategory
'afqmc-dev',
# '指代消解', # subcategory
'WSC',
# '多语种问答', # subcategory
'tydiqa-goldp',
# '翻译', # subcategory
'flores_100',
'--------- 知识 Knowledge ---------', # category
# '知识问答', # subcategory
'BoolQ',
'commonsense_qa',
'triviaqa',
'nq',
'--------- 理解 Understanding ---------', # category
# '阅读理解', # subcategory
'C3',
'race-middle',
'race-high',
'openbookqa_fact',
# '内容总结', # subcategory
'csl_dev',
'lcsts',
'Xsum',
# '内容分析', # subcategory
'eprstmt-dev',
'lambada',
'--------- 推理 Reasoning ---------', # category
# '文本蕴含', # subcategory
'cmnli',
'ocnli',
'AX_b',
'AX_g',
'RTE',
# '常识推理', # subcategory
'COPA',
'ReCoRD',
'hellaswag',
'piqa',
'siqa',
# '数学推理', # subcategory
'math',
'gsm8k',
# '定理应用', # subcategory
# '阅读理解', # subcategory
'drop',
# '代码', # subcategory
'openai_humaneval',
'mbpp',
# '综合推理', # subcategory
"bbh",
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
prompt_db=dict(
database_path='configs/datasets/log.json',
config_dir='configs/datasets',
blacklist='.promptignore'),
)

View File

@ -3,11 +3,14 @@ from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
from .groups.tydiqa import tydiqa_summary_groups
from .groups.xiezhi import xiezhi_summary_groups
summarizer = dict(
dataset_abbrs=[

View File

@ -3,10 +3,13 @@ from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.cmmlu import cmmlu_summary_groups
from .groups.ceval import ceval_summary_groups
from .groups.bbh import bbh_summary_groups
from .groups.GaokaoBench import GaokaoBench_summary_groups
from .groups.flores import flores_summary_groups
from .groups.tydiqa import tydiqa_summary_groups
from .groups.xiezhi import xiezhi_summary_groups
summarizer = dict(
dataset_abbrs = [

View File

@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset):
return dataset
class ReCoRDDataset_V2(BaseDataset):
@staticmethod
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for i, line in enumerate(in_f):
sample = json.loads(line.strip())
text = sample['passage']['text'].replace('@highlight',
'').replace(
'\n\n', '\n')
for qas_dict in sample['qas']:
query = qas_dict['query'].replace('@placeholder', '____')
answers = [
answer_dict['text']
for answer_dict in qas_dict['answers']
]
rows.append({
'text': text,
'question': query,
'answers': answers
})
dataset = Dataset.from_list(rows)
return dataset
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
def ReCoRD_postprocess(text: str) -> str:
text = text.strip().split('\n')[0].replace('Answer: ', '').strip()

View File

@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner):
fnmatch(dataset_abbr, pattern)
for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
'agieval-jec*', 'agieval-gaokao-mathcloze',
'agieval-math')):
'agieval-math', '*professional_law')):
factor *= 10
return factor