mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add qwen & qwen-chat support (#286)
* add and apply update suffix tool * add tool doc * add qwen configs * add cmmlu * rename bbh * update datasets * delete * update hf_qwen_7b.py
This commit is contained in:
parent
fd389e2d78
commit
7ca6ba625e
@ -0,0 +1,35 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import EMEvaluator
|
||||
from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess
|
||||
|
||||
ReCoRD_reader_cfg = dict(
|
||||
input_columns=['question', 'text'], output_column='answers')
|
||||
|
||||
ReCoRD_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:"
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
ReCoRD_eval_cfg = dict(
|
||||
evaluator=dict(type=EMEvaluator),
|
||||
pred_role='BOT',
|
||||
pred_postprocessor=dict(type=ReCoRD_postprocess))
|
||||
|
||||
ReCoRD_datasets = [
|
||||
dict(
|
||||
type=ReCoRDDataset_V2,
|
||||
abbr='ReCoRD',
|
||||
path='./data/SuperGLUE/ReCoRD/val.jsonl',
|
||||
reader_cfg=ReCoRD_reader_cfg,
|
||||
infer_cfg=ReCoRD_infer_cfg,
|
||||
eval_cfg=ReCoRD_eval_cfg)
|
||||
]
|
105
configs/datasets/bbh/bbh_gen_e3d13a.py
Normal file
105
configs/datasets/bbh/bbh_gen_e3d13a.py
Normal file
@ -0,0 +1,105 @@
|
||||
from os.path import exists
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
|
||||
|
||||
_path_prefix = "./data/BBH"
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
bbh_datasets = []
|
||||
for _name in bbh_multiple_choice_sets:
|
||||
_hint = None
|
||||
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
|
||||
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path=f"{_path_prefix}/data",
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
for _name in bbh_free_form_sets:
|
||||
_hint = None
|
||||
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
|
||||
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt=
|
||||
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path=f"{_path_prefix}/data",
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
||||
|
||||
del _name, _hint, _path_prefix
|
51
configs/datasets/collections/leaderboard/qwen.py
Normal file
51
configs/datasets/collections/leaderboard/qwen.py
Normal file
@ -0,0 +1,51 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ...ceval.ceval_ppl_578f8d import ceval_datasets
|
||||
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
|
||||
from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
||||
from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
|
||||
from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
||||
from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
||||
from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
||||
|
||||
from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
|
||||
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
||||
from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
||||
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
||||
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
|
||||
from ...flores.flores_gen_806ede import flores_datasets
|
||||
|
||||
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
|
||||
from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
|
||||
from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
|
||||
from ...nq.nq_gen_0356ec import nq_datasets
|
||||
|
||||
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||
from ...race.race_ppl_5831a0 import race_datasets
|
||||
from ...obqa.obqa_gen_9069e4 import obqa_datasets
|
||||
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
||||
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
||||
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
|
||||
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
||||
from ...lambada.lambada_gen_217e11 import lambada_datasets
|
||||
|
||||
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
||||
from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
|
||||
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
||||
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
|
||||
from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
||||
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
||||
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
|
||||
from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
||||
from ...piqa.piqa_gen_1194eb import piqa_datasets
|
||||
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
|
||||
from ...math.math_gen_265cce import math_datasets
|
||||
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from ...drop.drop_gen_599f07 import drop_datasets
|
||||
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||
from ...bbh.bbh_gen_e3d13a import bbh_datasets
|
||||
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
51
configs/datasets/collections/leaderboard/qwen_chat.py
Normal file
51
configs/datasets/collections/leaderboard/qwen_chat.py
Normal file
@ -0,0 +1,51 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ...ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
|
||||
from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||
from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
||||
from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
||||
from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
|
||||
from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
|
||||
|
||||
from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
||||
from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
||||
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
||||
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
|
||||
from ...flores.flores_gen_806ede import flores_datasets
|
||||
|
||||
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
|
||||
from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
||||
from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||
from ...nq.nq_gen_c788f6 import nq_datasets
|
||||
|
||||
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||
from ...race.race_gen_69ee4f import race_datasets
|
||||
from ...obqa.obqa_ppl_6aac9e import obqa_datasets
|
||||
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
||||
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
||||
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
|
||||
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
||||
from ...lambada.lambada_gen_217e11 import lambada_datasets
|
||||
|
||||
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
||||
from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
|
||||
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
||||
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
|
||||
from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
|
||||
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
||||
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
||||
from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
|
||||
from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
|
||||
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
|
||||
from ...math.math_gen_265cce import math_datasets
|
||||
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||
from ...drop.drop_gen_599f07 import drop_datasets
|
||||
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||
from ...bbh.bbh_gen_6bd693 import bbh_datasets
|
||||
|
||||
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
58
configs/eval_qwen_7b.py
Normal file
58
configs/eval_qwen_7b.py
Normal file
@ -0,0 +1,58 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .models.hf_qwen_7b import models
|
||||
from .datasets.collections.leaderboard.qwen import datasets
|
||||
from .summarizers.leaderboard import summarizer
|
||||
|
||||
'''
|
||||
dataset version metric mode qwen-7b-hf
|
||||
-------------------------------------- --------- ---------------- ------ ------------
|
||||
--------- 考试 Exam --------- - - - -
|
||||
ceval - naive_average ppl 58.65
|
||||
agieval - naive_average mixed 40.49
|
||||
mmlu - naive_average ppl 57.78
|
||||
cmmlu - naive_average ppl 58.57
|
||||
GaokaoBench - weighted_average mixed 51.76
|
||||
ARC-c 72cf91 accuracy gen 83.73
|
||||
ARC-e 72cf91 accuracy gen 90.65
|
||||
--------- 语言 Language --------- - - - -
|
||||
WiC ce62e6 accuracy ppl 51.10
|
||||
chid-dev 25f3d3 accuracy ppl 86.63
|
||||
afqmc-dev cc328c accuracy ppl 69.00
|
||||
WSC 678cb5 accuracy ppl 63.46
|
||||
tydiqa-goldp - naive_average gen 19.98
|
||||
flores_100 - naive_average gen 3.20
|
||||
--------- 知识 Knowledge --------- - - - -
|
||||
BoolQ 463fee accuracy ppl 83.00
|
||||
commonsense_qa 0d8e25 accuracy ppl 67.49
|
||||
triviaqa b6904f score gen 40.45
|
||||
nq b6904f score gen 14.16
|
||||
--------- 理解 Understanding --------- - - - -
|
||||
C3 e6778d accuracy gen 75.29
|
||||
race-middle 73bdec accuracy ppl 90.53
|
||||
race-high 73bdec accuracy ppl 87.71
|
||||
openbookqa_fact fa871c accuracy gen 92.20
|
||||
csl_dev 3c4211 accuracy ppl 56.25
|
||||
lcsts 0b3969 rouge1 gen 12.38
|
||||
Xsum 207e69 rouge1 gen 36.00
|
||||
eprstmt-dev 101429 accuracy gen 89.38
|
||||
lambada de1af2 accuracy gen 67.88
|
||||
--------- 推理 Reasoning --------- - - - -
|
||||
cmnli 15e783 accuracy ppl 54.85
|
||||
ocnli 1471e7 accuracy gen 42.34
|
||||
AX_b 793c72 accuracy gen 58.61
|
||||
AX_g c4c886 accuracy gen 69.10
|
||||
RTE c4c886 accuracy gen 57.76
|
||||
COPA 59f42c accuracy gen 88.00
|
||||
ReCoRD 3e0689 score gen 27.78
|
||||
hellaswag 06a1e2 accuracy gen 92.47
|
||||
piqa 24369d accuracy gen 78.02
|
||||
siqa ea30d1 accuracy ppl 75.03
|
||||
math 2c0b9e accuracy gen 11.06
|
||||
gsm8k 4c7f6e accuracy gen 50.87
|
||||
drop 53a0a7 score gen 44.95
|
||||
openai_humaneval dd0dff humaneval_pass@1 gen 23.78
|
||||
mbpp 60ca11 score gen 31.20
|
||||
bbh - naive_average gen 40.03
|
||||
'''
|
58
configs/eval_qwen_7b_chat.py
Normal file
58
configs/eval_qwen_7b_chat.py
Normal file
@ -0,0 +1,58 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .models.hf_qwen_7b_chat import models
|
||||
from .datasets.collections.leaderboard.qwen_chat import datasets
|
||||
from .summarizers.leaderboard import summarizer
|
||||
|
||||
'''
|
||||
dataset version metric mode qwen-7b-chat-hf
|
||||
-------------------------------------- --------- ---------------- ------ -----------------
|
||||
--------- 考试 Exam --------- - - - -
|
||||
ceval - naive_average gen 56.07
|
||||
agieval - naive_average mixed 39.51
|
||||
mmlu - naive_average gen 53.49
|
||||
cmmlu - naive_average gen 55.29
|
||||
GaokaoBench - weighted_average gen 48.01
|
||||
ARC-c ca1e8e accuracy ppl 74.92
|
||||
ARC-e ca1e8e accuracy ppl 85.71
|
||||
--------- 语言 Language --------- - - - -
|
||||
WiC efbd01 accuracy gen 51.41
|
||||
chid-dev 25f3d3 accuracy ppl 77.72
|
||||
afqmc-dev 4a1636 accuracy gen 69.00
|
||||
WSC 678cb5 accuracy ppl 67.31
|
||||
tydiqa-goldp - naive_average gen 15.32
|
||||
flores_100 - naive_average gen 10.00
|
||||
--------- 知识 Knowledge --------- - - - -
|
||||
BoolQ 463fee accuracy ppl 83.18
|
||||
commonsense_qa ddaabf accuracy gen 76.41
|
||||
triviaqa b6904f score gen 43.25
|
||||
nq 23dc1a score gen 16.26
|
||||
--------- 理解 Understanding --------- - - - -
|
||||
C3 e6778d accuracy gen 81.53
|
||||
race-middle e0908b accuracy gen 83.01
|
||||
race-high e0908b accuracy gen 77.79
|
||||
openbookqa_fact 49689a accuracy ppl 86.40
|
||||
csl_dev 3c4211 accuracy ppl 64.38
|
||||
lcsts 0b3969 rouge1 gen 12.75
|
||||
Xsum 207e69 rouge1 gen 20.21
|
||||
eprstmt-dev ed0c5d accuracy ppl 85.00
|
||||
lambada de1af2 accuracy gen 59.19
|
||||
--------- 推理 Reasoning --------- - - - -
|
||||
cmnli 15e783 accuracy ppl 48.08
|
||||
ocnli 15e783 accuracy ppl 51.40
|
||||
AX_b 689df1 accuracy ppl 65.67
|
||||
AX_g 808a19 accuracy ppl 76.12
|
||||
RTE 808a19 accuracy ppl 68.95
|
||||
COPA 59f42c accuracy gen 92.00
|
||||
ReCoRD 6f7cfc score gen 0.16
|
||||
hellaswag 8d79e0 accuracy ppl 69.28
|
||||
piqa 34eee7 accuracy ppl 72.20
|
||||
siqa ea30d1 accuracy ppl 72.88
|
||||
math 2c0b9e accuracy gen 7.84
|
||||
gsm8k 4c7f6e accuracy gen 45.41
|
||||
drop 53a0a7 score gen 39.62
|
||||
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
|
||||
mbpp 60ca11 score gen 20.60
|
||||
bbh - naive_average gen 42.61
|
||||
'''
|
32
configs/models/hf_qwen_7b.py
Normal file
32
configs/models/hf_qwen_7b.py
Normal file
@ -0,0 +1,32 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
# Please note that we have specified the revision here. Recently (on 20230827),
|
||||
# during our evaluations, we found that the newer revision models have a drop
|
||||
# of more than 5 points on datasets like GaokaoBench / mbpp.
|
||||
# We are not yet sure whether this drop is due to incorrect logic in OpenCompass
|
||||
# calling qwen or some other reasons. We would like to highlight this.
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-7b-hf',
|
||||
path="Qwen/Qwen-7B",
|
||||
tokenizer_path='Qwen/Qwen-7B',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
|
||||
),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
|
||||
),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
29
configs/models/hf_qwen_7b_chat.py
Normal file
29
configs/models/hf_qwen_7b_chat.py
Normal file
@ -0,0 +1,29 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-7b-chat-hf',
|
||||
path="Qwen/Qwen-7B-Chat",
|
||||
tokenizer_path='Qwen/Qwen-7B-Chat',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,),
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
@ -3,10 +3,13 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .groups.agieval import agieval_summary_groups
|
||||
from .groups.mmlu import mmlu_summary_groups
|
||||
from .groups.cmmlu import cmmlu_summary_groups
|
||||
from .groups.ceval import ceval_summary_groups
|
||||
from .groups.bbh import bbh_summary_groups
|
||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||
from .groups.flores import flores_summary_groups
|
||||
from .groups.tydiqa import tydiqa_summary_groups
|
||||
from .groups.xiezhi import xiezhi_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
||||
|
104
configs/summarizers/groups/cmmlu.py
Normal file
104
configs/summarizers/groups/cmmlu.py
Normal file
@ -0,0 +1,104 @@
|
||||
subcategories = {
|
||||
"agronomy": ['other'],
|
||||
"anatomy": ['biology'],
|
||||
"ancient_chinese": ['linguistics','china specific'],
|
||||
"arts": ['arts'],
|
||||
"astronomy": ['physics'],
|
||||
"business_ethics": ['business'],
|
||||
"chinese_civil_service_exam": ['politics','china specific'],
|
||||
"chinese_driving_rule": ['other','china specific'],
|
||||
"chinese_food_culture": ['culture','china specific'],
|
||||
"chinese_foreign_policy": ['politics','china specific'],
|
||||
"chinese_history":['history','china specific'],
|
||||
"chinese_literature": ['literature','china specific'],
|
||||
"chinese_teacher_qualification": ['education','china specific'],
|
||||
"college_actuarial_science":['math'],
|
||||
"college_education":['education'],
|
||||
"college_engineering_hydrology": ['engineering'],
|
||||
"college_law": ['law'],
|
||||
"college_mathematics": ['math'],
|
||||
"college_medical_statistics":['statistics'],
|
||||
"clinical_knowledge": ['other'],
|
||||
"college_medicine": ['other'],
|
||||
"computer_science": ['computer science'],
|
||||
"computer_security": ['other'],
|
||||
"conceptual_physics": ['physics'],
|
||||
"construction_project_management": ['other','china specific'],
|
||||
"economics": ['economics'],
|
||||
"education": ['education'],
|
||||
"elementary_chinese":['linguistics','china specific'],
|
||||
"elementary_commonsense":['other','china specific'],
|
||||
"elementary_information_and_technology": ['other'],
|
||||
"electrical_engineering": ['engineering'],
|
||||
"elementary_mathematics": ['math'],
|
||||
"ethnology": ['culture','china specific'],
|
||||
"food_science": ['other'],
|
||||
"genetics": ['biology'],
|
||||
"global_facts": ['global'],
|
||||
"high_school_biology": ['biology'],
|
||||
"high_school_chemistry": ['chemistry'],
|
||||
"high_school_geography": ['geography'],
|
||||
"high_school_mathematics": ['math'],
|
||||
"high_school_physics": ['physics'],
|
||||
"high_school_politics": ['politics','china specific'],
|
||||
"human_sexuality": ['other'],
|
||||
"international_law": ['law'],
|
||||
"journalism": ['sociology'],
|
||||
"jurisprudence": ['law'],
|
||||
"legal_and_moral_basis": ['other'],
|
||||
"logical": ['philosophy'],
|
||||
"machine_learning": ['computer science'],
|
||||
"management": ['business'],
|
||||
"marketing": ['business'],
|
||||
"marxist_theory": ['philosophy'],
|
||||
"modern_chinese": ['linguistics','china specific'],
|
||||
"nutrition": ['other'],
|
||||
"philosophy": ['philosophy'],
|
||||
"professional_accounting": ['business'],
|
||||
"professional_law": ['law'],
|
||||
"professional_medicine": ['other'],
|
||||
"professional_psychology": ['psychology'],
|
||||
"public_relations": ['politics'],
|
||||
"security_study": ['politics'],
|
||||
"sociology": ['culture'],
|
||||
"sports_science": ['other'],
|
||||
"traditional_chinese_medicine": ['other','china specific'],
|
||||
"virology": ['biology'],
|
||||
"world_history":['history'],
|
||||
"world_religions": ['global'],
|
||||
}
|
||||
|
||||
categories = {
|
||||
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
|
||||
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
|
||||
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
|
||||
"Other":["other"],
|
||||
"China specific": ["china specific"],
|
||||
}
|
||||
|
||||
category2subject = {}
|
||||
for k, v in categories.items():
|
||||
for subject, subcat in subcategories.items():
|
||||
for c in subcat:
|
||||
if c in v:
|
||||
category2subject.setdefault(k, []).append(subject)
|
||||
|
||||
cmmlu_summary_groups = []
|
||||
|
||||
_cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities})
|
||||
|
||||
_cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem})
|
||||
|
||||
_cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science})
|
||||
|
||||
_cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other})
|
||||
|
||||
_cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific})
|
||||
|
||||
_cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()]
|
||||
cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all})
|
5
configs/summarizers/groups/tydiqa.py
Normal file
5
configs/summarizers/groups/tydiqa.py
Normal file
@ -0,0 +1,5 @@
|
||||
tydiqa_summary_groups = []
|
||||
|
||||
_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
|
||||
_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa]
|
||||
tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})
|
4
configs/summarizers/groups/xiezhi.py
Normal file
4
configs/summarizers/groups/xiezhi.py
Normal file
@ -0,0 +1,4 @@
|
||||
xiezhi_summary_groups = []
|
||||
|
||||
_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
|
||||
xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
|
89
configs/summarizers/leaderboard.py
Normal file
89
configs/summarizers/leaderboard.py
Normal file
@ -0,0 +1,89 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .groups.agieval import agieval_summary_groups
|
||||
from .groups.mmlu import mmlu_summary_groups
|
||||
from .groups.cmmlu import cmmlu_summary_groups
|
||||
from .groups.ceval import ceval_summary_groups
|
||||
from .groups.bbh import bbh_summary_groups
|
||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||
from .groups.flores import flores_summary_groups
|
||||
from .groups.tydiqa import tydiqa_summary_groups
|
||||
from .groups.xiezhi import xiezhi_summary_groups
|
||||
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'--------- 考试 Exam ---------', # category
|
||||
# 'Mixed', # subcategory
|
||||
"ceval",
|
||||
'agieval',
|
||||
'mmlu',
|
||||
'cmmlu',
|
||||
"GaokaoBench",
|
||||
'ARC-c',
|
||||
'ARC-e',
|
||||
'--------- 语言 Language ---------', # category
|
||||
# '字词释义', # subcategory
|
||||
'WiC',
|
||||
# '成语习语', # subcategory
|
||||
'chid-dev',
|
||||
# '语义相似度', # subcategory
|
||||
'afqmc-dev',
|
||||
# '指代消解', # subcategory
|
||||
'WSC',
|
||||
# '多语种问答', # subcategory
|
||||
'tydiqa-goldp',
|
||||
# '翻译', # subcategory
|
||||
'flores_100',
|
||||
'--------- 知识 Knowledge ---------', # category
|
||||
# '知识问答', # subcategory
|
||||
'BoolQ',
|
||||
'commonsense_qa',
|
||||
'triviaqa',
|
||||
'nq',
|
||||
'--------- 理解 Understanding ---------', # category
|
||||
# '阅读理解', # subcategory
|
||||
'C3',
|
||||
'race-middle',
|
||||
'race-high',
|
||||
'openbookqa_fact',
|
||||
# '内容总结', # subcategory
|
||||
'csl_dev',
|
||||
'lcsts',
|
||||
'Xsum',
|
||||
# '内容分析', # subcategory
|
||||
'eprstmt-dev',
|
||||
'lambada',
|
||||
'--------- 推理 Reasoning ---------', # category
|
||||
# '文本蕴含', # subcategory
|
||||
'cmnli',
|
||||
'ocnli',
|
||||
'AX_b',
|
||||
'AX_g',
|
||||
'RTE',
|
||||
# '常识推理', # subcategory
|
||||
'COPA',
|
||||
'ReCoRD',
|
||||
'hellaswag',
|
||||
'piqa',
|
||||
'siqa',
|
||||
# '数学推理', # subcategory
|
||||
'math',
|
||||
'gsm8k',
|
||||
# '定理应用', # subcategory
|
||||
# '阅读理解', # subcategory
|
||||
'drop',
|
||||
# '代码', # subcategory
|
||||
'openai_humaneval',
|
||||
'mbpp',
|
||||
# '综合推理', # subcategory
|
||||
"bbh",
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
||||
prompt_db=dict(
|
||||
database_path='configs/datasets/log.json',
|
||||
config_dir='configs/datasets',
|
||||
blacklist='.promptignore'),
|
||||
)
|
@ -3,11 +3,14 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .groups.agieval import agieval_summary_groups
|
||||
from .groups.mmlu import mmlu_summary_groups
|
||||
from .groups.cmmlu import cmmlu_summary_groups
|
||||
from .groups.ceval import ceval_summary_groups
|
||||
from .groups.bbh import bbh_summary_groups
|
||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||
from .groups.flores import flores_summary_groups
|
||||
from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
|
||||
from .groups.tydiqa import tydiqa_summary_groups
|
||||
from .groups.xiezhi import xiezhi_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
|
@ -3,10 +3,13 @@ from mmengine.config import read_base
|
||||
with read_base():
|
||||
from .groups.agieval import agieval_summary_groups
|
||||
from .groups.mmlu import mmlu_summary_groups
|
||||
from .groups.cmmlu import cmmlu_summary_groups
|
||||
from .groups.ceval import ceval_summary_groups
|
||||
from .groups.bbh import bbh_summary_groups
|
||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||
from .groups.flores import flores_summary_groups
|
||||
from .groups.tydiqa import tydiqa_summary_groups
|
||||
from .groups.xiezhi import xiezhi_summary_groups
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs = [
|
||||
|
@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset):
|
||||
return dataset
|
||||
|
||||
|
||||
class ReCoRDDataset_V2(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
with open(path, 'r', errors='ignore') as in_f:
|
||||
rows = []
|
||||
for i, line in enumerate(in_f):
|
||||
sample = json.loads(line.strip())
|
||||
text = sample['passage']['text'].replace('@highlight',
|
||||
'').replace(
|
||||
'\n\n', '\n')
|
||||
for qas_dict in sample['qas']:
|
||||
query = qas_dict['query'].replace('@placeholder', '____')
|
||||
answers = [
|
||||
answer_dict['text']
|
||||
for answer_dict in qas_dict['answers']
|
||||
]
|
||||
rows.append({
|
||||
'text': text,
|
||||
'question': query,
|
||||
'answers': answers
|
||||
})
|
||||
|
||||
dataset = Dataset.from_list(rows)
|
||||
return dataset
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
|
||||
def ReCoRD_postprocess(text: str) -> str:
|
||||
text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
|
||||
|
@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner):
|
||||
fnmatch(dataset_abbr, pattern)
|
||||
for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
|
||||
'agieval-jec*', 'agieval-gaokao-mathcloze',
|
||||
'agieval-math')):
|
||||
'agieval-math', '*professional_law')):
|
||||
factor *= 10
|
||||
|
||||
return factor
|
||||
|
Loading…
Reference in New Issue
Block a user