mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feature] Add qwen & qwen-chat support (#286)
* add and apply update suffix tool * add tool doc * add qwen configs * add cmmlu * rename bbh * update datasets * delete * update hf_qwen_7b.py
This commit is contained in:
parent
fd389e2d78
commit
7ca6ba625e
@ -0,0 +1,35 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import EMEvaluator
|
||||||
|
from opencompass.datasets import ReCoRDDataset_V2, ReCoRD_postprocess
|
||||||
|
|
||||||
|
ReCoRD_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'text'], output_column='answers')
|
||||||
|
|
||||||
|
ReCoRD_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN", prompt="Passage:\n{text}\nResult:\n{question}\nQuestion:\nWhat entity does ____ refer to in the Result?\nAnswer:"
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
ReCoRD_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=EMEvaluator),
|
||||||
|
pred_role='BOT',
|
||||||
|
pred_postprocessor=dict(type=ReCoRD_postprocess))
|
||||||
|
|
||||||
|
ReCoRD_datasets = [
|
||||||
|
dict(
|
||||||
|
type=ReCoRDDataset_V2,
|
||||||
|
abbr='ReCoRD',
|
||||||
|
path='./data/SuperGLUE/ReCoRD/val.jsonl',
|
||||||
|
reader_cfg=ReCoRD_reader_cfg,
|
||||||
|
infer_cfg=ReCoRD_infer_cfg,
|
||||||
|
eval_cfg=ReCoRD_eval_cfg)
|
||||||
|
]
|
105
configs/datasets/bbh/bbh_gen_e3d13a.py
Normal file
105
configs/datasets/bbh/bbh_gen_e3d13a.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
from os.path import exists
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
|
from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess
|
||||||
|
|
||||||
|
bbh_reader_cfg = dict(input_columns=["input"], output_column="target")
|
||||||
|
|
||||||
|
_path_prefix = "./data/BBH"
|
||||||
|
|
||||||
|
bbh_multiple_choice_sets = [
|
||||||
|
'temporal_sequences',
|
||||||
|
'disambiguation_qa',
|
||||||
|
'date_understanding',
|
||||||
|
'tracking_shuffled_objects_three_objects',
|
||||||
|
'penguins_in_a_table',
|
||||||
|
'geometric_shapes',
|
||||||
|
'snarks',
|
||||||
|
'ruin_names',
|
||||||
|
'tracking_shuffled_objects_seven_objects',
|
||||||
|
'tracking_shuffled_objects_five_objects',
|
||||||
|
'logical_deduction_three_objects',
|
||||||
|
'hyperbaton',
|
||||||
|
'logical_deduction_five_objects',
|
||||||
|
'logical_deduction_seven_objects',
|
||||||
|
'movie_recommendation',
|
||||||
|
'salient_translation_error_detection',
|
||||||
|
'reasoning_about_colored_objects',
|
||||||
|
]
|
||||||
|
bbh_free_form_sets = [
|
||||||
|
'multistep_arithmetic_two',
|
||||||
|
'navigate',
|
||||||
|
'dyck_languages',
|
||||||
|
'word_sorting',
|
||||||
|
'sports_understanding',
|
||||||
|
'boolean_expressions',
|
||||||
|
'object_counting',
|
||||||
|
'formal_fallacies',
|
||||||
|
'causal_judgement',
|
||||||
|
'web_of_lies',
|
||||||
|
]
|
||||||
|
|
||||||
|
bbh_datasets = []
|
||||||
|
for _name in bbh_multiple_choice_sets:
|
||||||
|
_hint = None
|
||||||
|
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
|
||||||
|
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
|
||||||
|
bbh_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
|
||||||
|
)
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||||
|
bbh_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=AccEvaluator),
|
||||||
|
pred_role="BOT",
|
||||||
|
pred_postprocessor=dict(type=bbh_mcq_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
|
||||||
|
|
||||||
|
bbh_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=BBHDataset,
|
||||||
|
path=f"{_path_prefix}/data",
|
||||||
|
name=_name,
|
||||||
|
abbr='bbh-' + _name,
|
||||||
|
reader_cfg=bbh_reader_cfg,
|
||||||
|
infer_cfg=bbh_infer_cfg.copy(),
|
||||||
|
eval_cfg=bbh_eval_cfg.copy()))
|
||||||
|
|
||||||
|
for _name in bbh_free_form_sets:
|
||||||
|
_hint = None
|
||||||
|
if exists(f"{_path_prefix}/lib_prompt/{_name}.txt"):
|
||||||
|
_hint = open(f"{_path_prefix}/lib_prompt/{_name}.txt", 'r').read()
|
||||||
|
bbh_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: "
|
||||||
|
)
|
||||||
|
])),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||||
|
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
bbh_datasets.append(
|
||||||
|
dict(
|
||||||
|
type=BBHDataset,
|
||||||
|
path=f"{_path_prefix}/data",
|
||||||
|
name=_name,
|
||||||
|
abbr='bbh-' + _name,
|
||||||
|
reader_cfg=bbh_reader_cfg,
|
||||||
|
infer_cfg=bbh_infer_cfg.copy(),
|
||||||
|
eval_cfg=bbh_eval_cfg.copy()))
|
||||||
|
|
||||||
|
del _name, _hint, _path_prefix
|
51
configs/datasets/collections/leaderboard/qwen.py
Normal file
51
configs/datasets/collections/leaderboard/qwen.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from ...ceval.ceval_ppl_578f8d import ceval_datasets
|
||||||
|
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
|
||||||
|
from ...mmlu.mmlu_ppl_ac766d import mmlu_datasets
|
||||||
|
from ...cmmlu.cmmlu_ppl_8b9c76 import cmmlu_datasets
|
||||||
|
from ...GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
||||||
|
from ...ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
|
||||||
|
from ...ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
|
||||||
|
|
||||||
|
from ...SuperGLUE_WiC.SuperGLUE_WiC_ppl_312de9 import WiC_datasets
|
||||||
|
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
||||||
|
from ...CLUE_afqmc.CLUE_afqmc_ppl_6507d7 import afqmc_datasets
|
||||||
|
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
||||||
|
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
|
||||||
|
from ...flores.flores_gen_806ede import flores_datasets
|
||||||
|
|
||||||
|
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
|
||||||
|
from ...commonsenseqa.commonsenseqa_ppl_5545e2 import commonsenseqa_datasets
|
||||||
|
from ...triviaqa.triviaqa_gen_0356ec import triviaqa_datasets
|
||||||
|
from ...nq.nq_gen_0356ec import nq_datasets
|
||||||
|
|
||||||
|
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||||
|
from ...race.race_ppl_5831a0 import race_datasets
|
||||||
|
from ...obqa.obqa_gen_9069e4 import obqa_datasets
|
||||||
|
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
||||||
|
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
||||||
|
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
|
||||||
|
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
||||||
|
from ...lambada.lambada_gen_217e11 import lambada_datasets
|
||||||
|
|
||||||
|
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
||||||
|
from ...CLUE_ocnli.CLUE_ocnli_gen_c4cb6c import ocnli_datasets
|
||||||
|
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
||||||
|
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_gen_68aac7 import AX_g_datasets
|
||||||
|
from ...SuperGLUE_RTE.SuperGLUE_RTE_gen_68aac7 import RTE_datasets
|
||||||
|
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
||||||
|
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_a69961 import ReCoRD_datasets
|
||||||
|
from ...hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
|
||||||
|
from ...piqa.piqa_gen_1194eb import piqa_datasets
|
||||||
|
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
|
||||||
|
from ...math.math_gen_265cce import math_datasets
|
||||||
|
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
|
from ...drop.drop_gen_599f07 import drop_datasets
|
||||||
|
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
|
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
from ...bbh.bbh_gen_e3d13a import bbh_datasets
|
||||||
|
|
||||||
|
|
||||||
|
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
51
configs/datasets/collections/leaderboard/qwen_chat.py
Normal file
51
configs/datasets/collections/leaderboard/qwen_chat.py
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from ...ceval.ceval_gen_5f30c7 import ceval_datasets
|
||||||
|
from ...agieval.agieval_mixed_2f14ad import agieval_datasets
|
||||||
|
from ...mmlu.mmlu_gen_a484b3 import mmlu_datasets
|
||||||
|
from ...cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
|
||||||
|
from ...GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
||||||
|
from ...ARC_c.ARC_c_ppl_2ef631 import ARC_c_datasets
|
||||||
|
from ...ARC_e.ARC_e_ppl_2ef631 import ARC_e_datasets
|
||||||
|
|
||||||
|
from ...SuperGLUE_WiC.SuperGLUE_WiC_gen_d06864 import WiC_datasets
|
||||||
|
from ...FewCLUE_chid.FewCLUE_chid_ppl_8f2872 import chid_datasets
|
||||||
|
from ...CLUE_afqmc.CLUE_afqmc_gen_901306 import afqmc_datasets
|
||||||
|
from ...SuperGLUE_WSC.SuperGLUE_WSC_ppl_003529 import WSC_datasets
|
||||||
|
from ...tydiqa.tydiqa_gen_978d2a import tydiqa_datasets
|
||||||
|
from ...flores.flores_gen_806ede import flores_datasets
|
||||||
|
|
||||||
|
from ...SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
|
||||||
|
from ...commonsenseqa.commonsenseqa_gen_c946f2 import commonsenseqa_datasets
|
||||||
|
from ...triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
|
||||||
|
from ...nq.nq_gen_c788f6 import nq_datasets
|
||||||
|
|
||||||
|
from ...CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||||
|
from ...race.race_gen_69ee4f import race_datasets
|
||||||
|
from ...obqa.obqa_ppl_6aac9e import obqa_datasets
|
||||||
|
from ...FewCLUE_csl.FewCLUE_csl_ppl_841b62 import csl_datasets
|
||||||
|
from ...lcsts.lcsts_gen_8ee1fe import lcsts_datasets
|
||||||
|
from ...Xsum.Xsum_gen_31397e import Xsum_datasets
|
||||||
|
from ...FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
||||||
|
from ...lambada.lambada_gen_217e11 import lambada_datasets
|
||||||
|
|
||||||
|
from ...CLUE_cmnli.CLUE_cmnli_ppl_fdc6de import cmnli_datasets
|
||||||
|
from ...CLUE_ocnli.CLUE_ocnli_ppl_fdc6de import ocnli_datasets
|
||||||
|
from ...SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
||||||
|
from ...SuperGLUE_AX_g.SuperGLUE_AX_g_ppl_66caf3 import AX_g_datasets
|
||||||
|
from ...SuperGLUE_RTE.SuperGLUE_RTE_ppl_66caf3 import RTE_datasets
|
||||||
|
from ...SuperGLUE_COPA.SuperGLUE_COPA_gen_91ca53 import COPA_datasets
|
||||||
|
from ...SuperGLUE_ReCoRD.SuperGLUE_ReCoRD_gen_30dea0 import ReCoRD_datasets
|
||||||
|
from ...hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
|
||||||
|
from ...piqa.piqa_ppl_0cfff2 import piqa_datasets
|
||||||
|
from ...siqa.siqa_ppl_e8d8c5 import siqa_datasets
|
||||||
|
from ...math.math_gen_265cce import math_datasets
|
||||||
|
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
|
from ...drop.drop_gen_599f07 import drop_datasets
|
||||||
|
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
|
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
from ...bbh.bbh_gen_6bd693 import bbh_datasets
|
||||||
|
|
||||||
|
|
||||||
|
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
58
configs/eval_qwen_7b.py
Normal file
58
configs/eval_qwen_7b.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .models.hf_qwen_7b import models
|
||||||
|
from .datasets.collections.leaderboard.qwen import datasets
|
||||||
|
from .summarizers.leaderboard import summarizer
|
||||||
|
|
||||||
|
'''
|
||||||
|
dataset version metric mode qwen-7b-hf
|
||||||
|
-------------------------------------- --------- ---------------- ------ ------------
|
||||||
|
--------- 考试 Exam --------- - - - -
|
||||||
|
ceval - naive_average ppl 58.65
|
||||||
|
agieval - naive_average mixed 40.49
|
||||||
|
mmlu - naive_average ppl 57.78
|
||||||
|
cmmlu - naive_average ppl 58.57
|
||||||
|
GaokaoBench - weighted_average mixed 51.76
|
||||||
|
ARC-c 72cf91 accuracy gen 83.73
|
||||||
|
ARC-e 72cf91 accuracy gen 90.65
|
||||||
|
--------- 语言 Language --------- - - - -
|
||||||
|
WiC ce62e6 accuracy ppl 51.10
|
||||||
|
chid-dev 25f3d3 accuracy ppl 86.63
|
||||||
|
afqmc-dev cc328c accuracy ppl 69.00
|
||||||
|
WSC 678cb5 accuracy ppl 63.46
|
||||||
|
tydiqa-goldp - naive_average gen 19.98
|
||||||
|
flores_100 - naive_average gen 3.20
|
||||||
|
--------- 知识 Knowledge --------- - - - -
|
||||||
|
BoolQ 463fee accuracy ppl 83.00
|
||||||
|
commonsense_qa 0d8e25 accuracy ppl 67.49
|
||||||
|
triviaqa b6904f score gen 40.45
|
||||||
|
nq b6904f score gen 14.16
|
||||||
|
--------- 理解 Understanding --------- - - - -
|
||||||
|
C3 e6778d accuracy gen 75.29
|
||||||
|
race-middle 73bdec accuracy ppl 90.53
|
||||||
|
race-high 73bdec accuracy ppl 87.71
|
||||||
|
openbookqa_fact fa871c accuracy gen 92.20
|
||||||
|
csl_dev 3c4211 accuracy ppl 56.25
|
||||||
|
lcsts 0b3969 rouge1 gen 12.38
|
||||||
|
Xsum 207e69 rouge1 gen 36.00
|
||||||
|
eprstmt-dev 101429 accuracy gen 89.38
|
||||||
|
lambada de1af2 accuracy gen 67.88
|
||||||
|
--------- 推理 Reasoning --------- - - - -
|
||||||
|
cmnli 15e783 accuracy ppl 54.85
|
||||||
|
ocnli 1471e7 accuracy gen 42.34
|
||||||
|
AX_b 793c72 accuracy gen 58.61
|
||||||
|
AX_g c4c886 accuracy gen 69.10
|
||||||
|
RTE c4c886 accuracy gen 57.76
|
||||||
|
COPA 59f42c accuracy gen 88.00
|
||||||
|
ReCoRD 3e0689 score gen 27.78
|
||||||
|
hellaswag 06a1e2 accuracy gen 92.47
|
||||||
|
piqa 24369d accuracy gen 78.02
|
||||||
|
siqa ea30d1 accuracy ppl 75.03
|
||||||
|
math 2c0b9e accuracy gen 11.06
|
||||||
|
gsm8k 4c7f6e accuracy gen 50.87
|
||||||
|
drop 53a0a7 score gen 44.95
|
||||||
|
openai_humaneval dd0dff humaneval_pass@1 gen 23.78
|
||||||
|
mbpp 60ca11 score gen 31.20
|
||||||
|
bbh - naive_average gen 40.03
|
||||||
|
'''
|
58
configs/eval_qwen_7b_chat.py
Normal file
58
configs/eval_qwen_7b_chat.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .models.hf_qwen_7b_chat import models
|
||||||
|
from .datasets.collections.leaderboard.qwen_chat import datasets
|
||||||
|
from .summarizers.leaderboard import summarizer
|
||||||
|
|
||||||
|
'''
|
||||||
|
dataset version metric mode qwen-7b-chat-hf
|
||||||
|
-------------------------------------- --------- ---------------- ------ -----------------
|
||||||
|
--------- 考试 Exam --------- - - - -
|
||||||
|
ceval - naive_average gen 56.07
|
||||||
|
agieval - naive_average mixed 39.51
|
||||||
|
mmlu - naive_average gen 53.49
|
||||||
|
cmmlu - naive_average gen 55.29
|
||||||
|
GaokaoBench - weighted_average gen 48.01
|
||||||
|
ARC-c ca1e8e accuracy ppl 74.92
|
||||||
|
ARC-e ca1e8e accuracy ppl 85.71
|
||||||
|
--------- 语言 Language --------- - - - -
|
||||||
|
WiC efbd01 accuracy gen 51.41
|
||||||
|
chid-dev 25f3d3 accuracy ppl 77.72
|
||||||
|
afqmc-dev 4a1636 accuracy gen 69.00
|
||||||
|
WSC 678cb5 accuracy ppl 67.31
|
||||||
|
tydiqa-goldp - naive_average gen 15.32
|
||||||
|
flores_100 - naive_average gen 10.00
|
||||||
|
--------- 知识 Knowledge --------- - - - -
|
||||||
|
BoolQ 463fee accuracy ppl 83.18
|
||||||
|
commonsense_qa ddaabf accuracy gen 76.41
|
||||||
|
triviaqa b6904f score gen 43.25
|
||||||
|
nq 23dc1a score gen 16.26
|
||||||
|
--------- 理解 Understanding --------- - - - -
|
||||||
|
C3 e6778d accuracy gen 81.53
|
||||||
|
race-middle e0908b accuracy gen 83.01
|
||||||
|
race-high e0908b accuracy gen 77.79
|
||||||
|
openbookqa_fact 49689a accuracy ppl 86.40
|
||||||
|
csl_dev 3c4211 accuracy ppl 64.38
|
||||||
|
lcsts 0b3969 rouge1 gen 12.75
|
||||||
|
Xsum 207e69 rouge1 gen 20.21
|
||||||
|
eprstmt-dev ed0c5d accuracy ppl 85.00
|
||||||
|
lambada de1af2 accuracy gen 59.19
|
||||||
|
--------- 推理 Reasoning --------- - - - -
|
||||||
|
cmnli 15e783 accuracy ppl 48.08
|
||||||
|
ocnli 15e783 accuracy ppl 51.40
|
||||||
|
AX_b 689df1 accuracy ppl 65.67
|
||||||
|
AX_g 808a19 accuracy ppl 76.12
|
||||||
|
RTE 808a19 accuracy ppl 68.95
|
||||||
|
COPA 59f42c accuracy gen 92.00
|
||||||
|
ReCoRD 6f7cfc score gen 0.16
|
||||||
|
hellaswag 8d79e0 accuracy ppl 69.28
|
||||||
|
piqa 34eee7 accuracy ppl 72.20
|
||||||
|
siqa ea30d1 accuracy ppl 72.88
|
||||||
|
math 2c0b9e accuracy gen 7.84
|
||||||
|
gsm8k 4c7f6e accuracy gen 45.41
|
||||||
|
drop 53a0a7 score gen 39.62
|
||||||
|
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
|
||||||
|
mbpp 60ca11 score gen 20.60
|
||||||
|
bbh - naive_average gen 42.61
|
||||||
|
'''
|
32
configs/models/hf_qwen_7b.py
Normal file
32
configs/models/hf_qwen_7b.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
# Please note that we have specified the revision here. Recently (on 20230827),
|
||||||
|
# during our evaluations, we found that the newer revision models have a drop
|
||||||
|
# of more than 5 points on datasets like GaokaoBench / mbpp.
|
||||||
|
# We are not yet sure whether this drop is due to incorrect logic in OpenCompass
|
||||||
|
# calling qwen or some other reasons. We would like to highlight this.
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='qwen-7b-hf',
|
||||||
|
path="Qwen/Qwen-7B",
|
||||||
|
tokenizer_path='Qwen/Qwen-7B',
|
||||||
|
tokenizer_kwargs=dict(
|
||||||
|
padding_side='left',
|
||||||
|
truncation_side='left',
|
||||||
|
trust_remote_code=True,
|
||||||
|
use_fast=False,
|
||||||
|
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
|
||||||
|
),
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
model_kwargs=dict(
|
||||||
|
device_map='auto',
|
||||||
|
trust_remote_code=True,
|
||||||
|
revision='39fc5fdcb95c8c367bbdb3bfc0db71d96266de09'
|
||||||
|
),
|
||||||
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
)
|
||||||
|
]
|
29
configs/models/hf_qwen_7b_chat.py
Normal file
29
configs/models/hf_qwen_7b_chat.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
|
||||||
|
_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||||
|
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='qwen-7b-chat-hf',
|
||||||
|
path="Qwen/Qwen-7B-Chat",
|
||||||
|
tokenizer_path='Qwen/Qwen-7B-Chat',
|
||||||
|
tokenizer_kwargs=dict(
|
||||||
|
padding_side='left',
|
||||||
|
truncation_side='left',
|
||||||
|
trust_remote_code=True,
|
||||||
|
use_fast=False,),
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
meta_template=_meta_template,
|
||||||
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
)
|
||||||
|
]
|
@ -3,10 +3,13 @@ from mmengine.config import read_base
|
|||||||
with read_base():
|
with read_base():
|
||||||
from .groups.agieval import agieval_summary_groups
|
from .groups.agieval import agieval_summary_groups
|
||||||
from .groups.mmlu import mmlu_summary_groups
|
from .groups.mmlu import mmlu_summary_groups
|
||||||
|
from .groups.cmmlu import cmmlu_summary_groups
|
||||||
from .groups.ceval import ceval_summary_groups
|
from .groups.ceval import ceval_summary_groups
|
||||||
from .groups.bbh import bbh_summary_groups
|
from .groups.bbh import bbh_summary_groups
|
||||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||||
from .groups.flores import flores_summary_groups
|
from .groups.flores import flores_summary_groups
|
||||||
|
from .groups.tydiqa import tydiqa_summary_groups
|
||||||
|
from .groups.xiezhi import xiezhi_summary_groups
|
||||||
|
|
||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
||||||
|
104
configs/summarizers/groups/cmmlu.py
Normal file
104
configs/summarizers/groups/cmmlu.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
subcategories = {
|
||||||
|
"agronomy": ['other'],
|
||||||
|
"anatomy": ['biology'],
|
||||||
|
"ancient_chinese": ['linguistics','china specific'],
|
||||||
|
"arts": ['arts'],
|
||||||
|
"astronomy": ['physics'],
|
||||||
|
"business_ethics": ['business'],
|
||||||
|
"chinese_civil_service_exam": ['politics','china specific'],
|
||||||
|
"chinese_driving_rule": ['other','china specific'],
|
||||||
|
"chinese_food_culture": ['culture','china specific'],
|
||||||
|
"chinese_foreign_policy": ['politics','china specific'],
|
||||||
|
"chinese_history":['history','china specific'],
|
||||||
|
"chinese_literature": ['literature','china specific'],
|
||||||
|
"chinese_teacher_qualification": ['education','china specific'],
|
||||||
|
"college_actuarial_science":['math'],
|
||||||
|
"college_education":['education'],
|
||||||
|
"college_engineering_hydrology": ['engineering'],
|
||||||
|
"college_law": ['law'],
|
||||||
|
"college_mathematics": ['math'],
|
||||||
|
"college_medical_statistics":['statistics'],
|
||||||
|
"clinical_knowledge": ['other'],
|
||||||
|
"college_medicine": ['other'],
|
||||||
|
"computer_science": ['computer science'],
|
||||||
|
"computer_security": ['other'],
|
||||||
|
"conceptual_physics": ['physics'],
|
||||||
|
"construction_project_management": ['other','china specific'],
|
||||||
|
"economics": ['economics'],
|
||||||
|
"education": ['education'],
|
||||||
|
"elementary_chinese":['linguistics','china specific'],
|
||||||
|
"elementary_commonsense":['other','china specific'],
|
||||||
|
"elementary_information_and_technology": ['other'],
|
||||||
|
"electrical_engineering": ['engineering'],
|
||||||
|
"elementary_mathematics": ['math'],
|
||||||
|
"ethnology": ['culture','china specific'],
|
||||||
|
"food_science": ['other'],
|
||||||
|
"genetics": ['biology'],
|
||||||
|
"global_facts": ['global'],
|
||||||
|
"high_school_biology": ['biology'],
|
||||||
|
"high_school_chemistry": ['chemistry'],
|
||||||
|
"high_school_geography": ['geography'],
|
||||||
|
"high_school_mathematics": ['math'],
|
||||||
|
"high_school_physics": ['physics'],
|
||||||
|
"high_school_politics": ['politics','china specific'],
|
||||||
|
"human_sexuality": ['other'],
|
||||||
|
"international_law": ['law'],
|
||||||
|
"journalism": ['sociology'],
|
||||||
|
"jurisprudence": ['law'],
|
||||||
|
"legal_and_moral_basis": ['other'],
|
||||||
|
"logical": ['philosophy'],
|
||||||
|
"machine_learning": ['computer science'],
|
||||||
|
"management": ['business'],
|
||||||
|
"marketing": ['business'],
|
||||||
|
"marxist_theory": ['philosophy'],
|
||||||
|
"modern_chinese": ['linguistics','china specific'],
|
||||||
|
"nutrition": ['other'],
|
||||||
|
"philosophy": ['philosophy'],
|
||||||
|
"professional_accounting": ['business'],
|
||||||
|
"professional_law": ['law'],
|
||||||
|
"professional_medicine": ['other'],
|
||||||
|
"professional_psychology": ['psychology'],
|
||||||
|
"public_relations": ['politics'],
|
||||||
|
"security_study": ['politics'],
|
||||||
|
"sociology": ['culture'],
|
||||||
|
"sports_science": ['other'],
|
||||||
|
"traditional_chinese_medicine": ['other','china specific'],
|
||||||
|
"virology": ['biology'],
|
||||||
|
"world_history":['history'],
|
||||||
|
"world_religions": ['global'],
|
||||||
|
}
|
||||||
|
|
||||||
|
categories = {
|
||||||
|
"STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
|
||||||
|
"Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
|
||||||
|
"Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
|
||||||
|
"Other":["other"],
|
||||||
|
"China specific": ["china specific"],
|
||||||
|
}
|
||||||
|
|
||||||
|
category2subject = {}
|
||||||
|
for k, v in categories.items():
|
||||||
|
for subject, subcat in subcategories.items():
|
||||||
|
for c in subcat:
|
||||||
|
if c in v:
|
||||||
|
category2subject.setdefault(k, []).append(subject)
|
||||||
|
|
||||||
|
cmmlu_summary_groups = []
|
||||||
|
|
||||||
|
_cmmlu_humanities = ['cmmlu-' + s for s in category2subject['Humanities']]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu-humanities', 'subsets': _cmmlu_humanities})
|
||||||
|
|
||||||
|
_cmmlu_stem = ['cmmlu-' + s for s in category2subject['STEM']]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu-stem', 'subsets': _cmmlu_stem})
|
||||||
|
|
||||||
|
_cmmlu_social_science = ['cmmlu-' + s for s in category2subject['Social Science']]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu-social-science', 'subsets': _cmmlu_social_science})
|
||||||
|
|
||||||
|
_cmmlu_other = ['cmmlu-' + s for s in category2subject['Other']]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu-other', 'subsets': _cmmlu_other})
|
||||||
|
|
||||||
|
_cmmlu_china_specific = ['cmmlu-' + s for s in category2subject['China specific']]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu-china-specific', 'subsets': _cmmlu_china_specific})
|
||||||
|
|
||||||
|
_cmmlu_all = ['cmmlu-' + s for s in subcategories.keys()]
|
||||||
|
cmmlu_summary_groups.append({'name': 'cmmlu', 'subsets': _cmmlu_all})
|
5
configs/summarizers/groups/tydiqa.py
Normal file
5
configs/summarizers/groups/tydiqa.py
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
tydiqa_summary_groups = []
|
||||||
|
|
||||||
|
_tydiqa = ['arabic', 'bengali', 'english', 'finnish', 'indonesian', 'japanese', 'korean', 'russian', 'swahili', 'telugu', 'thai']
|
||||||
|
_tydiqa = ['tyidqa-goldp_' + s for s in _tydiqa]
|
||||||
|
tydiqa_summary_groups.append({'name': 'tydiqa-goldp', 'subsets': _tydiqa})
|
4
configs/summarizers/groups/xiezhi.py
Normal file
4
configs/summarizers/groups/xiezhi.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
xiezhi_summary_groups = []
|
||||||
|
|
||||||
|
_xiezhi = ["xiezhi-spec_eng", "xiezhi-spec_chn", "xiezhi-inter_eng", "xiezhi-inter_chn"]
|
||||||
|
xiezhi_summary_groups.append({'name': 'xiezhi', 'subsets': _xiezhi})
|
89
configs/summarizers/leaderboard.py
Normal file
89
configs/summarizers/leaderboard.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .groups.agieval import agieval_summary_groups
|
||||||
|
from .groups.mmlu import mmlu_summary_groups
|
||||||
|
from .groups.cmmlu import cmmlu_summary_groups
|
||||||
|
from .groups.ceval import ceval_summary_groups
|
||||||
|
from .groups.bbh import bbh_summary_groups
|
||||||
|
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||||
|
from .groups.flores import flores_summary_groups
|
||||||
|
from .groups.tydiqa import tydiqa_summary_groups
|
||||||
|
from .groups.xiezhi import xiezhi_summary_groups
|
||||||
|
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
'--------- 考试 Exam ---------', # category
|
||||||
|
# 'Mixed', # subcategory
|
||||||
|
"ceval",
|
||||||
|
'agieval',
|
||||||
|
'mmlu',
|
||||||
|
'cmmlu',
|
||||||
|
"GaokaoBench",
|
||||||
|
'ARC-c',
|
||||||
|
'ARC-e',
|
||||||
|
'--------- 语言 Language ---------', # category
|
||||||
|
# '字词释义', # subcategory
|
||||||
|
'WiC',
|
||||||
|
# '成语习语', # subcategory
|
||||||
|
'chid-dev',
|
||||||
|
# '语义相似度', # subcategory
|
||||||
|
'afqmc-dev',
|
||||||
|
# '指代消解', # subcategory
|
||||||
|
'WSC',
|
||||||
|
# '多语种问答', # subcategory
|
||||||
|
'tydiqa-goldp',
|
||||||
|
# '翻译', # subcategory
|
||||||
|
'flores_100',
|
||||||
|
'--------- 知识 Knowledge ---------', # category
|
||||||
|
# '知识问答', # subcategory
|
||||||
|
'BoolQ',
|
||||||
|
'commonsense_qa',
|
||||||
|
'triviaqa',
|
||||||
|
'nq',
|
||||||
|
'--------- 理解 Understanding ---------', # category
|
||||||
|
# '阅读理解', # subcategory
|
||||||
|
'C3',
|
||||||
|
'race-middle',
|
||||||
|
'race-high',
|
||||||
|
'openbookqa_fact',
|
||||||
|
# '内容总结', # subcategory
|
||||||
|
'csl_dev',
|
||||||
|
'lcsts',
|
||||||
|
'Xsum',
|
||||||
|
# '内容分析', # subcategory
|
||||||
|
'eprstmt-dev',
|
||||||
|
'lambada',
|
||||||
|
'--------- 推理 Reasoning ---------', # category
|
||||||
|
# '文本蕴含', # subcategory
|
||||||
|
'cmnli',
|
||||||
|
'ocnli',
|
||||||
|
'AX_b',
|
||||||
|
'AX_g',
|
||||||
|
'RTE',
|
||||||
|
# '常识推理', # subcategory
|
||||||
|
'COPA',
|
||||||
|
'ReCoRD',
|
||||||
|
'hellaswag',
|
||||||
|
'piqa',
|
||||||
|
'siqa',
|
||||||
|
# '数学推理', # subcategory
|
||||||
|
'math',
|
||||||
|
'gsm8k',
|
||||||
|
# '定理应用', # subcategory
|
||||||
|
# '阅读理解', # subcategory
|
||||||
|
'drop',
|
||||||
|
# '代码', # subcategory
|
||||||
|
'openai_humaneval',
|
||||||
|
'mbpp',
|
||||||
|
# '综合推理', # subcategory
|
||||||
|
"bbh",
|
||||||
|
],
|
||||||
|
summary_groups=sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
|
||||||
|
prompt_db=dict(
|
||||||
|
database_path='configs/datasets/log.json',
|
||||||
|
config_dir='configs/datasets',
|
||||||
|
blacklist='.promptignore'),
|
||||||
|
)
|
@ -3,11 +3,14 @@ from mmengine.config import read_base
|
|||||||
with read_base():
|
with read_base():
|
||||||
from .groups.agieval import agieval_summary_groups
|
from .groups.agieval import agieval_summary_groups
|
||||||
from .groups.mmlu import mmlu_summary_groups
|
from .groups.mmlu import mmlu_summary_groups
|
||||||
|
from .groups.cmmlu import cmmlu_summary_groups
|
||||||
from .groups.ceval import ceval_summary_groups
|
from .groups.ceval import ceval_summary_groups
|
||||||
from .groups.bbh import bbh_summary_groups
|
from .groups.bbh import bbh_summary_groups
|
||||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||||
from .groups.flores import flores_summary_groups
|
from .groups.flores import flores_summary_groups
|
||||||
from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
|
from .groups.jigsaw_multilingual import jigsaw_multilingual_summary_groups
|
||||||
|
from .groups.tydiqa import tydiqa_summary_groups
|
||||||
|
from .groups.xiezhi import xiezhi_summary_groups
|
||||||
|
|
||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
dataset_abbrs=[
|
dataset_abbrs=[
|
||||||
|
@ -3,10 +3,13 @@ from mmengine.config import read_base
|
|||||||
with read_base():
|
with read_base():
|
||||||
from .groups.agieval import agieval_summary_groups
|
from .groups.agieval import agieval_summary_groups
|
||||||
from .groups.mmlu import mmlu_summary_groups
|
from .groups.mmlu import mmlu_summary_groups
|
||||||
|
from .groups.cmmlu import cmmlu_summary_groups
|
||||||
from .groups.ceval import ceval_summary_groups
|
from .groups.ceval import ceval_summary_groups
|
||||||
from .groups.bbh import bbh_summary_groups
|
from .groups.bbh import bbh_summary_groups
|
||||||
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
from .groups.GaokaoBench import GaokaoBench_summary_groups
|
||||||
from .groups.flores import flores_summary_groups
|
from .groups.flores import flores_summary_groups
|
||||||
|
from .groups.tydiqa import tydiqa_summary_groups
|
||||||
|
from .groups.xiezhi import xiezhi_summary_groups
|
||||||
|
|
||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
dataset_abbrs = [
|
dataset_abbrs = [
|
||||||
|
@ -43,6 +43,33 @@ class ReCoRDDataset(BaseDataset):
|
|||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
|
class ReCoRDDataset_V2(BaseDataset):
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load(path: str):
|
||||||
|
with open(path, 'r', errors='ignore') as in_f:
|
||||||
|
rows = []
|
||||||
|
for i, line in enumerate(in_f):
|
||||||
|
sample = json.loads(line.strip())
|
||||||
|
text = sample['passage']['text'].replace('@highlight',
|
||||||
|
'').replace(
|
||||||
|
'\n\n', '\n')
|
||||||
|
for qas_dict in sample['qas']:
|
||||||
|
query = qas_dict['query'].replace('@placeholder', '____')
|
||||||
|
answers = [
|
||||||
|
answer_dict['text']
|
||||||
|
for answer_dict in qas_dict['answers']
|
||||||
|
]
|
||||||
|
rows.append({
|
||||||
|
'text': text,
|
||||||
|
'question': query,
|
||||||
|
'answers': answers
|
||||||
|
})
|
||||||
|
|
||||||
|
dataset = Dataset.from_list(rows)
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
|
||||||
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
|
@TEXT_POSTPROCESSORS.register_module('ReCoRD')
|
||||||
def ReCoRD_postprocess(text: str) -> str:
|
def ReCoRD_postprocess(text: str) -> str:
|
||||||
text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
|
text = text.strip().split('\n')[0].replace('Answer: ', '').strip()
|
||||||
|
@ -154,7 +154,7 @@ class SizePartitioner(BasePartitioner):
|
|||||||
fnmatch(dataset_abbr, pattern)
|
fnmatch(dataset_abbr, pattern)
|
||||||
for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
|
for pattern in ('bbh*', 'gsm8k*', 'math*', 'strategyqa*',
|
||||||
'agieval-jec*', 'agieval-gaokao-mathcloze',
|
'agieval-jec*', 'agieval-gaokao-mathcloze',
|
||||||
'agieval-math')):
|
'agieval-math', '*professional_law')):
|
||||||
factor *= 10
|
factor *= 10
|
||||||
|
|
||||||
return factor
|
return factor
|
||||||
|
Loading…
Reference in New Issue
Block a user