2023-08-31 11:29:05 +08:00
|
|
|
from mmengine.config import read_base
|
|
|
|
|
|
|
|
with read_base():
|
2025-01-20 19:17:38 +08:00
|
|
|
from opencompass.configs.datasets.collections.leaderboard.qwen_chat import \
|
|
|
|
datasets
|
2024-08-22 14:48:45 +08:00
|
|
|
from opencompass.configs.models.qwen.hf_qwen_7b_chat import models
|
|
|
|
from opencompass.configs.summarizers.leaderboard import summarizer
|
2023-08-31 11:29:05 +08:00
|
|
|
'''
|
|
|
|
dataset version metric mode qwen-7b-chat-hf
|
|
|
|
-------------------------------------- --------- ---------------- ------ -----------------
|
|
|
|
--------- 考试 Exam --------- - - - -
|
|
|
|
ceval - naive_average gen 56.07
|
|
|
|
agieval - naive_average mixed 39.51
|
|
|
|
mmlu - naive_average gen 53.49
|
|
|
|
cmmlu - naive_average gen 55.29
|
|
|
|
GaokaoBench - weighted_average gen 48.01
|
|
|
|
ARC-c ca1e8e accuracy ppl 74.92
|
|
|
|
ARC-e ca1e8e accuracy ppl 85.71
|
|
|
|
--------- 语言 Language --------- - - - -
|
|
|
|
WiC efbd01 accuracy gen 51.41
|
|
|
|
chid-dev 25f3d3 accuracy ppl 77.72
|
|
|
|
afqmc-dev 4a1636 accuracy gen 69.00
|
|
|
|
WSC 678cb5 accuracy ppl 67.31
|
|
|
|
tydiqa-goldp - naive_average gen 15.32
|
|
|
|
flores_100 - naive_average gen 10.00
|
|
|
|
--------- 知识 Knowledge --------- - - - -
|
|
|
|
BoolQ 463fee accuracy ppl 83.18
|
|
|
|
commonsense_qa ddaabf accuracy gen 76.41
|
|
|
|
triviaqa b6904f score gen 43.25
|
|
|
|
nq 23dc1a score gen 16.26
|
|
|
|
--------- 理解 Understanding --------- - - - -
|
|
|
|
C3 e6778d accuracy gen 81.53
|
|
|
|
race-middle e0908b accuracy gen 83.01
|
|
|
|
race-high e0908b accuracy gen 77.79
|
|
|
|
openbookqa_fact 49689a accuracy ppl 86.40
|
|
|
|
csl_dev 3c4211 accuracy ppl 64.38
|
|
|
|
lcsts 0b3969 rouge1 gen 12.75
|
|
|
|
Xsum 207e69 rouge1 gen 20.21
|
|
|
|
eprstmt-dev ed0c5d accuracy ppl 85.00
|
|
|
|
lambada de1af2 accuracy gen 59.19
|
|
|
|
--------- 推理 Reasoning --------- - - - -
|
|
|
|
cmnli 15e783 accuracy ppl 48.08
|
|
|
|
ocnli 15e783 accuracy ppl 51.40
|
|
|
|
AX_b 689df1 accuracy ppl 65.67
|
|
|
|
AX_g 808a19 accuracy ppl 76.12
|
|
|
|
RTE 808a19 accuracy ppl 68.95
|
|
|
|
COPA 59f42c accuracy gen 92.00
|
|
|
|
ReCoRD 6f7cfc score gen 0.16
|
|
|
|
hellaswag 8d79e0 accuracy ppl 69.28
|
|
|
|
piqa 34eee7 accuracy ppl 72.20
|
|
|
|
siqa ea30d1 accuracy ppl 72.88
|
|
|
|
math 2c0b9e accuracy gen 7.84
|
|
|
|
gsm8k 4c7f6e accuracy gen 45.41
|
|
|
|
drop 53a0a7 score gen 39.62
|
|
|
|
openai_humaneval dd0dff humaneval_pass@1 gen 10.98
|
|
|
|
mbpp 60ca11 score gen 20.60
|
|
|
|
bbh - naive_average gen 42.61
|
|
|
|
'''
|