mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Compare commits
519 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d572761cef | ||
![]() |
408f5caff4 | ||
![]() |
6f3c670b99 | ||
![]() |
c3779ebfc1 | ||
![]() |
aa2b89b6f8 | ||
![]() |
7a7a4517ab | ||
![]() |
8c0ccf9a6b | ||
![]() |
6f3b6a5d12 | ||
![]() |
3d1760aba2 | ||
![]() |
b84518c656 | ||
![]() |
d60f59dcab | ||
![]() |
9eaa1f6fec | ||
![]() |
d590f557bb | ||
![]() |
c492e49e79 | ||
![]() |
2c79dc5227 | ||
![]() |
345674f700 | ||
![]() |
8aa18df368 | ||
![]() |
44a7024ed5 | ||
![]() |
508e2b0cb2 | ||
![]() |
7bdd3c1904 | ||
![]() |
6097186a95 | ||
![]() |
d72df59363 | ||
![]() |
c5048bfec7 | ||
![]() |
a7f3ac20b2 | ||
![]() |
ff3275edf0 | ||
![]() |
a685ed7daf | ||
![]() |
9ec23c145b | ||
![]() |
ba0e32292c | ||
![]() |
43b2c4ed76 | ||
![]() |
d62b69aaef | ||
![]() |
af8432e1d6 | ||
![]() |
ddc9cc0afb | ||
![]() |
37cbaf8d92 | ||
![]() |
b6148aa198 | ||
![]() |
527a80947b | ||
![]() |
8c74e6a39e | ||
![]() |
e8bc8c1e8c | ||
![]() |
97010dc4ce | ||
![]() |
dcbf899369 | ||
![]() |
bf74f26603 | ||
![]() |
455bb05d1b | ||
![]() |
c69110361b | ||
![]() |
a2093a81ef | ||
![]() |
b2da1c08a8 | ||
![]() |
65ff602cf5 | ||
![]() |
75e7834b59 | ||
![]() |
6a6a1a5c0b | ||
![]() |
3f50b1dc49 | ||
![]() |
20660ab507 | ||
![]() |
12213207b6 | ||
![]() |
6ac9b06bc2 | ||
![]() |
a05f9da134 | ||
![]() |
fd82bea747 | ||
![]() |
bb58cfc85d | ||
![]() |
b564e608b1 | ||
![]() |
828fb745c9 | ||
![]() |
f982d6278e | ||
![]() |
3a9a384173 | ||
![]() |
9b489e9ea0 | ||
![]() |
dc8deb6af0 | ||
![]() |
32d6859679 | ||
![]() |
97236c8e97 | ||
![]() |
f66b0b347a | ||
![]() |
330a6e5ca7 | ||
![]() |
f71eb78c72 | ||
![]() |
0f46c35211 | ||
![]() |
6118596362 | ||
![]() |
07930b854a | ||
![]() |
37307fa996 | ||
![]() |
db96161a4e | ||
![]() |
aa05993922 | ||
![]() |
64128916d0 | ||
![]() |
8a5029b121 | ||
![]() |
b9de8b0e2b | ||
![]() |
c98599271b | ||
![]() |
5d2d253d83 | ||
![]() |
0b7f76e193 | ||
![]() |
15c825a51a | ||
![]() |
854c6bf025 | ||
![]() |
1c60e3a0f6 | ||
![]() |
709bc4af0e | ||
![]() |
bc2969dba8 | ||
![]() |
59e49aedf1 | ||
![]() |
e403fd21be | ||
![]() |
cbf84fb33c | ||
![]() |
570c30cf1b | ||
![]() |
277d7946f5 | ||
![]() |
1585c0adbe | ||
![]() |
54324657f0 | ||
![]() |
fff2d51440 | ||
![]() |
5547fd1592 | ||
![]() |
198c08632e | ||
![]() |
c84bc18ac1 | ||
![]() |
f0809fe6f6 | ||
![]() |
6a573f671b | ||
![]() |
73c80953c6 | ||
![]() |
6042b88e58 | ||
![]() |
bdb2d46f59 | ||
![]() |
fd6fbf01a2 | ||
![]() |
22a33d8759 | ||
![]() |
465e93e10e | ||
![]() |
046b6f75c6 | ||
![]() |
d7daee6e25 | ||
![]() |
27c916661d | ||
![]() |
bc22749fd8 | ||
![]() |
f407930475 | ||
![]() |
68a9838907 | ||
![]() |
3fd8b4e0cd | ||
![]() |
9c2e6a192c | ||
![]() |
ffc04cf650 | ||
![]() |
862bf78464 | ||
![]() |
412199f802 | ||
![]() |
70f2c963d3 | ||
![]() |
35ec307c6b | ||
![]() |
03415b2a66 | ||
![]() |
a6193b4c02 | ||
![]() |
ffdc917523 | ||
![]() |
70da9b7776 | ||
![]() |
531643e771 | ||
![]() |
7f2aeeff26 | ||
![]() |
121d482378 | ||
![]() |
abdcee68f6 | ||
![]() |
e039f3efa0 | ||
![]() |
f1e50d4bf0 | ||
![]() |
8fdb72f567 | ||
![]() |
f871e80887 | ||
![]() |
117dc500ad | ||
![]() |
f3220438bc | ||
![]() |
9c980cbc62 | ||
![]() |
fc0556ec8e | ||
![]() |
dc6035cfcb | ||
![]() |
98435dd98e | ||
![]() |
8e8d4f1c64 | ||
![]() |
42b54d6bb8 | ||
![]() |
357ce8c7a4 | ||
![]() |
ae9efb73ad | ||
![]() |
f103e90764 | ||
![]() |
ebeb578fbf | ||
![]() |
56eaac6d8f | ||
![]() |
c48bbde26f | ||
![]() |
ebefffed61 | ||
![]() |
d70100cdf2 | ||
![]() |
499302857f | ||
![]() |
9a5adbde6a | ||
![]() |
111f817e04 | ||
![]() |
38dba9919b | ||
![]() |
d593bfeac8 | ||
![]() |
eadbdcb4cb | ||
![]() |
5c8e91f329 | ||
![]() |
1bd594fc62 | ||
![]() |
aeded4c4db | ||
![]() |
a1c00cc8b7 | ||
![]() |
c4ce0174fe | ||
![]() |
bd7b705be4 | ||
![]() |
1a5b3fc11e | ||
![]() |
0d26b348e4 | ||
![]() |
54c0fb7a93 | ||
![]() |
0d8df541bc | ||
![]() |
f333be177c | ||
![]() |
08d63b5bf3 | ||
![]() |
fb43dd1906 | ||
![]() |
6181ac1122 | ||
![]() |
4f317d1bd5 | ||
![]() |
ac23f0ce1f | ||
![]() |
4d773904d4 | ||
![]() |
a011be6798 | ||
![]() |
e2a290fd46 | ||
![]() |
98c4666d65 | ||
![]() |
9de27b4d85 | ||
![]() |
fe6d76fb13 | ||
![]() |
b063779034 | ||
![]() |
c437135fad | ||
![]() |
06ab27861e | ||
![]() |
90efcf2216 | ||
![]() |
f7dbe6bb7d | ||
![]() |
bcb707dbfc | ||
![]() |
ef695e28e5 | ||
![]() |
f97c4eae42 | ||
![]() |
300adc31e8 | ||
![]() |
5c1916ea4c | ||
![]() |
e49fcfd3a3 | ||
![]() |
80e3b9ef37 | ||
![]() |
500fb1032a | ||
![]() |
ed81f9df30 | ||
![]() |
05044dfaf2 | ||
![]() |
ff831b153e | ||
![]() |
ab8fdbbaab | ||
![]() |
98242ff1d1 | ||
![]() |
4653f6976e | ||
![]() |
4a20e1176d | ||
![]() |
40a9f0be0d | ||
![]() |
e9e4b69ddb | ||
![]() |
d415439f9b | ||
![]() |
e92a5d4230 | ||
![]() |
2fee63f537 | ||
![]() |
f8a1c1f487 | ||
![]() |
aca8ec3c6a | ||
![]() |
a9d6b6461f | ||
![]() |
3ec178f4a9 | ||
![]() |
17b5e52f6c | ||
![]() |
a0ef2fd3b4 | ||
![]() |
835bf75a36 | ||
![]() |
fd7aa83c01 | ||
![]() |
db258eb7d5 | ||
![]() |
888f1f3bef | ||
![]() |
f7d899823c | ||
![]() |
c789ce5698 | ||
![]() |
695738a89b | ||
![]() |
a0853c939d | ||
![]() |
d611907d14 | ||
![]() |
5212ffe8e2 | ||
![]() |
df57c08ccf | ||
![]() |
d91d66792a | ||
![]() |
46affab882 | ||
![]() |
8172af49bb | ||
![]() |
645c5f3b2c | ||
![]() |
9c39cb68d4 | ||
![]() |
a61e8a0803 | ||
![]() |
84be90669b | ||
![]() |
2542bc6907 | ||
![]() |
22fdea4bf2 | ||
![]() |
fb12c3f98a | ||
![]() |
662dddf41a | ||
![]() |
be3c06a158 | ||
![]() |
a927bba1cf | ||
![]() |
a4d5a6c81b | ||
![]() |
5868d5afa4 | ||
![]() |
500b44ba2d | ||
![]() |
096c347e7d | ||
![]() |
1188e1ecf0 | ||
![]() |
825d3388d5 | ||
![]() |
a11e2b2fd4 | ||
![]() |
6e8adf5221 | ||
![]() |
b89c7b2fc3 | ||
![]() |
dd0b655bd0 | ||
![]() |
f0d436496e | ||
![]() |
4fe251729b | ||
![]() |
fa54aa62f6 | ||
![]() |
2b1afa7d1e | ||
![]() |
8aba547e06 | ||
![]() |
f390697a5e | ||
![]() |
4fde41036f | ||
![]() |
5faee929db | ||
![]() |
69997f11f8 | ||
![]() |
3f7a3730d7 | ||
![]() |
b52ba65c26 | ||
![]() |
d2ab51abbd | ||
![]() |
4d6349dfe1 | ||
![]() |
89abcba486 | ||
![]() |
22a4e76511 | ||
![]() |
bbdca5eb4c | ||
![]() |
763d7755b6 | ||
![]() |
7528b8ab8a | ||
![]() |
85a28874aa | ||
![]() |
e8437db98f | ||
![]() |
7d50294117 | ||
![]() |
a7bacfdf7e | ||
![]() |
3f833186dc | ||
![]() |
80cda1980e | ||
![]() |
aa43eaf267 | ||
![]() |
87df8a73a3 | ||
![]() |
c3fb9065db | ||
![]() |
fe84bbd9a0 | ||
![]() |
17eefc0e1e | ||
![]() |
83eeb52b09 | ||
![]() |
e7681943f3 | ||
![]() |
a2e9bc0c41 | ||
![]() |
335667183a | ||
![]() |
24915aeb3f | ||
![]() |
a0cfd61129 | ||
![]() |
ee058e25b2 | ||
![]() |
a81bbb85bf | ||
![]() |
5a27c2bd6f | ||
![]() |
be460fbb21 | ||
![]() |
2e9db77d57 | ||
![]() |
c9a7026f59 | ||
![]() |
cfbd308edf | ||
![]() |
90279b6461 | ||
![]() |
6997990c93 | ||
![]() |
3754dc1b67 | ||
![]() |
7c7fa36235 | ||
![]() |
317763381c | ||
![]() |
c2bcd8725e | ||
![]() |
a31a77c5c1 | ||
![]() |
5b93592242 | ||
![]() |
b5f8afb57b | ||
![]() |
f04f3546bc | ||
![]() |
ff18545f0e | ||
![]() |
87ffa71d68 | ||
![]() |
928d0cfc3a | ||
![]() |
faf5260155 | ||
![]() |
00fc8da5be | ||
![]() |
45efdc994d | ||
![]() |
6c9cd9a260 | ||
![]() |
716d46e1f5 | ||
![]() |
fb6a0df652 | ||
![]() |
da74cbfa39 | ||
![]() |
95aad6c282 | ||
![]() |
9693be46b7 | ||
![]() |
f34209766d | ||
![]() |
8b39225259 | ||
![]() |
a488b9b4f5 | ||
![]() |
e5a8eb2283 | ||
![]() |
245664f4c0 | ||
![]() |
463231c651 | ||
![]() |
94b6bd65fc | ||
![]() |
2295a33a18 | ||
![]() |
5485207fbe | ||
![]() |
7c2d25b557 | ||
![]() |
ad3931aa32 | ||
![]() |
fb69ba5eb8 | ||
![]() |
9fdbc744dc | ||
![]() |
0fe9756c5d | ||
![]() |
14b4b735cb | ||
![]() |
d3963bceae | ||
![]() |
ac093fce53 | ||
![]() |
e076dc5acf | ||
![]() |
a4b54048ae | ||
![]() |
99b5122ed5 | ||
![]() |
ecf9bb3e4c | ||
![]() |
9b3613f10b | ||
![]() |
ce7f4853ce | ||
![]() |
2596f226f4 | ||
![]() |
8e55c9c6ee | ||
![]() |
59586a8b4a | ||
![]() |
88eb91219b | ||
![]() |
decb621ff6 | ||
![]() |
818d72a650 | ||
![]() |
264fd23129 | ||
![]() |
fed1a4998b | ||
![]() |
c81329b548 | ||
![]() |
c09fc79ba8 | ||
![]() |
07c96ac659 | ||
![]() |
46cc7894e1 | ||
![]() |
b83396f57c | ||
![]() |
52eccc4f0e | ||
![]() |
33ceaa0eb8 | ||
![]() |
eee5a5be23 | ||
![]() |
fea11b1d20 | ||
![]() |
704853e5e7 | ||
![]() |
edab1c07ba | ||
![]() |
12b84aeb3b | ||
![]() |
d3782c1d47 | ||
![]() |
9b9855a008 | ||
![]() |
edd0ffdf70 | ||
![]() |
a08931f214 | ||
![]() |
8ee7fecb68 | ||
![]() |
65fad8e2ac | ||
![]() |
51a94aee01 | ||
![]() |
69aa2f2d57 | ||
![]() |
c3c02c2960 | ||
![]() |
73aa55af6d | ||
![]() |
0aad8199c7 | ||
![]() |
8fe75e9937 | ||
![]() |
86b6d18731 | ||
![]() |
cf3e942f73 | ||
![]() |
8127fc3518 | ||
![]() |
a244453d9e | ||
![]() |
e9384823f2 | ||
![]() |
96f644de69 | ||
![]() |
a56678190b | ||
![]() |
98c58f8a6c | ||
![]() |
1f9f728f22 | ||
![]() |
f40add2596 | ||
![]() |
1bfb4217ff | ||
![]() |
104bddf647 | ||
![]() |
0a1c89e618 | ||
![]() |
3aeabbc427 | ||
![]() |
8e7ad2e981 | ||
![]() |
62f55987f1 | ||
![]() |
889e7e1140 | ||
![]() |
a62c613d3e | ||
![]() |
1d3a26c732 | ||
![]() |
68ca48496b | ||
![]() |
aadcfa625f | ||
![]() |
409a042d93 | ||
![]() |
167cfdcca3 | ||
![]() |
28eba6fe34 | ||
![]() |
fc2c9dea8c | ||
![]() |
a32f21a356 | ||
![]() |
842fb1cd70 | ||
![]() |
26d077b080 | ||
![]() |
e5ee1647fb | ||
![]() |
1fa62c4a42 | ||
![]() |
83b9fd9eaa | ||
![]() |
e0d7808b4e | ||
![]() |
982e024540 | ||
![]() |
02a0a4e857 | ||
![]() |
b272803d8a | ||
![]() |
7c381e5be8 | ||
![]() |
a77b8a5cec | ||
![]() |
d59189b87f | ||
![]() |
0b50112dc1 | ||
![]() |
d656e818f8 | ||
![]() |
808582d952 | ||
![]() |
2954913d9b | ||
![]() |
ba620c4afe | ||
![]() |
9fa80b0f93 | ||
![]() |
608ff5810d | ||
![]() |
07a6dacf33 | ||
![]() |
88c14d3d04 | ||
![]() |
749e4cea71 | ||
![]() |
5eb8f14d97 | ||
![]() |
31afe87026 | ||
![]() |
1448be00e2 | ||
![]() |
2b3d4150f3 | ||
![]() |
296ea59931 | ||
![]() |
5de85406ce | ||
![]() |
94eb90569f | ||
![]() |
8ea2c404d7 | ||
![]() |
e3c0448bbc | ||
![]() |
f10dd48f9c | ||
![]() |
80f831b425 | ||
![]() |
8a8987be0b | ||
![]() |
62dbf04708 | ||
![]() |
aa2dd2b58c | ||
![]() |
3dbba11945 | ||
![]() |
7505b3cadf | ||
![]() |
6c711cb262 | ||
![]() |
5432dfc1ff | ||
![]() |
833a35140b | ||
![]() |
19d7e630d6 | ||
![]() |
a71122ee18 | ||
![]() |
cb080fa7de | ||
![]() |
826d8307ac | ||
![]() |
d2c40e5648 | ||
![]() |
862044fb7d | ||
![]() |
d501710155 | ||
![]() |
af10ecc272 | ||
![]() |
153c4fc988 | ||
![]() |
d43392a3bb | ||
![]() |
53fe390454 | ||
![]() |
baed2ed9b8 | ||
![]() |
35c94d0cde | ||
![]() |
9c79224b39 | ||
![]() |
3de48e9b35 | ||
![]() |
063f5f5f49 | ||
![]() |
a6f67e1a65 | ||
![]() |
0b7de67c4a | ||
![]() |
1013dce60c | ||
![]() |
58a57a4c45 | ||
![]() |
76dd814c4d | ||
![]() |
cce5b6fbb6 | ||
![]() |
701ecbb292 | ||
![]() |
048d41a1c4 | ||
![]() |
3a232db471 | ||
![]() |
f1ee11de14 | ||
![]() |
e4830a6926 | ||
![]() |
e404b72c52 | ||
![]() |
6ba1c4937d | ||
![]() |
41196c48ae | ||
![]() |
17735f0c13 | ||
![]() |
81d0e4d793 | ||
![]() |
8fe7b271cc | ||
![]() |
004ed79593 | ||
![]() |
a256753221 | ||
![]() |
6f98c8d9ab | ||
![]() |
8c85edd1cd | ||
![]() |
c172401323 | ||
![]() |
629836146a | ||
![]() |
881bdbf6bd | ||
![]() |
7a41951dda | ||
![]() |
a00e57296f | ||
![]() |
bd7c11bb89 | ||
![]() |
b39f501563 | ||
![]() |
16f29b25f1 | ||
![]() |
f2af49337d | ||
![]() |
b50d163265 | ||
![]() |
2d4e559763 | ||
![]() |
c220550fb9 | ||
![]() |
02e7eec911 | ||
![]() |
0a6a03fe1a | ||
![]() |
0665bb91a8 | ||
![]() |
1d3198554b | ||
![]() |
e415ddf96a | ||
![]() |
054e9fa7e5 | ||
![]() |
0221d30877 | ||
![]() |
8a3c6e51ed | ||
![]() |
d92595b671 | ||
![]() |
c78a4df923 | ||
![]() |
4d2591acb2 | ||
![]() |
89a8a8917b | ||
![]() |
c2d4717be2 | ||
![]() |
7baa711fc7 | ||
![]() |
3098d78845 | ||
![]() |
2a741477fe | ||
![]() |
4c1533e59e | ||
![]() |
ab6cdb2be8 | ||
![]() |
64fde73b15 | ||
![]() |
ed663ca17b | ||
![]() |
47cb75a3f7 | ||
![]() |
bdd85358cc | ||
![]() |
848e7c8a76 | ||
![]() |
7c1a819bb4 | ||
![]() |
3829be87b1 | ||
![]() |
107e022cf4 | ||
![]() |
c54a5d3b0f | ||
![]() |
caf1cf8a17 | ||
![]() |
2e993989a6 | ||
![]() |
66d3aa4c01 | ||
![]() |
d0550268f3 | ||
![]() |
b03d5dc531 | ||
![]() |
bbec7d8733 | ||
![]() |
8142f399a8 | ||
![]() |
120bf8b399 | ||
![]() |
3e9844ed33 | ||
![]() |
4c45a71bbc | ||
![]() |
001e77fea2 | ||
![]() |
9afbfa3639 | ||
![]() |
ba7cd58da3 | ||
![]() |
5ce8e0450e | ||
![]() |
53fe788d27 | ||
![]() |
45c606bcd0 | ||
![]() |
32ba0b074e | ||
![]() |
6d04decab4 | ||
![]() |
2b7d376e3d | ||
![]() |
9119e2ac39 | ||
![]() |
b6e21ece38 | ||
![]() |
08133e060a |
42
.github/scripts/eval_regression_api.py
vendored
Normal file
42
.github/scripts/eval_regression_api.py
vendored
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.models.openai_api import OpenAISDK
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# choose a list of datasets
|
||||||
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
|
||||||
|
gsm8k_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.race.race_gen import \
|
||||||
|
race_datasets # noqa: F401, E501
|
||||||
|
|
||||||
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
|
||||||
|
api_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
],
|
||||||
|
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||||
|
)
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
abbr='lmdeploy-api-test',
|
||||||
|
type=OpenAISDK,
|
||||||
|
key='EMPTY',
|
||||||
|
openai_api_base='http://localhost:23333/v1',
|
||||||
|
path='internlm3',
|
||||||
|
tokenizer_path='internlm/internlm3-8b-instruct',
|
||||||
|
rpm_verbose=True,
|
||||||
|
meta_template=api_meta_template,
|
||||||
|
query_per_second=128,
|
||||||
|
max_out_len=1024,
|
||||||
|
max_seq_len=4096,
|
||||||
|
temperature=0.01,
|
||||||
|
batch_size=128,
|
||||||
|
retry=20,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
for d in datasets:
|
||||||
|
d['reader_cfg']['test_range'] = '[0:16]'
|
210
.github/scripts/eval_regression_base_fullbench.py
vendored
Normal file
210
.github/scripts/eval_regression_base_fullbench.py
vendored
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from opencompass.configs.datasets.ARC_c.ARC_c_few_shot_ppl import \
|
||||||
|
ARC_c_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.bbh.bbh_gen_98fba6 import \
|
||||||
|
bbh_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.cmmlu.cmmlu_ppl_041cbf import \
|
||||||
|
cmmlu_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.dingo.dingo_gen import \
|
||||||
|
datasets as dingo_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.drop.drop_gen_a2697c import \
|
||||||
|
drop_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d21e37 import \
|
||||||
|
GaokaoBench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.gpqa.gpqa_few_shot_ppl_4b5a83 import \
|
||||||
|
gpqa_datasets # noqa: F401, E501
|
||||||
|
# Corebench v1.7
|
||||||
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
|
||||||
|
gsm8k_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_ppl_59c85e import \
|
||||||
|
hellaswag_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_ce6b06 import \
|
||||||
|
humaneval_datasets as humaneval_v2_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.humaneval.internal_humaneval_gen_d2537e import \
|
||||||
|
humaneval_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.math.math_4shot_base_gen_43d5b6 import \
|
||||||
|
math_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.MathBench.mathbench_2024_few_shot_mixed_4a3fd4 import \
|
||||||
|
mathbench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mbpp.sanitized_mbpp_gen_742f0c import \
|
||||||
|
sanitized_mbpp_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mmlu.mmlu_ppl_ac766d import \
|
||||||
|
mmlu_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_few_shot_gen_bfaf90 import \
|
||||||
|
mmlu_pro_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.nq.nq_open_1shot_gen_20a989 import \
|
||||||
|
nq_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.race.race_few_shot_ppl import \
|
||||||
|
race_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_few_shot_ppl import \
|
||||||
|
BoolQ_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
|
||||||
|
TheoremQA_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_20a989 import \
|
||||||
|
triviaqa_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.wikibench.wikibench_few_shot_ppl_c23d79 import \
|
||||||
|
wikibench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||||
|
winogrande_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
||||||
|
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||||
|
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.bbh import \
|
||||||
|
bbh_summary_groups # noqa: F401, E501
|
||||||
|
# Summary Groups
|
||||||
|
from opencompass.configs.summarizers.groups.cmmlu import \
|
||||||
|
cmmlu_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.GaokaoBench import \
|
||||||
|
GaokaoBench_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
||||||
|
mathbench_2024_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mmlu import \
|
||||||
|
mmlu_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
||||||
|
mmlu_pro_summary_groups # noqa: F401, E501
|
||||||
|
|
||||||
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
race_datasets = [race_datasets[1]] # Only take RACE-High
|
||||||
|
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
|
||||||
|
bbh_datasets = [
|
||||||
|
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
|
||||||
|
or 'multistep_arithmetic_two' in x['abbr']
|
||||||
|
]
|
||||||
|
cmmlu_datasets = [
|
||||||
|
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
|
||||||
|
'ancient_chinese', 'chinese_civil_service_exam',
|
||||||
|
'chinese_driving_rule', 'chinese_food_culture',
|
||||||
|
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
||||||
|
'chinese_teacher_qualification', 'construction_project_management',
|
||||||
|
'elementary_chinese', 'elementary_commonsense', 'ethnology',
|
||||||
|
'high_school_politics', 'modern_chinese',
|
||||||
|
'traditional_chinese_medicine'
|
||||||
|
]
|
||||||
|
]
|
||||||
|
mmlu_datasets = [
|
||||||
|
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
|
||||||
|
'business_ethics', 'clinical_knowledge', 'college_medicine',
|
||||||
|
'global_facts', 'human_aging', 'management', 'marketing',
|
||||||
|
'medical_genetics', 'miscellaneous', 'nutrition',
|
||||||
|
'professional_accounting', 'professional_medicine', 'virology'
|
||||||
|
]
|
||||||
|
]
|
||||||
|
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
|
||||||
|
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
|
||||||
|
GaokaoBench_datasets = [
|
||||||
|
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
|
||||||
|
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
|
||||||
|
]
|
||||||
|
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
||||||
|
|
||||||
|
summary_groups = sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
|
||||||
|
summary_groups.append(
|
||||||
|
{
|
||||||
|
'name': 'Mathbench',
|
||||||
|
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
|
||||||
|
}, )
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
'Language',
|
||||||
|
['race-high', 'accuracy'],
|
||||||
|
['ARC-c', 'accuracy'],
|
||||||
|
['BoolQ', 'accuracy'],
|
||||||
|
['triviaqa_wiki_1shot', 'score'],
|
||||||
|
['nq_open_1shot', 'score'],
|
||||||
|
'',
|
||||||
|
'General Reasoning',
|
||||||
|
['drop', 'accuracy'],
|
||||||
|
['bbh', 'naive_average'],
|
||||||
|
['GPQA_diamond', 'accuracy'],
|
||||||
|
['hellaswag', 'accuracy'],
|
||||||
|
['TheoremQA', 'score'],
|
||||||
|
['winogrande', 'accuracy'],
|
||||||
|
'',
|
||||||
|
'Math Calculation',
|
||||||
|
['gsm8k', 'accuracy'],
|
||||||
|
['GaokaoBench', 'weighted_average'],
|
||||||
|
'GaokaoBench_2010-2022_Math_II_MCQs',
|
||||||
|
'GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank',
|
||||||
|
['math', 'accuracy'],
|
||||||
|
['Mathbench', 'naive_average'],
|
||||||
|
'',
|
||||||
|
'Knowledge',
|
||||||
|
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
|
||||||
|
['cmmlu', 'naive_average'],
|
||||||
|
['mmlu', 'naive_average'],
|
||||||
|
['mmlu_pro', 'naive_average'],
|
||||||
|
'',
|
||||||
|
'Code',
|
||||||
|
['openai_humaneval', 'humaneval_pass@1'],
|
||||||
|
['openai_humaneval_v2', 'humaneval_pass@1'],
|
||||||
|
['sanitized_mbpp', 'score'],
|
||||||
|
'',
|
||||||
|
['dingo_en_192', 'score'],
|
||||||
|
['dingo_zh_170', 'score'],
|
||||||
|
'',
|
||||||
|
'mmlu',
|
||||||
|
'mmlu-stem',
|
||||||
|
'mmlu-social-science',
|
||||||
|
'mmlu-humanities',
|
||||||
|
['mmlu-other', 'accuracy'],
|
||||||
|
'',
|
||||||
|
'cmmlu',
|
||||||
|
'cmmlu-stem',
|
||||||
|
'cmmlu-social-science',
|
||||||
|
'cmmlu-humanities',
|
||||||
|
'cmmlu-other',
|
||||||
|
['cmmlu-china-specific', 'accuracy'],
|
||||||
|
'',
|
||||||
|
'mmlu_pro',
|
||||||
|
'mmlu_pro_biology',
|
||||||
|
'mmlu_pro_business',
|
||||||
|
'mmlu_pro_chemistry',
|
||||||
|
'mmlu_pro_computer_science',
|
||||||
|
'mmlu_pro_economics',
|
||||||
|
'mmlu_pro_engineering',
|
||||||
|
'mmlu_pro_health',
|
||||||
|
'mmlu_pro_history',
|
||||||
|
'mmlu_pro_law',
|
||||||
|
'mmlu_pro_math',
|
||||||
|
'mmlu_pro_philosophy',
|
||||||
|
'mmlu_pro_physics',
|
||||||
|
'mmlu_pro_psychology',
|
||||||
|
'mmlu_pro_other',
|
||||||
|
'',
|
||||||
|
'bbh-logical_deduction_seven_objects',
|
||||||
|
'bbh-multistep_arithmetic_two',
|
||||||
|
'###### MathBench-A: Application Part ######',
|
||||||
|
'college',
|
||||||
|
'high',
|
||||||
|
'middle',
|
||||||
|
'primary',
|
||||||
|
'arithmetic',
|
||||||
|
'mathbench-a (average)',
|
||||||
|
'###### MathBench-T: Theory Part ######',
|
||||||
|
'college_knowledge',
|
||||||
|
'high_knowledge',
|
||||||
|
'middle_knowledge',
|
||||||
|
'primary_knowledge',
|
||||||
|
'mathbench-t (average)',
|
||||||
|
],
|
||||||
|
summary_groups=summary_groups,
|
||||||
|
)
|
||||||
|
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
|
||||||
|
for d in datasets:
|
||||||
|
d['reader_cfg']['test_range'] = '[0:16]'
|
||||||
|
|
||||||
|
for m in models:
|
||||||
|
m['abbr'] = m['abbr'] + '_fullbench'
|
||||||
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||||
|
m['engine_config']['max_batch_size'] = 1
|
||||||
|
m['batch_size'] = 1
|
||||||
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
129
.github/scripts/eval_regression_base_models.py
vendored
Normal file
129
.github/scripts/eval_regression_base_models.py
vendored
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# choose a list of datasets
|
||||||
|
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||||
|
gpqa_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen_17d0dc import \
|
||||||
|
gsm8k_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.race.race_ppl import \
|
||||||
|
race_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
|
||||||
|
winogrande_datasets # noqa: F401, E501
|
||||||
|
# read hf models - chat models
|
||||||
|
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
|
||||||
|
models as lmdeploy_glm4_9b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
|
||||||
|
models as hf_deepseek_7b_base_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
|
||||||
|
models as lmdeploy_deepseek_7b_base_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
|
||||||
|
models as lmdeploy_deepseek_67b_base_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2 import \
|
||||||
|
lmdeploy_deepseek_v2_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.vllm_deepseek_moe_16b_base import \
|
||||||
|
models as vllm_deepseek_moe_16b_base_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma2_2b import \
|
||||||
|
models as hf_gemma2_2b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma2_9b import \
|
||||||
|
models as hf_gemma2_9b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma_2b import \
|
||||||
|
models as hf_gemma_2b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma_7b import \
|
||||||
|
models as hf_gemma_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.lmdeploy_gemma_9b import \
|
||||||
|
models as lmdeploy_gemma_9b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.vllm_gemma_2b import \
|
||||||
|
models as vllm_gemma_2b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.vllm_gemma_7b import \
|
||||||
|
models as vllm_gemma_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b import \
|
||||||
|
models as hf_internlm2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
|
||||||
|
models as hf_internlm2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
|
||||||
|
models as lmdeploy_internlm2_1_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
|
||||||
|
models as lmdeploy_internlm2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_7b import \
|
||||||
|
models as lmdeploy_internlm2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_20b import \
|
||||||
|
models as lmdeploy_internlm2_20b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
|
||||||
|
models as lmdeploy_internlm2_base_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
|
||||||
|
models as lmdeploy_internlm2_base_20b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama2_7b import \
|
||||||
|
models as hf_llama2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
|
||||||
|
models as hf_llama3_1_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama3_8b import \
|
||||||
|
models as hf_llama3_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
|
||||||
|
models as lmdeploy_llama3_1_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
|
||||||
|
models as lmdeploy_llama3_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_70b import \
|
||||||
|
models as lmdeploy_llama3_70b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.hf_mistral_7b_v0_3 import \
|
||||||
|
models as hf_mistral_7b_v0_3_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_7b import \
|
||||||
|
models as hf_qwen_2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_14b import \
|
||||||
|
models as hf_qwen_2_5_14b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen_2_5_32b import \
|
||||||
|
models as hf_qwen_2_5_32b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_1_5b import \
|
||||||
|
models as lmdeploy_qwen2_5_1_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b import \
|
||||||
|
models as lmdeploy_qwen2_5_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b import \
|
||||||
|
models as lmdeploy_qwen2_5_32b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b import \
|
||||||
|
models as lmdeploy_qwen2_5_72b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen1_5_moe_a2_7b import \
|
||||||
|
models as hf_qwen1_5_moe_a2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen2_0_5b import \
|
||||||
|
models as hf_qwen2_0_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen2_1_5b import \
|
||||||
|
models as hf_qwen2_1_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen2_7b import \
|
||||||
|
models as hf_qwen2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b import \
|
||||||
|
models as lmdeploy_qwen2_1_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b import \
|
||||||
|
models as lmdeploy_qwen2_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b import \
|
||||||
|
models as vllm_qwen1_5_0_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.hf_yi_1_5_6b import \
|
||||||
|
models as hf_yi_1_5_6b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.hf_yi_1_5_9b import \
|
||||||
|
models as hf_yi_1_5_9b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
|
||||||
|
models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501
|
||||||
|
|
||||||
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
race_datasets = [race_datasets[1]]
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
|
||||||
|
for d in datasets:
|
||||||
|
d['reader_cfg']['test_range'] = '[0:32]'
|
||||||
|
|
||||||
|
for m in models:
|
||||||
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||||
|
m['engine_config']['max_batch_size'] = 1
|
||||||
|
m['batch_size'] = 1
|
||||||
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
['gsm8k', 'accuracy'],
|
||||||
|
['GPQA_diamond', 'accuracy'],
|
||||||
|
['race-high', 'accuracy'],
|
||||||
|
['winogrande', 'accuracy'],
|
||||||
|
],
|
||||||
|
summary_groups=sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||||
|
)
|
193
.github/scripts/eval_regression_chat_models.py
vendored
Normal file
193
.github/scripts/eval_regression_chat_models.py
vendored
Normal file
@ -0,0 +1,193 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# choose a list of datasets
|
||||||
|
from opencompass.configs.datasets.gsm8k.gsm8k_gen import \
|
||||||
|
gsm8k_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.race.race_gen import \
|
||||||
|
race_datasets # noqa: F401, E501
|
||||||
|
# read hf models - chat models
|
||||||
|
from opencompass.configs.models.chatglm.hf_glm4_9b_chat import \
|
||||||
|
models as hf_glm4_9b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.chatglm.lmdeploy_glm4_9b_chat import \
|
||||||
|
models as lmdeploy_glm4_9b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.chatglm.vllm_glm4_9b_chat import \
|
||||||
|
models as vllm_glm4_9b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
|
||||||
|
models as hf_deepseek_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
|
||||||
|
models as lmdeploy_deepseek_67b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_llama_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_llama_70b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_qwen_1_5b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_deepseek_r1_distill_qwen_32b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
|
||||||
|
models as lmdeploy_deepseek_v2_5_1210_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
|
||||||
|
models as lmdeploy_deepseek_v2_lite_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
|
||||||
|
models as vllm_deepseek_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
|
||||||
|
models as hf_gemma2_2b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma2_9b_it import \
|
||||||
|
models as hf_gemma2_9b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma2_27b_it import \
|
||||||
|
models as hf_gemma2_27b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma_2b_it import \
|
||||||
|
models as hf_gemma_2b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.hf_gemma_7b_it import \
|
||||||
|
models as hf_gemma_7b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.lmdeploy_gemma_9b_it import \
|
||||||
|
models as lmdeploy_gemma_9b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.lmdeploy_gemma_27b_it import \
|
||||||
|
models as lmdeploy_gemma_27b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.gemma.vllm_gemma_7b_it import \
|
||||||
|
models as vllm_gemma_7b_it_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||||
|
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
|
||||||
|
models as hf_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
|
||||||
|
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||||
|
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
|
||||||
|
models as lmdeploy_internlm2_5_20b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b import \
|
||||||
|
models as lmdeploy_internlm2_chat_1_8b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_1_8b_sft import \
|
||||||
|
models as lmdeploy_internlm2_chat_1_8b_sft_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b import \
|
||||||
|
models as lmdeploy_internlm2_chat_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
|
||||||
|
models as lmdeploy_internlm2_chat_7b_sft_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
|
||||||
|
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
|
||||||
|
models as vllm_internlm2_chat_7b_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
|
||||||
|
models as hf_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama3_2_3b_instruct import \
|
||||||
|
models as hf_llama3_2_3b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.hf_llama3_8b_instruct import \
|
||||||
|
models as hf_llama3_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama2_7b_chat import \
|
||||||
|
models as lmdeploy_llama2_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b_instruct import \
|
||||||
|
models as lmdeploy_llama3_1_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_2_3b_instruct import \
|
||||||
|
models as lmdeploy_llama3_2_3b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_3_70b_instruct import \
|
||||||
|
models as lmdeploy_llama3_3_70b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b_instruct import \
|
||||||
|
models as lmdeploy_llama3_8b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_2 import \
|
||||||
|
models as hf_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.hf_mistral_7b_instruct_v0_3 import \
|
||||||
|
models as hf_mistral_7b_instruct_v0_3_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.hf_mistral_nemo_instruct_2407 import \
|
||||||
|
models as hf_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
|
||||||
|
models as hf_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_mistral_large_instruct_2411_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mistral_nemo_instruct_2407 import \
|
||||||
|
models as lmdeploy_mistral_nemo_instruct_2407_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_mistral_small_instruct_2409_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as \
|
||||||
|
lmdeploy_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
|
||||||
|
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
|
||||||
|
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
|
||||||
|
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
|
||||||
|
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.phi.hf_phi_4 import \
|
||||||
|
models as hf_phi_4_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
|
||||||
|
models as hf_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
|
||||||
|
models as hf_qwen2_5_3b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.hf_qwen2_5_14b_instruct import \
|
||||||
|
models as hf_qwen2_5_14b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_0_5b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_5_0_5b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_3b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_5_3b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_5_14b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_5_72b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen1_5_0_5b_chat import \
|
||||||
|
models as hf_qwen1_5_0_5b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen2_1_5b_instruct import \
|
||||||
|
models as hf_qwen2_1_5b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.hf_qwen2_7b_instruct import \
|
||||||
|
models as hf_qwen2_7b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_1_5b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_1_5b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.lmdeploy_qwen2_7b_instruct import \
|
||||||
|
models as lmdeploy_qwen2_7b_instruct_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.qwen.vllm_qwen1_5_0_5b_chat import \
|
||||||
|
models as vllm_qwen1_5_0_5b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.hf_yi_1_5_6b_chat import \
|
||||||
|
models as hf_yi_1_5_6b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.hf_yi_1_5_9b_chat import \
|
||||||
|
models as hf_yi_1_5_9b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.lmdeploy_yi_1_5_6b_chat import \
|
||||||
|
models as lmdeploy_yi_1_5_6b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b_chat import \
|
||||||
|
models as lmdeploy_yi_1_5_9b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
|
||||||
|
models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501
|
||||||
|
|
||||||
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
|
||||||
|
|
||||||
|
race_datasets = [race_datasets[1]]
|
||||||
|
datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
|
||||||
|
|
||||||
|
api_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
],
|
||||||
|
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||||
|
)
|
||||||
|
|
||||||
|
for d in datasets:
|
||||||
|
d['reader_cfg']['test_range'] = '[0:32]'
|
||||||
|
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
|
||||||
|
for m in models:
|
||||||
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||||
|
m['engine_config']['max_batch_size'] = 1
|
||||||
|
m['batch_size'] = 1
|
||||||
|
|
||||||
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||||
|
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
'gsm8k',
|
||||||
|
'race-middle',
|
||||||
|
'race-high',
|
||||||
|
],
|
||||||
|
summary_groups=sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||||
|
)
|
317
.github/scripts/eval_regression_chat_obj_fullbench.py
vendored
Normal file
317
.github/scripts/eval_regression_chat_obj_fullbench.py
vendored
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# read hf models - chat models
|
||||||
|
# Dataset
|
||||||
|
from opencompass.configs.datasets.aime2024.aime2024_gen_6e39a4 import \
|
||||||
|
aime2024_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.ARC_c.ARC_c_cot_gen_926652 import \
|
||||||
|
ARC_c_datasets # noqa: F401, E501
|
||||||
|
# remove because of oom
|
||||||
|
# from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_872059 import arc_prize_public_evaluation_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.bbh.bbh_gen_5b92b0 import \
|
||||||
|
bbh_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_complete_gen_faf748 import \
|
||||||
|
bigcodebench_hard_complete_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen_8815eb import \
|
||||||
|
bigcodebench_hard_instruct_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.cmmlu.cmmlu_0shot_cot_gen_305931 import \
|
||||||
|
cmmlu_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_ace24b import \
|
||||||
|
cmo_fib_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.drop.drop_openai_simple_evals_gen_3857b0 import \
|
||||||
|
drop_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import \
|
||||||
|
ds1000_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import \
|
||||||
|
GaokaoBench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||||
|
gpqa_datasets # noqa: F401, E501
|
||||||
|
# new datasets in Fullbench v1.1
|
||||||
|
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_6e39a4 import \
|
||||||
|
gsm8k_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.hellaswag.hellaswag_10shot_gen_e42710 import \
|
||||||
|
hellaswag_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
|
||||||
|
humaneval_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.humanevalx.humanevalx_gen_3d84a3 import \
|
||||||
|
humanevalx_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
|
||||||
|
ifeval_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.korbench.korbench_single_0_shot_gen import \
|
||||||
|
korbench_0shot_single_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.livecodebench.livecodebench_gen_b2b0fd import \
|
||||||
|
LCB_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.math.math_0shot_gen_11c4b5 import \
|
||||||
|
math_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_50a320 import \
|
||||||
|
mathbench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mbpp.sanitized_mbpp_mdblock_gen_a447ff import \
|
||||||
|
sanitized_mbpp_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mmlu.mmlu_openai_simple_evals_gen_b618ea import \
|
||||||
|
mmlu_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
|
||||||
|
mmlu_pro_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.mmmlu_lite.mmmlu_lite_gen_c51a84 import \
|
||||||
|
mmmlu_lite_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.musr.musr_gen_3622bb import \
|
||||||
|
musr_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.nq.nq_open_1shot_gen_2e45e5 import \
|
||||||
|
nq_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.race.race_cot_gen_d95929 import \
|
||||||
|
race_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.scicode.scicode_gen_085b98 import \
|
||||||
|
SciCode_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_cot_gen_1d56df import \
|
||||||
|
BoolQ_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.teval.teval_en_gen_1ac254 import \
|
||||||
|
teval_datasets as teval_en_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.teval.teval_zh_gen_1ac254 import \
|
||||||
|
teval_datasets as teval_zh_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import \
|
||||||
|
TheoremQA_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_bc5f21 import \
|
||||||
|
triviaqa_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.wikibench.wikibench_gen_0978ad import \
|
||||||
|
wikibench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||||
|
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||||
|
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
# Summary Groups
|
||||||
|
# Summary Groups
|
||||||
|
from opencompass.configs.summarizers.groups.bbh import \
|
||||||
|
bbh_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.cmmlu import \
|
||||||
|
cmmlu_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.ds1000 import \
|
||||||
|
ds1000_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.GaokaoBench import \
|
||||||
|
GaokaoBench_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.humanevalx import \
|
||||||
|
humanevalx_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.korbench import \
|
||||||
|
korbench_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
|
||||||
|
mathbench_2024_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mmlu import \
|
||||||
|
mmlu_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
||||||
|
mmlu_pro_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.musr_average import \
|
||||||
|
summarizer as musr_summarizer # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.scicode import \
|
||||||
|
scicode_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.groups.teval import \
|
||||||
|
teval_summary_groups # noqa: F401, E501
|
||||||
|
from opencompass.configs.summarizers.mmmlu_lite import \
|
||||||
|
mmmlu_summary_groups # noqa: F401, E501
|
||||||
|
|
||||||
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
# For HumanEval-X Evaluation
|
||||||
|
# Apply the evaluator ip_address and port
|
||||||
|
race_datasets = [race_datasets[1]]
|
||||||
|
for item in humanevalx_datasets:
|
||||||
|
item['eval_cfg']['evaluator'][
|
||||||
|
'ip_address'] = 'codeeval.opencompass.org.cn/humanevalx'
|
||||||
|
item['eval_cfg']['evaluator']['port'] = ''
|
||||||
|
|
||||||
|
# For DS-1000 Evaluation
|
||||||
|
# Apply the evaluator ip_address and port
|
||||||
|
for item in ds1000_datasets:
|
||||||
|
item['eval_cfg']['evaluator'][
|
||||||
|
'ip_address'] = 'codeeval.opencompass.org.cn/ds1000'
|
||||||
|
item['eval_cfg']['evaluator']['port'] = ''
|
||||||
|
|
||||||
|
bbh_datasets = [
|
||||||
|
x for x in bbh_datasets if 'logical_deduction_seven_objects' in x['abbr']
|
||||||
|
or 'multistep_arithmetic_two' in x['abbr']
|
||||||
|
]
|
||||||
|
cmmlu_datasets = [
|
||||||
|
x for x in cmmlu_datasets if x['abbr'].replace('cmmlu-', '') in [
|
||||||
|
'ancient_chinese', 'chinese_civil_service_exam',
|
||||||
|
'chinese_driving_rule', 'chinese_food_culture',
|
||||||
|
'chinese_foreign_policy', 'chinese_history', 'chinese_literature',
|
||||||
|
'chinese_teacher_qualification', 'construction_project_management',
|
||||||
|
'elementary_chinese', 'elementary_commonsense', 'ethnology',
|
||||||
|
'high_school_politics', 'modern_chinese',
|
||||||
|
'traditional_chinese_medicine'
|
||||||
|
]
|
||||||
|
]
|
||||||
|
mmlu_datasets = [
|
||||||
|
x for x in mmlu_datasets if x['abbr'].replace('lukaemon_mmlu_', '') in [
|
||||||
|
'business_ethics', 'clinical_knowledge', 'college_medicine',
|
||||||
|
'global_facts', 'human_aging', 'management', 'marketing',
|
||||||
|
'medical_genetics', 'miscellaneous', 'nutrition',
|
||||||
|
'professional_accounting', 'professional_medicine', 'virology'
|
||||||
|
]
|
||||||
|
]
|
||||||
|
|
||||||
|
mmlu_pro_datasets = [mmlu_pro_datasets[0]]
|
||||||
|
|
||||||
|
mmmlu_lite_datasets = [
|
||||||
|
x for x in mmmlu_lite_datasets if 'mmlu_lite_AR-XY' in x['abbr']
|
||||||
|
]
|
||||||
|
mathbench_datasets = [x for x in mathbench_datasets if 'college' in x['abbr']]
|
||||||
|
GaokaoBench_datasets = [
|
||||||
|
x for x in GaokaoBench_datasets if '2010-2022_Math_II_MCQs' in x['abbr']
|
||||||
|
or '2010-2022_Math_II_Fill-in-the-Blank' in x['abbr']
|
||||||
|
]
|
||||||
|
|
||||||
|
datasets = sum(
|
||||||
|
(v for k, v in locals().items() if k.endswith('_datasets')
|
||||||
|
and 'scicode' not in k.lower() and 'teval' not in k),
|
||||||
|
[],
|
||||||
|
)
|
||||||
|
datasets += teval_en_datasets
|
||||||
|
datasets += teval_zh_datasets
|
||||||
|
# datasets += SciCode_datasets
|
||||||
|
|
||||||
|
musr_summary_groups = musr_summarizer['summary_groups']
|
||||||
|
summary_groups = sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])
|
||||||
|
summary_groups.append(
|
||||||
|
{
|
||||||
|
'name': 'Mathbench',
|
||||||
|
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
|
||||||
|
}, )
|
||||||
|
|
||||||
|
# Summarizer
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
'Language',
|
||||||
|
['race-high', 'accuracy'],
|
||||||
|
['ARC-c', 'accuracy'],
|
||||||
|
['BoolQ', 'accuracy'],
|
||||||
|
['triviaqa_wiki_1shot', 'score'],
|
||||||
|
['nq_open_1shot', 'score'],
|
||||||
|
['mmmlu_lite', 'naive_average'],
|
||||||
|
'',
|
||||||
|
'Instruction Following',
|
||||||
|
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||||
|
'',
|
||||||
|
'General Reasoning',
|
||||||
|
['drop', 'accuracy'],
|
||||||
|
['bbh', 'naive_average'],
|
||||||
|
['GPQA_diamond', 'accuracy'],
|
||||||
|
['hellaswag', 'accuracy'],
|
||||||
|
['TheoremQA', 'score'],
|
||||||
|
['musr_average', 'naive_average'],
|
||||||
|
['korbench_single', 'naive_average'],
|
||||||
|
['ARC_Prize_Public_Evaluation', 'accuracy'],
|
||||||
|
'',
|
||||||
|
'Math Calculation',
|
||||||
|
['gsm8k', 'accuracy'],
|
||||||
|
['GaokaoBench', 'weighted_average'],
|
||||||
|
['math', 'accuracy'],
|
||||||
|
['cmo_fib', 'accuracy'],
|
||||||
|
['aime2024', 'accuracy'],
|
||||||
|
['Mathbench', 'naive_average'],
|
||||||
|
'',
|
||||||
|
'Knowledge',
|
||||||
|
['wikibench-wiki-single_choice_cncircular', 'perf_4'],
|
||||||
|
['cmmlu', 'naive_average'],
|
||||||
|
['mmlu', 'naive_average'],
|
||||||
|
['mmlu_pro', 'naive_average'],
|
||||||
|
'',
|
||||||
|
'Code',
|
||||||
|
['openai_humaneval', 'humaneval_pass@1'],
|
||||||
|
['sanitized_mbpp', 'score'],
|
||||||
|
['humanevalx', 'naive_average'],
|
||||||
|
['ds1000', 'naive_average'],
|
||||||
|
['lcb_code_generation', 'pass@1'],
|
||||||
|
['lcb_code_execution', 'pass@1'],
|
||||||
|
['lcb_test_output', 'pass@1'],
|
||||||
|
['bigcodebench_hard_instruct', 'pass@1'],
|
||||||
|
['bigcodebench_hard_complete', 'pass@1'],
|
||||||
|
'',
|
||||||
|
'Agent',
|
||||||
|
['teval', 'naive_average'],
|
||||||
|
['SciCode', 'accuracy'],
|
||||||
|
['SciCode', 'sub_accuracy'],
|
||||||
|
'',
|
||||||
|
'bbh-logical_deduction_seven_objects',
|
||||||
|
'bbh-multistep_arithmetic_two',
|
||||||
|
'',
|
||||||
|
'mmlu',
|
||||||
|
'mmlu-stem',
|
||||||
|
'mmlu-social-science',
|
||||||
|
'mmlu-humanities',
|
||||||
|
'mmlu-other',
|
||||||
|
'',
|
||||||
|
'cmmlu',
|
||||||
|
'cmmlu-stem',
|
||||||
|
'cmmlu-social-science',
|
||||||
|
'cmmlu-humanities',
|
||||||
|
'cmmlu-other',
|
||||||
|
'cmmlu-china-specific',
|
||||||
|
'',
|
||||||
|
'mmlu_pro',
|
||||||
|
'mmlu_pro_biology',
|
||||||
|
'mmlu_pro_business',
|
||||||
|
'mmlu_pro_chemistry',
|
||||||
|
'mmlu_pro_computer_science',
|
||||||
|
'mmlu_pro_economics',
|
||||||
|
'mmlu_pro_engineering',
|
||||||
|
'mmlu_pro_health',
|
||||||
|
'mmlu_pro_history',
|
||||||
|
'mmlu_pro_law',
|
||||||
|
'mmlu_pro_math',
|
||||||
|
'mmlu_pro_philosophy',
|
||||||
|
'mmlu_pro_physics',
|
||||||
|
'mmlu_pro_psychology',
|
||||||
|
'mmlu_pro_other',
|
||||||
|
'',
|
||||||
|
'ds1000_Pandas',
|
||||||
|
'ds1000_Numpy',
|
||||||
|
'ds1000_Tensorflow',
|
||||||
|
'ds1000_Scipy',
|
||||||
|
'ds1000_Sklearn',
|
||||||
|
'ds1000_Pytorch',
|
||||||
|
'ds1000_Matplotlib',
|
||||||
|
'',
|
||||||
|
'mmmlu_lite',
|
||||||
|
'openai_mmmlu_lite_AR-XY',
|
||||||
|
'openai_mmmlu_lite_BN-BD',
|
||||||
|
'openai_mmmlu_lite_DE-DE',
|
||||||
|
'openai_mmmlu_lite_ES-LA',
|
||||||
|
'openai_mmmlu_lite_FR-FR',
|
||||||
|
'openai_mmmlu_lite_HI-IN',
|
||||||
|
'openai_mmmlu_lite_ID-ID',
|
||||||
|
'openai_mmmlu_lite_IT-IT',
|
||||||
|
'openai_mmmlu_lite_JA-JP',
|
||||||
|
'openai_mmmlu_lite_KO-KR',
|
||||||
|
'openai_mmmlu_lite_PT-BR',
|
||||||
|
'openai_mmmlu_lite_SW-KE',
|
||||||
|
'openai_mmmlu_lite_YO-NG',
|
||||||
|
'openai_mmmlu_lite_ZH-CN',
|
||||||
|
'',
|
||||||
|
'###### MathBench-A: Application Part ######',
|
||||||
|
'college',
|
||||||
|
'high',
|
||||||
|
'middle',
|
||||||
|
'primary',
|
||||||
|
'arithmetic',
|
||||||
|
'mathbench-a (average)',
|
||||||
|
'###### MathBench-T: Theory Part ######',
|
||||||
|
'college_knowledge',
|
||||||
|
'high_knowledge',
|
||||||
|
'middle_knowledge',
|
||||||
|
'primary_knowledge',
|
||||||
|
'mathbench-t (average)',
|
||||||
|
],
|
||||||
|
summary_groups=summary_groups,
|
||||||
|
)
|
||||||
|
|
||||||
|
for d in datasets:
|
||||||
|
d['reader_cfg']['test_range'] = '[0:16]'
|
||||||
|
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
for m in models:
|
||||||
|
m['abbr'] = m['abbr'] + '_fullbench'
|
||||||
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||||
|
m['engine_config']['max_batch_size'] = 1
|
||||||
|
m['batch_size'] = 1
|
||||||
|
|
||||||
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
182
.github/scripts/eval_regression_chat_sub_fullbench.py
vendored
Normal file
182
.github/scripts/eval_regression_chat_sub_fullbench.py
vendored
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||||
|
from opencompass.runners import LocalRunner
|
||||||
|
from opencompass.summarizers import DefaultSubjectiveSummarizer
|
||||||
|
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
# read hf models - chat models
|
||||||
|
# Dataset
|
||||||
|
from opencompass.configs.datasets.chinese_simpleqa.chinese_simpleqa_gen import \
|
||||||
|
csimpleqa_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.SimpleQA.simpleqa_gen_0283c3 import \
|
||||||
|
simpleqa_datasets # noqa: F401, E501; noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.alignbench.alignbench_v1_1_judgeby_critiquellm_new import \
|
||||||
|
alignbench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4_new import \
|
||||||
|
alpacav2_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.arena_hard.arena_hard_compare_new import \
|
||||||
|
arenahard_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.compassarena.compassarena_compare_new import \
|
||||||
|
compassarena_datasets # noqa: F401, E501
|
||||||
|
# from opencompass.configs.datasets.subjective.fofo.fofo_bilingual_judge_new import fofo_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.followbench.followbench_llmeval_new import \
|
||||||
|
followbench_llmeval_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.multiround.mtbench101_judge_new import \
|
||||||
|
mtbench101_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.datasets.subjective.wildbench.wildbench_pair_judge_new import \
|
||||||
|
wildbench_datasets # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.hf_internlm2_5_7b_chat import \
|
||||||
|
models as hf_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||||
|
models as lmdeploy_internlm2_5_7b_chat_model # noqa: F401, E501
|
||||||
|
|
||||||
|
from ...volc import infer as volc_infer # noqa: F401, E501
|
||||||
|
|
||||||
|
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')
|
||||||
|
and 'mtbench101' not in k and 'wildbench' not in k), [])
|
||||||
|
datasets += mtbench101_datasets # noqa: F401, E501
|
||||||
|
datasets += wildbench_datasets # noqa: F401, E501
|
||||||
|
|
||||||
|
api_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
|
],
|
||||||
|
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||||
|
)
|
||||||
|
|
||||||
|
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||||
|
for m in models:
|
||||||
|
m['abbr'] = m['abbr'] + '_fullbench'
|
||||||
|
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
|
||||||
|
m['engine_config']['max_batch_size'] = 1
|
||||||
|
m['batch_size'] = 1
|
||||||
|
|
||||||
|
models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])
|
||||||
|
|
||||||
|
judge_models = deepcopy([models[1]])
|
||||||
|
judge_models[0]['abbr'] = judge_models[0]['abbr'] + '-judge'
|
||||||
|
|
||||||
|
eval = dict(
|
||||||
|
partitioner=dict(
|
||||||
|
type=SubjectiveNaivePartitioner,
|
||||||
|
models=models,
|
||||||
|
judge_models=judge_models,
|
||||||
|
),
|
||||||
|
runner=dict(type=LocalRunner,
|
||||||
|
max_num_workers=16,
|
||||||
|
task=dict(type=SubjectiveEvalTask)),
|
||||||
|
)
|
||||||
|
|
||||||
|
summary_groups = []
|
||||||
|
summary_groups.append({
|
||||||
|
'name': 'compassarena_language',
|
||||||
|
'subsets': [
|
||||||
|
['compassarena_language', '内容总结'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name': 'compassarena_knowledge',
|
||||||
|
'subsets': [
|
||||||
|
['compassarena_knowledge', '生活常识_ZH'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name': 'compassarena_reason_v2',
|
||||||
|
'subsets': [
|
||||||
|
['compassarena_reason_v2', 'reasoning'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name': 'compassarena_math_v2',
|
||||||
|
'subsets': [
|
||||||
|
['compassarena_math_v2', '高等数学_ZH'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name': 'compassarena_creationv2_zh',
|
||||||
|
'subsets': [
|
||||||
|
['compassarena_creationv2_zh', '内容扩写_ZH'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name':
|
||||||
|
'CompassArena',
|
||||||
|
'subsets': [
|
||||||
|
'compassarena_language',
|
||||||
|
'compassarena_knowledge',
|
||||||
|
'compassarena_reason_v2',
|
||||||
|
'compassarena_math_v2',
|
||||||
|
'compassarena_creationv2_zh',
|
||||||
|
],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name':
|
||||||
|
'FoFo',
|
||||||
|
'subsets': [['fofo_test_prompts', 'overall'],
|
||||||
|
['fofo_test_prompts_cn', 'overall']],
|
||||||
|
})
|
||||||
|
summary_groups.append({
|
||||||
|
'name':
|
||||||
|
'Followbench',
|
||||||
|
'subsets': [
|
||||||
|
['followbench_llmeval_en', 'HSR_AVG'],
|
||||||
|
['followbench_llmeval_en', 'SSR_AVG'],
|
||||||
|
],
|
||||||
|
})
|
||||||
|
|
||||||
|
# Summarizer
|
||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
['alignment_bench_v1_1', '总分'],
|
||||||
|
['alpaca_eval', 'total'],
|
||||||
|
['arenahard', 'score'],
|
||||||
|
['Followbench', 'naive_average'],
|
||||||
|
['CompassArena', 'naive_average'],
|
||||||
|
['FoFo', 'naive_average'],
|
||||||
|
['mtbench101', 'avg'],
|
||||||
|
['wildbench', 'average'],
|
||||||
|
['simpleqa', 'accuracy_given_attempted'],
|
||||||
|
['chinese_simpleqa', 'given_attempted_accuracy'],
|
||||||
|
'',
|
||||||
|
['alignment_bench_v1_1', '专业能力'],
|
||||||
|
['alignment_bench_v1_1', '数学计算'],
|
||||||
|
['alignment_bench_v1_1', '基本任务'],
|
||||||
|
['alignment_bench_v1_1', '逻辑推理'],
|
||||||
|
['alignment_bench_v1_1', '中文理解'],
|
||||||
|
['alignment_bench_v1_1', '文本写作'],
|
||||||
|
['alignment_bench_v1_1', '角色扮演'],
|
||||||
|
['alignment_bench_v1_1', '综合问答'],
|
||||||
|
['alpaca_eval', 'helpful_base'],
|
||||||
|
['alpaca_eval', 'koala'],
|
||||||
|
['alpaca_eval', 'oasst'],
|
||||||
|
['alpaca_eval', 'selfinstruct'],
|
||||||
|
['alpaca_eval', 'vicuna'],
|
||||||
|
['compassarena_language', 'naive_average'],
|
||||||
|
['compassarena_knowledge', 'naive_average'],
|
||||||
|
['compassarena_reason_v2', 'naive_average'],
|
||||||
|
['compassarena_math_v2', 'naive_average'],
|
||||||
|
['compassarena_creationv2_zh', 'naive_average'],
|
||||||
|
['fofo_test_prompts', 'overall'],
|
||||||
|
['fofo_test_prompts_cn', 'overall'],
|
||||||
|
['followbench_llmeval_en', 'HSR_AVG'],
|
||||||
|
['followbench_llmeval_en', 'SSR_AVG'],
|
||||||
|
['followbench_llmeval_en', 'HSR_L1'],
|
||||||
|
['followbench_llmeval_en', 'HSR_L2'],
|
||||||
|
['followbench_llmeval_en', 'HSR_L3'],
|
||||||
|
['followbench_llmeval_en', 'HSR_L4'],
|
||||||
|
['followbench_llmeval_en', 'HSR_L5'],
|
||||||
|
['followbench_llmeval_en', 'SSR_L1'],
|
||||||
|
['followbench_llmeval_en', 'SSR_L2'],
|
||||||
|
['followbench_llmeval_en', 'SSR_L3'],
|
||||||
|
['followbench_llmeval_en', 'SSR_L4'],
|
||||||
|
['followbench_llmeval_en', 'SSR_L5'],
|
||||||
|
['simpleqa', 'f1'],
|
||||||
|
],
|
||||||
|
type=DefaultSubjectiveSummarizer,
|
||||||
|
summary_groups=summary_groups,
|
||||||
|
)
|
345
.github/scripts/oc_score_assert.py
vendored
345
.github/scripts/oc_score_assert.py
vendored
@ -6,11 +6,29 @@ import yaml
|
|||||||
|
|
||||||
output_path = 'regression_result_daily'
|
output_path = 'regression_result_daily'
|
||||||
|
|
||||||
model_list = ['internlm-7b-hf', 'internlm-chat-7b-hf', 'chatglm3-6b-base-hf']
|
|
||||||
dataset_list = [
|
def model_list(type):
|
||||||
'ARC-c', 'chid-dev', 'chid-test', 'openai_humaneval', 'openbookqa',
|
config_path = '.github/scripts/oc_score_baseline_testrange.yaml'
|
||||||
'openbookqa_fact'
|
with open(config_path) as f:
|
||||||
]
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||||
|
return config.get(type).keys()
|
||||||
|
|
||||||
|
|
||||||
|
def dataset_list(model, type):
|
||||||
|
config_path = '.github/scripts/oc_score_baseline_fullbench.yaml'
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||||
|
return config.get(model).get(type).keys()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def baseline_scores_testrange(request):
|
||||||
|
config_path = os.path.join(
|
||||||
|
request.config.rootdir,
|
||||||
|
'.github/scripts/oc_score_baseline_testrange.yaml')
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
@ -22,6 +40,16 @@ def baseline_scores(request):
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def baseline_scores_fullbench(request):
|
||||||
|
config_path = os.path.join(
|
||||||
|
request.config.rootdir,
|
||||||
|
'.github/scripts/oc_score_baseline_fullbench.yaml')
|
||||||
|
with open(config_path) as f:
|
||||||
|
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
|
||||||
|
return config
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def result_scores():
|
def result_scores():
|
||||||
file = find_csv_files(output_path)
|
file = find_csv_files(output_path)
|
||||||
@ -31,55 +59,314 @@ def result_scores():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures('result_scores')
|
@pytest.mark.usefixtures('result_scores')
|
||||||
@pytest.mark.usefixtures('baseline_scores')
|
@pytest.mark.usefixtures('baseline_scores_testrange')
|
||||||
|
@pytest.mark.chat_models
|
||||||
class TestChat:
|
class TestChat:
|
||||||
"""Test cases for chat model."""
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in model_list
|
@pytest.mark.parametrize(
|
||||||
for p2 in dataset_list])
|
'model, dataset', [(p1, p2) for p1 in model_list('chat')
|
||||||
def test_model_dataset_score(self, baseline_scores, result_scores, model,
|
for p2 in ['gsm8k_accuracy', 'race-high_accuracy']])
|
||||||
dataset):
|
def test_model_dataset_score(self, baseline_scores_testrange,
|
||||||
|
result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores_testrange.get('chat').get(model).get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores_testrange')
|
||||||
|
@pytest.mark.base_models
|
||||||
|
class TestBase:
|
||||||
|
"""Test cases for base model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset',
|
||||||
|
[(p1, p2) for p1 in model_list('base') for p2 in [
|
||||||
|
'gsm8k_accuracy', 'GPQA_diamond_accuracy',
|
||||||
|
'race-high_accuracy', 'winogrande_accuracy'
|
||||||
|
]])
|
||||||
|
def test_model_dataset_score(self, baseline_scores_testrange,
|
||||||
|
result_scores, model, dataset):
|
||||||
|
if model in ['gemma-2b-vllm', 'gemma-7b-vllm'
|
||||||
|
] and dataset != 'gsm8k_accuracy':
|
||||||
|
return
|
||||||
|
base_score = baseline_scores_testrange.get('base').get(model).get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||||
|
@pytest.mark.chat_obj_fullbench
|
||||||
|
class TestChatObjFullbench:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||||
|
'internlm2_5-7b-chat-hf_fullbench',
|
||||||
|
'internlm2_5-7b-chat-turbomind_fullbench'
|
||||||
|
] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'objective')])
|
||||||
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||||
|
result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||||
|
@pytest.mark.chat_sub_fullbench
|
||||||
|
class TestChatSubFullbench:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||||
|
'internlm2_5-7b-chat-hf_fullbench',
|
||||||
|
'internlm2_5-7b-chat-turbomind_fullbench'
|
||||||
|
] for p2 in dataset_list('internlm2_5-7b-chat-hf_fullbench', 'subjective')]
|
||||||
|
)
|
||||||
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||||
|
result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get(
|
||||||
|
'subjective').get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||||
|
@pytest.mark.base_fullbench
|
||||||
|
class TestBaseFullbench:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[(p1, p2) for p1 in
|
||||||
|
['internlm2_5-7b-hf_fullbench', 'internlm2_5-7b-turbomind_fullbench']
|
||||||
|
for p2 in dataset_list('internlm2_5-7b-hf_fullbench', 'objective')])
|
||||||
|
def test_model_dataset_score(self, baseline_scores_fullbench,
|
||||||
|
result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores')
|
||||||
|
@pytest.mark.api
|
||||||
|
class TestApibench:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset',
|
||||||
|
[('lmdeploy-api-test', 'race-middle_accuracy'),
|
||||||
|
('lmdeploy-api-test', 'race-high_accuracy'),
|
||||||
|
('lmdeploy-api-test', 'gsm8k_accuracy')])
|
||||||
|
def test_api(self, baseline_scores, result_scores, model, dataset):
|
||||||
base_score = baseline_scores.get(model).get(dataset)
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
result_score = result_scores.get(model).get(dataset)
|
result_score = result_scores.get(model).get(dataset)
|
||||||
assert_score(result_score, base_score)
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
def assert_score(score, baseline):
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores_fullbench')
|
||||||
|
@pytest.mark.volc_fullbench
|
||||||
|
class TestVolcFullbench:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
|
||||||
|
'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
|
||||||
|
'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
|
||||||
|
'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
|
||||||
|
] for p2 in dataset_list(p1, 'objective')])
|
||||||
|
@pytest.mark.chat_objective
|
||||||
|
def test_chat_objective(self, baseline_scores_fullbench, result_scores,
|
||||||
|
model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize('model, dataset', [
|
||||||
|
(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
|
||||||
|
for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'subjective')
|
||||||
|
])
|
||||||
|
@pytest.mark.chat_subjective
|
||||||
|
def test_chat_subjective(self, baseline_scores_fullbench, result_scores,
|
||||||
|
model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get(
|
||||||
|
'subjective').get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
|
||||||
|
for p2 in dataset_list('internlm2_5-7b-turbomind', 'objective')])
|
||||||
|
@pytest.mark.base_objective
|
||||||
|
def test_base_objective(self, baseline_scores_fullbench, result_scores,
|
||||||
|
model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get('objective').get(
|
||||||
|
dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[(p1, p2) for p1 in ['internlm2_5-7b-turbomind']
|
||||||
|
for p2 in dataset_list('internlm2_5-7b-turbomind', 'long_context')])
|
||||||
|
@pytest.mark.base_long_context
|
||||||
|
def test_base_long_context(self, baseline_scores_fullbench, result_scores,
|
||||||
|
model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get(
|
||||||
|
'long_context').get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[(p1, p2)
|
||||||
|
for p1 in ['internlm2_5-7b-chat-1m-turbomind'] for p2 in dataset_list(
|
||||||
|
'internlm2_5-7b-chat-1m-turbomind', 'long_context')])
|
||||||
|
@pytest.mark.chat_long_context
|
||||||
|
def test_chat_long_context(self, baseline_scores_fullbench, result_scores,
|
||||||
|
model, dataset):
|
||||||
|
base_score = baseline_scores_fullbench.get(model).get(
|
||||||
|
'long_context').get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
@pytest.mark.usefixtures('baseline_scores')
|
||||||
|
class TestCmdCase:
|
||||||
|
|
||||||
|
@pytest.mark.case1
|
||||||
|
@pytest.mark.parametrize('model, dataset',
|
||||||
|
[('internlm2_5-7b-hf', 'race-middle_accuracy'),
|
||||||
|
('internlm2_5-7b-hf', 'race-high_accuracy'),
|
||||||
|
('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.case2
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
|
||||||
|
('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
|
||||||
|
('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
|
||||||
|
('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
|
||||||
|
('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
|
||||||
|
('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.case3
|
||||||
|
@pytest.mark.parametrize('model, dataset',
|
||||||
|
[('internlm2_5-7b_hf', 'race-middle_accuracy'),
|
||||||
|
('internlm2_5-7b_hf', 'race-high_accuracy'),
|
||||||
|
('internlm2_5-7b_hf', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case3(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model, result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.case4
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
@pytest.mark.case5
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
'model, dataset',
|
||||||
|
[('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
|
||||||
|
('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
|
||||||
|
def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
|
||||||
|
base_score = baseline_scores.get(model).get(dataset)
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(model + '_batch', result_score, base_score, dataset)
|
||||||
|
|
||||||
|
|
||||||
|
def assert_score(model_type, score, baseline, dataset: str = ''):
|
||||||
if score is None or score == '-':
|
if score is None or score == '-':
|
||||||
assert False, 'value is none'
|
assert False, 'value is none'
|
||||||
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
|
|
||||||
print(score + ' between ' + str(baseline * 0.97) + ' and ' +
|
if 'batch' not in model_type:
|
||||||
str(baseline * 1.03))
|
if float(score) <= (baseline + 0.01) and float(score) >= (baseline -
|
||||||
|
0.01):
|
||||||
|
print(' '.join([score, 'is equal', str(baseline)]))
|
||||||
assert True
|
assert True
|
||||||
else:
|
else:
|
||||||
assert False, score + ' not between ' + str(
|
print(' '.join([score, 'is not equal', str(baseline)]))
|
||||||
baseline * 0.97) + ' and ' + str(baseline * 1.03)
|
assert False, ' '.join([score, 'is not equal', str(baseline)])
|
||||||
|
else:
|
||||||
|
if dataset.startswith('dingo') or dataset.startswith(
|
||||||
|
'GPQA') or dataset.startswith('high') or dataset.startswith(
|
||||||
|
'mmlu_pro_') or dataset.startswith(
|
||||||
|
'alpaca_eval') or dataset.startswith('compassarena_'):
|
||||||
|
threshold = 5
|
||||||
|
elif dataset.startswith('humanevalx') or dataset == 'large_threshold':
|
||||||
|
threshold = 10
|
||||||
|
else:
|
||||||
|
threshold = 3
|
||||||
|
if float(score) <= (baseline + threshold) and float(score) >= (
|
||||||
|
baseline - threshold):
|
||||||
|
print(' '.join([
|
||||||
|
score, 'is between',
|
||||||
|
str(baseline - threshold), 'and',
|
||||||
|
str(baseline + threshold)
|
||||||
|
]))
|
||||||
|
assert True
|
||||||
|
else:
|
||||||
|
print(' '.join([
|
||||||
|
score, 'is not between',
|
||||||
|
str(baseline - threshold), 'and',
|
||||||
|
str(baseline + threshold)
|
||||||
|
]))
|
||||||
|
assert False, ' '.join([
|
||||||
|
score, 'is not between',
|
||||||
|
str(baseline - threshold), 'and',
|
||||||
|
str(baseline + threshold)
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
def find_csv_files(directory):
|
def find_csv_files(directory):
|
||||||
csv_files = []
|
csv_files = []
|
||||||
for root, dirs, files in os.walk(directory):
|
for root, dirs, files in os.walk(directory):
|
||||||
for file in files:
|
for file in files:
|
||||||
if file.endswith('.csv'):
|
if file.endswith('.csv') and file.startswith('summary'):
|
||||||
csv_files.append(os.path.join(root, file))
|
csv_files.append(os.path.join(root, file))
|
||||||
if len(csv_files) > 1:
|
|
||||||
raise 'have more than 1 result file, please check the result manually'
|
csv_files_with_time = {f: os.path.getctime(f) for f in csv_files}
|
||||||
if len(csv_files) == 0:
|
sorted_csv_files = sorted(csv_files_with_time.items(), key=lambda x: x[1])
|
||||||
return None
|
latest_csv_file = sorted_csv_files[-1][0]
|
||||||
return csv_files[0]
|
return latest_csv_file
|
||||||
|
|
||||||
|
|
||||||
def read_csv_file(file_path):
|
def read_csv_file(file_path):
|
||||||
with open(file_path, 'r') as csvfile:
|
with open(file_path, 'r') as csvfile:
|
||||||
reader = csv.DictReader(csvfile)
|
reader = csv.DictReader(csvfile)
|
||||||
filtered_data = []
|
filtered_data = []
|
||||||
|
|
||||||
for row in reader:
|
for row in reader:
|
||||||
filtered_row = {
|
if row['metric'] is not None and 'bpb' not in row[
|
||||||
k: v
|
'metric'] and '_' != row['metric']:
|
||||||
for k, v in row.items()
|
filtered_row = row
|
||||||
if k not in ['version', 'metric', 'mode']
|
filtered_row['dataset'] = row['dataset'] + '_' + row['metric']
|
||||||
}
|
del filtered_row['version']
|
||||||
|
del filtered_row['metric']
|
||||||
|
del filtered_row['mode']
|
||||||
filtered_data.append(filtered_row)
|
filtered_data.append(filtered_row)
|
||||||
|
|
||||||
result = {}
|
result = {}
|
||||||
|
58
.github/scripts/oc_score_baseline.yaml
vendored
58
.github/scripts/oc_score_baseline.yaml
vendored
@ -1,23 +1,39 @@
|
|||||||
internlm-7b-hf:
|
internlm2_5-7b-hf:
|
||||||
ARC-c: 36.27
|
demo_gsm8k_accuracy: 42.19
|
||||||
chid-dev: 81.68
|
race-middle_accuracy: 91.78
|
||||||
chid-test: 83.67
|
race-high_accuracy: 90.02
|
||||||
openai_humaneval: 10.37
|
|
||||||
openbookqa: 44.4
|
|
||||||
openbookqa_fact: 73.2
|
|
||||||
|
|
||||||
internlm-chat-7b-hf:
|
internlm2_5-7b_hf:
|
||||||
ARC-c: 36.95
|
demo_gsm8k_accuracy: 42.19
|
||||||
chid-dev: 71.78
|
race-middle_accuracy: 91.78
|
||||||
chid-test: 76.87
|
race-high_accuracy: 90.02
|
||||||
openai_humaneval: 21.34
|
|
||||||
openbookqa: 66.6
|
|
||||||
openbookqa_fact: 80.4
|
|
||||||
|
|
||||||
chatglm3-6b-base-hf:
|
internlm2_5-7b-chat-lmdeploy:
|
||||||
ARC-c: 43.05
|
demo_gsm8k_accuracy: 84.38
|
||||||
chid-dev: 80.2
|
race-middle_accuracy: 92.76
|
||||||
chid-test: 80.77
|
race-high_accuracy: 90.54
|
||||||
openai_humaneval: 20.73
|
|
||||||
openbookqa: 79.8
|
internlm3-8b-instruct-lmdeploy:
|
||||||
openbookqa_fact: 92.2
|
demo_gsm8k_accuracy: 73.44
|
||||||
|
race-middle_accuracy: 93.38
|
||||||
|
race-high_accuracy: 90.34
|
||||||
|
|
||||||
|
internlm3-8b-instruct_hf-lmdeploy:
|
||||||
|
demo_gsm8k_accuracy: 73.44
|
||||||
|
race-middle_accuracy: 93.38
|
||||||
|
race-high_accuracy: 90.34
|
||||||
|
|
||||||
|
internlm3-8b-instruct_hf-vllm:
|
||||||
|
demo_gsm8k_accuracy: 78.12
|
||||||
|
race-middle_accuracy: 92.20
|
||||||
|
race-high_accuracy: 89.88
|
||||||
|
|
||||||
|
internlm2_5-7b-chat_hf:
|
||||||
|
demo_gsm8k_accuracy: 87.50
|
||||||
|
race-middle_accuracy: 92.76
|
||||||
|
race-high_accuracy: 90.48
|
||||||
|
|
||||||
|
lmdeploy-api-test:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-middle_accuracy: 93.75
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
983
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
Normal file
983
.github/scripts/oc_score_baseline_fullbench.yaml
vendored
Normal file
@ -0,0 +1,983 @@
|
|||||||
|
internlm2_5-7b-chat-hf_fullbench:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
ARC-c_accuracy: 93.75
|
||||||
|
BoolQ_accuracy: 81.25
|
||||||
|
triviaqa_wiki_1shot_score: 50
|
||||||
|
nq_open_1shot_score: 25
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 50
|
||||||
|
drop_accuracy: 81.25
|
||||||
|
GPQA_diamond_accuracy: 25
|
||||||
|
hellaswag_accuracy: 87.5
|
||||||
|
TheoremQA_score: 12.50
|
||||||
|
musr_average_naive_average: 39.58
|
||||||
|
korbench_single_naive_average: 40
|
||||||
|
gsm8k_accuracy: 62.50
|
||||||
|
math_accuracy: 75
|
||||||
|
cmo_fib_accuracy: 6.25
|
||||||
|
aime2024_accuracy: 6.25
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 50
|
||||||
|
sanitized_mbpp_score: 68.75
|
||||||
|
ds1000_naive_average: 16.96
|
||||||
|
lcb_code_generation_pass@1: 12.5
|
||||||
|
lcb_code_execution_pass@1: 43.75
|
||||||
|
lcb_test_output_pass@1: 18.75
|
||||||
|
bbh-logical_deduction_seven_objects_score: 50
|
||||||
|
bbh-multistep_arithmetic_two_score: 68.75
|
||||||
|
mmlu-other_accuracy: 72.6
|
||||||
|
cmmlu-china-specific_accuracy: 76.25
|
||||||
|
mmlu_pro_math_accuracy: 25
|
||||||
|
ds1000_Pandas_accuracy: 12.5
|
||||||
|
ds1000_Numpy_accuracy: 0
|
||||||
|
ds1000_Tensorflow_accuracy: 12.5
|
||||||
|
ds1000_Scipy_accuracy: 18.75
|
||||||
|
ds1000_Sklearn_accuracy: 18.75
|
||||||
|
ds1000_Pytorch_accuracy: 12.5
|
||||||
|
ds1000_Matplotlib_accuracy: 43.75
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||||
|
college_naive_average: 12.5
|
||||||
|
college_knowledge_naive_average: 87.5
|
||||||
|
subjective:
|
||||||
|
alignment_bench_v1_1_总分: 0.66
|
||||||
|
alpaca_eval_total: 20.00
|
||||||
|
arenahard_score: 56.82
|
||||||
|
Followbench_naive_average: 1
|
||||||
|
CompassArena_naive_average: 43
|
||||||
|
mtbench101_avg: 7.60
|
||||||
|
wildbench_average: -14.58
|
||||||
|
simpleqa_accuracy_given_attempted: 1.00
|
||||||
|
chinese_simpleqa_given_attempted_accuracy: 0.90
|
||||||
|
alignment_bench_v1_1_专业能力: 7.90
|
||||||
|
alignment_bench_v1_1_数学计算: 0
|
||||||
|
alignment_bench_v1_1_基本任务: 0
|
||||||
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
|
alignment_bench_v1_1_中文理解: 0
|
||||||
|
alignment_bench_v1_1_文本写作: 0
|
||||||
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
|
alignment_bench_v1_1_综合问答: 0
|
||||||
|
alpaca_eval_helpful_base: 20.00
|
||||||
|
compassarena_language_naive_average: 35
|
||||||
|
compassarena_knowledge_naive_average: 60.00
|
||||||
|
compassarena_reason_v2_naive_average: 40
|
||||||
|
compassarena_math_v2_naive_average: 50.00
|
||||||
|
compassarena_creationv2_zh_naive_average: 30
|
||||||
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
|
followbench_llmeval_en_HSR_L1: 1
|
||||||
|
followbench_llmeval_en_HSR_L2: 1
|
||||||
|
followbench_llmeval_en_HSR_L3: 1
|
||||||
|
followbench_llmeval_en_HSR_L4: 1
|
||||||
|
followbench_llmeval_en_HSR_L5: 1
|
||||||
|
followbench_llmeval_en_SSR_L1: 1
|
||||||
|
followbench_llmeval_en_SSR_L2: 1
|
||||||
|
followbench_llmeval_en_SSR_L3: 1
|
||||||
|
followbench_llmeval_en_SSR_L4: 1
|
||||||
|
followbench_llmeval_en_SSR_L5: 1
|
||||||
|
simpleqa_f1: 0.12
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-turbomind_fullbench:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
ARC-c_accuracy: 93.75
|
||||||
|
BoolQ_accuracy: 75.00
|
||||||
|
triviaqa_wiki_1shot_score: 50
|
||||||
|
nq_open_1shot_score: 25
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 56.25
|
||||||
|
drop_accuracy: 75
|
||||||
|
GPQA_diamond_accuracy: 37.50
|
||||||
|
hellaswag_accuracy: 81.25
|
||||||
|
TheoremQA_score: 12.5
|
||||||
|
musr_average_naive_average: 39.58
|
||||||
|
korbench_single_naive_average: 40
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
math_accuracy: 68.75
|
||||||
|
cmo_fib_accuracy: 6.25
|
||||||
|
aime2024_accuracy: 6.25
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||||
|
sanitized_mbpp_score: 68.75
|
||||||
|
ds1000_naive_average: 15.18
|
||||||
|
lcb_code_generation_pass@1: 12.5
|
||||||
|
lcb_code_execution_pass@1: 43.75
|
||||||
|
lcb_test_output_pass@1: 0.00
|
||||||
|
bbh-logical_deduction_seven_objects_score: 62.50
|
||||||
|
bbh-multistep_arithmetic_two_score: 62.50
|
||||||
|
mmlu-other_accuracy: 73.08
|
||||||
|
cmmlu-china-specific_accuracy: 75.42
|
||||||
|
mmlu_pro_math_accuracy: 25.00
|
||||||
|
ds1000_Pandas_accuracy: 0.00
|
||||||
|
ds1000_Numpy_accuracy: 0
|
||||||
|
ds1000_Tensorflow_accuracy: 12.5
|
||||||
|
ds1000_Scipy_accuracy: 18.75
|
||||||
|
ds1000_Sklearn_accuracy: 18.75
|
||||||
|
ds1000_Pytorch_accuracy: 12.50
|
||||||
|
ds1000_Matplotlib_accuracy: 43.75
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 37.5
|
||||||
|
college_naive_average: 12.50
|
||||||
|
college_knowledge_naive_average: 87.5
|
||||||
|
subjective:
|
||||||
|
alignment_bench_v1_1_总分: 0.72
|
||||||
|
alpaca_eval_total: 20.00
|
||||||
|
arenahard_score: 55.77
|
||||||
|
Followbench_naive_average: 1
|
||||||
|
CompassArena_naive_average: 39.00
|
||||||
|
mtbench101_avg: 7.90
|
||||||
|
wildbench_average: 0.00
|
||||||
|
simpleqa_accuracy_given_attempted: 1.00
|
||||||
|
chinese_simpleqa_given_attempted_accuracy: 1
|
||||||
|
alignment_bench_v1_1_专业能力: 8.70
|
||||||
|
alignment_bench_v1_1_数学计算: 0
|
||||||
|
alignment_bench_v1_1_基本任务: 0
|
||||||
|
alignment_bench_v1_1_逻辑推理: 0
|
||||||
|
alignment_bench_v1_1_中文理解: 0
|
||||||
|
alignment_bench_v1_1_文本写作: 0
|
||||||
|
alignment_bench_v1_1_角色扮演: 0
|
||||||
|
alignment_bench_v1_1_综合问答: 0
|
||||||
|
alpaca_eval_helpful_base: 20.00
|
||||||
|
compassarena_language_naive_average: 25.00
|
||||||
|
compassarena_knowledge_naive_average: 55.00
|
||||||
|
compassarena_reason_v2_naive_average: 35.00
|
||||||
|
compassarena_math_v2_naive_average: 55.00
|
||||||
|
compassarena_creationv2_zh_naive_average: 25.00
|
||||||
|
followbench_llmeval_en_HSR_AVG: 1
|
||||||
|
followbench_llmeval_en_SSR_AVG: 1
|
||||||
|
followbench_llmeval_en_HSR_L1: 1
|
||||||
|
followbench_llmeval_en_HSR_L2: 1
|
||||||
|
followbench_llmeval_en_HSR_L3: 1
|
||||||
|
followbench_llmeval_en_HSR_L4: 1
|
||||||
|
followbench_llmeval_en_HSR_L5: 1
|
||||||
|
followbench_llmeval_en_SSR_L1: 1
|
||||||
|
followbench_llmeval_en_SSR_L2: 1
|
||||||
|
followbench_llmeval_en_SSR_L3: 1
|
||||||
|
followbench_llmeval_en_SSR_L4: 1
|
||||||
|
followbench_llmeval_en_SSR_L5: 1
|
||||||
|
simpleqa_f1: 0.12
|
||||||
|
|
||||||
|
internlm2_5-7b-hf_fullbench:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 100
|
||||||
|
ARC-c_accuracy: 68.75
|
||||||
|
BoolQ_accuracy: 87.5
|
||||||
|
triviaqa_wiki_1shot_score: 43.75
|
||||||
|
nq_open_1shot_score: 43.75
|
||||||
|
drop_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 62.5
|
||||||
|
hellaswag_accuracy: 93.75
|
||||||
|
TheoremQA_score: 18.75
|
||||||
|
winogrande_accuracy: 75
|
||||||
|
gsm8k_accuracy: 37.5
|
||||||
|
GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
|
||||||
|
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
||||||
|
math_accuracy: 12.5
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 25
|
||||||
|
sanitized_mbpp_score: 56.25
|
||||||
|
dingo_en_192_score: 37.5
|
||||||
|
dingo_zh_170_score: 100
|
||||||
|
mmlu-other_accuracy: 76.92
|
||||||
|
cmmlu-china-specific_accuracy: 84.17
|
||||||
|
mmlu_pro_math_accuracy: 18.75
|
||||||
|
bbh-logical_deduction_seven_objects_score: 43.75
|
||||||
|
bbh-multistep_arithmetic_two_score: 56.25
|
||||||
|
college_naive_average: 12.5
|
||||||
|
college_knowledge_naive_average: 87.5
|
||||||
|
|
||||||
|
internlm2_5-7b-turbomind_fullbench:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 100
|
||||||
|
ARC-c_accuracy: 68.75
|
||||||
|
BoolQ_accuracy: 87.5
|
||||||
|
triviaqa_wiki_1shot_score: 43.75
|
||||||
|
nq_open_1shot_score: 43.75
|
||||||
|
drop_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 68.75
|
||||||
|
hellaswag_accuracy: 93.75
|
||||||
|
TheoremQA_score: 18.75
|
||||||
|
winogrande_accuracy: 87.5
|
||||||
|
gsm8k_accuracy: 62.50
|
||||||
|
GaokaoBench_2010-2022_Math_II_MCQs_score: 93.75
|
||||||
|
GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
|
||||||
|
math_accuracy: 6.25
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 0.00
|
||||||
|
sanitized_mbpp_score: 62.50
|
||||||
|
dingo_en_192_score: 37.50
|
||||||
|
dingo_zh_170_score: 100.00
|
||||||
|
mmlu-other_accuracy: 78.37
|
||||||
|
cmmlu-china-specific_accuracy: 83.33
|
||||||
|
mmlu_pro_math_accuracy: 18.75
|
||||||
|
bbh-logical_deduction_seven_objects_score: 62.50
|
||||||
|
bbh-multistep_arithmetic_two_score: 50.00
|
||||||
|
college_naive_average: 12.5
|
||||||
|
college_knowledge_naive_average: 87.5
|
||||||
|
|
||||||
|
internlm2_5-7b-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.28
|
||||||
|
ARC-c_accuracy: 52.2
|
||||||
|
BoolQ_accuracy: 89.72
|
||||||
|
triviaqa_wiki_1shot_score: 65.88
|
||||||
|
nq_open_1shot_score: 34.82
|
||||||
|
drop_accuracy: 68.1
|
||||||
|
bbh_naive_average: 72.15
|
||||||
|
GPQA_diamond_accuracy: 32.83
|
||||||
|
hellaswag_accuracy: 88.36
|
||||||
|
TheoremQA_score: 25
|
||||||
|
winogrande_accuracy: 81.29
|
||||||
|
gsm8k_accuracy: 74.68
|
||||||
|
GaokaoBench_weighted_average: 58.19
|
||||||
|
math_accuracy: 33.98
|
||||||
|
Mathbench_naive_average: 48.38
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 29.1
|
||||||
|
cmmlu_naive_average: 78.94
|
||||||
|
mmlu_naive_average: 71.44
|
||||||
|
mmlu_pro_naive_average: 38.18
|
||||||
|
openai_humaneval_humaneval_pass@1: 59.76
|
||||||
|
openai_humaneval_v2_humaneval_pass@1: 57.93
|
||||||
|
sanitized_mbpp_score: 55.25
|
||||||
|
dingo_en_192_score: 60.94
|
||||||
|
dingo_zh_170_score: 67.65
|
||||||
|
mmlu-stem_accuracy: 63.72
|
||||||
|
mmlu-social-science_accuracy: 80.15
|
||||||
|
mmlu-humanities_accuracy: 74.27
|
||||||
|
mmlu-other_accuracy: 71.85
|
||||||
|
cmmlu-stem_accuracy: 67.07
|
||||||
|
cmmlu-social-science_accuracy: 81.49
|
||||||
|
cmmlu-humanities_accuracy: 85.84
|
||||||
|
cmmlu-other_accuracy: 82.69
|
||||||
|
cmmlu-china-specific_accuracy: 79.88
|
||||||
|
mmlu_pro_biology_accuracy: 58.58
|
||||||
|
mmlu_pro_business_accuracy: 28.01
|
||||||
|
mmlu_pro_chemistry_accuracy: 22.79
|
||||||
|
mmlu_pro_computer_science_accuracy: 39.02
|
||||||
|
mmlu_pro_economics_accuracy: 53.08
|
||||||
|
mmlu_pro_engineering_accuracy: 25.7
|
||||||
|
mmlu_pro_health_accuracy: 46.94
|
||||||
|
mmlu_pro_history_accuracy: 43.04
|
||||||
|
mmlu_pro_law_accuracy: 29.7
|
||||||
|
mmlu_pro_math_accuracy: 24.2
|
||||||
|
mmlu_pro_philosophy_accuracy: 42.48
|
||||||
|
mmlu_pro_physics_accuracy: 26.02
|
||||||
|
mmlu_pro_psychology_accuracy: 52.76
|
||||||
|
mmlu_pro_other_accuracy: 42.21
|
||||||
|
college_naive_average: 7.00
|
||||||
|
high_naive_average: 6.67
|
||||||
|
middle_naive_average: 26.67
|
||||||
|
primary_naive_average: 64.00
|
||||||
|
arithmetic_naive_average: 55
|
||||||
|
mathbench-a (average)_naive_average: 31.8
|
||||||
|
college_knowledge_naive_average: 58.23
|
||||||
|
high_knowledge_naive_average: 52.51
|
||||||
|
middle_knowledge_naive_average: 71.15
|
||||||
|
primary_knowledge_naive_average: 60.48
|
||||||
|
mathbench-t (average)_naive_average: 60.19
|
||||||
|
long_context:
|
||||||
|
Single-Needle-Retrieval(S-RT)-32000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-EN-32000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-ZH-32000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval(S-RT)-100000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-EN-100000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-ZH-100000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval(S-RT)-200000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-EN-200000_naive_average: 100
|
||||||
|
Single-Needle-Retrieval-ZH-200000_naive_average: 100
|
||||||
|
longbench_naive_average: 46.19
|
||||||
|
longbench_zh_naive_average: 49.3
|
||||||
|
longbench_en_naive_average: 43.97
|
||||||
|
longbench_single-document-qa_score: 42.84
|
||||||
|
longbench_multi-document-qa_score: 41.25
|
||||||
|
longbench_summarization_score: 23.21
|
||||||
|
longbench_few-shot-learning_score: 61.67
|
||||||
|
longbench_synthetic-tasks_score: 60.05
|
||||||
|
longbench_code-completion_score: 52.09
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 86.16
|
||||||
|
ARC-c_accuracy: 90.17
|
||||||
|
BoolQ_accuracy: 87.89
|
||||||
|
triviaqa_wiki_1shot_score: 64.91
|
||||||
|
nq_open_1shot_score: 22.69
|
||||||
|
mmmlu_lite_naive_average: 44.96
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 58.04
|
||||||
|
drop_accuracy: 77.68
|
||||||
|
bbh_naive_average: 73.14
|
||||||
|
GPQA_diamond_accuracy: 31.06
|
||||||
|
hellaswag_accuracy: 94.79
|
||||||
|
TheoremQA_score: 22.25
|
||||||
|
musr_average_naive_average: 50.89
|
||||||
|
korbench_single_naive_average: 32.16
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.02
|
||||||
|
gsm8k_accuracy: 86.73
|
||||||
|
GaokaoBench_weighted_average: 78.6
|
||||||
|
math_accuracy: 61
|
||||||
|
cmo_fib_accuracy: 11
|
||||||
|
aime2024_accuracy: 3.33
|
||||||
|
Mathbench_naive_average: 64.23
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 31.32
|
||||||
|
cmmlu_naive_average: 74.3
|
||||||
|
mmlu_naive_average: 70.84
|
||||||
|
mmlu_pro_naive_average: 44.98
|
||||||
|
openai_humaneval_humaneval_pass@1: 69.8
|
||||||
|
sanitized_mbpp_score: 64.4
|
||||||
|
humanevalx_naive_average: 33.35
|
||||||
|
ds1000_naive_average: 14.15
|
||||||
|
lcb_code_generation_pass@1: 17.75
|
||||||
|
lcb_code_execution_pass@1: 32.57
|
||||||
|
lcb_test_output_pass@1: 26.13
|
||||||
|
bigcodebench_hard_instruct_pass@1: 3.38
|
||||||
|
bigcodebench_hard_complete_pass@1: 5.06
|
||||||
|
teval_naive_average: 80
|
||||||
|
SciCode_sub_accuracy: 5.56
|
||||||
|
qa_dingo_cn_score: 99.01
|
||||||
|
mmlu-stem_accuracy: 68.2
|
||||||
|
mmlu-social-science_accuracy: 75.8
|
||||||
|
mmlu-humanities_accuracy: 69.3
|
||||||
|
mmlu-other_accuracy: 71.3
|
||||||
|
cmmlu-stem_accuracy: 66.64
|
||||||
|
cmmlu-social-science_accuracy: 76
|
||||||
|
cmmlu-humanities_accuracy: 77.9
|
||||||
|
cmmlu-other_accuracy: 77.25
|
||||||
|
cmmlu-china-specific_accuracy: 73.6
|
||||||
|
mmlu_pro_biology_accuracy: 66.67
|
||||||
|
mmlu_pro_business_accuracy: 47.91
|
||||||
|
mmlu_pro_chemistry_accuracy: 35
|
||||||
|
mmlu_pro_computer_science_accuracy: 48.9
|
||||||
|
mmlu_pro_economics_accuracy: 55.87
|
||||||
|
mmlu_pro_engineering_accuracy: 29.62
|
||||||
|
mmlu_pro_health_accuracy: 45
|
||||||
|
mmlu_pro_history_accuracy: 40.8
|
||||||
|
mmlu_pro_law_accuracy: 25.79
|
||||||
|
mmlu_pro_math_accuracy: 53.48
|
||||||
|
mmlu_pro_philosophy_accuracy: 38.38
|
||||||
|
mmlu_pro_physics_accuracy: 37.79
|
||||||
|
mmlu_pro_psychology_accuracy: 58.39
|
||||||
|
mmlu_pro_other_accuracy: 46.27
|
||||||
|
humanevalx-python_pass@1: 53.66
|
||||||
|
humanevalx-cpp_pass@1: 22.56
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-js_pass@1: 54.88
|
||||||
|
ds1000_Pandas_accuracy: 10.65
|
||||||
|
ds1000_Numpy_accuracy: 3.63
|
||||||
|
ds1000_Tensorflow_accuracy: 13.33
|
||||||
|
ds1000_Scipy_accuracy: 8.96
|
||||||
|
ds1000_Sklearn_accuracy: 6.96
|
||||||
|
ds1000_Pytorch_accuracy: 6.62
|
||||||
|
ds1000_Matplotlib_accuracy: 49.35
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 17.19
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 26.78
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 51.27
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 56.94
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 58.22
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 30.75
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 50.6
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 50.6
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.13
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 45
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 57.68
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.56
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.42
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 65.4
|
||||||
|
college_naive_average: 19.17
|
||||||
|
high_naive_average: 46.5
|
||||||
|
middle_naive_average: 61.34
|
||||||
|
primary_naive_average: 73.34
|
||||||
|
arithmetic_naive_average: 61.67
|
||||||
|
mathbench-a (average)_naive_average: 52.58
|
||||||
|
college_knowledge_naive_average: 67.1
|
||||||
|
high_knowledge_naive_average: 70
|
||||||
|
middle_knowledge_naive_average: 80
|
||||||
|
primary_knowledge_naive_average: 90.12
|
||||||
|
mathbench-t (average)_naive_average: 76
|
||||||
|
subjective:
|
||||||
|
alignment_bench_v1_1_总分: 5.68
|
||||||
|
alpaca_eval_total: 25.96
|
||||||
|
arenahard_score: 17.15
|
||||||
|
Followbench_naive_average: 0.81
|
||||||
|
CompassArena_naive_average: 39.49
|
||||||
|
FoFo_naive_average: 0.38
|
||||||
|
mtbench101_avg: 8.01
|
||||||
|
wildbench_average: -10.49
|
||||||
|
simpleqa_accuracy_given_attempted: 0.04
|
||||||
|
chinese_simpleqa_given_attempted_accuracy: 0.34
|
||||||
|
alignment_bench_v1_1_专业能力: 6.05
|
||||||
|
alignment_bench_v1_1_数学计算: 5.87
|
||||||
|
alignment_bench_v1_1_基本任务: 6.01
|
||||||
|
alignment_bench_v1_1_逻辑推理: 4.48
|
||||||
|
alignment_bench_v1_1_中文理解: 6.17
|
||||||
|
alignment_bench_v1_1_文本写作: 6.06
|
||||||
|
alignment_bench_v1_1_角色扮演: 6.3
|
||||||
|
alignment_bench_v1_1_综合问答: 6.45
|
||||||
|
alpaca_eval_helpful_base: 17.83
|
||||||
|
alpaca_eval_koala: 28.21
|
||||||
|
alpaca_eval_oasst: 23.4
|
||||||
|
alpaca_eval_selfinstruct: 30.95
|
||||||
|
alpaca_eval_vicuna: 25.00
|
||||||
|
compassarena_language_naive_average: 53.00
|
||||||
|
compassarena_knowledge_naive_average: 36
|
||||||
|
compassarena_reason_v2_naive_average: 35
|
||||||
|
compassarena_math_v2_naive_average: 16.07
|
||||||
|
compassarena_creationv2_zh_naive_average: 43.64
|
||||||
|
fofo_test_prompts_overall: 0.35
|
||||||
|
fofo_test_prompts_cn_overall: 0.41
|
||||||
|
followbench_llmeval_en_HSR_AVG: 0.73
|
||||||
|
followbench_llmeval_en_SSR_AVG: 0.88
|
||||||
|
followbench_llmeval_en_HSR_L1: 0.94
|
||||||
|
followbench_llmeval_en_HSR_L2: 0.77
|
||||||
|
followbench_llmeval_en_HSR_L3: 0.73
|
||||||
|
followbench_llmeval_en_HSR_L4: 0.68
|
||||||
|
followbench_llmeval_en_HSR_L5: 0.54
|
||||||
|
followbench_llmeval_en_SSR_L1: 0.94
|
||||||
|
followbench_llmeval_en_SSR_L2: 0.88
|
||||||
|
followbench_llmeval_en_SSR_L3: 0.87
|
||||||
|
followbench_llmeval_en_SSR_L4: 0.87
|
||||||
|
followbench_llmeval_en_SSR_L5: 0.85
|
||||||
|
simpleqa_f1: 0.04
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-1m-turbomind:
|
||||||
|
long_context:
|
||||||
|
ruler_8k_naive_average: 88.53
|
||||||
|
ruler_32k_naive_average: 83.84
|
||||||
|
ruler_128k_naive_average: 70.94
|
||||||
|
NeedleBench-Overall-Score-8K_weighted_average: 91.89
|
||||||
|
NeedleBench-Overall-Score-32K_weighted_average: 91.42
|
||||||
|
NeedleBench-Overall-Score-128K_weighted_average: 88.57
|
||||||
|
longbench_naive_average: 46.44
|
||||||
|
longbench_zh_naive_average: 45.19
|
||||||
|
longbench_en_naive_average: 45.71
|
||||||
|
babilong_0k_naive_average: 79.3
|
||||||
|
babilong_4k_naive_average: 67
|
||||||
|
babilong_16k_naive_average: 52.7
|
||||||
|
babilong_32k_naive_average: 48.9
|
||||||
|
babilong_128k_naive_average: 40.8
|
||||||
|
babilong_256k_naive_average: 23.5
|
||||||
|
longbench_single-document-qa_score: 43.56
|
||||||
|
longbench_multi-document-qa_score: 46.24
|
||||||
|
longbench_summarization_score: 24.32
|
||||||
|
longbench_few-shot-learning_score: 51.67
|
||||||
|
longbench_synthetic-tasks_score: 66.83
|
||||||
|
longbench_code-completion_score: 45.99
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 84.99
|
||||||
|
ARC-c_accuracy: 92.2
|
||||||
|
BoolQ_accuracy: 86.7
|
||||||
|
triviaqa_wiki_1shot_score: 53.06
|
||||||
|
nq_open_1shot_score: 17.51
|
||||||
|
mmmlu_lite_naive_average: 54.96
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.53
|
||||||
|
drop_accuracy: 80.07
|
||||||
|
bbh_naive_average: 68.81
|
||||||
|
GPQA_diamond_accuracy: 34.34
|
||||||
|
hellaswag_accuracy: 85.42
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.44
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 92.57
|
||||||
|
GaokaoBench_weighted_average: 80.14
|
||||||
|
math_accuracy: 73.58
|
||||||
|
cmo_fib_accuracy: 25
|
||||||
|
aime2024_accuracy: 16.67
|
||||||
|
Mathbench_naive_average: 77.33
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34.9
|
||||||
|
cmmlu_naive_average: 75.97
|
||||||
|
mmlu_naive_average: 76.01
|
||||||
|
mmlu_pro_naive_average: 56.12
|
||||||
|
openai_humaneval_humaneval_pass@1: 83.54
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.29
|
||||||
|
ds1000_naive_average: 18.66
|
||||||
|
lcb_code_generation_pass@1: 39.5
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.68
|
||||||
|
bigcodebench_hard_instruct_pass@1: 16.22
|
||||||
|
bigcodebench_hard_complete_pass@1: 11.49
|
||||||
|
teval_naive_average: 79.72
|
||||||
|
SciCode_sub_accuracy: 10.76
|
||||||
|
qa_dingo_cn_score: 99.01
|
||||||
|
mmlu_accuracy: 76.01
|
||||||
|
mmlu-stem_accuracy: 77.59
|
||||||
|
mmlu-social-science_accuracy: 79.02
|
||||||
|
mmlu-humanities_accuracy: 72.07
|
||||||
|
mmlu-other_accuracy: 74.86
|
||||||
|
cmmlu_accuracy: 75.97
|
||||||
|
cmmlu-stem_accuracy: 73.09
|
||||||
|
cmmlu-social-science_accuracy: 75.95
|
||||||
|
cmmlu-humanities_accuracy: 76.53
|
||||||
|
cmmlu-other_accuracy: 78.79
|
||||||
|
cmmlu-china-specific_accuracy: 73.17
|
||||||
|
mmlu_pro_accuracy: 56.12
|
||||||
|
mmlu_pro_biology_accuracy: 71.41
|
||||||
|
mmlu_pro_business_accuracy: 67.68
|
||||||
|
mmlu_pro_chemistry_accuracy: 54.59
|
||||||
|
mmlu_pro_computer_science_accuracy: 58.29
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 42.41
|
||||||
|
mmlu_pro_health_accuracy: 55.87
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 28.97
|
||||||
|
mmlu_pro_math_accuracy: 73.13
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 58.43
|
||||||
|
mmlu_pro_psychology_accuracy: 63.16
|
||||||
|
mmlu_pro_other_accuracy: 53.57
|
||||||
|
humanevalx-python_pass@1: 50
|
||||||
|
humanevalx-cpp_pass@1: 42.07
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 53.05
|
||||||
|
humanevalx-js_pass@1: 75
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.18
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 10.43
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 60.65
|
||||||
|
mmmlu_lite_accuracy: 54.96
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.25
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 59.93
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.53
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 49.26
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.47
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 61.54
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.28
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 55.51
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36.42
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.61
|
||||||
|
college_naive_average: 44.33
|
||||||
|
high_naive_average: 59
|
||||||
|
middle_naive_average: 78
|
||||||
|
primary_naive_average: 85.67
|
||||||
|
arithmetic_naive_average: 75.67
|
||||||
|
mathbench-a (average)_naive_average: 69.27
|
||||||
|
college_knowledge_naive_average: 83.86
|
||||||
|
high_knowledge_naive_average: 80.29
|
||||||
|
middle_knowledge_naive_average: 84.26
|
||||||
|
primary_knowledge_naive_average: 93.16
|
||||||
|
mathbench-t (average)_naive_average: 85.39
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
internlm2_5-7b-chat-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 86.39
|
||||||
|
ARC-c_accuracy: 90.51
|
||||||
|
BoolQ_accuracy: 88.01
|
||||||
|
triviaqa_wiki_1shot_score: 64.77
|
||||||
|
nq_open_1shot_score: 22.71
|
||||||
|
mmmlu_lite_naive_average: 45.02
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 56.56
|
||||||
|
drop_accuracy: 75.46
|
||||||
|
bbh_naive_average: 73.34
|
||||||
|
GPQA_diamond_accuracy: 32.83
|
||||||
|
hellaswag_accuracy: 94.81
|
||||||
|
TheoremQA_score: 23.88
|
||||||
|
musr_average_naive_average: 51.31
|
||||||
|
korbench_single_naive_average: 32
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.01
|
||||||
|
gsm8k_accuracy: 86.96
|
||||||
|
GaokaoBench_weighted_average: 78.05
|
||||||
|
math_accuracy: 60.34
|
||||||
|
cmo_fib_accuracy: 12.98
|
||||||
|
aime2024_accuracy: 3.33
|
||||||
|
Mathbench_naive_average: 64.82
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 31.7
|
||||||
|
cmmlu_naive_average: 74.24
|
||||||
|
mmlu_naive_average: 70.2
|
||||||
|
mmlu_pro_naive_average: 45.39
|
||||||
|
openai_humaneval_humaneval_pass@1: 70.12
|
||||||
|
sanitized_mbpp_score: 64.59
|
||||||
|
humanevalx_naive_average: 38.78
|
||||||
|
ds1000_naive_average: 14.19
|
||||||
|
lcb_code_generation_pass@1: 16.5
|
||||||
|
lcb_code_execution_pass@1: 33.82
|
||||||
|
lcb_test_output_pass@1: 22.62
|
||||||
|
bigcodebench_hard_instruct_pass@1: 6.08
|
||||||
|
bigcodebench_hard_complete_pass@1: 6.76
|
||||||
|
teval_naive_average: 79.73
|
||||||
|
SciCode_sub_accuracy: 3.47
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 70.2
|
||||||
|
mmlu-stem_accuracy: 67.73
|
||||||
|
mmlu-social-science_accuracy: 75.49
|
||||||
|
mmlu-humanities_accuracy: 68.56
|
||||||
|
mmlu-other_accuracy: 70.58
|
||||||
|
cmmlu_accuracy: 74.24
|
||||||
|
cmmlu-stem_accuracy: 66.7
|
||||||
|
cmmlu-social-science_accuracy: 75.88
|
||||||
|
cmmlu-humanities_accuracy: 77.56
|
||||||
|
cmmlu-other_accuracy: 77.52
|
||||||
|
cmmlu-china-specific_accuracy: 73.46
|
||||||
|
mmlu_pro_accuracy: 45.39
|
||||||
|
mmlu_pro_biology_accuracy: 65.83
|
||||||
|
mmlu_pro_business_accuracy: 51.96
|
||||||
|
mmlu_pro_chemistry_accuracy: 36.84
|
||||||
|
mmlu_pro_computer_science_accuracy: 48.29
|
||||||
|
mmlu_pro_economics_accuracy: 56.16
|
||||||
|
mmlu_pro_engineering_accuracy: 29.1
|
||||||
|
mmlu_pro_health_accuracy: 44.5
|
||||||
|
mmlu_pro_history_accuracy: 42.26
|
||||||
|
mmlu_pro_law_accuracy: 24.98
|
||||||
|
mmlu_pro_math_accuracy: 54.85
|
||||||
|
mmlu_pro_philosophy_accuracy: 39.28
|
||||||
|
mmlu_pro_physics_accuracy: 37.41
|
||||||
|
mmlu_pro_psychology_accuracy: 58.27
|
||||||
|
mmlu_pro_other_accuracy: 45.78
|
||||||
|
humanevalx-python_pass@1: 56.1
|
||||||
|
humanevalx-cpp_pass@1: 20.73
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 59.15
|
||||||
|
humanevalx-js_pass@1: 57.93
|
||||||
|
ds1000_Pandas_accuracy: 8.93
|
||||||
|
ds1000_Numpy_accuracy: 4.09
|
||||||
|
ds1000_Tensorflow_accuracy: 11.11
|
||||||
|
ds1000_Scipy_accuracy: 7.55
|
||||||
|
ds1000_Sklearn_accuracy: 7.83
|
||||||
|
ds1000_Pytorch_accuracy: 8.82
|
||||||
|
ds1000_Matplotlib_accuracy: 50.97
|
||||||
|
mmmlu_lite_accuracy: 45.02
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 18.6
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 27.58
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 51.23
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 56.63
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 58.11
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 33.82
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 50.39
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 45.05
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 57.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.14
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 65.33
|
||||||
|
college_naive_average: 21
|
||||||
|
high_naive_average: 47
|
||||||
|
middle_naive_average: 59.67
|
||||||
|
primary_naive_average: 72.33
|
||||||
|
arithmetic_naive_average: 62
|
||||||
|
mathbench-a (average)_naive_average: 53.13
|
||||||
|
college_knowledge_naive_average: 68.99
|
||||||
|
high_knowledge_naive_average: 70.06
|
||||||
|
middle_knowledge_naive_average: 78.53
|
||||||
|
primary_knowledge_naive_average: 88.49
|
||||||
|
mathbench-t (average)_naive_average: 76.51
|
||||||
|
|
||||||
|
|
||||||
|
qwen2.5-7b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 85.16
|
||||||
|
ARC-c_accuracy: 90.85
|
||||||
|
BoolQ_accuracy: 86.61
|
||||||
|
triviaqa_wiki_1shot_score: 52.96
|
||||||
|
nq_open_1shot_score: 17.62
|
||||||
|
mmmlu_lite_naive_average: 54.7
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 71.35
|
||||||
|
drop_accuracy: 80.23
|
||||||
|
bbh_naive_average: 68.88
|
||||||
|
GPQA_diamond_accuracy: 36.36
|
||||||
|
hellaswag_accuracy: 85.49
|
||||||
|
TheoremQA_score: 18.38
|
||||||
|
musr_average_naive_average: 43.3
|
||||||
|
korbench_single_naive_average: 39.44
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0
|
||||||
|
gsm8k_accuracy: 91.66
|
||||||
|
GaokaoBench_weighted_average: 80.02
|
||||||
|
math_accuracy: 73.74
|
||||||
|
cmo_fib_accuracy: 22.60
|
||||||
|
aime2024_accuracy: 13.33
|
||||||
|
Mathbench_naive_average: 77.08
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 34
|
||||||
|
cmmlu_naive_average: 75.9
|
||||||
|
mmlu_naive_average: 76.27
|
||||||
|
mmlu_pro_naive_average: 56.14
|
||||||
|
openai_humaneval_humaneval_pass@1: 84.76
|
||||||
|
sanitized_mbpp_score: 74.71
|
||||||
|
humanevalx_naive_average: 48.17
|
||||||
|
ds1000_naive_average: 18.57
|
||||||
|
lcb_code_generation_pass@1: 38.75
|
||||||
|
lcb_code_execution_pass@1: 42.38
|
||||||
|
lcb_test_output_pass@1: 50.45
|
||||||
|
bigcodebench_hard_instruct_pass@1: 16.89
|
||||||
|
bigcodebench_hard_complete_pass@1: 12.16
|
||||||
|
teval_naive_average: 79.46
|
||||||
|
SciCode_sub_accuracy: 10.42
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.27
|
||||||
|
mmlu-stem_accuracy: 77.75
|
||||||
|
mmlu-social-science_accuracy: 78.65
|
||||||
|
mmlu-humanities_accuracy: 73.12
|
||||||
|
mmlu-other_accuracy: 75.05
|
||||||
|
cmmlu_accuracy: 75.9
|
||||||
|
cmmlu-stem_accuracy: 73.41
|
||||||
|
cmmlu-social-science_accuracy: 75.97
|
||||||
|
cmmlu-humanities_accuracy: 76.42
|
||||||
|
cmmlu-other_accuracy: 78.15
|
||||||
|
cmmlu-china-specific_accuracy: 73.27
|
||||||
|
mmlu_pro_accuracy: 56.14
|
||||||
|
mmlu_pro_biology_accuracy: 72.25
|
||||||
|
mmlu_pro_business_accuracy: 66.16
|
||||||
|
mmlu_pro_chemistry_accuracy: 55.65
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 66.82
|
||||||
|
mmlu_pro_engineering_accuracy: 41.38
|
||||||
|
mmlu_pro_health_accuracy: 54.89
|
||||||
|
mmlu_pro_history_accuracy: 46.46
|
||||||
|
mmlu_pro_law_accuracy: 29.06
|
||||||
|
mmlu_pro_math_accuracy: 73.58
|
||||||
|
mmlu_pro_philosophy_accuracy: 44.89
|
||||||
|
mmlu_pro_physics_accuracy: 60.05
|
||||||
|
mmlu_pro_psychology_accuracy: 61.9
|
||||||
|
mmlu_pro_other_accuracy: 52.6
|
||||||
|
humanevalx-python_pass@1: 51.83
|
||||||
|
humanevalx-cpp_pass@1: 42.68
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 73.78
|
||||||
|
humanevalx-js_pass@1: 72.56
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 8.64
|
||||||
|
ds1000_Tensorflow_accuracy: 17.78
|
||||||
|
ds1000_Scipy_accuracy: 15.09
|
||||||
|
ds1000_Sklearn_accuracy: 8.7
|
||||||
|
ds1000_Pytorch_accuracy: 4.41
|
||||||
|
ds1000_Matplotlib_accuracy: 61.29
|
||||||
|
mmmlu_lite_accuracy: 54.7
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 42.32
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 42.18
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 60
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 66.18
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 66.88
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 48.63
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 61.26
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 65.26
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 60.7
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 60.63
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 54.46
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 36
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 31.86
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.4
|
||||||
|
college_naive_average: 48.33
|
||||||
|
high_naive_average: 59.33
|
||||||
|
middle_naive_average: 76.67
|
||||||
|
primary_naive_average: 86.67
|
||||||
|
arithmetic_naive_average: 74.33
|
||||||
|
mathbench-a (average)_naive_average: 69.07
|
||||||
|
college_knowledge_naive_average: 83.54
|
||||||
|
high_knowledge_naive_average: 80.82
|
||||||
|
middle_knowledge_naive_average: 83.79
|
||||||
|
primary_knowledge_naive_average: 92.22
|
||||||
|
mathbench-t (average)_naive_average: 85.1
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.22
|
||||||
|
ARC-c_accuracy: 92.54
|
||||||
|
BoolQ_accuracy: 86.45
|
||||||
|
triviaqa_wiki_1shot_score: 60.72
|
||||||
|
nq_open_1shot_score: 20.25
|
||||||
|
mmmlu_lite_naive_average: 41.82
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 77.45
|
||||||
|
drop_accuracy: 83.27
|
||||||
|
bbh_naive_average: 55.22
|
||||||
|
GPQA_diamond_accuracy: 37.88
|
||||||
|
hellaswag_accuracy: 91.28
|
||||||
|
TheoremQA_score: 20.12
|
||||||
|
musr_average_naive_average: 36.86
|
||||||
|
korbench_single_naive_average: 41.2
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 91.28
|
||||||
|
GaokaoBench_weighted_average: 86.59
|
||||||
|
math_accuracy: 76.96
|
||||||
|
cmo_fib_accuracy: 38.46
|
||||||
|
aime2024_accuracy: 13.33
|
||||||
|
Mathbench_naive_average: 78.96
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.45
|
||||||
|
cmmlu_naive_average: 83.33
|
||||||
|
mmlu_naive_average: 76.21
|
||||||
|
mmlu_pro_naive_average: 57.96
|
||||||
|
openai_humaneval_humaneval_pass@1: 81.71
|
||||||
|
sanitized_mbpp_score: 69.65
|
||||||
|
humanevalx_naive_average: 40.73
|
||||||
|
ds1000_naive_average: 27.23
|
||||||
|
lcb_code_generation_pass@1: 34.75
|
||||||
|
lcb_code_execution_pass@1: 49.9
|
||||||
|
lcb_test_output_pass@1: 48.19
|
||||||
|
bigcodebench_hard_instruct_pass@1: 13.51
|
||||||
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
|
teval_naive_average: 82.86
|
||||||
|
SciCode_sub_accuracy: 11.11
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.21
|
||||||
|
mmlu-stem_accuracy: 77.7
|
||||||
|
mmlu-social-science_accuracy: 80.98
|
||||||
|
mmlu-humanities_accuracy: 70.83
|
||||||
|
mmlu-other_accuracy: 75.01
|
||||||
|
cmmlu_accuracy: 83.33
|
||||||
|
cmmlu-stem_accuracy: 79.66
|
||||||
|
cmmlu-social-science_accuracy: 83.39
|
||||||
|
cmmlu-humanities_accuracy: 84.73
|
||||||
|
cmmlu-other_accuracy: 86.2
|
||||||
|
cmmlu-china-specific_accuracy: 81.77
|
||||||
|
mmlu_pro_accuracy: 57.96
|
||||||
|
mmlu_pro_biology_accuracy: 75.45
|
||||||
|
mmlu_pro_business_accuracy: 64.64
|
||||||
|
mmlu_pro_chemistry_accuracy: 59.81
|
||||||
|
mmlu_pro_computer_science_accuracy: 60.24
|
||||||
|
mmlu_pro_economics_accuracy: 68.6
|
||||||
|
mmlu_pro_engineering_accuracy: 44.79
|
||||||
|
mmlu_pro_health_accuracy: 58.31
|
||||||
|
mmlu_pro_history_accuracy: 49.87
|
||||||
|
mmlu_pro_law_accuracy: 32.43
|
||||||
|
mmlu_pro_math_accuracy: 70.17
|
||||||
|
mmlu_pro_philosophy_accuracy: 46.89
|
||||||
|
mmlu_pro_physics_accuracy: 59.58
|
||||||
|
mmlu_pro_psychology_accuracy: 66.29
|
||||||
|
mmlu_pro_other_accuracy: 54.33
|
||||||
|
humanevalx-python_pass@1: 43.9
|
||||||
|
humanevalx-cpp_pass@1: 20.12
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 40.85
|
||||||
|
humanevalx-js_pass@1: 65.24
|
||||||
|
ds1000_Pandas_accuracy: 16.49
|
||||||
|
ds1000_Numpy_accuracy: 34.09
|
||||||
|
ds1000_Tensorflow_accuracy: 26.67
|
||||||
|
ds1000_Scipy_accuracy: 17.92
|
||||||
|
ds1000_Sklearn_accuracy: 20.87
|
||||||
|
ds1000_Pytorch_accuracy: 19.12
|
||||||
|
ds1000_Matplotlib_accuracy: 55.48
|
||||||
|
mmmlu_lite_accuracy: 41.82
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.56
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 4.56
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.91
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 51.09
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.68
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 24.98
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 44.56
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 52.35
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.02
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 47.93
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 53.89
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 33.47
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.05
|
||||||
|
college_naive_average: 45.67
|
||||||
|
high_naive_average: 64.67
|
||||||
|
middle_naive_average: 82.33
|
||||||
|
primary_naive_average: 90.33
|
||||||
|
arithmetic_naive_average: 74
|
||||||
|
mathbench-a (average)_naive_average: 71.4
|
||||||
|
college_knowledge_naive_average: 85.28
|
||||||
|
high_knowledge_naive_average: 79.43
|
||||||
|
middle_knowledge_naive_average: 87.9
|
||||||
|
primary_knowledge_naive_average: 93.42
|
||||||
|
mathbench-t (average)_naive_average: 86.51
|
||||||
|
|
||||||
|
|
||||||
|
internlm3-8b-instruct-pytorch:
|
||||||
|
objective:
|
||||||
|
race-high_accuracy: 89.02
|
||||||
|
ARC-c_accuracy: 93.56
|
||||||
|
BoolQ_accuracy: 86.67
|
||||||
|
triviaqa_wiki_1shot_score: 60.54
|
||||||
|
nq_open_1shot_score: 20.3
|
||||||
|
mmmlu_lite_naive_average: 42.6
|
||||||
|
IFEval_Prompt-level-strict-accuracy: 79.11
|
||||||
|
drop_accuracy: 83.32
|
||||||
|
bbh_naive_average: 54.76
|
||||||
|
GPQA_diamond_accuracy: 33.84
|
||||||
|
hellaswag_accuracy: 91.31
|
||||||
|
TheoremQA_score: 18
|
||||||
|
musr_average_naive_average: 36.62
|
||||||
|
korbench_single_naive_average: 41.84
|
||||||
|
ARC_Prize_Public_Evaluation_accuracy: 0.06
|
||||||
|
gsm8k_accuracy: 90.67
|
||||||
|
GaokaoBench_weighted_average: 86.27
|
||||||
|
math_accuracy: 76.68
|
||||||
|
cmo_fib_accuracy: 33.65
|
||||||
|
aime2024_accuracy: 10
|
||||||
|
Mathbench_naive_average: 78.92
|
||||||
|
wikibench-wiki-single_choice_cncircular_perf_4: 37.35
|
||||||
|
cmmlu_naive_average: 83.11
|
||||||
|
mmlu_naive_average: 76.23
|
||||||
|
mmlu_pro_naive_average: 58.16
|
||||||
|
openai_humaneval_humaneval_pass@1: 82.32
|
||||||
|
sanitized_mbpp_score: 70.04
|
||||||
|
humanevalx_naive_average: 25.49
|
||||||
|
ds1000_naive_average: 27.84
|
||||||
|
lcb_code_generation_pass@1: 34.5
|
||||||
|
lcb_code_execution_pass@1: 48.02
|
||||||
|
lcb_test_output_pass@1: 47.74
|
||||||
|
bigcodebench_hard_instruct_pass@1: 12.84
|
||||||
|
bigcodebench_hard_complete_pass@1: 15.54
|
||||||
|
teval_naive_average: 82.86
|
||||||
|
SciCode_sub_accuracy: 9.38
|
||||||
|
qa_dingo_cn_score: 100
|
||||||
|
mmlu_accuracy: 76.23
|
||||||
|
mmlu-stem_accuracy: 78.08
|
||||||
|
mmlu-social-science_accuracy: 80.31
|
||||||
|
mmlu-humanities_accuracy: 71.38
|
||||||
|
mmlu-other_accuracy: 74.63
|
||||||
|
cmmlu_accuracy: 83.11
|
||||||
|
cmmlu-stem_accuracy: 79.42
|
||||||
|
cmmlu-social-science_accuracy: 83.34
|
||||||
|
cmmlu-humanities_accuracy: 83.95
|
||||||
|
cmmlu-other_accuracy: 86.22
|
||||||
|
cmmlu-china-specific_accuracy: 81.5
|
||||||
|
mmlu_pro_accuracy: 58.16
|
||||||
|
mmlu_pro_biology_accuracy: 74.62
|
||||||
|
mmlu_pro_business_accuracy: 65.02
|
||||||
|
mmlu_pro_chemistry_accuracy: 60.69
|
||||||
|
mmlu_pro_computer_science_accuracy: 61.46
|
||||||
|
mmlu_pro_economics_accuracy: 68.25
|
||||||
|
mmlu_pro_engineering_accuracy: 45.3
|
||||||
|
mmlu_pro_health_accuracy: 60.15
|
||||||
|
mmlu_pro_history_accuracy: 50.66
|
||||||
|
mmlu_pro_law_accuracy: 31.7
|
||||||
|
mmlu_pro_math_accuracy: 70.32
|
||||||
|
mmlu_pro_philosophy_accuracy: 47.7
|
||||||
|
mmlu_pro_physics_accuracy: 59.51
|
||||||
|
mmlu_pro_psychology_accuracy: 65.41
|
||||||
|
mmlu_pro_other_accuracy: 53.46
|
||||||
|
humanevalx-python_pass@1: 42.68
|
||||||
|
humanevalx-cpp_pass@1: 19.51
|
||||||
|
humanevalx-go_pass@1: 0
|
||||||
|
humanevalx-java_pass@1: 0.00
|
||||||
|
humanevalx-js_pass@1: 64.02
|
||||||
|
ds1000_Pandas_accuracy: 14.09
|
||||||
|
ds1000_Numpy_accuracy: 35
|
||||||
|
ds1000_Tensorflow_accuracy: 24.44
|
||||||
|
ds1000_Scipy_accuracy: 20.75
|
||||||
|
ds1000_Sklearn_accuracy: 21.74
|
||||||
|
ds1000_Pytorch_accuracy: 22.06
|
||||||
|
ds1000_Matplotlib_accuracy: 56.77
|
||||||
|
mmmlu_lite_accuracy: 42.6
|
||||||
|
openai_mmmlu_lite_AR-XY_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_BN-BD_accuracy: 10.46
|
||||||
|
openai_mmmlu_lite_DE-DE_accuracy: 24.56
|
||||||
|
openai_mmmlu_lite_ES-LA_accuracy: 50.95
|
||||||
|
openai_mmmlu_lite_FR-FR_accuracy: 61.05
|
||||||
|
openai_mmmlu_lite_HI-IN_accuracy: 30.6
|
||||||
|
openai_mmmlu_lite_ID-ID_accuracy: 45.89
|
||||||
|
openai_mmmlu_lite_IT-IT_accuracy: 51.79
|
||||||
|
openai_mmmlu_lite_JA-JP_accuracy: 51.65
|
||||||
|
openai_mmmlu_lite_KO-KR_accuracy: 48.77
|
||||||
|
openai_mmmlu_lite_PT-BR_accuracy: 52.7
|
||||||
|
openai_mmmlu_lite_SW-KE_accuracy: 32.91
|
||||||
|
openai_mmmlu_lite_YO-NG_accuracy: 32.84
|
||||||
|
openai_mmmlu_lite_ZH-CN_accuracy: 69.33
|
||||||
|
college_naive_average: 47
|
||||||
|
high_naive_average: 66.67
|
||||||
|
middle_naive_average: 81.67
|
||||||
|
primary_naive_average: 89.33
|
||||||
|
arithmetic_naive_average: 73.67
|
||||||
|
mathbench-a (average)_naive_average: 71.67
|
||||||
|
college_knowledge_naive_average: 82.91
|
||||||
|
high_knowledge_naive_average: 79.86
|
||||||
|
middle_knowledge_naive_average: 88.92
|
||||||
|
primary_knowledge_naive_average: 92.96
|
||||||
|
mathbench-t (average)_naive_average: 86.16
|
432
.github/scripts/oc_score_baseline_testrange.yaml
vendored
Normal file
432
.github/scripts/oc_score_baseline_testrange.yaml
vendored
Normal file
@ -0,0 +1,432 @@
|
|||||||
|
chat:
|
||||||
|
glm-4-9b-chat-hf:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
glm-4-9b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
glm-4-9b-chat-vllm:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
deepseek-7b-chat-hf:
|
||||||
|
gsm8k_accuracy: 46.88
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
deepseek-r1-distill-llama-8b-turbomind:
|
||||||
|
gsm8k_accuracy: 34.38
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
deepseek-r1-distill-qwen-1_5b-turbomind:
|
||||||
|
gsm8k_accuracy: 28.12
|
||||||
|
race-high_accuracy: 53.12
|
||||||
|
deepseek-7b-chat-vllm:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
gemma2-2b-it-hf:
|
||||||
|
gsm8k_accuracy: 50
|
||||||
|
race-high_accuracy: 75
|
||||||
|
gemma2-9b-it-hf:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
gemma-2b-it-hf:
|
||||||
|
gsm8k_accuracy: 3.12
|
||||||
|
race-high_accuracy: 40.62
|
||||||
|
gemma-7b-it-hf:
|
||||||
|
gsm8k_accuracy: 40.62
|
||||||
|
race-high_accuracy: 68.75
|
||||||
|
gemma-2-9b-it-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
gemma-2-27b-it-turbomind:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
gemma-7b-it-vllm:
|
||||||
|
gsm8k_accuracy: 28.12
|
||||||
|
race-high_accuracy: 68.75
|
||||||
|
internlm2_5-7b-chat-hf:
|
||||||
|
gsm8k_accuracy: 84.38
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
internlm3-8b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
internlm2_5-7b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
internlm2-chat-1.8b-turbomind:
|
||||||
|
gsm8k_accuracy: 25.00
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
internlm2-chat-1.8b-sft-turbomind:
|
||||||
|
gsm8k_accuracy: 34.38
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
internlm2-chat-7b-lmdeploy:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
internlm2-chat-7b-sft-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
internlm3-8b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
internlm2-chat-7b-vllm:
|
||||||
|
gsm8k_accuracy: 53.12
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
llama-3_1-8b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 84.38
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
llama-3_2-3b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
llama-3-8b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
llama-2-7b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 18.75
|
||||||
|
race-high_accuracy: 46.88
|
||||||
|
llama-3_1-8b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 84.38
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
llama-3_2-3b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
llama-3-8b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
mistral-7b-instruct-v0.2-hf:
|
||||||
|
gsm8k_accuracy: 40.62
|
||||||
|
race-high_accuracy: 75
|
||||||
|
mistral-7b-instruct-v0.3-hf:
|
||||||
|
gsm8k_accuracy: 40.62
|
||||||
|
race-high_accuracy: 75
|
||||||
|
mistral-nemo-instruct-2407-hf:
|
||||||
|
gsm8k_accuracy: 75
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
mistral-nemo-instruct-2407-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 75
|
||||||
|
mistral-7b-instruct-v0.1-vllm:
|
||||||
|
gsm8k_accuracy: 34.38
|
||||||
|
race-high_accuracy: 65.62
|
||||||
|
mistral-7b-instruct-v0.2-vllm:
|
||||||
|
gsm8k_accuracy: 28.12
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
qwen2.5-0.5b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 34.38
|
||||||
|
race-high_accuracy: 46.88
|
||||||
|
qwen2.5-3b-instruct-hf :
|
||||||
|
gsm8k_accuracy: 53.12
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
qwen2.5-0.5b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 28.12
|
||||||
|
race-high_accuracy: 43.75
|
||||||
|
qwen2.5-3b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
qwen1.5-0.5b-chat-hf:
|
||||||
|
gsm8k_accuracy: 0
|
||||||
|
race-high_accuracy: 53.12
|
||||||
|
qwen2-1.5b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 62.5
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
qwen2-7b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
qwen2-1.5b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
qwen2-7b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 75.00
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
qwen1.5-0.5b-chat-vllm:
|
||||||
|
gsm8k_accuracy: 6.25
|
||||||
|
race-high_accuracy: 53.12
|
||||||
|
yi-1.5-6b-chat-hf:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
yi-1.5-9b-chat-hf:
|
||||||
|
gsm8k_accuracy: 75
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
yi-1.5-6b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
yi-1.5-9b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
deepseek-v2_lite-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 43.75
|
||||||
|
race-high_accuracy: 71.88
|
||||||
|
gemma2-27b-it-hf:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
internlm2_5-20b-chat-hf:
|
||||||
|
gsm8k_accuracy: 84.38
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
internlm2_5-20b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 87.50
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
mistral-small-instruct-2409-hf:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
mistral-small-instruct-2409-turbomind:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
phi-4:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
qwen2.5-14b-instruct-hf:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 96.88
|
||||||
|
qwen2.5-14b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 96.88
|
||||||
|
yi-1.5-34b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
deepseek-67b-chat-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
race-high_accuracy: 75.00
|
||||||
|
deepseek-r1-distill-qwen-32b-turbomind:
|
||||||
|
gsm8k_accuracy: 31.25
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
llama-3_3-70b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 93.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
mixtral-large-instruct-2411-turbomind:
|
||||||
|
gsm8k_accuracy: 87.50
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
|
||||||
|
gsm8k_accuracy: 90.62
|
||||||
|
race-high_accuracy: 53.12
|
||||||
|
qwen2.5-72b-instruct-turbomind:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
deepseek-r1-distill-llama-70b-turbomind:
|
||||||
|
gsm8k_accuracy: 50.00
|
||||||
|
race-high_accuracy: 87.50
|
||||||
|
deepseek-v2_5-1210-turbomind:
|
||||||
|
gsm8k_accuracy: 90.62
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
mixtral-8x22b-instruct-v0.1-turbomind:
|
||||||
|
gsm8k_accuracy: 75.00
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
mixtral-8x22b-instruct-v0.1-vllm:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
base:
|
||||||
|
glm-4-9b-turbomind:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
GPQA_diamond_accuracy: 28.12
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 84.38
|
||||||
|
deepseek-7b-base-hf:
|
||||||
|
gsm8k_accuracy: 25
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 46.88
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
deepseek-7b-base-turbomind:
|
||||||
|
gsm8k_accuracy: 18.75
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 50.00
|
||||||
|
winogrande_accuracy: 84.38
|
||||||
|
deepseek-moe-16b-base-vllm:
|
||||||
|
gsm8k_accuracy: 25.00
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 25
|
||||||
|
winogrande_accuracy: 68.75
|
||||||
|
gemma2-2b-hf:
|
||||||
|
gsm8k_accuracy: 31.25
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 56.25
|
||||||
|
winogrande_accuracy: 75.00
|
||||||
|
gemma2-9b-hf:
|
||||||
|
gsm8k_accuracy: 75.00
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
gemma-2b-hf:
|
||||||
|
gsm8k_accuracy: 21.88
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 21.88
|
||||||
|
winogrande_accuracy: 53.12
|
||||||
|
gemma-7b-hf:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 65.62
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
gemma-2-9b-turbomind:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
gemma-2b-vllm:
|
||||||
|
gsm8k_accuracy: 15.62
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 28.12
|
||||||
|
winogrande_accuracy: 68.75
|
||||||
|
gemma-7b-vllm:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
GPQA_diamond_accuracy: 6.25
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
internlm2_5-7b-hf:
|
||||||
|
gsm8k_accuracy: 37.5
|
||||||
|
GPQA_diamond_accuracy: 25
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
internlm2-7b-hf:
|
||||||
|
gsm8k_accuracy: 53.12
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 62.5
|
||||||
|
winogrande_accuracy: 78.12
|
||||||
|
internlm2-1.8b-turbomind:
|
||||||
|
gsm8k_accuracy: 12.50
|
||||||
|
GPQA_diamond_accuracy: 9.38
|
||||||
|
race-high_accuracy: 71.88
|
||||||
|
winogrande_accuracy: 75
|
||||||
|
internlm2_5-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 31.25
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 87.5
|
||||||
|
internlm2-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 53.12
|
||||||
|
GPQA_diamond_accuracy: 25.00
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
internlm2-base-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 25.00
|
||||||
|
GPQA_diamond_accuracy: 34.38
|
||||||
|
race-high_accuracy: 71.88
|
||||||
|
winogrande_accuracy: 62.50
|
||||||
|
llama-2-7b-hf:
|
||||||
|
gsm8k_accuracy: 21.88
|
||||||
|
GPQA_diamond_accuracy: 21.88
|
||||||
|
race-high_accuracy: 40.62
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
llama-3_1-8b-hf:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
GPQA_diamond_accuracy: 25
|
||||||
|
race-high_accuracy: 90.62
|
||||||
|
winogrande_accuracy: 62.5
|
||||||
|
llama-3-8b-hf:
|
||||||
|
gsm8k_accuracy: 46.88
|
||||||
|
GPQA_diamond_accuracy: 6.25
|
||||||
|
race-high_accuracy: 65.62
|
||||||
|
winogrande_accuracy: 65.62
|
||||||
|
llama-3.1-8b-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
GPQA_diamond_accuracy: 9.38
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 78.12
|
||||||
|
llama-3-8b-turbomind:
|
||||||
|
gsm8k_accuracy: 46.88
|
||||||
|
GPQA_diamond_accuracy: 12.50
|
||||||
|
race-high_accuracy: 65.62
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
mistral-7b-v0.3-hf:
|
||||||
|
gsm8k_accuracy: 31.25
|
||||||
|
GPQA_diamond_accuracy: 6.25
|
||||||
|
race-high_accuracy: 62.5
|
||||||
|
winogrande_accuracy: 59.38
|
||||||
|
qwen2.5-7b-hf:
|
||||||
|
gsm8k_accuracy: 81.25
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
qwen2.5-1.5b-turbomind:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
GPQA_diamond_accuracy: 21.88
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 71.88
|
||||||
|
qwen2.5-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 78.12
|
||||||
|
GPQA_diamond_accuracy: 21.88
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 75.00
|
||||||
|
qwen1.5-moe-a2.7b-hf:
|
||||||
|
gsm8k_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 84.38
|
||||||
|
winogrande_accuracy: 75
|
||||||
|
qwen2-0.5b-hf:
|
||||||
|
gsm8k_accuracy: 25
|
||||||
|
GPQA_diamond_accuracy: 0
|
||||||
|
race-high_accuracy: 40.62
|
||||||
|
winogrande_accuracy: 62.5
|
||||||
|
qwen2-1.5b-hf:
|
||||||
|
gsm8k_accuracy: 59.38
|
||||||
|
GPQA_diamond_accuracy: 9.38
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
winogrande_accuracy: 62.5
|
||||||
|
qwen2-7b-hf:
|
||||||
|
gsm8k_accuracy: 68.75
|
||||||
|
GPQA_diamond_accuracy: 9.38
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 68.75
|
||||||
|
qwen2-1.5b-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
GPQA_diamond_accuracy: 12.50
|
||||||
|
race-high_accuracy: 81.25
|
||||||
|
winogrande_accuracy: 75
|
||||||
|
qwen2-7b-turbomind:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
GPQA_diamond_accuracy: 12.5
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 75
|
||||||
|
qwen1.5-0.5b-vllm:
|
||||||
|
gsm8k_accuracy: 9.38
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 56.25
|
||||||
|
winogrande_accuracy: 59.38
|
||||||
|
yi-1.5-6b-hf:
|
||||||
|
gsm8k_accuracy: 62.5
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 62.5
|
||||||
|
yi-1.5-9b-hf:
|
||||||
|
gsm8k_accuracy: 75
|
||||||
|
GPQA_diamond_accuracy: 40.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 59.38
|
||||||
|
yi-1.5-9b-turbomind:
|
||||||
|
gsm8k_accuracy: 75.00
|
||||||
|
GPQA_diamond_accuracy: 40.62
|
||||||
|
race-high_accuracy: 87.5
|
||||||
|
winogrande_accuracy: 65.62
|
||||||
|
internlm2-20b-turbomind:
|
||||||
|
gsm8k_accuracy: 71.88
|
||||||
|
GPQA_diamond_accuracy: 18.75
|
||||||
|
race-high_accuracy: 68.75
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
qwen2.5-14b-hf:
|
||||||
|
gsm8k_accuracy: 75
|
||||||
|
GPQA_diamond_accuracy: 37.5
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 84.38
|
||||||
|
qwen2.5-32b-hf:
|
||||||
|
gsm8k_accuracy: 87.5
|
||||||
|
GPQA_diamond_accuracy: 31.25
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 78.12
|
||||||
|
qwen2.5-32b-turbomind:
|
||||||
|
gsm8k_accuracy: 90.62
|
||||||
|
GPQA_diamond_accuracy: 31.25
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
deepseek-67b-base-turbomind:
|
||||||
|
gsm8k_accuracy: 62.50
|
||||||
|
GPQA_diamond_accuracy: 31.25
|
||||||
|
race-high_accuracy: 78.12
|
||||||
|
winogrande_accuracy: 81.25
|
||||||
|
llama-3-70b-turbomind:
|
||||||
|
gsm8k_accuracy: 56.25
|
||||||
|
GPQA_diamond_accuracy: 15.62
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 84.38
|
||||||
|
qwen2.5-72b-turbomind:
|
||||||
|
gsm8k_accuracy: 84.38
|
||||||
|
GPQA_diamond_accuracy: 40.62
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 87.5
|
||||||
|
deepseek-v2-turbomind:
|
||||||
|
gsm8k_accuracy: 65.62
|
||||||
|
GPQA_diamond_accuracy: 3.12
|
||||||
|
race-high_accuracy: 93.75
|
||||||
|
winogrande_accuracy: 81.25
|
77
.github/scripts/pr_oc_score_assert.py
vendored
Normal file
77
.github/scripts/pr_oc_score_assert.py
vendored
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
output_path = 'regression_result'
|
||||||
|
model = 'internlm2-chat-7b-hf'
|
||||||
|
dataset = 'siqa'
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def result_scores():
|
||||||
|
file = find_csv_files(output_path)
|
||||||
|
if file is None:
|
||||||
|
return None
|
||||||
|
return read_csv_file(file)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures('result_scores')
|
||||||
|
class TestChatScore:
|
||||||
|
"""Test cases for chat model."""
|
||||||
|
|
||||||
|
def test_model_dataset_score(self, result_scores):
|
||||||
|
result_score = result_scores.get(model).get(dataset)
|
||||||
|
assert_score(result_score, 79.53)
|
||||||
|
|
||||||
|
|
||||||
|
def assert_score(score, baseline):
|
||||||
|
if score is None or score == '-':
|
||||||
|
assert False, 'value is none'
|
||||||
|
if float(score) < (baseline * 1.03) and float(score) > (baseline * 0.97):
|
||||||
|
print(score + ' between ' + str(baseline * 0.97) + ' and ' +
|
||||||
|
str(baseline * 1.03))
|
||||||
|
assert True
|
||||||
|
else:
|
||||||
|
assert False, score + ' not between ' + str(
|
||||||
|
baseline * 0.97) + ' and ' + str(baseline * 1.03)
|
||||||
|
|
||||||
|
|
||||||
|
def find_csv_files(directory):
|
||||||
|
csv_files = []
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.csv'):
|
||||||
|
csv_files.append(os.path.join(root, file))
|
||||||
|
if len(csv_files) > 1:
|
||||||
|
raise 'have more than 1 result file, please check the result manually'
|
||||||
|
if len(csv_files) == 0:
|
||||||
|
return None
|
||||||
|
return csv_files[0]
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv_file(file_path):
|
||||||
|
with open(file_path, 'r') as csvfile:
|
||||||
|
reader = csv.DictReader(csvfile)
|
||||||
|
filtered_data = []
|
||||||
|
|
||||||
|
for row in reader:
|
||||||
|
filtered_row = {
|
||||||
|
k: v
|
||||||
|
for k, v in row.items()
|
||||||
|
if k not in ['version', 'metric', 'mode']
|
||||||
|
}
|
||||||
|
filtered_data.append(filtered_row)
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
for data in filtered_data:
|
||||||
|
dataset = data.get('dataset')
|
||||||
|
for key in data.keys():
|
||||||
|
if key == 'dataset':
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if key in result.keys():
|
||||||
|
result.get(key)[dataset] = data.get(key)
|
||||||
|
else:
|
||||||
|
result[key] = {dataset: data.get(key)}
|
||||||
|
return result
|
378
.github/workflows/daily-run-test.yml
vendored
378
.github/workflows/daily-run-test.yml
vendored
@ -2,71 +2,347 @@ name: daily_run_test
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
repo_org:
|
||||||
|
required: false
|
||||||
|
description: 'Tested repository organization name. Default is open-compass/opencompass'
|
||||||
|
type: string
|
||||||
|
default: 'open-compass/opencompass'
|
||||||
|
repo_ref:
|
||||||
|
required: false
|
||||||
|
description: 'Set branch or tag or commit id. Default is "main"'
|
||||||
|
type: string
|
||||||
|
default: 'main'
|
||||||
|
build_lmdeploy:
|
||||||
|
required: false
|
||||||
|
description: 'whether to build lmdeploy'
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
repo_org_lmdeploy:
|
||||||
|
required: false
|
||||||
|
description: 'Tested repository organization name. Default is internlm/lmdeploy'
|
||||||
|
type: string
|
||||||
|
default: 'InternLM/lmdeploy'
|
||||||
|
repo_ref_lmdeploy:
|
||||||
|
required: false
|
||||||
|
description: 'Set branch or tag or commit id. Default is "main"'
|
||||||
|
type: string
|
||||||
|
default: 'main'
|
||||||
|
regression_func_volc:
|
||||||
|
required: true
|
||||||
|
description: 'regression functions'
|
||||||
|
type: string
|
||||||
|
default: "['chat_models','base_models', 'chat_obj_fullbench', 'base_fullbench']"
|
||||||
|
regression_func_local:
|
||||||
|
required: true
|
||||||
|
description: 'regression functions'
|
||||||
|
type: string
|
||||||
|
default: "['cmd', 'api', 'chat_sub_fullbench']"
|
||||||
|
fullbench_eval:
|
||||||
|
required: true
|
||||||
|
description: 'fullbench volc functions'
|
||||||
|
type: string
|
||||||
|
default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '56 16 * * *'
|
- cron: '15 14 * * 0,3'
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
env:
|
env:
|
||||||
CONDA_ENV: opencompass_regression
|
HF_DATASETS_OFFLINE: 1
|
||||||
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
|
HF_EVALUATE_OFFLINE: 1
|
||||||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
|
TRANSFORMERS_OFFLINE: 1
|
||||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
VLLM_USE_MODELSCOPE: false
|
||||||
|
LMDEPLOY_USE_MODELSCOPE: false
|
||||||
|
HF_HUB_OFFLINE: 1
|
||||||
|
OUTPUT_FOLDER: cuda12.1_dist_${{ github.run_id }}
|
||||||
|
CONDA_PATH: ${{ secrets.WORKSPACE_PREFIX }}/miniconda3
|
||||||
|
PIP_CACHE_PATH: ${{ secrets.WORKSPACE_PREFIX }}/.cache/pip
|
||||||
|
REPORT_ROOT: ${{ secrets.WORKSPACE_PREFIX }}/eval_report/regression
|
||||||
|
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache
|
||||||
|
HUGGINGFACE_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||||
|
HF_HUB_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/models/opencompass_hf_hub
|
||||||
|
HF_DATASETS_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/hf_datasets_cache
|
||||||
|
HF_ENDPOINT: https://hf-mirror.com
|
||||||
|
CONDA_ENV: regression_test
|
||||||
|
export VLLM_WORKER_MULTIPROC_METHOD: spawn
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
daily_run_test:
|
build-pypi:
|
||||||
runs-on: self-hosted
|
runs-on: ubuntu-latest
|
||||||
environment: 'prod'
|
steps:
|
||||||
timeout-minutes: 240 #4hours
|
- uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||||
|
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||||
|
- name: Set up Python 3.10
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: '3.10'
|
||||||
|
- name: Build lagent
|
||||||
|
run: |
|
||||||
|
pip install wheel setuptools
|
||||||
|
python setup.py sdist bdist_wheel
|
||||||
|
- name: Upload Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
if-no-files-found: error
|
||||||
|
path: dist/*
|
||||||
|
retention-days: 1
|
||||||
|
name: my-artifact-${{ github.run_id }}
|
||||||
|
|
||||||
|
build-pypi-lmdeploy:
|
||||||
|
if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
pyver: [py310]
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
PYTHON_VERSION: ${{ matrix.pyver }}
|
||||||
|
PLAT_NAME: manylinux2014_x86_64
|
||||||
|
DOCKER_TAG: cuda12.1
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
repository: ${{ github.event.inputs.repo_org_lmdeploy || 'InternLM/lmdeploy' }}
|
||||||
|
ref: ${{github.event.inputs.repo_ref_lmdeploy || 'main'}}
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
echo ${PYTHON_VERSION}
|
||||||
|
echo ${PLAT_NAME}
|
||||||
|
echo ${DOCKER_TAG}
|
||||||
|
echo ${OUTPUT_FOLDER}
|
||||||
|
echo ${GITHUB_RUN_ID}
|
||||||
|
# remove -it
|
||||||
|
sed -i 's/docker run --rm -it/docker run --rm/g' builder/manywheel/build_wheel.sh
|
||||||
|
bash builder/manywheel/build_wheel.sh ${PYTHON_VERSION} ${PLAT_NAME} ${DOCKER_TAG} ${OUTPUT_FOLDER}
|
||||||
|
- name: Upload Artifacts
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
if-no-files-found: error
|
||||||
|
path: builder/manywheel/${{ env.OUTPUT_FOLDER }}
|
||||||
|
retention-days: 1
|
||||||
|
name: my-artifact-${{ github.run_id }}-${{ matrix.pyver }}
|
||||||
|
|
||||||
|
|
||||||
|
prepare_env:
|
||||||
|
if: ${{!cancelled()}}
|
||||||
|
needs: ['build-pypi', 'build-pypi-lmdeploy']
|
||||||
|
runs-on: volc_cu12
|
||||||
|
timeout-minutes: 120 #2hours
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Clone repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
- name: Prepare - create conda env and install torch
|
with:
|
||||||
run: |
|
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||||
eval "$(conda shell.bash hook)"
|
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||||
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
- name: Download Artifacts
|
||||||
conda activate ${{env.CONDA_ENV}}
|
uses: actions/download-artifact@v4
|
||||||
pip install torch torchvision torchaudio --cache-dir ${{env.PIP_CACHE_PATH}} --index-url https://download.pytorch.org/whl/cu118
|
with:
|
||||||
conda info --envs
|
name: my-artifact-${{ github.run_id }}
|
||||||
- name: Prepare - Pip install code
|
|
||||||
run: |
|
|
||||||
eval "$(conda shell.bash hook)"
|
|
||||||
conda activate ${{env.CONDA_ENV}}
|
|
||||||
pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
|
|
||||||
pip install human_eval transformers==4.33.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
|
||||||
conda info --envs
|
|
||||||
- name: Prepare - prepare data and hf model
|
|
||||||
run: |
|
|
||||||
cp -r ${{env.USERSPACE_PREFIX}}/data .
|
|
||||||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
|
|
||||||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
|
|
||||||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
|
|
||||||
- name: Run test
|
|
||||||
run: |
|
|
||||||
eval "$(conda shell.bash hook)"
|
|
||||||
conda activate ${{env.CONDA_ENV}}
|
|
||||||
conda info --envs
|
|
||||||
rm -rf regression_result_daily
|
|
||||||
export from_tf=TRUE
|
|
||||||
python3 run.py --models hf_internlm_chat_7b hf_internlm_7b hf_chatglm3_6b_base hf_chatglm3_6b hf_qwen_7b_chat hf_qwen_7b --datasets FewCLUE_chid_ppl humaneval_gen ARC_c_ppl obqa_ppl --work-dir regression_result_daily
|
|
||||||
- name: Get result
|
|
||||||
run: |
|
|
||||||
eval "$(conda shell.bash hook)"
|
|
||||||
pip install pytest --cache-dir ${{env.PIP_CACHE_PATH}}
|
|
||||||
python -m pytest -s -v --color=yes .github/scripts/oc_score_assert.py
|
|
||||||
- name: Remove Conda Env
|
- name: Remove Conda Env
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
eval "$(conda shell.bash hook)"
|
. ${{ secrets.WORKSPACE_PREFIX }}/miniconda3/bin/activate
|
||||||
conda env remove --name ${{env.CONDA_ENV}}
|
conda env remove -y --name ${{env.CONDA_ENV}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
|
- name: Prepare - create conda env and install torch - cu12
|
||||||
|
uses: nick-fields/retry@v3
|
||||||
|
with:
|
||||||
|
max_attempts: 3
|
||||||
|
timeout_minutes: 120
|
||||||
|
command: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda create -y --name ${{env.CONDA_ENV}} python=3.10
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
pip install -r ${{ secrets.WORKSPACE_PREFIX }}/config/requirements.txt --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
FLASH_ATTENTION_FORCE_BUILD=TRUE pip install ${{ secrets.WORKSPACE_PREFIX }}/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
|
||||||
|
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
|
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||||
|
- name: Prepare - reinstall lmdeploy - cu12
|
||||||
|
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: my-artifact-${{ github.run_id }}-py310
|
||||||
|
- name: Prepare - reinstall lmdeploy - cu12
|
||||||
|
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
pip uninstall -y lmdeploy
|
||||||
|
pip install lmdeploy-*.whl --no-deps
|
||||||
|
- name: conda env
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
pip list
|
||||||
|
|
||||||
|
daily_run_test_volc:
|
||||||
|
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||||
|
needs: prepare_env
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||||
|
runs-on: volc_cu12_daily
|
||||||
|
timeout-minutes: 180 #3hours
|
||||||
|
steps:
|
||||||
|
- name: Clone repository
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||||
|
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||||
|
- name: conda env
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
pip list
|
||||||
|
- name: modify config
|
||||||
|
if: matrix.regression_func != 'chat_sub_fullbench'
|
||||||
|
run: |
|
||||||
|
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||||
|
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||||
|
- name: Run test
|
||||||
|
uses: nick-fields/retry@v3
|
||||||
|
with:
|
||||||
|
max_attempts: 1
|
||||||
|
timeout_minutes: 180
|
||||||
|
command: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
|
||||||
|
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
|
||||||
|
|
||||||
|
daily_run_test_local:
|
||||||
|
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||||
|
needs: prepare_env
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
|
||||||
|
runs-on: volc_cu12_local
|
||||||
|
timeout-minutes: 480 #6hours
|
||||||
|
steps:
|
||||||
|
- name: Clone repository
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||||
|
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||||
|
- name: conda env
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
pip list
|
||||||
|
- name: modify config
|
||||||
|
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||||
|
run: |
|
||||||
|
cp -r ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/configs_cluster/volc.py .
|
||||||
|
cat ${{ secrets.WORKSPACE_PREFIX }}/config/test_config_sub.txt >> .github/scripts/eval_regression_${{matrix.regression_func}}.py
|
||||||
|
- name: Run command testcase
|
||||||
|
if: matrix.regression_func == 'cmd'
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
export from_tf=TRUE
|
||||||
|
python tools/list_configs.py internlm2_5 mmlu
|
||||||
|
opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
|
||||||
|
python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
- name: Run model test - api
|
||||||
|
if: matrix.regression_func == 'api'
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||||
|
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||||
|
sleep 180s
|
||||||
|
env | grep PROXY
|
||||||
|
env | grep proxy
|
||||||
|
unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
|
||||||
|
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||||
|
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
- name: Run model test - api kill
|
||||||
|
if: always() && matrix.regression_func == 'api'
|
||||||
|
run: |
|
||||||
|
kill -15 "$restful_pid"
|
||||||
|
- name: Run testcase
|
||||||
|
if: matrix.regression_func == 'chat_sub_fullbench'
|
||||||
|
env:
|
||||||
|
COMPASS_DATA_CACHE: ${{ secrets.SHARESPACE_PREFIX }}/datasets/compass_data_cache_subset
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
export from_tf=TRUE
|
||||||
|
opencompass .github/scripts/eval_regression_${{matrix.regression_func}}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}} --reuse --dump-eval-details
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{matrix.regression_func}}/*/summary regression_result_daily
|
||||||
|
python -m pytest -m ${{matrix.regression_func}} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
|
||||||
|
fullbench_run_test:
|
||||||
|
if: ${{!cancelled() && contains(needs.prepare_env.result, 'success')}}
|
||||||
|
needs: prepare_env
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
|
||||||
|
runs-on: volc_cu12
|
||||||
|
timeout-minutes: 480 #6hours
|
||||||
|
steps:
|
||||||
|
- name: Clone repository
|
||||||
|
uses: actions/checkout@v2
|
||||||
|
with:
|
||||||
|
repository: ${{ github.event.inputs.repo_org || 'open-compass/opencompass' }}
|
||||||
|
ref: ${{github.event.inputs.repo_ref || 'main'}}
|
||||||
|
- name: conda env
|
||||||
|
run: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
pip list
|
||||||
|
- name: Run testcase
|
||||||
|
uses: nick-fields/retry@v3
|
||||||
|
with:
|
||||||
|
max_attempts: 1
|
||||||
|
timeout_minutes: 480
|
||||||
|
command: |
|
||||||
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
|
conda activate ${{env.CONDA_ENV}}
|
||||||
|
conda info --envs
|
||||||
|
export from_tf=TRUE
|
||||||
|
opencompass ${{ secrets.WORKSPACE_PREFIX }}/ocplayground/template/regression/eval_${{ matrix.function_type }}.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }} --reuse
|
||||||
|
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/${{ matrix.function_type }}/*/summary regression_result_daily
|
||||||
|
python -m pytest -m ${{ matrix.function_type }} -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||||
|
|
||||||
|
|
||||||
notify_to_feishu:
|
notify_to_feishu:
|
||||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||||
needs: [daily_run_test]
|
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 5
|
timeout-minutes: 5
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
steps:
|
steps:
|
||||||
|
13
.github/workflows/link-check.yml
vendored
13
.github/workflows/link-check.yml
vendored
@ -5,13 +5,22 @@ on:
|
|||||||
# check links at 01:30 a.m. every day
|
# check links at 01:30 a.m. every day
|
||||||
- cron: '30 1 * * *'
|
- cron: '30 1 * * *'
|
||||||
|
|
||||||
|
workflow_dispatch: # allow manual trigger
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
link-check:
|
link-check:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
# - uses: actions/checkout@v3
|
# - uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: linkchecker
|
- name: Install linkchecker
|
||||||
run: |
|
run: |
|
||||||
pip install linkchecker
|
pip install linkchecker
|
||||||
linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30
|
|
||||||
|
- name: Run linkchecker
|
||||||
|
run: |
|
||||||
|
linkchecker https://opencompass.readthedocs.io/ --no-robots -t 30 --no-warnings \
|
||||||
|
--ignore-url "https://opencompass.readthedocs.io/.*/static/images/opencompass_logo.svg" \
|
||||||
|
--ignore-url "https://opencompass.readthedocs.io/.*/_static/images/icon-menu-dots.svg" \
|
||||||
|
--ignore-url "https://opencompass.readthedocs.io/policy" \
|
||||||
|
--ignore-url "https://opencompass.readthedocs.io/(en|zh_CN)/[0-9a-f]{40}/.*"
|
||||||
|
2
.github/workflows/lint.yml
vendored
2
.github/workflows/lint.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
|||||||
python-version: '3.10'
|
python-version: '3.10'
|
||||||
- name: Install pre-commit hook
|
- name: Install pre-commit hook
|
||||||
run: |
|
run: |
|
||||||
pip install pre-commit mmengine
|
pip install pre-commit==3.8.0 mmengine==0.10.5
|
||||||
pre-commit install
|
pre-commit install
|
||||||
- name: Linting
|
- name: Linting
|
||||||
run: pre-commit run --all-files
|
run: pre-commit run --all-files
|
||||||
|
70
.github/workflows/pr-run-test.yml
vendored
70
.github/workflows/pr-run-test.yml
vendored
@ -8,62 +8,88 @@ on:
|
|||||||
- 'docs/**'
|
- 'docs/**'
|
||||||
- 'configs/**'
|
- 'configs/**'
|
||||||
- 'tools/**'
|
- 'tools/**'
|
||||||
|
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '56 22 * * *'
|
- cron: '56 22 * * *'
|
||||||
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
group: ${{ github.workflow }}-${{ github.ref }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
env:
|
env:
|
||||||
CONDA_ENV: opencompass_base
|
CONDA_ENV: pr_test
|
||||||
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
|
HF_DATASETS_OFFLINE: 1
|
||||||
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
|
HF_EVALUATE_OFFLINE: 1
|
||||||
|
TRANSFORMERS_OFFLINE: 1
|
||||||
|
VLLM_USE_MODELSCOPE: false
|
||||||
|
LMDEPLOY_USE_MODELSCOPE: false
|
||||||
|
HF_HUB_OFFLINE: 1
|
||||||
|
CONDA_PATH: /fs-computility/llm/qa-llm-cicd/miniconda3
|
||||||
|
PIP_CACHE_PATH: /fs-computility/llm/qa-llm-cicd/.cache/pip
|
||||||
|
REPORT_ROOT: /fs-computility/llm/qa-llm-cicd/eval_report/prtest
|
||||||
|
COMPASS_DATA_CACHE: /fs-computility/llm/shared/llmeval/datasets/compass_data_cache
|
||||||
|
HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||||
|
HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pr_run_test:
|
pr_run_test:
|
||||||
runs-on: self-hosted
|
runs-on: volc_cu12_local
|
||||||
environment: 'prod'
|
environment: 'prod'
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
steps:
|
steps:
|
||||||
- name: Clone repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
- name: Prepare - Install opencompass
|
- name: Prepare - Install opencompass
|
||||||
run: |
|
run: |
|
||||||
eval "$(conda shell.bash hook)"
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
python3 -m pip uninstall opencompass -y
|
python3 -m pip uninstall opencompass -y
|
||||||
python3 -m pip install -e . --cache-dir ${{env.USERSPACE_PREFIX}}/.cache/pip
|
python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
- name: Prepare - prepare data and hf model
|
- name: conda env
|
||||||
run: |
|
run: |
|
||||||
cp -r ${{env.USERSPACE_PREFIX}}/data .
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
|
conda activate ${{env.CONDA_ENV}}
|
||||||
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
|
conda info --envs
|
||||||
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
|
pip list
|
||||||
|
lmdeploy check_env
|
||||||
- name: Run test
|
- name: Run test
|
||||||
run: |
|
run: |
|
||||||
eval "$(conda shell.bash hook)"
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
conda info --envs
|
conda info --envs
|
||||||
rm -rf regression_result
|
rm -rf regression_result
|
||||||
python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
|
opencompass --models hf_internlm2_5_20b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1 --debug
|
||||||
|
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2 --debug --max-num-workers 2
|
||||||
|
opencompass --models hf_internlm2_5_7b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3 --debug --max-num-workers 2
|
||||||
- name: Get result
|
- name: Get result
|
||||||
run: |
|
run: |
|
||||||
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
|
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result1/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||||
if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
|
if (( ${score%.*} >= 88 && ${score%.*} <= 89 )); then
|
||||||
echo "score is $score between 70 and 75"
|
echo "score is $score between 88 and 89"
|
||||||
else
|
else
|
||||||
echo "score is $score not between 70 and 75"
|
echo "score is $score not between 88 and 89"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result2/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||||
|
if (( ${score%.*} >= 87 && ${score%.*} <= 88 )); then
|
||||||
|
echo "score is $score between 87 and 88"
|
||||||
|
else
|
||||||
|
echo "score is $score not between 87 and 88"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
score=$(sed -n '$p' ${{env.REPORT_ROOT}}/${{ github.run_id }}/regression_result3/*/summary/*.csv | awk -F ',' '{print $NF}')
|
||||||
|
if (( ${score%.*} >= 87 && ${score%.*} <= 91 )); then
|
||||||
|
echo "score is $score between 87 and 91"
|
||||||
|
else
|
||||||
|
echo "score is $score not between 87 and 91"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
rm -rf regression_result
|
|
||||||
- name: Uninstall opencompass
|
- name: Uninstall opencompass
|
||||||
if: always()
|
if: always()
|
||||||
run: |
|
run: |
|
||||||
eval "$(conda shell.bash hook)"
|
. ${{env.CONDA_PATH}}/bin/activate
|
||||||
conda activate ${{env.CONDA_ENV}}
|
conda activate ${{env.CONDA_ENV}}
|
||||||
python3 -m pip uninstall opencompass -y
|
python3 -m pip uninstall opencompass -y
|
||||||
conda info --envs
|
conda info --envs
|
||||||
@ -71,9 +97,9 @@ jobs:
|
|||||||
notify_to_feishu:
|
notify_to_feishu:
|
||||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||||
needs: [pr_run_test]
|
needs: [pr_run_test]
|
||||||
environment: 'prod'
|
|
||||||
timeout-minutes: 5
|
timeout-minutes: 5
|
||||||
runs-on: self-hosted
|
runs-on: self-hosted
|
||||||
|
environment: 'prod'
|
||||||
steps:
|
steps:
|
||||||
- name: notify
|
- name: notify
|
||||||
run: |
|
run: |
|
||||||
|
18
.github/workflows/pr-stage-check.yml
vendored
18
.github/workflows/pr-stage-check.yml
vendored
@ -20,7 +20,7 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
python-version: ['3.10']
|
python-version: ['3.10']
|
||||||
include:
|
include:
|
||||||
- torch: 2.0.0
|
- torch: 2.5.1
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
@ -30,7 +30,7 @@ jobs:
|
|||||||
- name: Upgrade pip
|
- name: Upgrade pip
|
||||||
run: python -m pip install --upgrade pip
|
run: python -m pip install --upgrade pip
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||||
- name: Install system dependencies
|
- name: Install system dependencies
|
||||||
run: |
|
run: |
|
||||||
sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
|
sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
|
||||||
@ -44,8 +44,8 @@ jobs:
|
|||||||
run: python -m pip install -e .
|
run: python -m pip install -e .
|
||||||
- name: Prepare dataset
|
- name: Prepare dataset
|
||||||
run: |
|
run: |
|
||||||
wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
|
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
|
||||||
unzip OpenCompassData-core-20231110.zip
|
unzip OpenCompassData-core-20240207.zip
|
||||||
- name: Dry run test
|
- name: Dry run test
|
||||||
run: |
|
run: |
|
||||||
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
||||||
@ -85,8 +85,8 @@ jobs:
|
|||||||
run: python -m pip install -e .
|
run: python -m pip install -e .
|
||||||
- name: Prepare dataset
|
- name: Prepare dataset
|
||||||
run: |
|
run: |
|
||||||
wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
|
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
|
||||||
unzip OpenCompassData-core-20231110.zip
|
unzip OpenCompassData-core-20240207.zip
|
||||||
- name: Dry run test
|
- name: Dry run test
|
||||||
run: |
|
run: |
|
||||||
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
||||||
@ -106,7 +106,7 @@ jobs:
|
|||||||
- name: Upgrade pip
|
- name: Upgrade pip
|
||||||
run: python -m pip install pip --upgrade
|
run: python -m pip install pip --upgrade
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
|
run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
|
||||||
- name: Install opencompass dependencies
|
- name: Install opencompass dependencies
|
||||||
run: |
|
run: |
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
@ -114,8 +114,8 @@ jobs:
|
|||||||
run: pip install -e .
|
run: pip install -e .
|
||||||
- name: Prepare dataset
|
- name: Prepare dataset
|
||||||
run: |
|
run: |
|
||||||
Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip -OutFile OpenCompassData-core-20231110.zip
|
Invoke-WebRequest -Uri https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip -OutFile OpenCompassData-core-20240207.zip
|
||||||
unzip OpenCompassData-core-20231110.zip
|
unzip OpenCompassData-core-20240207.zip
|
||||||
- name: Dry run test
|
- name: Dry run test
|
||||||
run: |
|
run: |
|
||||||
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
python run.py --models hf_opt_125m --datasets siqa_gen winograd_ppl --dry-run
|
||||||
|
23
.github/workflows/publish-to-pypi.yml
vendored
23
.github/workflows/publish-to-pypi.yml
vendored
@ -1,21 +1,26 @@
|
|||||||
name: deploy
|
name: deploy
|
||||||
|
|
||||||
on: push
|
on:
|
||||||
|
push:
|
||||||
concurrency:
|
workflow_dispatch:
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
inputs:
|
||||||
cancel-in-progress: true
|
confirm_publish:
|
||||||
|
description: 'Type YES to confirm publishing to PyPI'
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-n-publish:
|
build-n-publish:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
if: startsWith(github.event.ref, 'refs/tags')
|
if: |
|
||||||
|
github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') ||
|
||||||
|
(github.event_name == 'workflow_dispatch' && inputs.confirm_publish == 'YES')
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python 3.7
|
- name: Set up Python 3.10
|
||||||
uses: actions/setup-python@v1
|
uses: actions/setup-python@v4
|
||||||
with:
|
with:
|
||||||
python-version: 3.7
|
python-version: '3.10'
|
||||||
- name: Build lagent
|
- name: Build lagent
|
||||||
run: |
|
run: |
|
||||||
pip install wheel
|
pip install wheel
|
||||||
|
15
.gitignore
vendored
15
.gitignore
vendored
@ -1,6 +1,7 @@
|
|||||||
|
.DS_Store
|
||||||
output_*/
|
output_*/
|
||||||
outputs/
|
outputs/
|
||||||
|
scripts/
|
||||||
icl_inference_output/
|
icl_inference_output/
|
||||||
.vscode/
|
.vscode/
|
||||||
tmp/
|
tmp/
|
||||||
@ -10,8 +11,11 @@ configs/secrets.py
|
|||||||
configs/datasets/log.json
|
configs/datasets/log.json
|
||||||
configs/eval_debug*.py
|
configs/eval_debug*.py
|
||||||
configs/viz_*.py
|
configs/viz_*.py
|
||||||
|
configs/**/*_bkup.py
|
||||||
|
opencompass/**/*_bkup.py
|
||||||
data
|
data
|
||||||
work_dirs
|
work_dirs
|
||||||
|
outputs
|
||||||
models/*
|
models/*
|
||||||
configs/internal/
|
configs/internal/
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
@ -91,9 +95,14 @@ docs/zh_cn/_build/
|
|||||||
|
|
||||||
# sft config ignore list
|
# sft config ignore list
|
||||||
configs/sft_cfg/*B_*
|
configs/sft_cfg/*B_*
|
||||||
|
configs/sft_cfg/1B/*
|
||||||
configs/sft_cfg/7B/*
|
configs/sft_cfg/7B/*
|
||||||
configs/sft_cfg/20B/*
|
configs/sft_cfg/20B/*
|
||||||
|
configs/sft_cfg/60B/*
|
||||||
|
configs/sft_cfg/100B/*
|
||||||
|
|
||||||
configs/cky/
|
configs/cky/
|
||||||
|
configs/_internal_legacy*
|
||||||
# in case llama clone in the opencompass
|
# in case llama clone in the opencompass
|
||||||
llama/
|
llama/
|
||||||
|
|
||||||
@ -117,6 +126,10 @@ turbomind/
|
|||||||
*.txt
|
*.txt
|
||||||
*.jpg
|
*.jpg
|
||||||
*.json
|
*.json
|
||||||
|
*.jsonl
|
||||||
*.csv
|
*.csv
|
||||||
*.npy
|
*.npy
|
||||||
*.c
|
*.c
|
||||||
|
|
||||||
|
# aliyun
|
||||||
|
core.*
|
||||||
|
@ -7,8 +7,8 @@ assign:
|
|||||||
scedule:
|
scedule:
|
||||||
'*/1 * * * *'
|
'*/1 * * * *'
|
||||||
assignees:
|
assignees:
|
||||||
- Leymore
|
|
||||||
- bittersweet1999
|
- bittersweet1999
|
||||||
- yingfhu
|
- liushz
|
||||||
- kennymckormick
|
- MaiziXiao
|
||||||
|
- acylam
|
||||||
- tonysy
|
- tonysy
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
tests/data/|
|
tests/data/|
|
||||||
|
tests/dataset/|
|
||||||
opencompass/models/internal/|
|
opencompass/models/internal/|
|
||||||
opencompass/utils/internal/|
|
opencompass/utils/internal/|
|
||||||
opencompass/openicl/icl_evaluator/hf_metrics/|
|
opencompass/openicl/icl_evaluator/hf_metrics/|
|
||||||
@ -9,24 +10,41 @@ exclude: |
|
|||||||
opencompass/datasets/medbench/|
|
opencompass/datasets/medbench/|
|
||||||
opencompass/datasets/teval/|
|
opencompass/datasets/teval/|
|
||||||
opencompass/datasets/NPHardEval/|
|
opencompass/datasets/NPHardEval/|
|
||||||
docs/zh_cn/advanced_guides/compassbench_intro.md
|
opencompass/datasets/TheoremQA|
|
||||||
|
opencompass/datasets/subjective/mtbench101.py|
|
||||||
|
docs/zh_cn/advanced_guides/compassbench_intro.md |
|
||||||
|
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
|
||||||
|
opencompass/utils/datasets.py |
|
||||||
|
opencompass/utils/datasets_info.py
|
||||||
)
|
)
|
||||||
repos:
|
repos:
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-flake8
|
- repo: https://gitee.com/openmmlab/mirrors-flake8
|
||||||
rev: 5.0.4
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-isort
|
- repo: https://gitee.com/openmmlab/mirrors-isort
|
||||||
rev: 5.11.5
|
rev: 5.11.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-yapf
|
- repo: https://gitee.com/openmmlab/mirrors-yapf
|
||||||
rev: v0.32.0
|
rev: v0.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-codespell
|
- repo: https://gitee.com/openmmlab/mirrors-codespell
|
||||||
rev: v2.2.1
|
rev: v2.2.1
|
||||||
hooks:
|
hooks:
|
||||||
@ -34,7 +52,9 @@ repos:
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
.*\.jsonl|
|
.*\.jsonl|
|
||||||
configs/
|
.*\.md.template|
|
||||||
|
opencompass/configs/ |
|
||||||
|
examples/
|
||||||
)
|
)
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
|
- repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
|
||||||
rev: v4.3.0
|
rev: v4.3.0
|
||||||
@ -44,7 +64,6 @@ repos:
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/|
|
projects/.*?/dicts/|
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
@ -52,18 +71,14 @@ repos:
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/|
|
projects/.*?/dicts/|
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
- id: requirements-txt-fixer
|
- id: requirements-txt-fixer
|
||||||
- id: double-quote-string-fixer
|
- id: double-quote-string-fixer
|
||||||
exclude: configs/
|
|
||||||
- id: check-merge-conflict
|
- id: check-merge-conflict
|
||||||
- id: fix-encoding-pragma
|
- id: fix-encoding-pragma
|
||||||
args: ["--remove"]
|
args: ["--remove"]
|
||||||
- id: mixed-line-ending
|
- id: mixed-line-ending
|
||||||
args: ["--fix=lf"]
|
args: ["--fix=lf"]
|
||||||
- id: mixed-line-ending
|
|
||||||
args: ["--fix=lf"]
|
|
||||||
- repo: https://gitee.com/openmmlab/mirrors-mdformat
|
- repo: https://gitee.com/openmmlab/mirrors-mdformat
|
||||||
rev: 0.7.9
|
rev: 0.7.9
|
||||||
hooks:
|
hooks:
|
||||||
@ -87,7 +102,25 @@ repos:
|
|||||||
language: script
|
language: script
|
||||||
pass_filenames: true
|
pass_filenames: true
|
||||||
require_serial: true
|
require_serial: true
|
||||||
files: ^configs/datasets
|
files: ^opencompass/configs/datasets
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: update-dataset-suffix-pacakge
|
||||||
|
name: dataset suffix updater(package)
|
||||||
|
entry: ./tools/update_dataset_suffix.py
|
||||||
|
language: script
|
||||||
|
pass_filenames: false
|
||||||
|
# require_serial: true
|
||||||
|
# files: ^opencompass/configs/datasets
|
||||||
|
args:
|
||||||
|
- --root_folder
|
||||||
|
- opencompass/configs/datasets
|
||||||
|
- repo: https://gitee.com/mirrors/gitleaks
|
||||||
|
rev: v8.23.1
|
||||||
|
hooks:
|
||||||
|
- id: gitleaks
|
||||||
|
entry: "gitleaks dir"
|
||||||
|
args: ["--verbose", "--redact=50"]
|
||||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||||
# rev: v0.2.0 # Use the ref you want to point at
|
# rev: v0.2.0 # Use the ref you want to point at
|
||||||
# hooks:
|
# hooks:
|
||||||
|
@ -1,32 +1,51 @@
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
tests/data/|
|
tests/data/|
|
||||||
|
tests/dataset/|
|
||||||
opencompass/models/internal/|
|
opencompass/models/internal/|
|
||||||
opencompass/utils/internal/|
|
opencompass/utils/internal/|
|
||||||
opencompass/openicl/icl_evaluator/hf_metrics/|
|
opencompass/openicl/icl_evaluator/hf_metrics/|
|
||||||
opencompass/datasets/lawbench/utils|
|
opencompass/datasets/lawbench/utils|
|
||||||
opencompass/datasets/lawbench/evaluation_functions/|
|
opencompass/datasets/lawbench/evaluation_functions/|
|
||||||
opencompass/datasets/medbench/|
|
opencompass/datasets/medbench/|
|
||||||
|
opencompass/datasets/matbench/|
|
||||||
opencompass/datasets/teval/|
|
opencompass/datasets/teval/|
|
||||||
opencompass/datasets/NPHardEval/|
|
opencompass/datasets/NPHardEval/|
|
||||||
docs/zh_cn/advanced_guides/compassbench_intro.md
|
opencompass/datasets/TheoremQA|
|
||||||
|
opencompass/datasets/subjective/mtbench101.py|
|
||||||
|
docs/zh_cn/advanced_guides/compassbench_intro.md |
|
||||||
|
docs/zh_cn/advanced_guides/compassbench_v2_0.md |
|
||||||
|
opencompass/utils/datasets.py |
|
||||||
|
opencompass/utils/datasets_info.py
|
||||||
)
|
)
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/PyCQA/flake8
|
- repo: https://github.com/PyCQA/flake8
|
||||||
rev: 5.0.4
|
rev: 5.0.4
|
||||||
hooks:
|
hooks:
|
||||||
- id: flake8
|
- id: flake8
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 5.11.5
|
rev: 5.11.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://github.com/pre-commit/mirrors-yapf
|
- repo: https://github.com/pre-commit/mirrors-yapf
|
||||||
rev: v0.32.0
|
rev: v0.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
exclude: configs/
|
exclude: |
|
||||||
|
(?x)^(
|
||||||
|
opencompass/configs/|
|
||||||
|
examples/
|
||||||
|
)
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- repo: https://github.com/codespell-project/codespell
|
||||||
rev: v2.2.1
|
rev: v2.2.1
|
||||||
hooks:
|
hooks:
|
||||||
@ -34,7 +53,9 @@ repos:
|
|||||||
exclude: |
|
exclude: |
|
||||||
(?x)^(
|
(?x)^(
|
||||||
.*\.jsonl|
|
.*\.jsonl|
|
||||||
configs/
|
.*\.md.template|
|
||||||
|
opencompass/configs/ |
|
||||||
|
examples/
|
||||||
)
|
)
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.3.0
|
rev: v4.3.0
|
||||||
@ -44,7 +65,6 @@ repos:
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/|
|
projects/.*?/dicts/|
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
@ -52,18 +72,14 @@ repos:
|
|||||||
(?x)^(
|
(?x)^(
|
||||||
dicts/|
|
dicts/|
|
||||||
projects/.*?/dicts/|
|
projects/.*?/dicts/|
|
||||||
configs/
|
|
||||||
)
|
)
|
||||||
- id: requirements-txt-fixer
|
- id: requirements-txt-fixer
|
||||||
- id: double-quote-string-fixer
|
- id: double-quote-string-fixer
|
||||||
exclude: configs/
|
|
||||||
- id: check-merge-conflict
|
- id: check-merge-conflict
|
||||||
- id: fix-encoding-pragma
|
- id: fix-encoding-pragma
|
||||||
args: ["--remove"]
|
args: ["--remove"]
|
||||||
- id: mixed-line-ending
|
- id: mixed-line-ending
|
||||||
args: ["--fix=lf"]
|
args: ["--fix=lf"]
|
||||||
- id: mixed-line-ending
|
|
||||||
args: ["--fix=lf"]
|
|
||||||
- repo: https://github.com/executablebooks/mdformat
|
- repo: https://github.com/executablebooks/mdformat
|
||||||
rev: 0.7.9
|
rev: 0.7.9
|
||||||
hooks:
|
hooks:
|
||||||
@ -87,7 +103,25 @@ repos:
|
|||||||
language: script
|
language: script
|
||||||
pass_filenames: true
|
pass_filenames: true
|
||||||
require_serial: true
|
require_serial: true
|
||||||
files: ^configs/datasets
|
files: ^opencompass/configs/datasets
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: update-dataset-suffix-pacakge
|
||||||
|
name: dataset suffix updater(package)
|
||||||
|
entry: ./tools/update_dataset_suffix.py
|
||||||
|
language: script
|
||||||
|
pass_filenames: false
|
||||||
|
# require_serial: true
|
||||||
|
# files: ^opencompass/configs/datasets
|
||||||
|
args:
|
||||||
|
- --root_folder
|
||||||
|
- opencompass/configs/datasets
|
||||||
|
- repo: https://github.com/gitleaks/gitleaks
|
||||||
|
rev: v8.23.1
|
||||||
|
hooks:
|
||||||
|
- id: gitleaks
|
||||||
|
entry: "gitleaks dir"
|
||||||
|
args: ["--verbose", "--redact=50"]
|
||||||
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
# - repo: https://github.com/open-mmlab/pre-commit-hooks
|
||||||
# rev: v0.2.0 # Use the ref you want to point at
|
# rev: v0.2.0 # Use the ref you want to point at
|
||||||
# hooks:
|
# hooks:
|
||||||
|
@ -1,14 +0,0 @@
|
|||||||
version: 2
|
|
||||||
|
|
||||||
# Set the version of Python and other tools you might need
|
|
||||||
build:
|
|
||||||
os: ubuntu-22.04
|
|
||||||
tools:
|
|
||||||
python: "3.8"
|
|
||||||
|
|
||||||
formats:
|
|
||||||
- epub
|
|
||||||
|
|
||||||
python:
|
|
||||||
install:
|
|
||||||
- requirements: requirements/docs.txt
|
|
3
MANIFEST.in
Normal file
3
MANIFEST.in
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
recursive-include opencompass/configs *.py *.yml *.json *.txt *.md
|
||||||
|
recursive-include opencompass/openicl/icl_evaluator/hf_metrics *.py
|
||||||
|
recursive-include opencompass/datasets *.py *.yml *.json *.txt *.md *.yaml
|
674
README.md
674
README.md
@ -3,37 +3,44 @@
|
|||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
|
|
||||||
[](https://opencompass.readthedocs.io/en)
|
[![][github-release-shield]][github-release-link]
|
||||||
[](https://github.com/open-compass/opencompass/blob/main/LICENSE)
|
[![][github-releasedate-shield]][github-releasedate-link]
|
||||||
|
[![][github-contributors-shield]][github-contributors-link]<br>
|
||||||
|
[![][github-forks-shield]][github-forks-link]
|
||||||
|
[![][github-stars-shield]][github-stars-link]
|
||||||
|
[![][github-issues-shield]][github-issues-link]
|
||||||
|
[![][github-license-shield]][github-license-link]
|
||||||
|
|
||||||
<!-- [](https://pypi.org/project/opencompass/) -->
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
||||||
|
|
||||||
[🌐Website](https://opencompass.org.cn/) |
|
[🌐Website](https://opencompass.org.cn/) |
|
||||||
|
[📖CompassHub](https://hub.opencompass.org.cn/home) |
|
||||||
|
[📊CompassRank](https://rank.opencompass.org.cn/home) |
|
||||||
[📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
|
[📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
|
||||||
[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
|
[🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
|
||||||
[🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
|
[🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
|
||||||
|
|
||||||
English | [简体中文](README_zh-CN.md)
|
English | [简体中文](README_zh-CN.md)
|
||||||
|
|
||||||
|
[![][github-trending-shield]][github-trending-url]
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 📣 OpenCompass 2023 LLM Annual Leaderboard
|
> \[!IMPORTANT\]
|
||||||
|
>
|
||||||
|
> **Star Us**, You will receive all release notifications from GitHub without any delay ~ ⭐️
|
||||||
|
|
||||||
We are honored to have witnessed the tremendous progress of artificial general intelligence together with the community in the past year, and we are also very pleased that **OpenCompass** can help numerous developers and users.
|
<details>
|
||||||
|
<summary><kbd>Star History</kbd></summary>
|
||||||
We announce the launch of the **OpenCompass 2023 LLM Annual Leaderboard** plan. We expect to release the annual leaderboard of the LLMs in January 2024, systematically evaluating the performance of LLMs in various capabilities such as language, knowledge, reasoning, creation, long-text, and agents.
|
<picture>
|
||||||
|
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
||||||
At that time, we will release rankings for both open-source models and commercial API models, aiming to provide a comprehensive, objective, and neutral reference for the industry and research community.
|
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
||||||
|
</picture>
|
||||||
We sincerely invite various large models to join the OpenCompass to showcase their performance advantages in different fields. At the same time, we also welcome researchers and developers to provide valuable suggestions and contributions to jointly promote the development of the LLMs. If you have any questions or needs, please feel free to [contact us](mailto:opencompass@pjlab.org.cn). In addition, relevant evaluation contents, performance statistics, and evaluation methods will be open-source along with the leaderboard release.
|
</details>
|
||||||
|
|
||||||
We have provided the more details of the CompassBench 2023 in [Doc](docs/zh_cn/advanced_guides/compassbench_intro.md).
|
|
||||||
|
|
||||||
Let's look forward to the release of the OpenCompass 2023 LLM Annual Leaderboard!
|
|
||||||
|
|
||||||
## 🧭 Welcome
|
## 🧭 Welcome
|
||||||
|
|
||||||
@ -46,21 +53,232 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
|||||||
🔥🔥🔥 We are delighted to announce that **the OpenCompass has been recommended by the Meta AI**, click [Get Started](https://ai.meta.com/llama/get-started/#validation) of Llama for more information.
|
🔥🔥🔥 We are delighted to announce that **the OpenCompass has been recommended by the Meta AI**, click [Get Started](https://ai.meta.com/llama/get-started/#validation) of Llama for more information.
|
||||||
|
|
||||||
> **Attention**<br />
|
> **Attention**<br />
|
||||||
> We launch the OpenCompass Collaboration project, welcome to support diverse evaluation benchmarks into OpenCompass!
|
> Breaking Change Notice: In version 0.4.0, we are consolidating all AMOTIC configuration files (previously located in ./configs/datasets, ./configs/models, and ./configs/summarizers) into the opencompass package. Users are advised to update their configuration references to reflect this structural change.
|
||||||
> Clike [Issue](https://github.com/open-compass/opencompass/issues/248) for more information.
|
|
||||||
> Let's work together to build a more powerful OpenCompass toolkit!
|
|
||||||
|
|
||||||
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||||
|
|
||||||
- **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
|
- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
|
||||||
- **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
|
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
|
||||||
- **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development. 🔥🔥🔥.
|
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
|
||||||
- **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details! 🔥🔥🔥.
|
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHVerifyEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
|
||||||
- **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series).
|
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
|
||||||
- **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation.
|
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
|
||||||
|
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
|
||||||
|
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥
|
||||||
|
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
|
||||||
|
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
|
||||||
|
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
|
||||||
|
- **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥
|
||||||
|
- **\[2024.08.20\]** OpenCompass now supports the [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists. 🔥🔥🔥
|
||||||
|
- **\[2024.08.16\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [RULER](https://arxiv.org/pdf/2404.06654). RULER provides an evaluation of long-context including retrieval, multi-hop tracing, aggregation, and question answering through flexible configurations. Check out the [RULER](configs/datasets/ruler/README.md) evaluation config now! 🔥🔥🔥
|
||||||
|
- **\[2024.08.09\]** We have released the example data and configuration for the CompassBench-202408, welcome to [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) for more details. 🔥🔥🔥
|
||||||
|
- **\[2024.08.01\]** We supported the [Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315) models. Welcome to try! 🔥🔥🔥
|
||||||
|
- **\[2024.07.23\]** We supported the [ModelScope](www.modelscope.cn) datasets, you can load them on demand without downloading all the data to your local disk. Welcome to try! 🔥🔥🔥
|
||||||
|
- **\[2024.07.17\]** We are excited to announce the release of NeedleBench's [technical report](http://arxiv.org/abs/2407.11963). We invite you to visit our [support documentation](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html) for detailed evaluation guidelines. 🔥🔥🔥
|
||||||
|
- **\[2024.07.04\]** OpenCompass now supports InternLM2.5, which has **outstanding reasoning capability**, **1M Context window and** and **stronger tool use**, you can try the models in [OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) and [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
|
||||||
|
- **\[2024.06.20\]** OpenCompass now supports one-click switching between inference acceleration backends, enhancing the efficiency of the evaluation process. In addition to the default HuggingFace inference backend, it now also supports popular backends [LMDeploy](https://github.com/InternLM/lmdeploy) and [vLLM](https://github.com/vllm-project/vllm). This feature is available via a simple command-line switch and through deployment APIs. For detailed usage, see the [documentation](docs/en/advanced_guides/accelerator_intro.md).🔥🔥🔥.
|
||||||
|
|
||||||
> [More](docs/en/notes/news.md)
|
> [More](docs/en/notes/news.md)
|
||||||
|
|
||||||
|
## 📊 Leaderboard
|
||||||
|
|
||||||
|
We provide [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
|
||||||
|
|
||||||
|
You can also refer to [CompassAcademic](configs/eval_academic_leaderboard_202412.py) to quickly reproduce the leaderboard results. The currently selected datasets include Knowledge Reasoning (MMLU-Pro/GPQA Diamond), Logical Reasoning (BBH), Mathematical Reasoning (MATH-500, AIME), Code Generation (LiveCodeBench, HumanEval), and Instruction Following (IFEval)."
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||||
|
|
||||||
|
## 🛠️ Installation
|
||||||
|
|
||||||
|
Below are the steps for quick installation and datasets preparation.
|
||||||
|
|
||||||
|
### 💻 Environment Setup
|
||||||
|
|
||||||
|
We highly recommend using conda to manage your python environment.
|
||||||
|
|
||||||
|
- #### Create your virtual environment
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda create --name opencompass python=3.10 -y
|
||||||
|
conda activate opencompass
|
||||||
|
```
|
||||||
|
|
||||||
|
- #### Install OpenCompass via pip
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -U opencompass
|
||||||
|
|
||||||
|
## Full installation (with support for more datasets)
|
||||||
|
# pip install "opencompass[full]"
|
||||||
|
|
||||||
|
## Environment with model acceleration frameworks
|
||||||
|
## Manage different acceleration frameworks using virtual environments
|
||||||
|
## since they usually have dependency conflicts with each other.
|
||||||
|
# pip install "opencompass[lmdeploy]"
|
||||||
|
# pip install "opencompass[vllm]"
|
||||||
|
|
||||||
|
## API evaluation (i.e. Openai, Qwen)
|
||||||
|
# pip install "opencompass[api]"
|
||||||
|
```
|
||||||
|
|
||||||
|
- #### Install OpenCompass from source
|
||||||
|
|
||||||
|
If you want to use opencompass's latest features, or develop new features, you can also build it from source
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/open-compass/opencompass opencompass
|
||||||
|
cd opencompass
|
||||||
|
pip install -e .
|
||||||
|
# pip install -e ".[full]"
|
||||||
|
# pip install -e ".[vllm]"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 📂 Data Preparation
|
||||||
|
|
||||||
|
You can choose one for the following method to prepare datasets.
|
||||||
|
|
||||||
|
#### Offline Preparation
|
||||||
|
|
||||||
|
You can download and extract the datasets with the following commands:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Download dataset to data/ folder
|
||||||
|
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
|
||||||
|
unzip OpenCompassData-core-20240207.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Automatic Download from OpenCompass
|
||||||
|
|
||||||
|
We have supported download datasets automatic from the OpenCompass storage server. You can run the evaluation with extra `--dry-run` to download these datasets.
|
||||||
|
Currently, the supported datasets are listed in [here](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259). More datasets will be uploaded recently.
|
||||||
|
|
||||||
|
#### (Optional) Automatic Download with ModelScope
|
||||||
|
|
||||||
|
Also you can use the [ModelScope](www.modelscope.cn) to load the datasets on demand.
|
||||||
|
|
||||||
|
Installation:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install modelscope[framework]
|
||||||
|
export DATASET_SOURCE=ModelScope
|
||||||
|
```
|
||||||
|
|
||||||
|
Then submit the evaluation task without downloading all the data to your local disk. Available datasets include:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
|
||||||
|
```
|
||||||
|
|
||||||
|
Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||||
|
|
||||||
|
## 🏗️ ️Evaluation
|
||||||
|
|
||||||
|
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
|
||||||
|
|
||||||
|
### Your first evaluation with OpenCompass!
|
||||||
|
|
||||||
|
OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CLI
|
||||||
|
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
|
||||||
|
|
||||||
|
# Python scripts
|
||||||
|
opencompass examples/eval_chat_demo.py
|
||||||
|
```
|
||||||
|
|
||||||
|
You can find more script examples under [examples](./examples) folder.
|
||||||
|
|
||||||
|
### API evaluation
|
||||||
|
|
||||||
|
OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
|
||||||
|
# CLI
|
||||||
|
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
|
||||||
|
|
||||||
|
# Python scripts
|
||||||
|
opencompass examples/eval_api_demo.py
|
||||||
|
|
||||||
|
# You can use o1_mini_2024_09_12/o1_preview_2024_09_12 for o1 models, we set max_completion_tokens=8192 as default.
|
||||||
|
```
|
||||||
|
|
||||||
|
### Accelerated Evaluation
|
||||||
|
|
||||||
|
Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# CLI
|
||||||
|
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
|
||||||
|
|
||||||
|
# Python scripts
|
||||||
|
opencompass examples/eval_lmdeploy_demo.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### Supported Models and Datasets
|
||||||
|
|
||||||
|
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# List all configurations
|
||||||
|
python tools/list_configs.py
|
||||||
|
# List all configurations related to llama and mmlu
|
||||||
|
python tools/list_configs.py llama mmlu
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Supported Models
|
||||||
|
|
||||||
|
If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Supported Datasets
|
||||||
|
|
||||||
|
Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Recommended Evaluation Config based on Rules
|
||||||
|
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
|
||||||
|
|
||||||
|
# Recommended Evaluation Config based on LLM Judge
|
||||||
|
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
|
||||||
|
```
|
||||||
|
|
||||||
|
If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
|
||||||
|
```
|
||||||
|
|
||||||
|
> \[!TIP\]
|
||||||
|
>
|
||||||
|
> `--hf-num-gpus` is used for model parallel(huggingface format), `--max-num-worker` is used for data parallel.
|
||||||
|
|
||||||
|
> \[!TIP\]
|
||||||
|
>
|
||||||
|
> configuration with `_ppl` is designed for base model typically.
|
||||||
|
> configuration with `_gen` can be used for both base model and chat model.
|
||||||
|
|
||||||
|
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||||
|
|
||||||
|
## 📣 OpenCompass 2.0
|
||||||
|
|
||||||
|
We are thrilled to introduce OpenCompass 2.0, an advanced suite featuring three key components: [CompassKit](https://github.com/open-compass), [CompassHub](https://hub.opencompass.org.cn/home), and [CompassRank](https://rank.opencompass.org.cn/home).
|
||||||
|

|
||||||
|
|
||||||
|
**CompassRank** has been significantly enhanced into the leaderboards that now incorporates both open-source benchmarks and proprietary benchmarks. This upgrade allows for a more comprehensive evaluation of models across the industry.
|
||||||
|
|
||||||
|
**CompassHub** presents a pioneering benchmark browser interface, designed to simplify and expedite the exploration and utilization of an extensive array of benchmarks for researchers and practitioners alike. To enhance the visibility of your own benchmark within the community, we warmly invite you to contribute it to CompassHub. You may initiate the submission process by clicking [here](https://hub.opencompass.org.cn/dataset-submit).
|
||||||
|
|
||||||
|
**CompassKit** is a powerful collection of evaluation toolkits specifically tailored for Large Language Models and Large Vision-language Models. It provides an extensive set of tools to assess and measure the performance of these complex models effectively. Welcome to try our toolkits for in your research and products.
|
||||||
|
|
||||||
## ✨ Introduction
|
## ✨ Introduction
|
||||||
|
|
||||||

|

|
||||||
@ -77,350 +295,15 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
|
|||||||
|
|
||||||
- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results.
|
- **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results.
|
||||||
|
|
||||||
## 📊 Leaderboard
|
|
||||||
|
|
||||||
We provide [OpenCompass Leaderboard](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `opencompass@pjlab.org.cn`.
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
||||||
|
|
||||||
## 🛠️ Installation
|
|
||||||
|
|
||||||
Below are the steps for quick installation and datasets preparation.
|
|
||||||
|
|
||||||
### 💻 Environment Setup
|
|
||||||
|
|
||||||
#### Open-source Models with GPU
|
|
||||||
|
|
||||||
```bash
|
|
||||||
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
|
||||||
conda activate opencompass
|
|
||||||
git clone https://github.com/open-compass/opencompass opencompass
|
|
||||||
cd opencompass
|
|
||||||
pip install -e .
|
|
||||||
```
|
|
||||||
|
|
||||||
#### API Models with CPU-only
|
|
||||||
|
|
||||||
```bash
|
|
||||||
conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
|
|
||||||
conda activate opencompass
|
|
||||||
git clone https://github.com/open-compass/opencompass opencompass
|
|
||||||
cd opencompass
|
|
||||||
pip install -e .
|
|
||||||
# also please install requiresments packages via `pip install -r requirements/api.txt` for API models if needed.
|
|
||||||
```
|
|
||||||
|
|
||||||
### 📂 Data Preparation
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Download dataset to data/ folder
|
|
||||||
wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
|
|
||||||
unzip OpenCompassData-core-20231110.zip
|
|
||||||
```
|
|
||||||
|
|
||||||
Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
||||||
|
|
||||||
## 🏗️ ️Evaluation
|
|
||||||
|
|
||||||
After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared, you can evaluate the performance of the LLaMA-7b model on the MMLU and C-Eval datasets using the following command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
|
|
||||||
```
|
|
||||||
|
|
||||||
OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# List all configurations
|
|
||||||
python tools/list_configs.py
|
|
||||||
# List all configurations related to llama and mmlu
|
|
||||||
python tools/list_configs.py llama mmlu
|
|
||||||
```
|
|
||||||
|
|
||||||
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run.py --datasets ceval_ppl mmlu_ppl \
|
|
||||||
--hf-path huggyllama/llama-7b \ # HuggingFace model path
|
|
||||||
--model-kwargs device_map='auto' \ # Arguments for model construction
|
|
||||||
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
|
|
||||||
--max-out-len 100 \ # Maximum number of tokens generated
|
|
||||||
--max-seq-len 2048 \ # Maximum sequence length the model can accept
|
|
||||||
--batch-size 8 \ # Batch size
|
|
||||||
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
|
|
||||||
--num-gpus 1 # Number of minimum required GPUs
|
|
||||||
```
|
|
||||||
|
|
||||||
> **Note**<br />
|
|
||||||
> To run the command above, you will need to remove the comments starting from `# ` first.
|
|
||||||
|
|
||||||
Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
|
||||||
|
|
||||||
## 📖 Dataset Support
|
## 📖 Dataset Support
|
||||||
|
|
||||||
<table align="center">
|
We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
|
||||||
<tbody>
|
|
||||||
<tr align="center" valign="bottom">
|
|
||||||
<td>
|
|
||||||
<b>Language</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Knowledge</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Reasoning</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Examination</b>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr valign="top">
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Word Definition</b></summary>
|
|
||||||
|
|
||||||
- WiC
|
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
|
||||||
- SummEdits
|
|
||||||
|
|
||||||
</details>
|
In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
|
||||||
|
|
||||||
<details open>
|
Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
|
||||||
<summary><b>Idiom Learning</b></summary>
|
|
||||||
|
|
||||||
- CHID
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Semantic Similarity</b></summary>
|
|
||||||
|
|
||||||
- AFQMC
|
|
||||||
- BUSTM
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Coreference Resolution</b></summary>
|
|
||||||
|
|
||||||
- CLUEWSC
|
|
||||||
- WSC
|
|
||||||
- WinoGrande
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Translation</b></summary>
|
|
||||||
|
|
||||||
- Flores
|
|
||||||
- IWSLT2017
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Multi-language Question Answering</b></summary>
|
|
||||||
|
|
||||||
- TyDi-QA
|
|
||||||
- XCOPA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Multi-language Summary</b></summary>
|
|
||||||
|
|
||||||
- XLSum
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Knowledge Question Answering</b></summary>
|
|
||||||
|
|
||||||
- BoolQ
|
|
||||||
- CommonSenseQA
|
|
||||||
- NaturalQuestions
|
|
||||||
- TriviaQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Textual Entailment</b></summary>
|
|
||||||
|
|
||||||
- CMNLI
|
|
||||||
- OCNLI
|
|
||||||
- OCNLI_FC
|
|
||||||
- AX-b
|
|
||||||
- AX-g
|
|
||||||
- CB
|
|
||||||
- RTE
|
|
||||||
- ANLI
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Commonsense Reasoning</b></summary>
|
|
||||||
|
|
||||||
- StoryCloze
|
|
||||||
- COPA
|
|
||||||
- ReCoRD
|
|
||||||
- HellaSwag
|
|
||||||
- PIQA
|
|
||||||
- SIQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Mathematical Reasoning</b></summary>
|
|
||||||
|
|
||||||
- MATH
|
|
||||||
- GSM8K
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Theorem Application</b></summary>
|
|
||||||
|
|
||||||
- TheoremQA
|
|
||||||
- StrategyQA
|
|
||||||
- SciBench
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Comprehensive Reasoning</b></summary>
|
|
||||||
|
|
||||||
- BBH
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
|
|
||||||
|
|
||||||
- C-Eval
|
|
||||||
- AGIEval
|
|
||||||
- MMLU
|
|
||||||
- GAOKAO-Bench
|
|
||||||
- CMMLU
|
|
||||||
- ARC
|
|
||||||
- Xiezhi
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Medical Examinations</b></summary>
|
|
||||||
|
|
||||||
- CMB
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
<tbody>
|
|
||||||
<tr align="center" valign="bottom">
|
|
||||||
<td>
|
|
||||||
<b>Understanding</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Long Context</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Safety</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>Code</b>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr valign="top">
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Reading Comprehension</b></summary>
|
|
||||||
|
|
||||||
- C3
|
|
||||||
- CMRC
|
|
||||||
- DRCD
|
|
||||||
- MultiRC
|
|
||||||
- RACE
|
|
||||||
- DROP
|
|
||||||
- OpenBookQA
|
|
||||||
- SQuAD2.0
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Content Summary</b></summary>
|
|
||||||
|
|
||||||
- CSL
|
|
||||||
- LCSTS
|
|
||||||
- XSum
|
|
||||||
- SummScreen
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>Content Analysis</b></summary>
|
|
||||||
|
|
||||||
- EPRSTMT
|
|
||||||
- LAMBADA
|
|
||||||
- TNEWS
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Long Context Understanding</b></summary>
|
|
||||||
|
|
||||||
- LEval
|
|
||||||
- LongBench
|
|
||||||
- GovReports
|
|
||||||
- NarrativeQA
|
|
||||||
- Qasper
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Safety</b></summary>
|
|
||||||
|
|
||||||
- CivilComments
|
|
||||||
- CrowsPairs
|
|
||||||
- CValues
|
|
||||||
- JigsawMultilingual
|
|
||||||
- TruthfulQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Robustness</b></summary>
|
|
||||||
|
|
||||||
- AdvGLUE
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>Code</b></summary>
|
|
||||||
|
|
||||||
- HumanEval
|
|
||||||
- HumanEvalX
|
|
||||||
- MBPP
|
|
||||||
- APPs
|
|
||||||
- DS1000
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
## OpenCompass Ecosystem
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||||
|
|
||||||
@ -442,23 +325,27 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|||||||
<tr valign="top">
|
<tr valign="top">
|
||||||
<td>
|
<td>
|
||||||
|
|
||||||
- [InternLM](https://github.com/InternLM/InternLM)
|
|
||||||
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
||||||
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
||||||
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
||||||
- [Baichuan](https://github.com/baichuan-inc)
|
- [Baichuan](https://github.com/baichuan-inc)
|
||||||
- [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
|
||||||
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
|
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
|
||||||
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
|
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
|
||||||
- [TigerBot](https://github.com/TigerResearch/TigerBot)
|
- [Gemma](https://huggingface.co/google/gemma-7b)
|
||||||
|
- [InternLM](https://github.com/InternLM/InternLM)
|
||||||
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
||||||
|
- [LLaMA3](https://github.com/meta-llama/llama3)
|
||||||
- [Qwen](https://github.com/QwenLM/Qwen)
|
- [Qwen](https://github.com/QwenLM/Qwen)
|
||||||
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
|
- [TigerBot](https://github.com/TigerResearch/TigerBot)
|
||||||
- ...
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
||||||
|
- [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
|
- [Yi](https://github.com/01-ai/Yi)
|
||||||
|
- ……
|
||||||
|
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
|
||||||
- OpenAI
|
- OpenAI
|
||||||
|
- Gemini
|
||||||
- Claude
|
- Claude
|
||||||
- ZhipuAI(ChatGLM)
|
- ZhipuAI(ChatGLM)
|
||||||
- Baichuan
|
- Baichuan
|
||||||
@ -481,25 +368,39 @@ Through the command line or configuration files, OpenCompass also supports evalu
|
|||||||
|
|
||||||
## 🔜 Roadmap
|
## 🔜 Roadmap
|
||||||
|
|
||||||
- [ ] Subjective Evaluation
|
- [x] Subjective Evaluation
|
||||||
- [ ] Release CompassAreana
|
- [x] Release CompassAreana.
|
||||||
- [ ] Subjective evaluation dataset.
|
- [x] Subjective evaluation.
|
||||||
- [x] Long-context
|
- [x] Long-context
|
||||||
- [ ] Long-context evaluation with extensive datasets.
|
- [x] Long-context evaluation with extensive datasets.
|
||||||
- [ ] Long-context leaderboard.
|
- [ ] Long-context leaderboard.
|
||||||
- [ ] Coding
|
- [x] Coding
|
||||||
- [ ] Coding evaluation leaderboard.
|
- [ ] Coding evaluation leaderboard.
|
||||||
- [x] Non-python language evaluation service.
|
- [x] Non-python language evaluation service.
|
||||||
- [ ] Agent
|
- [x] Agent
|
||||||
- [ ] Support various agenet framework.
|
- [ ] Support various agent frameworks.
|
||||||
- [ ] Evaluation of tool use of the LLMs.
|
- [x] Evaluation of tool use of the LLMs.
|
||||||
- [x] Robustness
|
- [x] Robustness
|
||||||
- [x] Support various attack method
|
- [x] Support various attack methods.
|
||||||
|
|
||||||
## 👷♂️ Contributing
|
## 👷♂️ Contributing
|
||||||
|
|
||||||
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
|
||||||
|
|
||||||
|
<!-- Copy-paste in your Readme.md file -->
|
||||||
|
|
||||||
|
<!-- Made with [OSS Insight](https://ossinsight.io/) -->
|
||||||
|
|
||||||
|
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th colspan="2">
|
||||||
|
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</a>
|
||||||
|
|
||||||
## 🤝 Acknowledgements
|
## 🤝 Acknowledgements
|
||||||
|
|
||||||
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
|
||||||
@ -518,3 +419,20 @@ Some datasets and prompt implementations are modified from [chain-of-thought-hub
|
|||||||
```
|
```
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||||
|
|
||||||
|
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
||||||
|
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
||||||
|
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
||||||
|
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
||||||
|
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
||||||
|
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
||||||
|
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
||||||
|
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
||||||
|
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
||||||
|
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
||||||
|
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
||||||
|
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
||||||
|
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
||||||
|
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
||||||
|
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
||||||
|
[github-trending-url]: https://trendshift.io/repositories/6630
|
||||||
|
667
README_zh-CN.md
667
README_zh-CN.md
@ -3,37 +3,44 @@
|
|||||||
<br />
|
<br />
|
||||||
<br />
|
<br />
|
||||||
|
|
||||||
[](https://opencompass.readthedocs.io/zh_CN)
|
[![][github-release-shield]][github-release-link]
|
||||||
[](https://github.com/open-compass/opencompass/blob/main/LICENSE)
|
[![][github-releasedate-shield]][github-releasedate-link]
|
||||||
|
[![][github-contributors-shield]][github-contributors-link]<br>
|
||||||
|
[![][github-forks-shield]][github-forks-link]
|
||||||
|
[![][github-stars-shield]][github-stars-link]
|
||||||
|
[![][github-issues-shield]][github-issues-link]
|
||||||
|
[![][github-license-shield]][github-license-link]
|
||||||
|
|
||||||
<!-- [](https://pypi.org/project/opencompass/) -->
|
<!-- [](https://pypi.org/project/opencompass/) -->
|
||||||
|
|
||||||
[🌐Website](https://opencompass.org.cn/) |
|
[🌐官方网站](https://opencompass.org.cn/) |
|
||||||
[📘Documentation](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
|
[📖数据集社区](https://hub.opencompass.org.cn/home) |
|
||||||
[🛠️Installation](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html) |
|
[📊性能榜单](https://rank.opencompass.org.cn/home) |
|
||||||
[🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
|
[📘文档教程](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
|
||||||
|
[🛠️安装](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html) |
|
||||||
|
[🤔报告问题](https://github.com/open-compass/opencompass/issues/new/choose)
|
||||||
|
|
||||||
[English](/README.md) | 简体中文
|
[English](/README.md) | 简体中文
|
||||||
|
|
||||||
|
[![][github-trending-shield]][github-trending-url]
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👋 加入我们的 <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> 和 <a href="https://r.vansin.top/?r=opencompass" target="_blank">微信社区</a>
|
👋 加入我们的 <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> 和 <a href="https://r.vansin.top/?r=opencompass" target="_blank">微信社区</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 📣 2023 年度榜单计划
|
> \[!IMPORTANT\]
|
||||||
|
>
|
||||||
|
> **收藏项目**,你将能第一时间获取 OpenCompass 的最新动态~⭐️
|
||||||
|
|
||||||
我们有幸与社区共同见证了通用人工智能在过去一年里的巨大进展,也非常高兴OpenCompass能够帮助广大大模型开发者和使用者。
|
<details>
|
||||||
|
<summary><kbd>Star History</kbd></summary>
|
||||||
我们宣布将启动**OpenCompass 2023年度大模型榜单**发布计划。我们预计将于2024年1月发布大模型年度榜单,系统性评估大模型在语言、知识、推理、创作、长文本和智能体等多个能力维度的表现。
|
<picture>
|
||||||
|
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&theme=dark&type=Date">
|
||||||
届时,我们将发布开源模型和商业API模型能力榜单,以期为业界提供一份**全面、客观、中立**的参考。
|
<img width="100%" src="https://api.star-history.com/svg?repos=open-compass%2Fopencompass&type=Date">
|
||||||
|
</picture>
|
||||||
我们诚挚邀请各类大模型接入OpenCompass评测体系,以展示其在各个领域的性能优势。同时,也欢迎广大研究者、开发者向我们提供宝贵的意见和建议,共同推动大模型领域的发展。如有任何问题或需求,请随时[联系我们](mailto:opencompass@pjlab.org.cn)。此外,相关评测内容,性能数据,评测方法也将随榜单发布一并开源。
|
</details>
|
||||||
|
|
||||||
我们提供了本次评测所使用的部分题目示例,详情请见[CompassBench 2023](docs/zh_cn/advanced_guides/compassbench_intro.md).
|
|
||||||
|
|
||||||
<p>让我们共同期待OpenCompass 2023年度大模型榜单的发布,期待各大模型在榜单上的精彩表现!</p>
|
|
||||||
|
|
||||||
## 🧭 欢迎
|
## 🧭 欢迎
|
||||||
|
|
||||||
@ -43,24 +50,228 @@
|
|||||||
|
|
||||||
🚩🚩🚩 欢迎加入 OpenCompass!我们目前**招聘全职研究人员/工程师和实习生**。如果您对 LLM 和 OpenCompass 充满热情,请随时通过[电子邮件](mailto:zhangsongyang@pjlab.org.cn)与我们联系。我们非常期待与您交流!
|
🚩🚩🚩 欢迎加入 OpenCompass!我们目前**招聘全职研究人员/工程师和实习生**。如果您对 LLM 和 OpenCompass 充满热情,请随时通过[电子邮件](mailto:zhangsongyang@pjlab.org.cn)与我们联系。我们非常期待与您交流!
|
||||||
|
|
||||||
🔥🔥🔥 祝贺 **OpenCompass 作为大模型标准测试工具被Meta AI官方推荐**, 点击 Llama 的 [入门文档](https://ai.meta.com/llama/get-started/#validation) 获取更多信息.
|
🔥🔥🔥 祝贺 **OpenCompass 作为大模型标准测试工具被Meta AI官方推荐**, 点击 Llama 的 [入门文档](https://ai.meta.com/llama/get-started/#validation) 获取更多信息。
|
||||||
|
|
||||||
> **注意**<br />
|
> **注意**<br />
|
||||||
> 我们正式启动 OpenCompass 共建计划,诚邀社区用户为 OpenCompass 提供更具代表性和可信度的客观评测数据集!
|
> 重要通知:从 v0.4.0 版本开始,所有位于 ./configs/datasets、./configs/models 和 ./configs/summarizers 目录下的 AMOTIC 配置文件将迁移至 opencompass 包中。请及时更新您的配置文件路径。
|
||||||
> 点击 [Issue](https://github.com/open-compass/opencompass/issues/248) 获取更多数据集.
|
|
||||||
> 让我们携手共进,打造功能强大易用的大模型评测平台!
|
|
||||||
|
|
||||||
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||||
|
|
||||||
- **\[2024.01.17\]** 我们支持了 [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 和 [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 的相关评测,InternLM2 在这些测试中表现出非常强劲的性能,欢迎试用!🔥🔥🔥.
|
- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`,允许多个评估器按顺序工作,可以为更复杂的评估场景创建自定义评估流程,查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法!🔥🔥🔥
|
||||||
- **\[2024.01.17\]** 我们支持了多根针版本的大海捞针测试,更多信息见[这里](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html#id8)🔥🔥🔥.
|
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
|
||||||
- **\[2023.12.28\]** 我们支持了对使用[LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory)(一款强大的LLM开发工具箱)开发的所有模型的无缝评估! 🔥🔥🔥.
|
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
|
||||||
- **\[2023.12.22\]** 我们开源了[T-Eval](https://github.com/open-compass/T-Eval)用于评测大语言模型工具调用能力。欢迎访问T-Eval的官方[Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html)获取更多信息! 🔥🔥🔥.
|
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHVerifyEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
|
||||||
- **\[2023.12.10\]** 我们开源了多模评测框架 [VLMEvalKit](https://github.com/open-compass/VLMEvalKit),目前已支持 20+ 个多模态大模型与包括 MMBench 系列在内的 7 个多模态评测集.
|
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。
|
||||||
- **\[2023.12.10\]** 我们已经支持了Mistral AI的MoE模型 **Mixtral-8x7B-32K**。欢迎查阅[MixtralKit](https://github.com/open-compass/MixtralKit)以获取更多关于推理和评测的详细信息.
|
- **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py),你可以通过简单地配置复现官方评测结果。
|
||||||
|
- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥
|
||||||
|
- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥
|
||||||
|
- **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥
|
||||||
|
- **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥
|
||||||
|
- **\[2024.08.20\]** OpenCompass 现已支持 [SciCode](https://github.com/scicode-bench/SciCode): A Research Coding Benchmark Curated by Scientists。 🔥🔥🔥
|
||||||
|
- **\[2024.08.16\]** OpenCompass 现已支持全新的长上下文语言模型评估基准——[RULER](https://arxiv.org/pdf/2404.06654)。RULER 通过灵活的配置,提供了对长上下文包括检索、多跳追踪、聚合和问答等多种任务类型的评测,欢迎访问[RULER](configs/datasets/ruler/README.md)。🔥🔥🔥
|
||||||
|
- **\[2024.07.23\]** 我们支持了[Gemma2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)模型,欢迎试用!🔥🔥🔥
|
||||||
|
- **\[2024.07.23\]** 我们支持了[ModelScope](www.modelscope.cn)数据集,您可以按需加载,无需事先下载全部数据到本地,欢迎试用!🔥🔥🔥
|
||||||
|
- **\[2024.07.17\]** 我们发布了CompassBench-202407榜单的示例数据和评测规则,敬请访问 [CompassBench](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/compassbench_intro.html) 获取更多信息。 🔥🔥🔥
|
||||||
|
- **\[2024.07.17\]** 我们正式发布 NeedleBench 的[技术报告](http://arxiv.org/abs/2407.11963)。诚邀您访问我们的[帮助文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html)进行评估。🔥🔥🔥
|
||||||
|
- **\[2024.07.04\]** OpenCompass 现已支持 InternLM2.5, 它拥有卓越的推理性能、有效支持百万字超长上下文以及工具调用能力整体升级,欢迎访问[OpenCompass Config](https://github.com/open-compass/opencompass/tree/main/configs/models/hf_internlm) 和 [InternLM](https://github.com/InternLM/InternLM) .🔥🔥🔥.
|
||||||
|
- **\[2024.06.20\]** OpenCompass 现已支持一键切换推理加速后端,助力评测过程更加高效。除了默认的HuggingFace推理后端外,还支持了常用的 [LMDeploy](https://github.com/InternLM/lmdeploy) 和 [vLLM](https://github.com/vllm-project/vllm) ,支持命令行一键切换和部署 API 加速服务两种方式,详细使用方法见[文档](docs/zh_cn/advanced_guides/accelerator_intro.md)。欢迎试用!🔥🔥🔥.
|
||||||
|
|
||||||
> [更多](docs/zh_cn/notes/news.md)
|
> [更多](docs/zh_cn/notes/news.md)
|
||||||
|
|
||||||
|
## 📊 性能榜单
|
||||||
|
|
||||||
|
我们将陆续提供开源模型和 API 模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://rank.opencompass.org.cn/home) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.
|
||||||
|
|
||||||
|
你也可以参考[CompassAcademic](configs/eval_academic_leaderboard_202412.py),快速地复现榜单的结果,目前选取的数据集包括 综合知识推理 (MMLU-Pro/GPQA Diamond) ,逻辑推理 (BBH) ,数学推理 (MATH-500, AIME) ,代码生成 (LiveCodeBench, HumanEval) ,指令跟随 (IFEval) 。
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||||
|
|
||||||
|
## 🛠️ 安装指南
|
||||||
|
|
||||||
|
下面提供了快速安装和数据集准备的步骤。
|
||||||
|
|
||||||
|
### 💻 环境搭建
|
||||||
|
|
||||||
|
我们强烈建议使用 `conda` 来管理您的 Python 环境。
|
||||||
|
|
||||||
|
- #### 创建虚拟环境
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda create --name opencompass python=3.10 -y
|
||||||
|
conda activate opencompass
|
||||||
|
```
|
||||||
|
|
||||||
|
- #### 通过pip安装OpenCompass
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 支持绝大多数数据集及模型
|
||||||
|
pip install -U opencompass
|
||||||
|
|
||||||
|
# 完整安装(支持更多数据集)
|
||||||
|
# pip install "opencompass[full]"
|
||||||
|
|
||||||
|
# 模型推理后端,由于这些推理后端通常存在依赖冲突,建议使用不同的虚拟环境来管理它们。
|
||||||
|
# pip install "opencompass[lmdeploy]"
|
||||||
|
# pip install "opencompass[vllm]"
|
||||||
|
|
||||||
|
# API 测试(例如 OpenAI、Qwen)
|
||||||
|
# pip install "opencompass[api]"
|
||||||
|
```
|
||||||
|
|
||||||
|
- #### 基于源码安装OpenCompass
|
||||||
|
|
||||||
|
如果希望使用 OpenCompass 的最新功能,也可以从源代码构建它:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/open-compass/opencompass opencompass
|
||||||
|
cd opencompass
|
||||||
|
pip install -e .
|
||||||
|
# pip install -e ".[full]"
|
||||||
|
# pip install -e ".[vllm]"
|
||||||
|
```
|
||||||
|
|
||||||
|
### 📂 数据准备
|
||||||
|
|
||||||
|
#### 提前离线下载
|
||||||
|
|
||||||
|
OpenCompass支持使用本地数据集进行评测,数据集的下载和解压可以通过以下命令完成:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 下载数据集到 data/ 处
|
||||||
|
wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
|
||||||
|
unzip OpenCompassData-core-20240207.zip
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 从 OpenCompass 自动下载
|
||||||
|
|
||||||
|
我们已经支持从OpenCompass存储服务器自动下载数据集。您可以通过额外的 `--dry-run` 参数来运行评估以下载这些数据集。
|
||||||
|
目前支持的数据集列表在[这里](https://github.com/open-compass/opencompass/blob/main/opencompass/utils/datasets_info.py#L259)。更多数据集将会很快上传。
|
||||||
|
|
||||||
|
#### (可选) 使用 ModelScope 自动下载
|
||||||
|
|
||||||
|
另外,您还可以使用[ModelScope](www.modelscope.cn)来加载数据集:
|
||||||
|
环境准备:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install modelscope
|
||||||
|
export DATASET_SOURCE=ModelScope
|
||||||
|
```
|
||||||
|
|
||||||
|
配置好环境后,无需下载全部数据,直接提交评测任务即可。目前支持的数据集有:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ceval, math, LCSTS, Xsum, winogrande, openbookqa, AGIEval, gsm8k, nq, race, siqa, mbpp, mmlu, hellaswag, ARC, BBH, xstory_cloze, summedits, GAOKAO-BENCH, OCNLI, cmnli
|
||||||
|
```
|
||||||
|
|
||||||
|
有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||||
|
|
||||||
|
## 🏗️ ️评测
|
||||||
|
|
||||||
|
在确保按照上述步骤正确安装了 OpenCompass 并准备好了数据集之后,现在您可以开始使用 OpenCompass 进行首次评估!
|
||||||
|
|
||||||
|
- ### 首次评测
|
||||||
|
|
||||||
|
OpenCompass 支持通过命令行界面 (CLI) 或 Python 脚本来设置配置。对于简单的评估设置,我们推荐使用 CLI;而对于更复杂的评估,则建议使用脚本方式。你可以在examples文件夹下找到更多脚本示例。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 命令行界面 (CLI)
|
||||||
|
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
|
||||||
|
|
||||||
|
# Python 脚本
|
||||||
|
opencompass examples/eval_chat_demo.py
|
||||||
|
```
|
||||||
|
|
||||||
|
你可以在[examples](./examples) 文件夹下找到更多的脚本示例。
|
||||||
|
|
||||||
|
- ### API评测
|
||||||
|
|
||||||
|
OpenCompass 在设计上并不区分开源模型与 API 模型。您可以以相同的方式或甚至在同一设置中评估这两种类型的模型。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
|
||||||
|
# 命令行界面 (CLI)
|
||||||
|
opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
|
||||||
|
|
||||||
|
# Python 脚本
|
||||||
|
opencompass examples/eval_api_demo.py
|
||||||
|
|
||||||
|
|
||||||
|
# 现已支持 o1_mini_2024_09_12/o1_preview_2024_09_12 模型, 默认情况下 max_completion_tokens=8192.
|
||||||
|
```
|
||||||
|
|
||||||
|
- ### 推理后端
|
||||||
|
|
||||||
|
另外,如果您想使用除 HuggingFace 之外的推理后端来进行加速评估,比如 LMDeploy 或 vLLM,可以通过以下命令进行。请确保您已经为所选的后端安装了必要的软件包,并且您的模型支持该后端的加速推理。更多信息,请参阅关于推理加速后端的文档 [这里](docs/zh_cn/advanced_guides/accelerator_intro.md)。以下是使用 LMDeploy 的示例:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
|
||||||
|
```
|
||||||
|
|
||||||
|
- ### 支持的模型与数据集
|
||||||
|
|
||||||
|
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 列出所有配置
|
||||||
|
python tools/list_configs.py
|
||||||
|
# 列出所有跟 llama 及 mmlu 相关的配置
|
||||||
|
python tools/list_configs.py llama mmlu
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 支持的模型
|
||||||
|
|
||||||
|
如果模型不在列表中,但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装(详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)),您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 支持的数据集
|
||||||
|
|
||||||
|
目前,OpenCompass针对数据集给出了标准的推荐配置。通常,`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 基于规则的推荐配置
|
||||||
|
opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
|
||||||
|
|
||||||
|
# 基于LLM Judge的推荐配置
|
||||||
|
opencompass --datasets aime2024_llmjudge_gen --models hf_internlm2_5_1_8b_chat
|
||||||
|
```
|
||||||
|
|
||||||
|
此外,如果你想在多块 GPU 上使用模型进行推理,您可以使用 `--max-num-worker` 参数。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
|
||||||
|
```
|
||||||
|
|
||||||
|
> \[!TIP\]
|
||||||
|
>
|
||||||
|
> `--hf-num-gpus` 用于 模型并行(huggingface 格式),`--max-num-worker` 用于数据并行。
|
||||||
|
|
||||||
|
> \[!TIP\]
|
||||||
|
>
|
||||||
|
> configuration with `_ppl` is designed for base model typically.
|
||||||
|
> 配置带 `_ppl` 的配置设计给基础模型使用。
|
||||||
|
> 配置带 `_gen` 的配置可以同时用于基础模型和对话模型。
|
||||||
|
|
||||||
|
通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
|
||||||
|
|
||||||
|
更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
|
||||||
|
|
||||||
|
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||||
|
|
||||||
|
## 📣 OpenCompass 2.0
|
||||||
|
|
||||||
|
我们很高兴发布 OpenCompass 司南 2.0 大模型评测体系,它主要由三大核心模块构建而成:[CompassKit](https://github.com/open-compass)、[CompassHub](https://hub.opencompass.org.cn/home)以及[CompassRank](https://rank.opencompass.org.cn/home)。
|
||||||
|
|
||||||
|
**CompassRank** 系统进行了重大革新与提升,现已成为一个兼容并蓄的排行榜体系,不仅囊括了开源基准测试项目,还包含了私有基准测试。此番升级极大地拓宽了对行业内各类模型进行全面而深入测评的可能性。
|
||||||
|
|
||||||
|
**CompassHub** 创新性地推出了一个基准测试资源导航平台,其设计初衷旨在简化和加快研究人员及行业从业者在多样化的基准测试库中进行搜索与利用的过程。为了让更多独具特色的基准测试成果得以在业内广泛传播和应用,我们热忱欢迎各位将自定义的基准数据贡献至CompassHub平台。只需轻点鼠标,通过访问[这里](https://hub.opencompass.org.cn/dataset-submit),即可启动提交流程。
|
||||||
|
|
||||||
|
**CompassKit** 是一系列专为大型语言模型和大型视觉-语言模型打造的强大评估工具合集,它所提供的全面评测工具集能够有效地对这些复杂模型的功能性能进行精准测量和科学评估。在此,我们诚挚邀请您在学术研究或产品研发过程中积极尝试运用我们的工具包,以助您取得更加丰硕的研究成果和产品优化效果。
|
||||||
|
|
||||||
## ✨ 介绍
|
## ✨ 介绍
|
||||||
|
|
||||||

|

|
||||||
@ -79,350 +290,13 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
|
|||||||
|
|
||||||
- **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展!
|
- **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展!
|
||||||
|
|
||||||
## 📊 性能榜单
|
|
||||||
|
|
||||||
我们将陆续提供开源模型和API模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://opencompass.org.cn/rank) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `opencompass@pjlab.org.cn`.
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
|
||||||
|
|
||||||
## 🛠️ 安装
|
|
||||||
|
|
||||||
下面展示了快速安装以及准备数据集的步骤。
|
|
||||||
|
|
||||||
### 💻 环境配置
|
|
||||||
|
|
||||||
#### 面向开源模型的GPU环境
|
|
||||||
|
|
||||||
```bash
|
|
||||||
conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
|
|
||||||
conda activate opencompass
|
|
||||||
git clone https://github.com/open-compass/opencompass opencompass
|
|
||||||
cd opencompass
|
|
||||||
pip install -e .
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 面向API模型测试的CPU环境
|
|
||||||
|
|
||||||
```bash
|
|
||||||
conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
|
|
||||||
conda activate opencompass
|
|
||||||
git clone https://github.com/open-compass/opencompass opencompass
|
|
||||||
cd opencompass
|
|
||||||
pip install -e .
|
|
||||||
# 如果需要使用各个API模型,请 `pip install -r requirements/api.txt` 安装API模型的相关依赖
|
|
||||||
```
|
|
||||||
|
|
||||||
### 📂 数据准备
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 下载数据集到 data/ 处
|
|
||||||
wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
|
|
||||||
unzip OpenCompassData-core-20231110.zip
|
|
||||||
```
|
|
||||||
|
|
||||||
有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
|
||||||
|
|
||||||
## 🏗️ ️评测
|
|
||||||
|
|
||||||
确保按照上述步骤正确安装 OpenCompass 并准备好数据集后,可以通过以下命令评测 LLaMA-7b 模型在 MMLU 和 C-Eval 数据集上的性能:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
|
|
||||||
```
|
|
||||||
|
|
||||||
OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# 列出所有配置
|
|
||||||
python tools/list_configs.py
|
|
||||||
# 列出所有跟 llama 及 mmlu 相关的配置
|
|
||||||
python tools/list_configs.py llama mmlu
|
|
||||||
```
|
|
||||||
|
|
||||||
你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python run.py --datasets ceval_ppl mmlu_ppl \
|
|
||||||
--hf-path huggyllama/llama-7b \ # HuggingFace 模型地址
|
|
||||||
--model-kwargs device_map='auto' \ # 构造 model 的参数
|
|
||||||
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # 构造 tokenizer 的参数
|
|
||||||
--max-out-len 100 \ # 最长生成 token 数
|
|
||||||
--max-seq-len 2048 \ # 模型能接受的最大序列长度
|
|
||||||
--batch-size 8 \ # 批次大小
|
|
||||||
--no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
|
|
||||||
--num-gpus 1 # 运行该模型所需的最少 gpu 数
|
|
||||||
```
|
|
||||||
|
|
||||||
> **注意**<br />
|
|
||||||
> 若需要运行上述命令,你需要删除所有从 `# ` 开始的注释。
|
|
||||||
|
|
||||||
通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
|
|
||||||
|
|
||||||
更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
|
||||||
|
|
||||||
## 📖 数据集支持
|
## 📖 数据集支持
|
||||||
|
|
||||||
<table align="center">
|
我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
|
||||||
<tbody>
|
|
||||||
<tr align="center" valign="bottom">
|
|
||||||
<td>
|
|
||||||
<b>语言</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>知识</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>推理</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>考试</b>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr valign="top">
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>字词释义</b></summary>
|
|
||||||
|
|
||||||
- WiC
|
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
|
||||||
- SummEdits
|
|
||||||
|
|
||||||
</details>
|
详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>成语习语</b></summary>
|
|
||||||
|
|
||||||
- CHID
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>语义相似度</b></summary>
|
|
||||||
|
|
||||||
- AFQMC
|
|
||||||
- BUSTM
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>指代消解</b></summary>
|
|
||||||
|
|
||||||
- CLUEWSC
|
|
||||||
- WSC
|
|
||||||
- WinoGrande
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>翻译</b></summary>
|
|
||||||
|
|
||||||
- Flores
|
|
||||||
- IWSLT2017
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>多语种问答</b></summary>
|
|
||||||
|
|
||||||
- TyDi-QA
|
|
||||||
- XCOPA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>多语种总结</b></summary>
|
|
||||||
|
|
||||||
- XLSum
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>知识问答</b></summary>
|
|
||||||
|
|
||||||
- BoolQ
|
|
||||||
- CommonSenseQA
|
|
||||||
- NaturalQuestions
|
|
||||||
- TriviaQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>文本蕴含</b></summary>
|
|
||||||
|
|
||||||
- CMNLI
|
|
||||||
- OCNLI
|
|
||||||
- OCNLI_FC
|
|
||||||
- AX-b
|
|
||||||
- AX-g
|
|
||||||
- CB
|
|
||||||
- RTE
|
|
||||||
- ANLI
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>常识推理</b></summary>
|
|
||||||
|
|
||||||
- StoryCloze
|
|
||||||
- COPA
|
|
||||||
- ReCoRD
|
|
||||||
- HellaSwag
|
|
||||||
- PIQA
|
|
||||||
- SIQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>数学推理</b></summary>
|
|
||||||
|
|
||||||
- MATH
|
|
||||||
- GSM8K
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>定理应用</b></summary>
|
|
||||||
|
|
||||||
- TheoremQA
|
|
||||||
- StrategyQA
|
|
||||||
- SciBench
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>综合推理</b></summary>
|
|
||||||
|
|
||||||
- BBH
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>初中/高中/大学/职业考试</b></summary>
|
|
||||||
|
|
||||||
- C-Eval
|
|
||||||
- AGIEval
|
|
||||||
- MMLU
|
|
||||||
- GAOKAO-Bench
|
|
||||||
- CMMLU
|
|
||||||
- ARC
|
|
||||||
- Xiezhi
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>医学考试</b></summary>
|
|
||||||
|
|
||||||
- CMB
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
<tbody>
|
|
||||||
<tr align="center" valign="bottom">
|
|
||||||
<td>
|
|
||||||
<b>理解</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>长文本</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>安全</b>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<b>代码</b>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr valign="top">
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>阅读理解</b></summary>
|
|
||||||
|
|
||||||
- C3
|
|
||||||
- CMRC
|
|
||||||
- DRCD
|
|
||||||
- MultiRC
|
|
||||||
- RACE
|
|
||||||
- DROP
|
|
||||||
- OpenBookQA
|
|
||||||
- SQuAD2.0
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>内容总结</b></summary>
|
|
||||||
|
|
||||||
- CSL
|
|
||||||
- LCSTS
|
|
||||||
- XSum
|
|
||||||
- SummScreen
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
<details open>
|
|
||||||
<summary><b>内容分析</b></summary>
|
|
||||||
|
|
||||||
- EPRSTMT
|
|
||||||
- LAMBADA
|
|
||||||
- TNEWS
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>长文本理解</b></summary>
|
|
||||||
|
|
||||||
- LEval
|
|
||||||
- LongBench
|
|
||||||
- GovReports
|
|
||||||
- NarrativeQA
|
|
||||||
- Qasper
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>安全</b></summary>
|
|
||||||
|
|
||||||
- CivilComments
|
|
||||||
- CrowsPairs
|
|
||||||
- CValues
|
|
||||||
- JigsawMultilingual
|
|
||||||
- TruthfulQA
|
|
||||||
|
|
||||||
</details>
|
|
||||||
<details open>
|
|
||||||
<summary><b>健壮性</b></summary>
|
|
||||||
|
|
||||||
- AdvGLUE
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<details open>
|
|
||||||
<summary><b>代码</b></summary>
|
|
||||||
|
|
||||||
- HumanEval
|
|
||||||
- HumanEvalX
|
|
||||||
- MBPP
|
|
||||||
- APPs
|
|
||||||
- DS1000
|
|
||||||
|
|
||||||
</details>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||||
|
|
||||||
@ -444,23 +318,27 @@ python run.py --datasets ceval_ppl mmlu_ppl \
|
|||||||
<tr valign="top">
|
<tr valign="top">
|
||||||
<td>
|
<td>
|
||||||
|
|
||||||
- [InternLM](https://github.com/InternLM/InternLM)
|
|
||||||
- [LLaMA](https://github.com/facebookresearch/llama)
|
|
||||||
- [Vicuna](https://github.com/lm-sys/FastChat)
|
|
||||||
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
- [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
|
||||||
- [Baichuan](https://github.com/baichuan-inc)
|
- [Baichuan](https://github.com/baichuan-inc)
|
||||||
- [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
|
||||||
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
|
- [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
|
||||||
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
|
- [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
|
||||||
- [TigerBot](https://github.com/TigerResearch/TigerBot)
|
- [Gemma](https://huggingface.co/google/gemma-7b)
|
||||||
|
- [InternLM](https://github.com/InternLM/InternLM)
|
||||||
|
- [LLaMA](https://github.com/facebookresearch/llama)
|
||||||
|
- [LLaMA3](https://github.com/meta-llama/llama3)
|
||||||
- [Qwen](https://github.com/QwenLM/Qwen)
|
- [Qwen](https://github.com/QwenLM/Qwen)
|
||||||
- [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
|
- [TigerBot](https://github.com/TigerResearch/TigerBot)
|
||||||
|
- [Vicuna](https://github.com/lm-sys/FastChat)
|
||||||
|
- [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
|
- [Yi](https://github.com/01-ai/Yi)
|
||||||
- ……
|
- ……
|
||||||
|
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
|
|
||||||
- OpenAI
|
- OpenAI
|
||||||
|
- Gemini
|
||||||
- Claude
|
- Claude
|
||||||
- ZhipuAI(ChatGLM)
|
- ZhipuAI(ChatGLM)
|
||||||
- Baichuan
|
- Baichuan
|
||||||
@ -483,18 +361,18 @@ python run.py --datasets ceval_ppl mmlu_ppl \
|
|||||||
|
|
||||||
## 🔜 路线图
|
## 🔜 路线图
|
||||||
|
|
||||||
- [ ] 主观评测
|
- [x] 主观评测
|
||||||
- [ ] 发布主观评测榜单
|
- [x] 发布主观评测榜单
|
||||||
- [ ] 发布主观评测数据集
|
- [x] 发布主观评测数据集
|
||||||
- [x] 长文本
|
- [x] 长文本
|
||||||
- [ ] 支持广泛的长文本评测集
|
- [x] 支持广泛的长文本评测集
|
||||||
- [ ] 发布长文本评测榜单
|
- [ ] 发布长文本评测榜单
|
||||||
- [ ] 代码能力
|
- [x] 代码能力
|
||||||
- [ ] 发布代码能力评测榜单
|
- [ ] 发布代码能力评测榜单
|
||||||
- [x] 提供非Python语言的评测服务
|
- [x] 提供非Python语言的评测服务
|
||||||
- [ ] 智能体
|
- [x] 智能体
|
||||||
- [ ] 支持丰富的智能体方案
|
- [ ] 支持丰富的智能体方案
|
||||||
- [ ] 提供智能体评测榜单
|
- [x] 提供智能体评测榜单
|
||||||
- [x] 鲁棒性
|
- [x] 鲁棒性
|
||||||
- [x] 支持各类攻击方法
|
- [x] 支持各类攻击方法
|
||||||
|
|
||||||
@ -502,6 +380,16 @@ python run.py --datasets ceval_ppl mmlu_ppl \
|
|||||||
|
|
||||||
我们感谢所有的贡献者为改进和提升 OpenCompass 所作出的努力。请参考[贡献指南](https://opencompass.readthedocs.io/zh_CN/latest/notes/contribution_guide.html)来了解参与项目贡献的相关指引。
|
我们感谢所有的贡献者为改进和提升 OpenCompass 所作出的努力。请参考[贡献指南](https://opencompass.readthedocs.io/zh_CN/latest/notes/contribution_guide.html)来了解参与项目贡献的相关指引。
|
||||||
|
|
||||||
|
<a href="https://github.com/open-compass/opencompass/graphs/contributors" target="_blank">
|
||||||
|
<table>
|
||||||
|
<tr>
|
||||||
|
<th colspan="2">
|
||||||
|
<br><img src="https://contrib.rocks/image?repo=open-compass/opencompass"><br><br>
|
||||||
|
</th>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</a>
|
||||||
|
|
||||||
## 🤝 致谢
|
## 🤝 致谢
|
||||||
|
|
||||||
该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
|
该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
|
||||||
@ -520,3 +408,20 @@ python run.py --datasets ceval_ppl mmlu_ppl \
|
|||||||
```
|
```
|
||||||
|
|
||||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||||
|
|
||||||
|
[github-contributors-link]: https://github.com/open-compass/opencompass/graphs/contributors
|
||||||
|
[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/opencompass?color=c4f042&labelColor=black&style=flat-square
|
||||||
|
[github-forks-link]: https://github.com/open-compass/opencompass/network/members
|
||||||
|
[github-forks-shield]: https://img.shields.io/github/forks/open-compass/opencompass?color=8ae8ff&labelColor=black&style=flat-square
|
||||||
|
[github-issues-link]: https://github.com/open-compass/opencompass/issues
|
||||||
|
[github-issues-shield]: https://img.shields.io/github/issues/open-compass/opencompass?color=ff80eb&labelColor=black&style=flat-square
|
||||||
|
[github-license-link]: https://github.com/open-compass/opencompass/blob/main/LICENSE
|
||||||
|
[github-license-shield]: https://img.shields.io/github/license/open-compass/opencompass?color=white&labelColor=black&style=flat-square
|
||||||
|
[github-release-link]: https://github.com/open-compass/opencompass/releases
|
||||||
|
[github-release-shield]: https://img.shields.io/github/v/release/open-compass/opencompass?color=369eff&labelColor=black&logo=github&style=flat-square
|
||||||
|
[github-releasedate-link]: https://github.com/open-compass/opencompass/releases
|
||||||
|
[github-releasedate-shield]: https://img.shields.io/github/release-date/open-compass/opencompass?labelColor=black&style=flat-square
|
||||||
|
[github-stars-link]: https://github.com/open-compass/opencompass/stargazers
|
||||||
|
[github-stars-shield]: https://img.shields.io/github/stars/open-compass/opencompass?color=ffcb47&labelColor=black&style=flat-square
|
||||||
|
[github-trending-shield]: https://trendshift.io/api/badge/repositories/6630
|
||||||
|
[github-trending-url]: https://trendshift.io/repositories/6630
|
||||||
|
@ -1,43 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import AI360GPT
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='360GPT_S2_V9',
|
|
||||||
type=AI360GPT,
|
|
||||||
path='360GPT_S2_V9',
|
|
||||||
key="xxxxxxxxxxxx",
|
|
||||||
generation_kwargs={
|
|
||||||
'temperature': 0.9,
|
|
||||||
'max_tokens': 2048,
|
|
||||||
'top_p': 0.5,
|
|
||||||
'tok_k': 0,
|
|
||||||
'repetition_penalty': 1.05,
|
|
||||||
},
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir ="./output/api_360GPT_S2_V9"
|
|
@ -1,44 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import BaiChuan
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='Baichuan2-53B',
|
|
||||||
type=BaiChuan,
|
|
||||||
path='Baichuan2-53B',
|
|
||||||
api_key='xxxxxx',
|
|
||||||
secret_key="xxxxx",
|
|
||||||
url="xxxxx",
|
|
||||||
generation_kwargs={
|
|
||||||
'temperature': 0.3,
|
|
||||||
'top_p': 0.85,
|
|
||||||
'top_k': 5,
|
|
||||||
'with_search_enhance': False,
|
|
||||||
},
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_baichuan53b/"
|
|
@ -1,42 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import ERNIEBot
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='erniebot',
|
|
||||||
type=ERNIEBot,
|
|
||||||
path='erniebot',
|
|
||||||
key='xxxxxx', # please give you key
|
|
||||||
secretkey='xxxxxxxxx', # please give your group_id
|
|
||||||
url='xxxxxxxxx',
|
|
||||||
generation_kwargs = {
|
|
||||||
'temperature': 0.8,
|
|
||||||
},
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_erniebot/"
|
|
@ -1,44 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import ByteDance
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
# from .datasets.collections.chat_medium import datasets
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='skylark-pro-public',
|
|
||||||
type=ByteDance,
|
|
||||||
path='skylark-pro-public',
|
|
||||||
accesskey="xxxxxxx",
|
|
||||||
secretkey="xxxxxxx",
|
|
||||||
url='xxxxxx',
|
|
||||||
generation_kwargs={
|
|
||||||
'temperature': 0.7,
|
|
||||||
'top_p': 0.9,
|
|
||||||
'top_k': 0,
|
|
||||||
},
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_bytedance/"
|
|
@ -1,37 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import MiniMax
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='minimax_abab5.5-chat',
|
|
||||||
type=MiniMax,
|
|
||||||
path='abab5.5-chat',
|
|
||||||
key='xxxxxxx', # please give you key
|
|
||||||
group_id='xxxxxxxx', # please give your group_id
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=4,
|
|
||||||
concurrent_users=4,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_minimax/"
|
|
@ -1,40 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import MoonShot
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='moonshot-v1-32k',
|
|
||||||
type=MoonShot,
|
|
||||||
path='moonshot-v1-32k',
|
|
||||||
key='xxxxxxx',
|
|
||||||
url= 'xxxxxxxx',
|
|
||||||
system_prompt= '你是 Kimi,由 Moonshot AI 提供的人工智能助手,你更擅长中文和英文的对话。'
|
|
||||||
'你会为用户提供安全,有帮助,准确的回答。同时,你会拒绝一些涉及恐怖主义,种族歧视,'
|
|
||||||
'黄色暴力等问题的回答。Moonshot AI 为专有名词,不可翻译成其他语言。',
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=4,
|
|
||||||
concurrent_users=4,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_moonshot/"
|
|
@ -1,36 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import Nanbeige
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='nanbeige-plus',
|
|
||||||
type=Nanbeige,
|
|
||||||
path='nanbeige-plus',
|
|
||||||
key="xxxxxx",
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir ="./output/nanbeige-plus"
|
|
@ -1,42 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import PanGu
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='pangu',
|
|
||||||
type=PanGu,
|
|
||||||
path='pangu',
|
|
||||||
access_key="xxxxxx",
|
|
||||||
secret_key="xxxxxx",
|
|
||||||
url = "xxxxxx",
|
|
||||||
# url of token sever, used for generate token, like "https://xxxxxx.myhuaweicloud.com/v3/auth/tokens",
|
|
||||||
token_url = "xxxxxx",
|
|
||||||
# scope-project-name, used for generate token
|
|
||||||
project_name = "xxxxxx",
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_pangu/"
|
|
@ -1,40 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import Qwen
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='qwen-max',
|
|
||||||
type=Qwen,
|
|
||||||
path='qwen-max',
|
|
||||||
key='xxxxxxxxxxxxxxxx', # please give you key
|
|
||||||
generation_kwargs={
|
|
||||||
'enable_search': False,
|
|
||||||
},
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=1,
|
|
||||||
concurrent_users=1,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_qwen/"
|
|
@ -1,52 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import SenseTime
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='nova-ptc-xl-v1',
|
|
||||||
type=SenseTime,
|
|
||||||
path='nova-ptc-xl-v1',
|
|
||||||
key='xxxxxxxxxxxxxx',
|
|
||||||
url='xxxxxxxxxxx',
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8,
|
|
||||||
parameters={
|
|
||||||
"temperature": 0.8,
|
|
||||||
"top_p": 0.7,
|
|
||||||
"max_new_tokens": 1024,
|
|
||||||
"repetition_penalty": 1.05,
|
|
||||||
"know_ids": [],
|
|
||||||
"stream": True,
|
|
||||||
"user": "#*#***TestUser***#*#",
|
|
||||||
"knowledge_config": {
|
|
||||||
"control_level": "normal",
|
|
||||||
"knowledge_base_result": False,
|
|
||||||
"online_search_result": False
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_sensetime/"
|
|
@ -1,51 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models.xunfei_api import XunFei
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
# from .datasets.collections.chat_medium import datasets
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='Spark-v1-1',
|
|
||||||
type=XunFei,
|
|
||||||
appid="xxxx",
|
|
||||||
path='ws://spark-api.xf-yun.com/v1.1/chat',
|
|
||||||
api_secret = "xxxxxxx",
|
|
||||||
api_key = "xxxxxxx",
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
dict(
|
|
||||||
abbr='Spark-v3-1',
|
|
||||||
type=XunFei,
|
|
||||||
appid="xxxx",
|
|
||||||
domain='generalv3',
|
|
||||||
path='ws://spark-api.xf-yun.com/v3.1/chat',
|
|
||||||
api_secret = "xxxxxxxx",
|
|
||||||
api_key = "xxxxxxxxx",
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_xunfei/"
|
|
@ -1,48 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import ZhiPuAI
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
# from .datasets.collections.chat_medium import datasets
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
# needs a special postprocessor for all
|
|
||||||
# except 'gsm8k' and 'strategyqa'
|
|
||||||
from opencompass.utils import general_eval_wrapper_postprocess
|
|
||||||
for _dataset in datasets:
|
|
||||||
if _dataset['abbr'] not in ['gsm8k', 'strategyqa']:
|
|
||||||
if hasattr(_dataset['eval_cfg'], 'pred_postprocessor'):
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor']['postprocess'] = _dataset['eval_cfg']['pred_postprocessor']['type']
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor']['type'] = general_eval_wrapper_postprocess
|
|
||||||
else:
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor'] = {'type': general_eval_wrapper_postprocess}
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='chatglm_pro',
|
|
||||||
type=ZhiPuAI,
|
|
||||||
path='chatglm_pro',
|
|
||||||
key='xxxxxxxxxxxx',
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_zhipu/"
|
|
@ -1,67 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models import ZhiPuV2AI
|
|
||||||
from opencompass.partitioners import NaivePartitioner
|
|
||||||
from opencompass.runners.local_api import LocalAPIRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
# from .datasets.collections.chat_medium import datasets
|
|
||||||
from ..summarizers.medium import summarizer
|
|
||||||
from ..datasets.ceval.ceval_gen import ceval_datasets
|
|
||||||
|
|
||||||
datasets = [
|
|
||||||
*ceval_datasets,
|
|
||||||
]
|
|
||||||
|
|
||||||
# needs a special postprocessor for all
|
|
||||||
# except 'gsm8k' and 'strategyqa'
|
|
||||||
from opencompass.utils import general_eval_wrapper_postprocess
|
|
||||||
for _dataset in datasets:
|
|
||||||
if _dataset['abbr'] not in ['gsm8k', 'strategyqa']:
|
|
||||||
if hasattr(_dataset['eval_cfg'], 'pred_postprocessor'):
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor']['postprocess'] = _dataset['eval_cfg']['pred_postprocessor']['type']
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor']['type'] = general_eval_wrapper_postprocess
|
|
||||||
else:
|
|
||||||
_dataset['eval_cfg']['pred_postprocessor'] = {'type': general_eval_wrapper_postprocess}
|
|
||||||
|
|
||||||
|
|
||||||
api_meta_template = dict(
|
|
||||||
round=[
|
|
||||||
dict(role='HUMAN', api_role='HUMAN'),
|
|
||||||
dict(role='BOT', api_role='BOT', generate=True),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='glm4_notools',
|
|
||||||
type=ZhiPuV2AI,
|
|
||||||
path='glm-4',
|
|
||||||
key='xxxxxx',
|
|
||||||
generation_kwargs={
|
|
||||||
'tools': [
|
|
||||||
{
|
|
||||||
'type': 'web_search',
|
|
||||||
'web_search': {
|
|
||||||
'enable': False # turn off the search
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
meta_template=api_meta_template,
|
|
||||||
query_per_second=1,
|
|
||||||
max_out_len=2048,
|
|
||||||
max_seq_len=2048,
|
|
||||||
batch_size=8)
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=NaivePartitioner),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalAPIRunner,
|
|
||||||
max_num_workers=2,
|
|
||||||
concurrent_users=2,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
||||||
|
|
||||||
work_dir = "outputs/api_zhipu_v2/"
|
|
@ -1,54 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccContaminationEvaluator
|
|
||||||
from opencompass.datasets import ARCDatasetClean as ARCDataset
|
|
||||||
|
|
||||||
ARC_c_reader_cfg = dict(
|
|
||||||
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
|
||||||
output_column='answerKey')
|
|
||||||
|
|
||||||
ARC_c_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"A":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textA}")
|
|
||||||
], ),
|
|
||||||
"B":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textB}")
|
|
||||||
], ),
|
|
||||||
"C":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textC}")
|
|
||||||
], ),
|
|
||||||
"D":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textD}")
|
|
||||||
], ),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ARC_c_eval_cfg = dict(evaluator=dict(type=AccContaminationEvaluator),
|
|
||||||
analyze_contamination=True)
|
|
||||||
|
|
||||||
ARC_c_datasets = [
|
|
||||||
dict(
|
|
||||||
type=ARCDataset,
|
|
||||||
abbr='ARC-c-test',
|
|
||||||
path='./data/ARC/ARC-c/ARC-Challenge-Test.jsonl',
|
|
||||||
reader_cfg=ARC_c_reader_cfg,
|
|
||||||
infer_cfg=ARC_c_infer_cfg,
|
|
||||||
eval_cfg=ARC_c_eval_cfg)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
ARC_c_reader_cfg = dict(
|
|
||||||
input_columns=["question", "textA", "textB", "textC", "textD"],
|
|
||||||
output_column="answerKey")
|
|
||||||
|
|
||||||
ARC_c_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:"
|
|
||||||
)
|
|
||||||
], ),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
ARC_c_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
|
||||||
)
|
|
||||||
|
|
||||||
ARC_c_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="ARC-c",
|
|
||||||
type=ARCDataset,
|
|
||||||
path="./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl",
|
|
||||||
reader_cfg=ARC_c_reader_cfg,
|
|
||||||
infer_cfg=ARC_c_infer_cfg,
|
|
||||||
eval_cfg=ARC_c_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,53 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
|
|
||||||
ARC_c_reader_cfg = dict(
|
|
||||||
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
|
||||||
output_column='answerKey')
|
|
||||||
|
|
||||||
ARC_c_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"A":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textA}")
|
|
||||||
], ),
|
|
||||||
"B":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textB}")
|
|
||||||
], ),
|
|
||||||
"C":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textC}")
|
|
||||||
], ),
|
|
||||||
"D":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textD}")
|
|
||||||
], ),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
ARC_c_datasets = [
|
|
||||||
dict(
|
|
||||||
type=ARCDataset,
|
|
||||||
abbr='ARC-c',
|
|
||||||
path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl',
|
|
||||||
reader_cfg=ARC_c_reader_cfg,
|
|
||||||
infer_cfg=ARC_c_infer_cfg,
|
|
||||||
eval_cfg=ARC_c_eval_cfg)
|
|
||||||
]
|
|
@ -1,33 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
|
|
||||||
ARC_c_reader_cfg = dict(
|
|
||||||
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
|
||||||
output_column='answerKey')
|
|
||||||
|
|
||||||
ARC_c_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"A": "Question: {question}\nAnswer: {textA}",
|
|
||||||
"B": "Question: {question}\nAnswer: {textB}",
|
|
||||||
"C": "Question: {question}\nAnswer: {textC}",
|
|
||||||
"D": "Question: {question}\nAnswer: {textD}"
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ARC_c_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
ARC_c_datasets = [
|
|
||||||
dict(
|
|
||||||
type=ARCDataset,
|
|
||||||
abbr='ARC-c',
|
|
||||||
path='./data/ARC/ARC-c/ARC-Challenge-Dev.jsonl',
|
|
||||||
reader_cfg=ARC_c_reader_cfg,
|
|
||||||
infer_cfg=ARC_c_infer_cfg,
|
|
||||||
eval_cfg=ARC_c_eval_cfg)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
ARC_e_reader_cfg = dict(
|
|
||||||
input_columns=["question", "textA", "textB", "textC", "textD"],
|
|
||||||
output_column="answerKey")
|
|
||||||
|
|
||||||
ARC_e_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"Question: {question}\nA. {textA}\nB. {textB}\nC. {textC}\nD. {textD}\nAnswer:"
|
|
||||||
)
|
|
||||||
], ),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
ARC_e_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
|
||||||
)
|
|
||||||
|
|
||||||
ARC_e_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="ARC-e",
|
|
||||||
type=ARCDataset,
|
|
||||||
path="./data/ARC/ARC-e/ARC-Easy-Dev.jsonl",
|
|
||||||
reader_cfg=ARC_e_reader_cfg,
|
|
||||||
infer_cfg=ARC_e_infer_cfg,
|
|
||||||
eval_cfg=ARC_e_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,53 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
|
|
||||||
ARC_e_reader_cfg = dict(
|
|
||||||
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
|
||||||
output_column='answerKey')
|
|
||||||
|
|
||||||
ARC_e_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"A":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textA}")
|
|
||||||
], ),
|
|
||||||
"B":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textB}")
|
|
||||||
], ),
|
|
||||||
"C":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textC}")
|
|
||||||
], ),
|
|
||||||
"D":
|
|
||||||
dict(
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="Question: {question}\nAnswer: "),
|
|
||||||
dict(role="BOT", prompt="{textD}")
|
|
||||||
], ),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ARC_e_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
ARC_e_datasets = [
|
|
||||||
dict(
|
|
||||||
type=ARCDataset,
|
|
||||||
abbr='ARC-e',
|
|
||||||
path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl',
|
|
||||||
reader_cfg=ARC_e_reader_cfg,
|
|
||||||
infer_cfg=ARC_e_infer_cfg,
|
|
||||||
eval_cfg=ARC_e_eval_cfg)
|
|
||||||
]
|
|
@ -1,33 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import ARCDataset
|
|
||||||
|
|
||||||
ARC_e_reader_cfg = dict(
|
|
||||||
input_columns=['question', 'textA', 'textB', 'textC', 'textD'],
|
|
||||||
output_column='answerKey')
|
|
||||||
|
|
||||||
ARC_e_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"A": "Question: {question}\nAnswer: {textA}",
|
|
||||||
"B": "Question: {question}\nAnswer: {textB}",
|
|
||||||
"C": "Question: {question}\nAnswer: {textC}",
|
|
||||||
"D": "Question: {question}\nAnswer: {textD}"
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ARC_e_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
ARC_e_datasets = [
|
|
||||||
dict(
|
|
||||||
type=ARCDataset,
|
|
||||||
abbr='ARC-e',
|
|
||||||
path='./data/ARC/ARC-e/ARC-Easy-Dev.jsonl',
|
|
||||||
reader_cfg=ARC_e_reader_cfg,
|
|
||||||
infer_cfg=ARC_e_infer_cfg,
|
|
||||||
eval_cfg=ARC_e_eval_cfg)
|
|
||||||
]
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403
|
|
@ -1,36 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import AgentInferencer
|
|
||||||
|
|
||||||
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
|
|
||||||
|
|
||||||
cibench_reader_cfg = dict(
|
|
||||||
input_columns=["questions"],
|
|
||||||
output_column="references",
|
|
||||||
train_split='test',
|
|
||||||
test_split='test')
|
|
||||||
|
|
||||||
cibench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template="""{questions}""",
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=AgentInferencer, infer_mode='every'),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
|
||||||
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
|
|
||||||
|
|
||||||
cibench_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr=f"cibench_generation_{lib}",
|
|
||||||
type=CIBenchDataset,
|
|
||||||
path=f"./data/cibench/{lib}",
|
|
||||||
internet_check=False,
|
|
||||||
reader_cfg=cibench_reader_cfg,
|
|
||||||
infer_cfg=cibench_infer_cfg,
|
|
||||||
eval_cfg=cibench_eval_cfg,
|
|
||||||
) for lib in libs
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import C3Dataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
C3_reader_cfg = dict(
|
|
||||||
input_columns=[
|
|
||||||
"question",
|
|
||||||
"content",
|
|
||||||
"choice0",
|
|
||||||
"choice1",
|
|
||||||
"choice2",
|
|
||||||
"choice3",
|
|
||||||
"choices",
|
|
||||||
],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
C3_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{content}\n问:{question}\nA. {choice0}\nB. {choice1}\nC. {choice2}\nD. {choice3}\n请从“A”,“B”,“C”,“D”中进行选择。\n答:",
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
C3_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
C3_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="C3",
|
|
||||||
type=C3Dataset_V2,
|
|
||||||
path="./data/CLUE/C3/dev_0.json",
|
|
||||||
reader_cfg=C3_reader_cfg,
|
|
||||||
infer_cfg=C3_infer_cfg,
|
|
||||||
eval_cfg=C3_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,36 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import C3Dataset
|
|
||||||
|
|
||||||
C3_reader_cfg = dict(
|
|
||||||
input_columns=[
|
|
||||||
'question', 'content', 'choice0', 'choice1', 'choice2', 'choice3',
|
|
||||||
'choices'
|
|
||||||
],
|
|
||||||
output_column='label')
|
|
||||||
|
|
||||||
C3_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
0: "文章:{content}\n问题:{question}\n答案:{choice0}",
|
|
||||||
1: "文章:{content}\n问题:{question}\n答案:{choice1}",
|
|
||||||
2: "文章:{content}\n问题:{question}\n答案:{choice2}",
|
|
||||||
3: "文章:{content}\n问题:{question}\n答案:{choice3}"
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
C3_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
C3_datasets = [
|
|
||||||
dict(
|
|
||||||
type=C3Dataset,
|
|
||||||
abbr='C3',
|
|
||||||
path='./data/CLUE/C3/dev_0.json',
|
|
||||||
reader_cfg=C3_reader_cfg,
|
|
||||||
infer_cfg=C3_infer_cfg,
|
|
||||||
eval_cfg=C3_eval_cfg)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import AFQMCDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
afqmc_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
afqmc_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"语句一:“{sentence1}”\n语句二:“{sentence2}”\n语句一与语句二是关于蚂蚁金融产品的疑问,两者所询问的内容是否完全一致?\nA. 不完全一致\nB. 完全一致\n请从“A”,“B”中进行选择。\n答:",
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
afqmc_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
afqmc_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="afqmc-dev",
|
|
||||||
type=AFQMCDataset_V2,
|
|
||||||
path="./data/CLUE/AFQMC/dev.json",
|
|
||||||
reader_cfg=afqmc_reader_cfg,
|
|
||||||
infer_cfg=afqmc_infer_cfg,
|
|
||||||
eval_cfg=afqmc_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
cmnli_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
cmnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?\nA. 蕴含\nB. 矛盾\nC. 无关\n请从“A”,“B”,“C”中进行选择。\n答:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
cmnli_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
cmnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="cmnli",
|
|
||||||
type=cmnliDataset_V2,
|
|
||||||
path="./data/CLUE/cmnli/cmnli_public/dev.json",
|
|
||||||
reader_cfg=cmnli_reader_cfg,
|
|
||||||
infer_cfg=cmnli_infer_cfg,
|
|
||||||
eval_cfg=cmnli_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
cmnli_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
cmnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
cmnli_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
cmnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="cmnli",
|
|
||||||
type=cmnliDataset_V2,
|
|
||||||
path="./data/CLUE/cmnli/cmnli_public/dev.json",
|
|
||||||
reader_cfg=cmnli_reader_cfg,
|
|
||||||
infer_cfg=cmnli_infer_cfg,
|
|
||||||
eval_cfg=cmnli_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,50 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset
|
|
||||||
|
|
||||||
cmnli_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'],
|
|
||||||
output_column='label',
|
|
||||||
test_split='train')
|
|
||||||
|
|
||||||
cmnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'contradiction':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="错")
|
|
||||||
]),
|
|
||||||
'entailment':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="对")
|
|
||||||
]),
|
|
||||||
'neutral':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN", prompt="如果{sentence1}为真,那么{sentence2}也为真吗?"),
|
|
||||||
dict(role="BOT", prompt="可能")
|
|
||||||
]),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
cmnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="cmnli",
|
|
||||||
type=cmnliDataset,
|
|
||||||
path='./data/CLUE/cmnli/cmnli_public/dev.json',
|
|
||||||
reader_cfg=cmnli_reader_cfg,
|
|
||||||
infer_cfg=cmnli_infer_cfg,
|
|
||||||
eval_cfg=cmnli_eval_cfg)
|
|
||||||
]
|
|
@ -1,54 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset
|
|
||||||
|
|
||||||
cmnli_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'],
|
|
||||||
output_column='label',
|
|
||||||
test_split='train')
|
|
||||||
|
|
||||||
cmnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'contradiction':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="矛盾")
|
|
||||||
]),
|
|
||||||
'entailment':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="蕴含")
|
|
||||||
]),
|
|
||||||
'neutral':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="无关")
|
|
||||||
]),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
cmnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
cmnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="cmnli",
|
|
||||||
type=cmnliDataset,
|
|
||||||
path='./data/CLUE/cmnli/cmnli_public/dev.json',
|
|
||||||
reader_cfg=cmnli_reader_cfg,
|
|
||||||
infer_cfg=cmnli_infer_cfg,
|
|
||||||
eval_cfg=cmnli_eval_cfg)
|
|
||||||
]
|
|
@ -1,44 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
ocnli_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: two prompt templates for ocnli
|
|
||||||
ocnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocnli_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="ocnli",
|
|
||||||
type=cmnliDataset_V2, # ocnli share the same format with cmnli
|
|
||||||
path="./data/CLUE/OCNLI/dev.json",
|
|
||||||
reader_cfg=ocnli_reader_cfg,
|
|
||||||
infer_cfg=ocnli_infer_cfg,
|
|
||||||
eval_cfg=ocnli_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,44 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
ocnli_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: two prompt templates for ocnli
|
|
||||||
ocnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?\nA. 蕴含\n B. 矛盾\n C. 无关\n请从“A”,“B”,“C”中进行选择。\n答:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocnli_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocnli_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="ocnli",
|
|
||||||
type=cmnliDataset_V2, # ocnli share the same format with cmnli
|
|
||||||
path="./data/CLUE/OCNLI/dev.json",
|
|
||||||
reader_cfg=ocnli_reader_cfg,
|
|
||||||
infer_cfg=ocnli_infer_cfg,
|
|
||||||
eval_cfg=ocnli_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
ocnli_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'], output_column='label')
|
|
||||||
|
|
||||||
# TODO: two prompt templates for ocnli
|
|
||||||
ocnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'contradiction':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="错")
|
|
||||||
]),
|
|
||||||
'entailment':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="对")
|
|
||||||
]),
|
|
||||||
'neutral':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN", prompt="如果{sentence1}为真,那么{sentence2}也为真吗?"),
|
|
||||||
dict(role="BOT", prompt="可能")
|
|
||||||
]),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ocnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
|
||||||
|
|
||||||
ocnli_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr='ocnli',
|
|
||||||
path='json',
|
|
||||||
split='train',
|
|
||||||
data_files='./data/CLUE/OCNLI/dev.json',
|
|
||||||
reader_cfg=ocnli_reader_cfg,
|
|
||||||
infer_cfg=ocnli_infer_cfg,
|
|
||||||
eval_cfg=ocnli_eval_cfg)
|
|
||||||
]
|
|
@ -1,55 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
ocnli_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'], output_column='label')
|
|
||||||
|
|
||||||
# TODO: two prompt templates for ocnli
|
|
||||||
ocnli_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'contradiction':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="矛盾")
|
|
||||||
]),
|
|
||||||
'entailment':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="蕴含")
|
|
||||||
]),
|
|
||||||
'neutral':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="语句一:“{sentence1}”\n语句二:“{sentence2}”\n请问这两句话是什么关系?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="无关")
|
|
||||||
]),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
ocnli_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
|
||||||
|
|
||||||
ocnli_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr='ocnli',
|
|
||||||
path='json',
|
|
||||||
split='train',
|
|
||||||
data_files='./data/CLUE/OCNLI/dev.json',
|
|
||||||
reader_cfg=ocnli_reader_cfg,
|
|
||||||
infer_cfg=ocnli_infer_cfg,
|
|
||||||
eval_cfg=ocnli_eval_cfg)
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import AFQMCDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
bustm_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
bustm_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"语句一:“{sentence1}”\n语句二:“{sentence2}”\n请判断语句一和语句二说的是否是一个意思?\nA. 无关\nB. 相关\n请从“A”,“B”中进行选择。\n答:",
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
bustm_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
bustm_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="bustm-dev",
|
|
||||||
type=AFQMCDataset_V2, # bustm share the same format with AFQMC
|
|
||||||
path="./data/FewCLUE/bustm/dev_few_all.json",
|
|
||||||
reader_cfg=bustm_reader_cfg,
|
|
||||||
infer_cfg=bustm_infer_cfg,
|
|
||||||
eval_cfg=bustm_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="bustm-test",
|
|
||||||
type=AFQMCDataset_V2, # bustm share the same format with AFQMC
|
|
||||||
path="./data/FewCLUE/bustm/test_public.json",
|
|
||||||
reader_cfg=bustm_reader_cfg,
|
|
||||||
infer_cfg=bustm_infer_cfg,
|
|
||||||
eval_cfg=bustm_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import CHIDDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
chid_reader_cfg = dict(
|
|
||||||
input_columns=["content","A","B","C","D","E","F","G"],
|
|
||||||
output_column="answer",
|
|
||||||
)
|
|
||||||
|
|
||||||
chid_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{content}\n请选择______处所填的词\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nF. {F}\nG. {G}\n请从”A“,”B“,”C“,”D“,”E“,”F“,”G“中进行选择。答:",
|
|
||||||
),
|
|
||||||
])),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
chid_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
chid_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="chid-dev",
|
|
||||||
type=CHIDDataset_V2,
|
|
||||||
path="./data/FewCLUE/chid/dev_few_all.json",
|
|
||||||
reader_cfg=chid_reader_cfg,
|
|
||||||
infer_cfg=chid_infer_cfg,
|
|
||||||
eval_cfg=chid_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="chid-test",
|
|
||||||
type=CHIDDataset_V2,
|
|
||||||
path="./data/FewCLUE/chid/test_public.json",
|
|
||||||
reader_cfg=chid_reader_cfg,
|
|
||||||
infer_cfg=chid_infer_cfg,
|
|
||||||
eval_cfg=chid_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import CluewscDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
cluewsc_reader_cfg = dict(
|
|
||||||
input_columns=["span1", "span2", "text", "new_text"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
cluewsc_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{text}\n此处,“{span2}”是否指代“{span1}“?\nA. 是\nB. 否\n请从”A“,”B“中进行选择。\n答:",
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
cluewsc_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
cluewsc_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="cluewsc-dev",
|
|
||||||
type=CluewscDataset_V2,
|
|
||||||
path="./data/FewCLUE/cluewsc/dev_few_all.json",
|
|
||||||
reader_cfg=cluewsc_reader_cfg,
|
|
||||||
infer_cfg=cluewsc_infer_cfg,
|
|
||||||
eval_cfg=cluewsc_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="cluewsc-test",
|
|
||||||
type=CluewscDataset_V2,
|
|
||||||
path="./data/FewCLUE/cluewsc/test_public.json",
|
|
||||||
reader_cfg=cluewsc_reader_cfg,
|
|
||||||
infer_cfg=cluewsc_infer_cfg,
|
|
||||||
eval_cfg=cluewsc_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import CslDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
csl_reader_cfg = dict(
|
|
||||||
input_columns=["abst", "keywords"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"摘要是对论文内容不加注释和评论的简短陈述,要求扼要地说明研究工作的目的、研究方法和最终结论等。\n关键词是一篇学术论文的核心词汇,一般由一系列名词组成。关键词在全文中应有较高出现频率,且能起到帮助文献检索的作用。\n摘要:{abst}\n关键词:{keywords}\n请问上述关键词是否匹配摘要且符合要求?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:"
|
|
||||||
)
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="csl_dev",
|
|
||||||
type=CslDataset_V2,
|
|
||||||
path="./data/FewCLUE/csl/dev_few_all.json",
|
|
||||||
reader_cfg=csl_reader_cfg,
|
|
||||||
infer_cfg=csl_infer_cfg,
|
|
||||||
eval_cfg=csl_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="csl_test",
|
|
||||||
type=CslDataset_V2,
|
|
||||||
path="./data/FewCLUE/csl/test_public.json",
|
|
||||||
reader_cfg=csl_reader_cfg,
|
|
||||||
infer_cfg=csl_infer_cfg,
|
|
||||||
eval_cfg=csl_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import CslDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
csl_reader_cfg = dict(
|
|
||||||
input_columns=["abst", "keywords"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"摘要:{abst}\n关键词:{keywords}\n上述关键词出现在学术期刊中是否恰当?\nA. 否\nB. 是\n请从”A“,”B“中进行选择。\n答:"
|
|
||||||
)
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
csl_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="csl_dev",
|
|
||||||
type=CslDataset_V2,
|
|
||||||
path="./data/FewCLUE/csl/dev_few_all.json",
|
|
||||||
reader_cfg=csl_reader_cfg,
|
|
||||||
infer_cfg=csl_infer_cfg,
|
|
||||||
eval_cfg=csl_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="csl_test",
|
|
||||||
type=CslDataset_V2,
|
|
||||||
path="./data/FewCLUE/csl/test_public.json",
|
|
||||||
reader_cfg=csl_reader_cfg,
|
|
||||||
infer_cfg=csl_infer_cfg,
|
|
||||||
eval_cfg=csl_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,49 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import eprstmtDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
eprstmt_reader_cfg = dict(
|
|
||||||
input_columns=["sentence"], output_column="label", test_split="train")
|
|
||||||
|
|
||||||
eprstmt_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
'内容: "{sentence}"。请对上述内容进行情绪分类。\nA. 积极\nB. 消极\n请从”A“,”B“中进行选择。\n答:'
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
eprstmt_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
eprstmt_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="eprstmt-dev",
|
|
||||||
type=eprstmtDataset_V2,
|
|
||||||
path="./data/FewCLUE/eprstmt/dev_few_all.json",
|
|
||||||
reader_cfg=eprstmt_reader_cfg,
|
|
||||||
infer_cfg=eprstmt_infer_cfg,
|
|
||||||
eval_cfg=eprstmt_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="eprstmt-test",
|
|
||||||
type=eprstmtDataset_V2,
|
|
||||||
path="./data/FewCLUE/eprstmt/test_public.json",
|
|
||||||
reader_cfg=eprstmt_reader_cfg,
|
|
||||||
infer_cfg=eprstmt_infer_cfg,
|
|
||||||
eval_cfg=eprstmt_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,50 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import cmnliDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
ocnli_fc_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
ocnli_fc_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}\nA. 对\nB. 错\nC. 可能\n请从“A”,“B”,“C”中进行选择。\n答:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
ocnli_fc_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
ocnli_fc_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="ocnli_fc-dev",
|
|
||||||
type=cmnliDataset_V2, # ocnli_fc share the same format with cmnli
|
|
||||||
path="./data/FewCLUE/ocnli/dev_few_all.json",
|
|
||||||
reader_cfg=ocnli_fc_reader_cfg,
|
|
||||||
infer_cfg=ocnli_fc_infer_cfg,
|
|
||||||
eval_cfg=ocnli_fc_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="ocnli_fc-test",
|
|
||||||
type=cmnliDataset_V2, # ocnli_fc share the same format with cmnli
|
|
||||||
path="./data/FewCLUE/ocnli/test_public.json",
|
|
||||||
reader_cfg=ocnli_fc_reader_cfg,
|
|
||||||
infer_cfg=ocnli_fc_infer_cfg,
|
|
||||||
eval_cfg=ocnli_fc_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
@ -1,60 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
ocnli_fc_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'],
|
|
||||||
output_column='label',
|
|
||||||
test_split='train')
|
|
||||||
|
|
||||||
ocnli_fc_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'contradiction':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="错")
|
|
||||||
]),
|
|
||||||
'entailment':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="阅读文章:{sentence1}\n根据上文,回答如下问题:{sentence2}?"),
|
|
||||||
dict(role="BOT", prompt="对")
|
|
||||||
]),
|
|
||||||
'neutral':
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN", prompt="如果{sentence1}为真,那么{sentence2}也为真吗?"),
|
|
||||||
dict(role="BOT", prompt="可能")
|
|
||||||
]),
|
|
||||||
}),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
ocnli_fc_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
ocnli_fc_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr='ocnli_fc-dev',
|
|
||||||
path='json',
|
|
||||||
split='train',
|
|
||||||
data_files='./data/FewCLUE/ocnli/dev_few_all.json',
|
|
||||||
reader_cfg=ocnli_fc_reader_cfg,
|
|
||||||
infer_cfg=ocnli_fc_infer_cfg,
|
|
||||||
eval_cfg=ocnli_fc_eval_cfg),
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr='ocnli_fc-test',
|
|
||||||
path='json',
|
|
||||||
split='train',
|
|
||||||
data_files='./data/FewCLUE/ocnli/test_public.json',
|
|
||||||
reader_cfg=ocnli_fc_reader_cfg,
|
|
||||||
infer_cfg=ocnli_fc_infer_cfg,
|
|
||||||
eval_cfg=ocnli_fc_eval_cfg)
|
|
||||||
]
|
|
@ -1,75 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import TNewsDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
tnews_reader_cfg = dict(
|
|
||||||
input_columns="sentence",
|
|
||||||
output_column="label_desc2",
|
|
||||||
)
|
|
||||||
|
|
||||||
tnews_labels = [
|
|
||||||
"农业新闻", # news_agriculture
|
|
||||||
"旅游新闻", # news_travel
|
|
||||||
"游戏新闻", # news_game
|
|
||||||
"科技类别公司新闻", # news_tech
|
|
||||||
"体育类别新闻", # news_sports
|
|
||||||
"初升高教育新闻", # news_edu
|
|
||||||
"娱乐圈新闻", # news_entertainment
|
|
||||||
"投资资讯", # news_finance
|
|
||||||
"军事类别常识", # news_military
|
|
||||||
"车辆新闻", # news_car
|
|
||||||
"楼市新闻", # news_house
|
|
||||||
"环球不含中国类别新闻", # news_world
|
|
||||||
"书籍文化历史类别新闻", # news_culture
|
|
||||||
"故事类别新闻", # news_story
|
|
||||||
"股票市场类别新闻", # news_stock
|
|
||||||
]
|
|
||||||
_tnews_options_list_str = "\n".join(f'{chr(ord("A") + i)}. {tnews_labels[i]}'
|
|
||||||
for i in range(len(tnews_labels)))
|
|
||||||
_tnews_options_range_str = ",".join(f'“{chr(ord("A") + i)}”'
|
|
||||||
for i in range(len(tnews_labels)))
|
|
||||||
|
|
||||||
tnews_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
f"{{sentence}}\n请判断上述内容属于什么新闻?\n{_tnews_options_list_str}\n请从{_tnews_options_range_str}中进行选择。\n答:",
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
tnews_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
tnews_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="tnews-dev",
|
|
||||||
type=TNewsDataset_V2,
|
|
||||||
path="./data/FewCLUE/tnews/dev_few_all.json",
|
|
||||||
reader_cfg=tnews_reader_cfg,
|
|
||||||
infer_cfg=tnews_infer_cfg,
|
|
||||||
eval_cfg=tnews_eval_cfg,
|
|
||||||
),
|
|
||||||
dict(
|
|
||||||
abbr="tnews-test",
|
|
||||||
type=TNewsDataset_V2,
|
|
||||||
path="./data/FewCLUE/tnews/test_public.json",
|
|
||||||
reader_cfg=tnews_reader_cfg,
|
|
||||||
infer_cfg=tnews_infer_cfg,
|
|
||||||
eval_cfg=tnews_eval_cfg,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
del _tnews_options_list_str, _tnews_options_range_str
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .FinanceIQ_gen_e0e6b5 import FinanceIQ_datasets # noqa: F401, F403
|
|
@ -1,77 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import FinanceIQDataset
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
financeIQ_subject_mapping_en = {
|
|
||||||
'certified_public_accountant': '注册会计师(CPA)',
|
|
||||||
'banking_qualification': '银行从业资格',
|
|
||||||
'securities_qualification': '证券从业资格',
|
|
||||||
'fund_qualification': '基金从业资格',
|
|
||||||
'insurance_qualification': '保险从业资格CICE',
|
|
||||||
'economic_analyst': '经济师',
|
|
||||||
'taxation_practitioner': '税务师',
|
|
||||||
'futures_qualification': '期货从业资格',
|
|
||||||
'certified_fin_planner': '理财规划师',
|
|
||||||
'actuary_fin_math': '精算师-金融数学',
|
|
||||||
}
|
|
||||||
|
|
||||||
financeIQ_subject_mapping = {
|
|
||||||
'注册会计师(CPA)': '注册会计师(CPA)',
|
|
||||||
'银行从业资格': '银行从业资格',
|
|
||||||
'证券从业资格': '证券从业资格',
|
|
||||||
'基金从业资格': '基金从业资格',
|
|
||||||
'保险从业资格CICE': '保险从业资格CICE',
|
|
||||||
'经济师': '经济师',
|
|
||||||
'税务师': '税务师',
|
|
||||||
'期货从业资格': '期货从业资格',
|
|
||||||
'理财规划师': '理财规划师',
|
|
||||||
'精算师-金融数学': '精算师-金融数学',
|
|
||||||
}
|
|
||||||
|
|
||||||
financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
|
|
||||||
|
|
||||||
financeIQ_datasets = []
|
|
||||||
for _name in financeIQ_all_sets:
|
|
||||||
_ch_name = financeIQ_subject_mapping[_name]
|
|
||||||
financeIQ_infer_cfg = dict(
|
|
||||||
ice_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
begin="</E>",
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
f"以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt='答案是: {answer}'),
|
|
||||||
]),
|
|
||||||
ice_token="</E>",
|
|
||||||
),
|
|
||||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
financeIQ_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess))
|
|
||||||
|
|
||||||
financeIQ_datasets.append(
|
|
||||||
dict(
|
|
||||||
type=FinanceIQDataset,
|
|
||||||
path="./data/FinanceIQ/",
|
|
||||||
name=_name,
|
|
||||||
abbr=f"FinanceIQ-{_name}",
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question", "A", "B", "C", "D"],
|
|
||||||
output_column="answer",
|
|
||||||
train_split="dev",
|
|
||||||
test_split='test'),
|
|
||||||
infer_cfg=financeIQ_infer_cfg,
|
|
||||||
eval_cfg=financeIQ_eval_cfg,
|
|
||||||
))
|
|
||||||
|
|
||||||
del _name, _ch_name
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .FinanceIQ_ppl_42b9bd import FinanceIQ_datasets # noqa: F401, F403
|
|
@ -1,76 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import FinanceIQDataset
|
|
||||||
|
|
||||||
financeIQ_subject_mapping_en = {
|
|
||||||
'certified_public_accountant': '注册会计师(CPA)',
|
|
||||||
'banking_qualification': '银行从业资格',
|
|
||||||
'securities_qualification': '证券从业资格',
|
|
||||||
'fund_qualification': '基金从业资格',
|
|
||||||
'insurance_qualification': '保险从业资格CICE',
|
|
||||||
'economic_analyst': '经济师',
|
|
||||||
'taxation_practitioner': '税务师',
|
|
||||||
'futures_qualification': '期货从业资格',
|
|
||||||
'certified_fin_planner': '理财规划师',
|
|
||||||
'actuary_fin_math': '精算师-金融数学',
|
|
||||||
}
|
|
||||||
|
|
||||||
financeIQ_subject_mapping = {
|
|
||||||
'注册会计师(CPA)': '注册会计师(CPA)',
|
|
||||||
'银行从业资格': '银行从业资格',
|
|
||||||
'证券从业资格': '证券从业资格',
|
|
||||||
'基金从业资格': '基金从业资格',
|
|
||||||
'保险从业资格CICE': '保险从业资格CICE',
|
|
||||||
'经济师': '经济师',
|
|
||||||
'税务师': '税务师',
|
|
||||||
'期货从业资格': '期货从业资格',
|
|
||||||
'理财规划师': '理财规划师',
|
|
||||||
'精算师-金融数学': '精算师-金融数学',
|
|
||||||
}
|
|
||||||
|
|
||||||
financeIQ_all_sets = list(financeIQ_subject_mapping.keys())
|
|
||||||
|
|
||||||
financeIQ_datasets = []
|
|
||||||
for _name in financeIQ_all_sets:
|
|
||||||
_ch_name = financeIQ_subject_mapping[_name]
|
|
||||||
financeIQ_infer_cfg = dict(
|
|
||||||
ice_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
answer: dict(
|
|
||||||
begin="</E>",
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=f"以下是关于{_ch_name}的单项选择题,请直接给出正确答案的选项。\n题目:{{question}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt=f'答案是: {answer}'),
|
|
||||||
])
|
|
||||||
for answer in ["A", "B", "C", "D"]
|
|
||||||
},
|
|
||||||
ice_token="</E>",
|
|
||||||
),
|
|
||||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
financeIQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
financeIQ_datasets.append(
|
|
||||||
dict(
|
|
||||||
type=FinanceIQDataset,
|
|
||||||
path="./data/FinanceIQ/",
|
|
||||||
name=_name,
|
|
||||||
abbr=f"FinanceIQ-{_name}",
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question", "A", "B", "C", "D"],
|
|
||||||
output_column="answer",
|
|
||||||
train_split="dev",
|
|
||||||
test_split='test'),
|
|
||||||
infer_cfg=financeIQ_infer_cfg,
|
|
||||||
eval_cfg=financeIQ_eval_cfg,
|
|
||||||
))
|
|
||||||
|
|
||||||
del _name, _ch_name
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
|
|
||||||
_hint = "The following are semantic matching questions. \n" \
|
|
||||||
"Please determine whether the following two sentences are semantically equivalent: " \
|
|
||||||
"0 means not equivalent, 1 means equivalent.\n"
|
|
||||||
MRPC_infer_cfg = dict(
|
|
||||||
ice_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template="Sentence one: {sentence1}\nSentence two: {sentence2}\nResult: {label}",
|
|
||||||
),
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
answer:
|
|
||||||
f"{_hint}</E>Sentence one: {{sentence1}}\nSentence two: {{sentence2}}\nResult: {answer}"
|
|
||||||
for answer in [0, 1]
|
|
||||||
},
|
|
||||||
ice_token='</E>',
|
|
||||||
),
|
|
||||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
MRPC_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
|
||||||
|
|
||||||
|
|
||||||
MRPC_datasets = []
|
|
||||||
for _split in ["validation", "test"]:
|
|
||||||
|
|
||||||
MRPC_reader_cfg = dict(
|
|
||||||
input_columns=['sentence1', 'sentence2'],
|
|
||||||
output_column='label',
|
|
||||||
train_split="train",
|
|
||||||
test_split=_split
|
|
||||||
)
|
|
||||||
|
|
||||||
MRPC_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr=f'MRPC-{_split}',
|
|
||||||
type=HFDataset,
|
|
||||||
path='glue',
|
|
||||||
name='mrpc',
|
|
||||||
reader_cfg=MRPC_reader_cfg,
|
|
||||||
infer_cfg=MRPC_infer_cfg,
|
|
||||||
eval_cfg=MRPC_eval_cfg
|
|
||||||
)
|
|
||||||
)
|
|
@ -1,51 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
|
|
||||||
_hint = "The following are semantic matching questions. \n" \
|
|
||||||
"Please determine whether the following two sentences are semantically duplicate: " \
|
|
||||||
"0 means not duplicate, 1 means duplicate.\n"
|
|
||||||
QQP_infer_cfg = dict(
|
|
||||||
ice_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template="Sentence one: {question1}\nSentence two: {question2}\nResult: {label}",
|
|
||||||
),
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
answer:
|
|
||||||
f"{_hint}</E>Sentence one: {{question1}}\nSentence two: {{question2}}\nResult: {answer}"
|
|
||||||
for answer in [0, 1]
|
|
||||||
},
|
|
||||||
ice_token='</E>',
|
|
||||||
),
|
|
||||||
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
|
|
||||||
inferencer=dict(type=PPLInferencer))
|
|
||||||
|
|
||||||
QQP_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
|
||||||
|
|
||||||
|
|
||||||
QQP_datasets = []
|
|
||||||
for _split in ["validation", "test"]:
|
|
||||||
|
|
||||||
QQP_reader_cfg = dict(
|
|
||||||
input_columns=['question1', 'question2'],
|
|
||||||
output_column='label',
|
|
||||||
train_split="train",
|
|
||||||
test_split=_split
|
|
||||||
)
|
|
||||||
|
|
||||||
QQP_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr=f'QQP-{_split}',
|
|
||||||
type=HFDataset,
|
|
||||||
path='glue',
|
|
||||||
name='qqp',
|
|
||||||
reader_cfg=QQP_reader_cfg,
|
|
||||||
infer_cfg=QQP_infer_cfg,
|
|
||||||
eval_cfg=QQP_eval_cfg
|
|
||||||
)
|
|
||||||
)
|
|
@ -1,301 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.datasets import GaokaoBenchDataset
|
|
||||||
|
|
||||||
_MCQ_prompts = [
|
|
||||||
{
|
|
||||||
"type": "single_choice",
|
|
||||||
"keyword": "2010-2022_Math_II_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "single_choice",
|
|
||||||
"keyword": "2010-2022_Math_I_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_History_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Biology_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Political_Science_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Physics_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chemistry_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2013_English_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Modern_Lit",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_English_Fill_in_Blanks",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"five_out_of_seven",
|
|
||||||
"keyword":
|
|
||||||
"2012-2022_English_Cloze_Test",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Geography_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_English_Reading_Comp",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Lang_and_Usage_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:"
|
|
||||||
},
|
|
||||||
]
|
|
||||||
_FBQ_prompts = [{
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2010-2022_Math_I_Fill-in-the-Blank",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2010-2022_Math_II_Fill-in-the-Blank",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2014-2022_English_Language_Cloze_Passage",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}]
|
|
||||||
_OEQ_prompts = [
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Geography_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chemistry_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Math_I_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_History_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Biology_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Math_II_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Physics_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Political_Science_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "correction",
|
|
||||||
"keyword": "2012-2022_English_Language_Error_Correction",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
# "prefix_prompt": [
|
|
||||||
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
|
||||||
# ],
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Ancient_Poetry_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Practical_Text_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Literary_Text_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Classical_Chinese_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
GaokaoBench_datasets = []
|
|
||||||
for _folder, _prompts in [
|
|
||||||
("Multiple-choice_Questions", _MCQ_prompts),
|
|
||||||
("Fill-in-the-blank_Questions", _FBQ_prompts),
|
|
||||||
("Open-ended_Questions", _OEQ_prompts),
|
|
||||||
]:
|
|
||||||
for _p in _prompts:
|
|
||||||
_reader_cfg = {
|
|
||||||
"input_columns": ['question'],
|
|
||||||
"output_column": 'answer',
|
|
||||||
}
|
|
||||||
_infer_cfg = {
|
|
||||||
"ice_template": {
|
|
||||||
"type": PromptTemplate,
|
|
||||||
"template": {
|
|
||||||
"round": [{
|
|
||||||
"role": "HUMAN",
|
|
||||||
"prompt": _p['prefix_prompt'] + '{question}'
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
"ice_token": "</E>"
|
|
||||||
},
|
|
||||||
"retriever": {
|
|
||||||
"type": ZeroRetriever
|
|
||||||
},
|
|
||||||
"inferencer": {
|
|
||||||
"type": GenInferencer,
|
|
||||||
"max_out_len": 1024,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_eval_cfg = {
|
|
||||||
"evaluator": {
|
|
||||||
"type": "GaokaoBenchEvaluator" + "_" + _p['type'],
|
|
||||||
},
|
|
||||||
"pred_role": "BOT",
|
|
||||||
}
|
|
||||||
_base_path = './data/GAOKAO-BENCH/data'
|
|
||||||
_dataset = {
|
|
||||||
"type": GaokaoBenchDataset,
|
|
||||||
"abbr": "GaokaoBench_" + _p['keyword'],
|
|
||||||
"path": _base_path + '/' + _folder + '/' + _p['keyword'] + ".json",
|
|
||||||
"reader_cfg": _reader_cfg,
|
|
||||||
"infer_cfg": _infer_cfg,
|
|
||||||
"eval_cfg": _eval_cfg,
|
|
||||||
}
|
|
||||||
|
|
||||||
GaokaoBench_datasets.append(_dataset)
|
|
||||||
|
|
||||||
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
|
||||||
for _t in _temporary_variables:
|
|
||||||
del globals()[_t]
|
|
||||||
del _temporary_variables, _t
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .GaokaoBench_mixed_f2038e import GaokaoBench_datasets # noqa: F401, F403
|
|
@ -1,353 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
|
||||||
from opencompass.datasets import GaokaoBenchDataset
|
|
||||||
|
|
||||||
_MCQ_prompts = [
|
|
||||||
{
|
|
||||||
"type": "single_choice",
|
|
||||||
"keyword": "2010-2022_Math_II_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "single_choice",
|
|
||||||
"keyword": "2010-2022_Math_I_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道数学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_History_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道历史选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Biology_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道生物选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Political_Science_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道政治选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Physics_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道物理选择题。\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出所有符合题意的答案,并写在【答案】和<eoa>之间。\n例如:【答案】 AB <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】... <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chemistry_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道化学选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"single_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2013_English_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。\n题目如下:"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Modern_Lit",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道语文阅读理解题,其中包含三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_English_Fill_in_Blanks",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语完形填空题,其中包含二十个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"five_out_of_seven",
|
|
||||||
"keyword":
|
|
||||||
"2012-2022_English_Cloze_Test",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的问题,将符合题意的五个选项的字母写在【答案】和<eoa>之间,例如“【答案】 A B C D E <eoa>\n请严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Geography_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道地理选择题,其中包含两到三个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_English_Reading_Comp",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道英语阅读理解题,其中包含三到五个小题。\n请你一步一步思考。每一题你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:(1)【答案】 A <eoa>\n(2)【答案】 B <eoa>\n请你严格按照上述格式作答。\n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type":
|
|
||||||
"multi_question_choice",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Lang_and_Usage_MCQs",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请你做一道语文选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】: A <eoa>\n完整的题目回答的格式如下:\n(1)【解析】 ... <eoe>\n【答案】 ... <eoa>\n(2)【解析】 ... <eoe>\n【答案】 ... <eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答\n题目如下:"
|
|
||||||
},
|
|
||||||
]
|
|
||||||
_FBQ_prompts = [{
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2010-2022_Math_I_Fill-in-the-Blank",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2010-2022_Math_II_Fill-in-the-Blank",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学填空题\n仔细阅读题目,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Language_Famous_Passages_and_Sentences_Dictation",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的语文填空题\n请你仔细阅读题目,先找到题目对应的中国名篇,再从名篇中找到合适的句子填写到题目的空白处。请你将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}, {
|
|
||||||
"type": "cloze",
|
|
||||||
"keyword": "2014-2022_English_Language_Cloze_Passage",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请回答下面的英语短文填词题\n仔细阅读题目,空白处请填入一个适当单词或者括号内单词的正确形式。请你一步步思考,将思考过程写在【解析】和<eoe>之间,将最终答案写在【答案】和<eoa>之间。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n(2)【解析】 ...<eoe>\n【答案】...<eoa>\n请严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}]
|
|
||||||
_OEQ_prompts = [
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Geography_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的地理解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。你的答案请写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chemistry_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的化学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Math_I_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_History_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的历史解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Biology_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的生物解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,同一小题的答案用\t分隔开。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...\t...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Math_II_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的数学解答题\n仔细阅读题目并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间,答案需要有完整的解题步骤。\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Physics_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的物理解答题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Political_Science_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的政治解答题\n仔细阅读材料和题目,并充分结合你已有的知识,解答其中的问题,请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的答案写在【答案】和<eoa>之间\n完整的题目回答格式如下:\n(1)【解析】 ...<eoe>\n【答案】...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "correction",
|
|
||||||
"keyword": "2012-2022_English_Language_Error_Correction",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
# "prefix_prompt": [
|
|
||||||
# "请解答下面的英语短文改错题,仔细阅读题目并充分结合你你已有的知识,找出其中10处需要改动的地方。请你一步步思考,把修改后的短文写在【答案】和<eoa>之间。\n完整的题目回答格式如下:【答案】 ...<eoa>\n 请你严格按照上述格式作答。\n题目如下:",
|
|
||||||
# "请比较下面两篇短文,找到第二篇和第一篇的10处不同,每处不同只涉及一个单词,请将结果写在【答案】和<eoa>之间。例如:【答案】1. 将play改为plays\n 2.增加了the\n ... <eoa>\n 完整的题目回答格式如下:【答案】(1) ... \n (2) ...\n ...(10) ...\n<eoa>\n请你严格按照上述格式作答。\n短文如下:"
|
|
||||||
# ],
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Ancient_Poetry_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文古代诗歌阅读题,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Practical_Text_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文实用类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Literary_Text_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文文学类文本阅读,仔细阅读题目,注意其中可能含有单选题和多选题。请你一步步思考并将最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword": "2010-2022_Chinese_Language_Classical_Chinese_Reading",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文文言文阅读,仔细阅读题目,前三题是单选题,最后一题要将文言文翻译为现代汉语。请你一步步思考并把最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。翻译题把翻译后的现代汉语句子写在【答案】后面,例如”【答案】今天天气很好 <eoa>”\n完整的题目回答格式如下:(1)[答案】 ...<eoa>\n (2)【答案】...<eoa>\n请你严格按照上述格式作答,如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "subjective",
|
|
||||||
"keyword":
|
|
||||||
"2010-2022_Chinese_Language_Language_and_Writing_Skills_Open-ended_Questions",
|
|
||||||
"prefix_prompt":
|
|
||||||
"请解答下面的语文解答题,仔细阅读题目,注意其中可能含有选择题。请你一步步思考并将思考过程写在【解析】和<eoe>之间。请把你的最终答案写在【答案】和<eoa>之间。选择题你要从选项中选出符合题意的答案,例如“【答案】A <eoa>”。\n完整的题目回答格式如下:(1)【解析】 ...<eoe>\n【答案】 ...<eoa>\n (2)【解析】 ...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。如果不止一道题,请分别作答。\n题目如下:",
|
|
||||||
"comment": ""
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
GaokaoBench_datasets = []
|
|
||||||
for _folder, _prompts in [
|
|
||||||
("Multiple-choice_Questions", _MCQ_prompts),
|
|
||||||
("Fill-in-the-blank_Questions", _FBQ_prompts),
|
|
||||||
("Open-ended_Questions", _OEQ_prompts),
|
|
||||||
]:
|
|
||||||
for _p in _prompts:
|
|
||||||
if _p['type'] == "single_choice":
|
|
||||||
continue
|
|
||||||
_reader_cfg = {
|
|
||||||
"input_columns": ['question'],
|
|
||||||
"output_column": 'answer',
|
|
||||||
}
|
|
||||||
_infer_cfg = {
|
|
||||||
"ice_template": {
|
|
||||||
"type": PromptTemplate,
|
|
||||||
"template": {
|
|
||||||
"round": [{
|
|
||||||
"role": "HUMAN",
|
|
||||||
"prompt": _p['prefix_prompt'] + '{question}'
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
"ice_token": "</E>"
|
|
||||||
},
|
|
||||||
"retriever": {
|
|
||||||
"type": ZeroRetriever
|
|
||||||
},
|
|
||||||
"inferencer": {
|
|
||||||
"type": GenInferencer,
|
|
||||||
"max_out_len": 1024,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_eval_cfg = {
|
|
||||||
"evaluator": {
|
|
||||||
"type": "GaokaoBenchEvaluator" + "_" + _p['type'],
|
|
||||||
},
|
|
||||||
"pred_role": "BOT",
|
|
||||||
}
|
|
||||||
_base_path = './data/GAOKAO-BENCH/data'
|
|
||||||
_dataset = {
|
|
||||||
"type": GaokaoBenchDataset,
|
|
||||||
"abbr": "GaokaoBench_" + _p['keyword'],
|
|
||||||
"path": _base_path + '/' + _folder + '/' + _p['keyword'] + ".json",
|
|
||||||
"reader_cfg": _reader_cfg,
|
|
||||||
"infer_cfg": _infer_cfg,
|
|
||||||
"eval_cfg": _eval_cfg,
|
|
||||||
}
|
|
||||||
|
|
||||||
GaokaoBench_datasets.append(_dataset)
|
|
||||||
|
|
||||||
_folder = "Multiple-choice_Questions"
|
|
||||||
for _p in _MCQ_prompts:
|
|
||||||
if _p['type'] != "single_choice":
|
|
||||||
continue
|
|
||||||
_reader_cfg = {
|
|
||||||
"input_columns": ['question'],
|
|
||||||
"output_column": 'answer',
|
|
||||||
}
|
|
||||||
_infer_cfg = {
|
|
||||||
"ice_template": {
|
|
||||||
"type": PromptTemplate,
|
|
||||||
"template": {
|
|
||||||
answer: {
|
|
||||||
"round": [{
|
|
||||||
"role": "HUMAN",
|
|
||||||
"prompt": _p['prefix_prompt'] + '{question}'
|
|
||||||
}, {
|
|
||||||
"role": "BOT",
|
|
||||||
"prompt": f"【答案】{answer} <eoa>"
|
|
||||||
}]
|
|
||||||
}
|
|
||||||
for answer in ['A', 'B', 'C', 'D']
|
|
||||||
},
|
|
||||||
"ice_token": "</E>"
|
|
||||||
},
|
|
||||||
"retriever": {
|
|
||||||
"type": ZeroRetriever
|
|
||||||
},
|
|
||||||
"inferencer": {
|
|
||||||
"type": PPLInferencer
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_eval_cfg = {
|
|
||||||
"evaluator": {
|
|
||||||
"type": "GaokaoBenchEvaluator" + "_" + _p['type'],
|
|
||||||
},
|
|
||||||
"pred_role": "BOT",
|
|
||||||
}
|
|
||||||
_base_path = './data/GAOKAO-BENCH/data'
|
|
||||||
_dataset = {
|
|
||||||
"type": GaokaoBenchDataset,
|
|
||||||
"abbr": "GaokaoBench_" + _p['keyword'],
|
|
||||||
"path": _base_path + '/' + _folder + '/' + _p['keyword'] + ".json",
|
|
||||||
"reader_cfg": _reader_cfg,
|
|
||||||
"infer_cfg": _infer_cfg,
|
|
||||||
"eval_cfg": _eval_cfg,
|
|
||||||
}
|
|
||||||
|
|
||||||
GaokaoBench_datasets.append(_dataset)
|
|
||||||
|
|
||||||
_temporary_variables = [k for k in globals() if k.startswith('_')]
|
|
||||||
for _t in _temporary_variables:
|
|
||||||
del globals()[_t]
|
|
||||||
del _temporary_variables, _t
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .IFEval_gen_3321a3 import humaneval_datasets # noqa: F401, F403
|
|
@ -1,108 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
single_choice_prompts = {
|
|
||||||
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
|
|
||||||
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
|
|
||||||
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
|
|
||||||
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
|
|
||||||
}
|
|
||||||
|
|
||||||
cloze_prompts = {
|
|
||||||
"cloze_cn": [
|
|
||||||
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
|
|
||||||
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
|
|
||||||
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
|
|
||||||
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
|
|
||||||
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
|
||||||
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
|
||||||
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
|
||||||
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
|
||||||
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}'),
|
|
||||||
],
|
|
||||||
"cloze_en": [
|
|
||||||
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
|
|
||||||
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
|
|
||||||
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
|
|
||||||
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
|
|
||||||
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
|
||||||
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
|
||||||
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
|
||||||
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}\n'),
|
|
||||||
]}
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'college': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'high': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'middle': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'primary': ['cloze_cn', 'cloze_en'],
|
|
||||||
'calculate': ['cloze_en'],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate reasoning path or not, only for single choice
|
|
||||||
with_reasoning = True
|
|
||||||
|
|
||||||
# Use circular evaluation or not
|
|
||||||
with_circular_eval = True
|
|
||||||
|
|
||||||
mathbench_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr="mathbench-" + _split + '-' + _name,
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=with_circular_eval,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,128 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import AgentInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
|
|
||||||
PROMPT_EN = {
|
|
||||||
"FEWSHOT_INSTRUCTION_CLOZE" : [
|
|
||||||
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:210'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:140'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
|
|
||||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result\n```"""),
|
|
||||||
dict(role='SYSTEM', prompt='Response:146'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
],
|
|
||||||
"FEWSHOT_INSTRUCTION_CHOICE" : [
|
|
||||||
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
|
|
||||||
dict(role='BOT', prompt='Thought: Substituting point P into the function yields the value of k as $-4$\nFinalAnswer: D'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='The graph of the power function $y=(x)$ passes through the point$ (2, \\dfrac {{1}}{{4}}) $, what is the value of $f(-3)$? A. $\\frac{{1}}{{9}}$ B. $\\frac{{1}}{{8}})=196-x$ C. $\\frac{{2}}{{9}}$ D. $\\frac{{1}}{{4}}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
|
||||||
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
|
||||||
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
PROMPT_CN = {
|
|
||||||
"FEWSHOT_INSTRUCTION_CLOZE" : [
|
|
||||||
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球,8个3分球和10个罚球。他们的对手得到2分球的两倍,但3分球和罚球的一半。两队得分的总和是多少?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:210'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5,她会有多少总数的物品,如果她现在有60颗弹珠?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:140'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='一个有4个水果篮子,前三个篮子里有9个苹果、15个橙子和14个香蕉,第四个篮子里每种水果都少2个。总共有多少水果?'),
|
|
||||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result\n```"""),
|
|
||||||
dict(role='SYSTEM', prompt='Response:146'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
],
|
|
||||||
"FEWSHOT_INSTRUCTION_CHOICE" : [
|
|
||||||
dict(role='HUMAN', prompt='已知点P(-1,4)在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上,则k的值是____'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
|
|
||||||
dict(role='BOT', prompt='Thought: 将点 P 带入函数解出 k 的值为 $-4$\nFinalAnswer: D'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='幂函数$ y=(x) $的图象经过点$ (2, \\dfrac {{1}}{{4}}) $,则$ f(-3) $的值为 ______ .'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
|
||||||
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result\n```'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
|
||||||
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'college': ['single_choice_cn', 'cloze_en'],
|
|
||||||
'high': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'middle': ['single_choice_cn'],
|
|
||||||
'primary': ['cloze_cn'],
|
|
||||||
'primary_refine': ['refine_cloze_cn']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Use circular evaluation or not
|
|
||||||
with_circular_eval = True
|
|
||||||
|
|
||||||
mathbench_agent_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=AgentInferencer)
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_agent_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr="mathbench-" + _split + '-' + _name + '-agent',
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=with_circular_eval,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,130 +0,0 @@
|
|||||||
# GONNA BE DEPRECATED, DON'T USE IT
|
|
||||||
|
|
||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import AgentInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
|
|
||||||
PROMPT_EN = {
|
|
||||||
"FEWSHOT_INSTRUCTION_CLOZE" : [
|
|
||||||
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:210'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:140'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
|
|
||||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
|
|
||||||
dict(role='SYSTEM', prompt='Response:146'),
|
|
||||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
],
|
|
||||||
"FEWSHOT_INSTRUCTION_CHOICE" : [
|
|
||||||
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
|
|
||||||
dict(role='BOT', prompt='Thought: Substituting point P into the function yields the value of k as $-4$\nFinalAnswer: D'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='The graph of the power function $y=(x)$ passes through the point$ (2, \\dfrac {{1}}{{4}}) $, what is the value of $f(-3)$? A. $\\frac{{1}}{{9}}$ B. $\\frac{{1}}{{8}})=196-x$ C. $\\frac{{2}}{{9}}$ D. $\\frac{{1}}{{4}}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
|
||||||
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
|
||||||
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
PROMPT_CN = {
|
|
||||||
"FEWSHOT_INSTRUCTION_CLOZE" : [
|
|
||||||
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球,8个3分球和10个罚球。他们的对手得到2分球的两倍,但3分球和罚球的一半。两队得分的总和是多少?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:210'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5,她会有多少总数的物品,如果她现在有60颗弹珠?'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:140'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='一个有4个水果篮子,前三个篮子里有9个苹果、15个橙子和14个香蕉,第四个篮子里每种水果都少2个。总共有多少水果?'),
|
|
||||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
|
|
||||||
dict(role='SYSTEM', prompt='Response:146'),
|
|
||||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
],
|
|
||||||
"FEWSHOT_INSTRUCTION_CHOICE" : [
|
|
||||||
dict(role='HUMAN', prompt='已知点P(-1,4)在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上,则k的值是____'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
|
|
||||||
dict(role='BOT', prompt='Thought: 将点 P 带入函数解出 k 的值为 $-4$\nFinalAnswer: D'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='幂函数$ y=(x) $的图象经过点$ (2, \\dfrac {{1}}{{4}}) $,则$ f(-3) $的值为 ______ .'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
|
||||||
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
|
||||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
|
|
||||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
|
||||||
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
|
|
||||||
|
|
||||||
dict(role='HUMAN', prompt='{question}'),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'college': ['single_choice_cn', 'cloze_en'],
|
|
||||||
'high': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'middle': ['single_choice_cn'],
|
|
||||||
'primary': ['cloze_cn'],
|
|
||||||
'primary_refine': ['refine_cloze_cn']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Use circular evaluation or not
|
|
||||||
with_circular_eval = True
|
|
||||||
|
|
||||||
mathbench_agent_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=AgentInferencer)
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_agent_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr="mathbench-" + _split + '-' + _name + '-agent',
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=with_circular_eval,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,58 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
|
|
||||||
cloze_prompts ={
|
|
||||||
"cloze_arith_en": [
|
|
||||||
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
|
|
||||||
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
|
|
||||||
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
|
|
||||||
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
|
|
||||||
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}\n'),
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'arithmetic': ['cloze_arith_en'],
|
|
||||||
}
|
|
||||||
|
|
||||||
mathbench_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=cloze_prompts[_name],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_datasets.append(
|
|
||||||
dict(
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=False,
|
|
||||||
abbr="mathbench-arithmetic" + _split + '-' + _name,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,110 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
|
|
||||||
single_choice_prompts = {
|
|
||||||
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
|
|
||||||
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
|
|
||||||
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
|
|
||||||
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
|
|
||||||
}
|
|
||||||
|
|
||||||
cloze_prompts = {
|
|
||||||
"cloze_cn": [
|
|
||||||
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
|
|
||||||
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
|
|
||||||
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
|
|
||||||
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
|
|
||||||
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
|
||||||
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
|
||||||
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
|
||||||
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
|
||||||
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}'),
|
|
||||||
],
|
|
||||||
"cloze_en": [
|
|
||||||
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
|
|
||||||
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
|
|
||||||
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
|
|
||||||
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
|
|
||||||
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
|
||||||
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
|
||||||
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
|
||||||
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}\n'),
|
|
||||||
]}
|
|
||||||
cloze_prompts['refine_cloze_cn'] = cloze_prompts['cloze_cn']
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'college': ['single_choice_cn', 'cloze_en'],
|
|
||||||
'high': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'middle': ['single_choice_cn'],
|
|
||||||
'primary': ['cloze_cn'],
|
|
||||||
'primary_refine': ['refine_cloze_cn']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate reasoning path or not, only for single choice
|
|
||||||
with_reasoning = True
|
|
||||||
|
|
||||||
# Use circular evaluation or not
|
|
||||||
with_circular_eval = True
|
|
||||||
|
|
||||||
mathbench_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr="mathbench-" + _split + '-' + _name,
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=with_circular_eval,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,4 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .mathbench_2024_gen_de9ff9 import mathbench_datasets # noqa: F401, F403
|
|
@ -1,110 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
|
|
||||||
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
|
|
||||||
single_choice_prompts = {
|
|
||||||
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理,并在最后用“所以答案为选项X”给出答案,其中“X”为选项A,B,C,D中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
|
|
||||||
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
|
|
||||||
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from A,B,C,D. Here is the question you need to answer:\n{question}\nLet's think step by step:",
|
|
||||||
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
|
|
||||||
}
|
|
||||||
|
|
||||||
cloze_prompts = {
|
|
||||||
"cloze_cn": [
|
|
||||||
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后,将有21棵树。林务工人员今天种植了多少棵树?'),
|
|
||||||
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以,他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车,又有2辆车进来,停车场里有多少辆车?'),
|
|
||||||
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力,她的妹妹有42块。如果他们吃了35块,他们总共剩下多少块?'),
|
|
||||||
dict(role='BOT', prompt='A: 黎恩有32块巧克力,Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖?'),
|
|
||||||
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个,所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
|
|
||||||
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个,所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个,所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
|
|
||||||
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑,所以现在有9 + 20 = 29台电脑。答案是 29\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二,他丢失了23个高尔夫球。星期三,他又丢失了2个。星期三结束时他还剩下多少个高尔夫球?'),
|
|
||||||
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个,所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个,所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱?'),
|
|
||||||
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元,所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}'),
|
|
||||||
],
|
|
||||||
"cloze_en": [
|
|
||||||
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
|
|
||||||
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
|
|
||||||
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
|
|
||||||
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
|
|
||||||
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
|
|
||||||
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
|
|
||||||
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
|
|
||||||
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
|
|
||||||
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
|
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
|
||||||
dict(role='BOT', prompt='A: {answer}\n'),
|
|
||||||
]}
|
|
||||||
cloze_prompts['refine_cloze_cn'] = cloze_prompts['cloze_cn']
|
|
||||||
|
|
||||||
mathbench_sets = {
|
|
||||||
'college': ['single_choice_cn', 'cloze_en'],
|
|
||||||
'high': ['single_choice_cn', 'single_choice_en'],
|
|
||||||
'middle': ['single_choice_cn'],
|
|
||||||
'primary': ['cloze_cn'],
|
|
||||||
'primary_refine': ['refine_cloze_cn']
|
|
||||||
}
|
|
||||||
|
|
||||||
# Generate reasoning path or not, only for single choice
|
|
||||||
with_reasoning = False
|
|
||||||
|
|
||||||
# Use circular evaluation or not
|
|
||||||
with_circular_eval = True
|
|
||||||
|
|
||||||
mathbench_datasets = []
|
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
|
||||||
for _name in mathbench_sets[_split]:
|
|
||||||
mathbench_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
|
|
||||||
),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
|
||||||
)
|
|
||||||
|
|
||||||
mathbench_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
|
||||||
|
|
||||||
mathbench_datasets.append(
|
|
||||||
dict(
|
|
||||||
abbr="mathbench-" + _split + '-' + _name,
|
|
||||||
type=MathBenchDataset,
|
|
||||||
path=f"./data/mathbench/{_split}",
|
|
||||||
name=_name,
|
|
||||||
with_circular=with_circular_eval,
|
|
||||||
reader_cfg=dict(
|
|
||||||
input_columns=["question"],
|
|
||||||
output_column="answer"
|
|
||||||
),
|
|
||||||
infer_cfg=mathbench_infer_cfg,
|
|
||||||
eval_cfg=mathbench_eval_cfg,
|
|
||||||
))
|
|
@ -1,59 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.datasets.NPHardEval import (
|
|
||||||
hard_GCP_Dataset, hard_GCP_Evaluator,
|
|
||||||
hard_TSP_Dataset, hard_TSP_Evaluator,
|
|
||||||
hard_MSP_Dataset, hard_MSP_Evaluator,
|
|
||||||
cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator,
|
|
||||||
cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator,
|
|
||||||
cmp_KSP_Dataset, cmp_KSP_Evaluator,
|
|
||||||
p_BSP_Dataset, p_BSP_Evaluator,
|
|
||||||
p_EDP_Dataset, p_EDP_Evaluator,
|
|
||||||
p_SPP_Dataset, p_SPP_Evaluator,
|
|
||||||
)
|
|
||||||
|
|
||||||
NPHardEval_tasks = [
|
|
||||||
["hard_GCP", "GCP", hard_GCP_Dataset, hard_GCP_Evaluator],
|
|
||||||
["hard_TSP", "TSP", hard_TSP_Dataset, hard_TSP_Evaluator],
|
|
||||||
["hard_MSP", "MSP", hard_MSP_Dataset, hard_MSP_Evaluator],
|
|
||||||
["cmp_GCP_D", "GCP_Decision", cmp_GCP_D_Dataset, cmp_GCP_D_Evaluator],
|
|
||||||
["cmp_TSP_D", "TSP_Decision", cmp_TSP_D_Dataset, cmp_TSP_D_Evaluator],
|
|
||||||
["cmp_KSP", "KSP", cmp_KSP_Dataset, cmp_KSP_Evaluator],
|
|
||||||
["p_BSP", "BSP", p_BSP_Dataset, p_BSP_Evaluator],
|
|
||||||
["p_EDP", "EDP", p_EDP_Dataset, p_EDP_Evaluator],
|
|
||||||
["p_SPP", "SPP", p_SPP_Dataset, p_SPP_Evaluator],
|
|
||||||
]
|
|
||||||
|
|
||||||
NPHardEval_datasets = []
|
|
||||||
for name, path_name, dataset, evaluator in NPHardEval_tasks:
|
|
||||||
NPHardEval_reader_cfg = dict(input_columns=["prompt", "level"], output_column="q")
|
|
||||||
|
|
||||||
NPHardEval_infer_cfg = dict(
|
|
||||||
ice_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
begin="</E>",
|
|
||||||
round=[
|
|
||||||
dict(role="HUMAN", prompt="</E>{prompt}"),
|
|
||||||
dict(role="BOT", prompt=""),
|
|
||||||
],
|
|
||||||
),
|
|
||||||
ice_token="</E>",
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
NPHardEval_eval_cfg = dict(evaluator=dict(type=evaluator), pred_role="BOT")
|
|
||||||
|
|
||||||
NPHardEval_datasets.append(
|
|
||||||
dict(
|
|
||||||
type=dataset,
|
|
||||||
abbr=name,
|
|
||||||
path=f"./data/NPHardEval/{path_name}/",
|
|
||||||
reader_cfg=NPHardEval_reader_cfg,
|
|
||||||
infer_cfg=NPHardEval_infer_cfg,
|
|
||||||
eval_cfg=NPHardEval_eval_cfg,
|
|
||||||
)
|
|
||||||
)
|
|
@ -1,54 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.datasets import PJExamDataset, PJExamEvaluator
|
|
||||||
|
|
||||||
PJExam_datasets = []
|
|
||||||
for _name in [
|
|
||||||
'gk-2022-v1', 'gk-2022-v1-math', 'gk-2023-v1', 'gk-2023-v1-math',
|
|
||||||
'gk-2023-v2', 'gk-2023-v2-math', 'zk-2022-v1'
|
|
||||||
]:
|
|
||||||
_hint = "请你做一道</major>选择题\n请你一步一步思考并将思考过程写在【解析】和<eoe>之间。你将从A,B,C,D中选出正确的答案,并写在【答案】和<eoa>之间。\n例如:【答案】A<eoa>\n完整的题目回答的格式如下:\n【解析】...<eoe>\n【答案】...<eoa>\n请你严格按照上述格式作答。\n题目如下:\n"
|
|
||||||
_reader_cfg = {
|
|
||||||
"input_columns": ['question'],
|
|
||||||
"output_column": 'std_ans',
|
|
||||||
},
|
|
||||||
_infer_cfg = {
|
|
||||||
"ice_template": {
|
|
||||||
"type": PromptTemplate,
|
|
||||||
"template": {
|
|
||||||
"round": [{
|
|
||||||
"role": "HUMAN",
|
|
||||||
"prompt": _hint + "{question}",
|
|
||||||
}]
|
|
||||||
},
|
|
||||||
"ice_token": "</E>"
|
|
||||||
},
|
|
||||||
"retriever": {
|
|
||||||
"type": ZeroRetriever
|
|
||||||
},
|
|
||||||
"inferencer": {
|
|
||||||
"type": GenInferencer,
|
|
||||||
"max_out_len": 1024,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_eval_cfg = {
|
|
||||||
"evaluator": {
|
|
||||||
"type": PJExamEvaluator
|
|
||||||
},
|
|
||||||
"pred_role": "BOT",
|
|
||||||
"ds_column": "eval_infos"
|
|
||||||
}
|
|
||||||
_dataset = {
|
|
||||||
"type": PJExamDataset,
|
|
||||||
"abbr": "PJExamDataset-" + _name,
|
|
||||||
"path": './data/PJExam',
|
|
||||||
"name": _name,
|
|
||||||
"reader_cfg": _reader_cfg,
|
|
||||||
"infer_cfg": _infer_cfg,
|
|
||||||
"eval_cfg": _eval_cfg,
|
|
||||||
}
|
|
||||||
|
|
||||||
PJExam_datasets.append(_dataset)
|
|
||||||
|
|
||||||
del _name, _hint, _reader_cfg, _infer_cfg, _eval_cfg, _dataset
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import AXDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
AX_b_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_b_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{sentence1}\n{sentence2}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_b_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_b_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="AX_b",
|
|
||||||
type=AXDataset_V2,
|
|
||||||
path="./data/SuperGLUE/AX-b/AX-b.jsonl",
|
|
||||||
reader_cfg=AX_b_reader_cfg,
|
|
||||||
infer_cfg=AX_b_infer_cfg,
|
|
||||||
eval_cfg=AX_b_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,53 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
AX_b_reader_cfg = dict(
|
|
||||||
input_columns=["sentence1", "sentence2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
AX_b_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"entailment":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{sentence1}\n{sentence2}\nIs the sentence below entailed by the sentence above?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="Yes"),
|
|
||||||
]),
|
|
||||||
"not_entailment":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{sentence1}\n{sentence2}\nIs the sentence below entailed by the sentence above?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="No"),
|
|
||||||
])
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_b_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
AX_b_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr="AX_b",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/AX-b/AX-b.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=AX_b_reader_cfg,
|
|
||||||
infer_cfg=AX_b_infer_cfg,
|
|
||||||
eval_cfg=AX_b_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import AXDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
AX_g_reader_cfg = dict(
|
|
||||||
input_columns=["hypothesis", "premise"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_g_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?\nA. Yes\nB. No\nAnswer:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_g_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_g_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="AX_g",
|
|
||||||
type=AXDataset_V2,
|
|
||||||
path="./data/SuperGLUE/AX-g/AX-g.jsonl",
|
|
||||||
reader_cfg=AX_g_reader_cfg,
|
|
||||||
infer_cfg=AX_g_infer_cfg,
|
|
||||||
eval_cfg=AX_g_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,53 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
AX_g_reader_cfg = dict(
|
|
||||||
input_columns=["hypothesis", "premise"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
AX_g_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"entailment":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="Yes"),
|
|
||||||
]),
|
|
||||||
"not_entailment":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nIs the sentence below entailed by the sentence above?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="No"),
|
|
||||||
])
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
AX_g_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
AX_g_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr="AX_g",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/AX-g/AX-g.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=AX_g_reader_cfg,
|
|
||||||
infer_cfg=AX_g_infer_cfg,
|
|
||||||
eval_cfg=AX_g_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,41 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import BoolQDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_capital_postprocess
|
|
||||||
|
|
||||||
BoolQ_reader_cfg = dict(
|
|
||||||
input_columns=["question", "passage"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="{passage}\nQuestion: {question}\nA. Yes\nB. No\nAnswer:"),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_capital_postprocess),
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="BoolQ",
|
|
||||||
type=BoolQDataset_V2,
|
|
||||||
path="./data/SuperGLUE/BoolQ/val.jsonl",
|
|
||||||
reader_cfg=BoolQ_reader_cfg,
|
|
||||||
infer_cfg=BoolQ_infer_cfg,
|
|
||||||
eval_cfg=BoolQ_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import BoolQDataset_V3
|
|
||||||
|
|
||||||
BoolQ_reader_cfg = dict(
|
|
||||||
input_columns=["question", "passage"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
BoolQ_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
'false':
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="Passage: {passage}\nQuestion: {question}?"),
|
|
||||||
dict(role="BOT", prompt="Answer: No"),
|
|
||||||
]),
|
|
||||||
'true':
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="Passage: {passage}\nQuestion: {question}?"),
|
|
||||||
dict(role="BOT", prompt="Answer: Yes"),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
BoolQ_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="BoolQ",
|
|
||||||
type=BoolQDataset_V3,
|
|
||||||
path="./data/SuperGLUE/BoolQ/val.jsonl",
|
|
||||||
reader_cfg=BoolQ_reader_cfg,
|
|
||||||
infer_cfg=BoolQ_infer_cfg,
|
|
||||||
eval_cfg=BoolQ_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,45 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import BoolQDataset
|
|
||||||
|
|
||||||
BoolQ_reader_cfg = dict(
|
|
||||||
input_columns=["question", "passage"],
|
|
||||||
output_column="answer",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
BoolQ_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
0:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{passage}\nQuestion: {question}?"),
|
|
||||||
dict(role="BOT", prompt="No"),
|
|
||||||
]),
|
|
||||||
1:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{passage}\nQuestion: {question}?"),
|
|
||||||
dict(role="BOT", prompt="Yes"),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
BoolQ_datasets = [
|
|
||||||
dict(
|
|
||||||
type=BoolQDataset,
|
|
||||||
abbr="BoolQ",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/BoolQ/val.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=BoolQ_reader_cfg,
|
|
||||||
infer_cfg=BoolQ_infer_cfg,
|
|
||||||
eval_cfg=BoolQ_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,45 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import BoolQDataset
|
|
||||||
|
|
||||||
BoolQ_reader_cfg = dict(
|
|
||||||
input_columns=["question", "passage"],
|
|
||||||
output_column="answer",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
BoolQ_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
0:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{passage}\nQuestion: {question}"),
|
|
||||||
dict(role="BOT", prompt="No."),
|
|
||||||
]),
|
|
||||||
1:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{passage}\nQuestion: {question}"),
|
|
||||||
dict(role="BOT", prompt="Yes."),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
BoolQ_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
BoolQ_datasets = [
|
|
||||||
dict(
|
|
||||||
type=BoolQDataset,
|
|
||||||
abbr="BoolQ",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/BoolQ/val.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=BoolQ_reader_cfg,
|
|
||||||
infer_cfg=BoolQ_infer_cfg,
|
|
||||||
eval_cfg=BoolQ_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,44 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import CBDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
CB_reader_cfg = dict(
|
|
||||||
input_columns=["premise", "hypothesis"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
CB_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nWhat is the relation between the two sentences?\nA. Contradiction\nB. Entailment\nC. Neutral\nAnswer:"
|
|
||||||
),
|
|
||||||
], ),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
CB_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABC'),
|
|
||||||
)
|
|
||||||
|
|
||||||
CB_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="CB",
|
|
||||||
type=CBDataset_V2,
|
|
||||||
path="./data/SuperGLUE/CB/val.jsonl",
|
|
||||||
reader_cfg=CB_reader_cfg,
|
|
||||||
infer_cfg=CB_infer_cfg,
|
|
||||||
eval_cfg=CB_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,62 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
CB_reader_cfg = dict(
|
|
||||||
input_columns=["premise", "hypothesis"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
CB_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
"contradiction":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nWhat is the relation between the two sentences?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="Contradiction"),
|
|
||||||
]),
|
|
||||||
"entailment":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nWhat is the relation between the two sentences?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="Entailment"),
|
|
||||||
]),
|
|
||||||
"neutral":
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\n{hypothesis}\nWhat is the relation between the two sentences?"
|
|
||||||
),
|
|
||||||
dict(role="BOT", prompt="Neutral"),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
CB_eval_cfg = dict(evaluator=dict(type=AccEvaluator), )
|
|
||||||
|
|
||||||
CB_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr="CB",
|
|
||||||
path="json",
|
|
||||||
split="train",
|
|
||||||
data_files="./data/SuperGLUE/CB/val.jsonl",
|
|
||||||
reader_cfg=CB_reader_cfg,
|
|
||||||
infer_cfg=CB_infer_cfg,
|
|
||||||
eval_cfg=CB_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,44 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import COPADataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
COPA_reader_cfg = dict(
|
|
||||||
input_columns=["question", "premise", "choice1", "choice2"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
COPA_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{premise}\nQuestion: Which may be the {question}?\nA. {choice1}\nB. {choice2}\nAnswer:"
|
|
||||||
),
|
|
||||||
], ),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
COPA_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
|
||||||
)
|
|
||||||
|
|
||||||
COPA_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="COPA",
|
|
||||||
type=COPADataset_V2,
|
|
||||||
path="./data/SuperGLUE/COPA/val.jsonl",
|
|
||||||
reader_cfg=COPA_reader_cfg,
|
|
||||||
infer_cfg=COPA_infer_cfg,
|
|
||||||
eval_cfg=COPA_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,45 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
COPA_reader_cfg = dict(
|
|
||||||
input_columns=["question", "premise", "choice1", "choice2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
COPA_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
0:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{premise}\nQuestion: {question}\nAnswer:"),
|
|
||||||
dict(role="BOT", prompt="{choice1}"),
|
|
||||||
]),
|
|
||||||
1:
|
|
||||||
dict(round=[
|
|
||||||
dict(role="HUMAN", prompt="{premise}\nQuestion: {question}\nAnswer:"),
|
|
||||||
dict(role="BOT", prompt="{choice2}"),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
COPA_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
COPA_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr="COPA",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/COPA/val.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=COPA_reader_cfg,
|
|
||||||
infer_cfg=COPA_infer_cfg,
|
|
||||||
eval_cfg=COPA_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,49 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import PPLInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import HFDataset
|
|
||||||
|
|
||||||
COPA_reader_cfg = dict(
|
|
||||||
input_columns=["question", "premise", "choice1", "choice2"],
|
|
||||||
output_column="label",
|
|
||||||
test_split="train")
|
|
||||||
|
|
||||||
COPA_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template={
|
|
||||||
0:
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="{premise}\nQuestion: What may be the {question}?\nAnswer:"),
|
|
||||||
dict(role="BOT", prompt="{choice1}"),
|
|
||||||
]),
|
|
||||||
1:
|
|
||||||
dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="{premise}\nQuestion: What may be the {question}?\nAnswer:"),
|
|
||||||
dict(role="BOT", prompt="{choice2}"),
|
|
||||||
]),
|
|
||||||
},
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=PPLInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
COPA_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
|
||||||
|
|
||||||
COPA_datasets = [
|
|
||||||
dict(
|
|
||||||
type=HFDataset,
|
|
||||||
abbr="COPA",
|
|
||||||
path="json",
|
|
||||||
data_files="./data/SuperGLUE/COPA/val.jsonl",
|
|
||||||
split="train",
|
|
||||||
reader_cfg=COPA_reader_cfg,
|
|
||||||
infer_cfg=COPA_infer_cfg,
|
|
||||||
eval_cfg=COPA_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -1,43 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
|
||||||
from opencompass.datasets import MultiRCDataset_V2
|
|
||||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
|
||||||
|
|
||||||
MultiRC_reader_cfg = dict(
|
|
||||||
input_columns=["question", "text", "answer"],
|
|
||||||
output_column="label",
|
|
||||||
)
|
|
||||||
|
|
||||||
MultiRC_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt=
|
|
||||||
"{text}\nQuestion: {question}\nAnswer: {answer}\nIs it true?\nA. Yes\nB. No\nAnswer:"
|
|
||||||
),
|
|
||||||
]),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer),
|
|
||||||
)
|
|
||||||
|
|
||||||
MultiRC_eval_cfg = dict(
|
|
||||||
evaluator=dict(type=AccEvaluator),
|
|
||||||
pred_role="BOT",
|
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
|
|
||||||
)
|
|
||||||
|
|
||||||
MultiRC_datasets = [
|
|
||||||
dict(
|
|
||||||
abbr="MultiRC",
|
|
||||||
type=MultiRCDataset_V2,
|
|
||||||
path="./data/SuperGLUE/MultiRC/val.jsonl",
|
|
||||||
reader_cfg=MultiRC_reader_cfg,
|
|
||||||
infer_cfg=MultiRC_infer_cfg,
|
|
||||||
eval_cfg=MultiRC_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user