This commit is contained in:
Myhs-phz 2025-03-19 01:16:59 +00:00
parent b9b69febc3
commit cc9761e882
4 changed files with 170 additions and 204 deletions

View File

@ -1,739 +1,886 @@
- ifeval: - ifeval:
name: IFEval name: IFEval
category: Instruction Following category: Instruction Following
paper: https://arxiv.org/pdf/2311.07911 paper: https://arxiv.org/pdf/2311.07911
configpath: opencompass/configs/datasets/IFEval/IFEval configpath: opencompass/configs/datasets/IFEval/IFEval
configpath_llmjudge: ''
- nphard: - nphard:
name: NPHardEval name: NPHardEval
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2312.14890v2 paper: https://arxiv.org/pdf/2312.14890v2
configpath: opencompass/configs/datasets/NPHardEval configpath: opencompass/configs/datasets/NPHardEval
configpath_llmjudge: ''
- pmmeval: - pmmeval:
name: PMMEval name: PMMEval
category: Language category: Language
paper: https://arxiv.org/pdf/2411.09116v1 paper: https://arxiv.org/pdf/2411.09116v1
configpath: opencompass/configs/datasets/PMMEval configpath: opencompass/configs/datasets/PMMEval
configpath_llmjudge: ''
- theoremqa: - theoremqa:
name: TheroremQA name: TheroremQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2305.12524 paper: https://arxiv.org/pdf/2305.12524
configpath: opencompass/configs/datasets/TheroremQA configpath: opencompass/configs/datasets/TheroremQA
configpath_llmjudge: ''
- agieval: - agieval:
name: AGIEval name: AGIEval
category: Examination category: Examination
paper: https://arxiv.org/pdf/2304.06364 paper: https://arxiv.org/pdf/2304.06364
configpath: opencompass/configs/datasets/agieval configpath: opencompass/configs/datasets/agieval
configpath_llmjudge: ''
- babilong: - babilong:
name: BABILong name: BABILong
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2406.10149 paper: https://arxiv.org/pdf/2406.10149
configpath: opencompass/configs/datasets/babilong configpath: opencompass/configs/datasets/babilong
configpath_llmjudge: ''
- bigcodebench: - bigcodebench:
name: BigCodeBench name: BigCodeBench
category: Code category: Code
paper: https://arxiv.org/pdf/2406.15877 paper: https://arxiv.org/pdf/2406.15877
configpath: opencompass/configs/datasets/bigcodebench configpath: opencompass/configs/datasets/bigcodebench
configpath_llmjudge: ''
- calm: - calm:
name: CaLM name: CaLM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2405.00622 paper: https://arxiv.org/pdf/2405.00622
configpath: opencompass/configs/datasets/calm configpath: opencompass/configs/datasets/calm
configpath_llmjudge: ''
- infinitebench: - infinitebench:
name: InfiniteBench (∞Bench) name: InfiniteBench (∞Bench)
category: Long Context category: Long Context
paper: https://aclanthology.org/2024.acl-long.814.pdf paper: https://aclanthology.org/2024.acl-long.814.pdf
configpath: opencompass/configs/datasets/infinitebench configpath: opencompass/configs/datasets/infinitebench
configpath_llmjudge: ''
- korbench: - korbench:
name: KOR-Bench name: KOR-Bench
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2410.06526v1 paper: https://arxiv.org/pdf/2410.06526v1
configpath: opencompass/configs/datasets/korbench configpath: opencompass/configs/datasets/korbench
configpath_llmjudge: ''
- lawbench: - lawbench:
name: LawBench name: LawBench
category: Knowledge / Law category: Knowledge / Law
paper: https://arxiv.org/pdf/2309.16289 paper: https://arxiv.org/pdf/2309.16289
configpath: opencompass/configs/datasets/lawbench configpath: opencompass/configs/datasets/lawbench
configpath_llmjudge: ''
- leval: - leval:
name: L-Eval name: L-Eval
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2307.11088v1 paper: https://arxiv.org/pdf/2307.11088v1
configpath: opencompass/configs/datasets/leval configpath: opencompass/configs/datasets/leval
configpath_llmjudge: ''
- livecodebench: - livecodebench:
name: LiveCodeBench name: LiveCodeBench
category: Code category: Code
paper: https://arxiv.org/pdf/2403.07974 paper: https://arxiv.org/pdf/2403.07974
configpath: opencompass/configs/datasets/livecodebench configpath: opencompass/configs/datasets/livecodebench
configpath_llmjudge: ''
- livemathbench: - livemathbench:
name: LiveMathBench name: LiveMathBench
category: Math category: Math
paper: https://arxiv.org/pdf/2412.13147 paper: https://arxiv.org/pdf/2412.13147
configpath: opencompass/configs/datasets/livemathbench configpath: opencompass/configs/datasets/livemathbench
configpath_llmjudge: ''
- longbench: - longbench:
name: LongBench name: LongBench
category: Long Context category: Long Context
paper: https://github.com/THUDM/LongBench paper: https://github.com/THUDM/LongBench
configpath: opencompass/configs/datasets/livemathbench configpath: opencompass/configs/datasets/livemathbench
configpath_llmjudge: ''
- lveval: - lveval:
name: LV-Eval name: LV-Eval
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2402.05136 paper: https://arxiv.org/pdf/2402.05136
configpath: opencompass/configs/datasets/lveval configpath: opencompass/configs/datasets/lveval
configpath_llmjudge: ''
- medbench: - medbench:
name: MedBench name: MedBench
category: Knowledge / Medicine category: Knowledge / Medicine
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench configpath: opencompass/configs/datasets/MedBench
configpath_llmjudge: ''
- musr: - musr:
name: MuSR name: MuSR
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2310.16049 paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr configpath: opencompass/configs/datasets/musr
configpath_llmjudge: ''
- needlebench: - needlebench:
name: NeedleBench name: NeedleBench
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2407.11963 paper: https://arxiv.org/pdf/2407.11963
configpath: opencompass/configs/datasets/needlebench configpath: opencompass/configs/datasets/needlebench
configpath_llmjudge: ''
- ruler: - ruler:
name: RULER name: RULER
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2404.06654 paper: https://arxiv.org/pdf/2404.06654
configpath: opencompass/configs/datasets/ruler configpath: opencompass/configs/datasets/ruler
configpath_llmjudge: ''
- alignment: - alignment:
name: AlignBench name: AlignBench
category: Subjective / Alignment category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.18743 paper: https://arxiv.org/pdf/2311.18743
configpath: opencompass/configs/datasets/subjective/alignbench configpath: opencompass/configs/datasets/subjective/alignbench
configpath_llmjudge: ''
- alpaca: - alpaca:
name: AlpacaEval name: AlpacaEval
category: Subjective / Instruction Following category: Subjective / Instruction Following
paper: https://github.com/tatsu-lab/alpaca_eval paper: https://github.com/tatsu-lab/alpaca_eval
configpath: opencompass/configs/datasets/subjective/aplaca_eval configpath: opencompass/configs/datasets/subjective/aplaca_eval
configpath_llmjudge: ''
- arenahard: - arenahard:
name: Arena-Hard name: Arena-Hard
category: Subjective / Chatbot category: Subjective / Chatbot
paper: https://lmsys.org/blog/2024-04-19-arena-hard/ paper: https://lmsys.org/blog/2024-04-19-arena-hard/
configpath: opencompass/configs/datasets/subjective/arena_hard configpath: opencompass/configs/datasets/subjective/arena_hard
configpath_llmjudge: ''
- flames: - flames:
name: FLAMES name: FLAMES
category: Subjective / Alignment category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.06899 paper: https://arxiv.org/pdf/2311.06899
configpath: opencompass/configs/datasets/subjective/flames configpath: opencompass/configs/datasets/subjective/flames
configpath_llmjudge: ''
- fofo: - fofo:
name: FOFO name: FOFO
category: Subjective / Format Following category: Subjective / Format Following
paper: https://arxiv.org/pdf/2402.18667 paper: https://arxiv.org/pdf/2402.18667
configpath: opencompass/configs/datasets/subjective/fofo configpath: opencompass/configs/datasets/subjective/fofo
configpath_llmjudge: ''
- followbench: - followbench:
name: FollowBench name: FollowBench
category: Subjective / Instruction Following category: Subjective / Instruction Following
paper: https://arxiv.org/pdf/2310.20410 paper: https://arxiv.org/pdf/2310.20410
configpath: opencompass/configs/datasets/subjective/followbench configpath: opencompass/configs/datasets/subjective/followbench
configpath_llmjudge: ''
- hellobench: - hellobench:
name: HelloBench name: HelloBench
category: Subjective / Long Context category: Subjective / Long Context
paper: https://arxiv.org/pdf/2409.16191 paper: https://arxiv.org/pdf/2409.16191
configpath: opencompass/configs/datasets/subjective/hellobench configpath: opencompass/configs/datasets/subjective/hellobench
configpath_llmjudge: ''
- judgerbench: - judgerbench:
name: JudgerBench name: JudgerBench
category: Subjective / Long Context category: Subjective / Long Context
paper: https://arxiv.org/pdf/2410.16256 paper: https://arxiv.org/pdf/2410.16256
configpath: opencompass/configs/datasets/subjective/judgerbench configpath: opencompass/configs/datasets/subjective/judgerbench
configpath_llmjudge: ''
- multiround: - multiround:
name: MT-Bench-101 name: MT-Bench-101
category: Subjective / Multi-Round category: Subjective / Multi-Round
paper: https://arxiv.org/pdf/2402.14762 paper: https://arxiv.org/pdf/2402.14762
configpath: opencompass/configs/datasets/subjective/multiround configpath: opencompass/configs/datasets/subjective/multiround
configpath_llmjudge: ''
- wildbench: - wildbench:
name: WildBench name: WildBench
category: Subjective / Real Task category: Subjective / Real Task
paper: https://arxiv.org/pdf/2406.04770 paper: https://arxiv.org/pdf/2406.04770
configpath: opencompass/configs/datasets/subjective/wildbench configpath: opencompass/configs/datasets/subjective/wildbench
configpath_llmjudge: ''
- teval: - teval:
name: T-Eval name: T-Eval
category: Tool Utilization category: Tool Utilization
paper: https://arxiv.org/pdf/2312.14033 paper: https://arxiv.org/pdf/2312.14033
configpath: opencompass/configs/datasets/teval configpath: opencompass/configs/datasets/teval
configpath_llmjudge: ''
- finalceiq: - finalceiq:
name: FinanceIQ name: FinanceIQ
category: Knowledge / Finance category: Knowledge / Finance
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ configpath: opencompass/configs/datasets/FinanceIQ
configpath_llmjudge: ''
- gaokaobench: - gaokaobench:
name: GAOKAOBench name: GAOKAOBench
category: Examination category: Examination
paper: https://arxiv.org/pdf/2305.12474 paper: https://arxiv.org/pdf/2305.12474
configpath: opencompass/configs/datasets/GaokaoBench configpath: opencompass/configs/datasets/GaokaoBench
configpath_llmjudge: ''
- lcbench: - lcbench:
name: LCBench name: LCBench
category: Code category: Code
paper: https://github.com/open-compass/CodeBench/ paper: https://github.com/open-compass/CodeBench/
configpath: opencompass/configs/datasets/LCBench configpath: opencompass/configs/datasets/LCBench
configpath_llmjudge: ''
- MMLUArabic: - MMLUArabic:
name: ArabicMMLU name: ArabicMMLU
category: Language category: Language
paper: https://arxiv.org/pdf/2402.12840 paper: https://arxiv.org/pdf/2402.12840
configpath: opencompass/configs/datasets/MMLUArabic configpath: opencompass/configs/datasets/MMLUArabic
configpath_llmjudge: ''
- OpenFinData: - OpenFinData:
name: OpenFinData name: OpenFinData
category: Knowledge / Finance category: Knowledge / Finance
paper: https://github.com/open-compass/OpenFinData paper: https://github.com/open-compass/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData configpath: opencompass/configs/datasets/OpenFinData
configpath_llmjudge: ''
- QuALITY: - QuALITY:
name: QuALITY name: QuALITY
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2112.08608 paper: https://arxiv.org/pdf/2112.08608
configpath: opencompass/configs/datasets/QuALITY configpath: opencompass/configs/datasets/QuALITY
configpath_llmjudge: ''
- advglue: - advglue:
name: Adversarial GLUE name: Adversarial GLUE
category: Safety category: Safety
paper: https://openreview.net/pdf?id=GF9cSKI3A_q paper: https://openreview.net/pdf?id=GF9cSKI3A_q
configpath: opencompass/configs/datasets/adv_glue configpath: opencompass/configs/datasets/adv_glue
configpath_llmjudge: ''
- afqmcd: - afqmcd:
name: CLUE / AFQMC name: CLUE / AFQMC
category: Language category: Language
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_afqmc configpath: opencompass/configs/datasets/CLUE_afqmc
configpath_llmjudge: ''
- aime2024: - aime2024:
name: AIME2024 name: AIME2024
category: Examination category: Examination
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024 paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
configpath: opencompass/configs/datasets/aime2024 configpath: opencompass/configs/datasets/aime2024
configpath_llmjudge: ''
- anli: - anli:
name: Adversarial NLI name: Adversarial NLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1910.14599v2 paper: https://arxiv.org/pdf/1910.14599v2
configpath: opencompass/configs/datasets/anli configpath: opencompass/configs/datasets/anli
configpath_llmjudge: ''
- anthropics_evals: - anthropics_evals:
name: Anthropics Evals name: Anthropics Evals
category: Safety category: Safety
paper: https://arxiv.org/pdf/2212.09251 paper: https://arxiv.org/pdf/2212.09251
configpath: opencompass/configs/datasets/anthropics_evals configpath: opencompass/configs/datasets/anthropics_evals
configpath_llmjudge: ''
- apps: - apps:
name: APPS name: APPS
category: Code category: Code
paper: https://arxiv.org/pdf/2105.09938 paper: https://arxiv.org/pdf/2105.09938
configpath: opencompass/configs/datasets/apps configpath: opencompass/configs/datasets/apps
configpath_llmjudge: ''
- arc: - arc:
name: ARC name: ARC
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1803.05457 paper: https://arxiv.org/pdf/1803.05457
configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e] configpath:
- opencompass/configs/datasets/ARC_c
- opencompass/configs/datasets/ARC_e
configpath_llmjudge: ''
- arc_prize_public_eval: - arc_prize_public_eval:
name: ARC Prize name: ARC Prize
category: ARC-AGI category: ARC-AGI
paper: https://arcprize.org/guide#private paper: https://arcprize.org/guide#private
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
configpath_llmjudge: ''
- ax: - ax:
name: SuperGLUE / AX name: SuperGLUE / AX
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g] configpath:
- opencompass/configs/datasets/SuperGLUE_AX_b
- opencompass/configs/datasets/SuperGLUE_AX_g
configpath_llmjudge: ''
- bbh: - bbh:
name: BIG-Bench Hard name: BIG-Bench Hard
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2210.09261 paper: https://arxiv.org/pdf/2210.09261
configpath: opencompass/configs/datasets/bbh configpath: opencompass/configs/datasets/bbh
configpath_llmjudge: ''
- BoolQ: - BoolQ:
name: SuperGLUE / BoolQ name: SuperGLUE / BoolQ
category: Knowledge category: Knowledge
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
configpath_llmjudge: ''
- c3: - c3:
name: CLUE / C3 (C³) name: CLUE / C3 (C³)
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3 configpath: opencompass/configs/datasets/CLUE_C3
configpath_llmjudge: ''
- cb: - cb:
name: SuperGLUE / CB name: SuperGLUE / CB
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_CB configpath: opencompass/configs/datasets/SuperGLUE_CB
configpath_llmjudge: ''
- ceval: - ceval:
name: C-EVAL name: C-EVAL
category: Examination category: Examination
paper: https://arxiv.org/pdf/2305.08322v1 paper: https://arxiv.org/pdf/2305.08322v1
configpath: opencompass/configs/datasets/ceval configpath: opencompass/configs/datasets/ceval
configpath_llmjudge: ''
- charm: - charm:
name: CHARM name: CHARM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2403.14112 paper: https://arxiv.org/pdf/2403.14112
configpath: opencompass/configs/datasets/CHARM configpath: opencompass/configs/datasets/CHARM
configpath_llmjudge: ''
- chembench: - chembench:
name: ChemBench name: ChemBench
category: Knowledge / Chemistry category: Knowledge / Chemistry
paper: https://arxiv.org/pdf/2404.01475 paper: https://arxiv.org/pdf/2404.01475
configpath: opencompass/configs/datasets/ChemBench configpath: opencompass/configs/datasets/ChemBench
configpath_llmjudge: ''
- chid: - chid:
name: FewCLUE / CHID name: FewCLUE / CHID
category: Language category: Language
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_chid configpath: opencompass/configs/datasets/FewCLUE_chid
configpath_llmjudge: ''
- chinese_simpleqa: - chinese_simpleqa:
name: Chinese SimpleQA name: Chinese SimpleQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2411.07140 paper: https://arxiv.org/pdf/2411.07140
configpath: opencompass/configs/datasets/chinese_simpleqa configpath: opencompass/configs/datasets/chinese_simpleqa
configpath_llmjudge: ''
- cibench: - cibench:
name: CIBench name: CIBench
category: Code category: Code
paper: https://www.arxiv.org/pdf/2407.10499 paper: https://www.arxiv.org/pdf/2407.10499
configpath: opencompass/configs/datasets/CIBench configpath: opencompass/configs/datasets/CIBench
configpath_llmjudge: ''
- civilcomments: - civilcomments:
name: CivilComments name: CivilComments
category: Safety category: Safety
paper: https://arxiv.org/pdf/1903.04561 paper: https://arxiv.org/pdf/1903.04561
configpath: opencompass/configs/datasets/civilcomments configpath: opencompass/configs/datasets/civilcomments
configpath_llmjudge: ''
- clozeTest_maxmin: - clozeTest_maxmin:
name: Cloze Test-max/min name: Cloze Test-max/min
category: Code category: Code
paper: https://arxiv.org/pdf/2102.04664 paper: https://arxiv.org/pdf/2102.04664
configpath: opencompass/configs/datasets/clozeTest_maxmin configpath: opencompass/configs/datasets/clozeTest_maxmin
configpath_llmjudge: ''
- cluewsc: - cluewsc:
name: FewCLUE / CLUEWSC name: FewCLUE / CLUEWSC
category: Language / WSC category: Language / WSC
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_cluewsc configpath: opencompass/configs/datasets/FewCLUE_cluewsc
configpath_llmjudge: ''
- cmb: - cmb:
name: CMB name: CMB
category: Knowledge / Medicine category: Knowledge / Medicine
paper: https://arxiv.org/pdf/2308.08833 paper: https://arxiv.org/pdf/2308.08833
configpath: opencompass/configs/datasets/cmb configpath: opencompass/configs/datasets/cmb
configpath_llmjudge: ''
- cmmlu: - cmmlu:
name: CMMLU name: CMMLU
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2306.09212 paper: https://arxiv.org/pdf/2306.09212
configpath: opencompass/configs/datasets/cmmlu configpath: opencompass/configs/datasets/cmmlu
configpath_llmjudge: ''
- cmnli: - cmnli:
name: CLUE / CMNLI name: CLUE / CMNLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_cmnli configpath: opencompass/configs/datasets/CLUE_cmnli
configpath_llmjudge: ''
- cmo_fib: - cmo_fib:
name: cmo_fib name: cmo_fib
category: Examination category: Examination
paper: "" paper: ''
configpath: opencompass/configs/datasets/cmo_fib configpath: opencompass/configs/datasets/cmo_fib
configpath_llmjudge: ''
- cmrc: - cmrc:
name: CLUE / CMRC name: CLUE / CMRC
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_CMRC configpath: opencompass/configs/datasets/CLUE_CMRC
configpath_llmjudge: ''
- commonsenseqa: - commonsenseqa:
name: CommonSenseQA name: CommonSenseQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1811.00937v2 paper: https://arxiv.org/pdf/1811.00937v2
configpath: opencompass/configs/datasets/commonsenseqa configpath: opencompass/configs/datasets/commonsenseqa
configpath_llmjudge: ''
- commonsenseqa_cn: - commonsenseqa_cn:
name: CommonSenseQA-CN name: CommonSenseQA-CN
category: Knowledge category: Knowledge
paper: "" paper: ''
configpath: opencompass/configs/datasets/commonsenseqa_cn configpath: opencompass/configs/datasets/commonsenseqa_cn
configpath_llmjudge: ''
- copa: - copa:
name: SuperGLUE / COPA name: SuperGLUE / COPA
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_COPA configpath: opencompass/configs/datasets/SuperGLUE_COPA
configpath_llmjudge: ''
- crowspairs: - crowspairs:
name: CrowsPairs name: CrowsPairs
category: Safety category: Safety
paper: https://arxiv.org/pdf/2010.00133 paper: https://arxiv.org/pdf/2010.00133
configpath: opencompass/configs/datasets/crowspairs configpath: opencompass/configs/datasets/crowspairs
configpath_llmjudge: ''
- crowspairs_cn: - crowspairs_cn:
name: CrowsPairs-CN name: CrowsPairs-CN
category: Safety category: Safety
paper: "" paper: ''
configpath: opencompass/configs/datasets/crowspairs_cn configpath: opencompass/configs/datasets/crowspairs_cn
configpath_llmjudge: ''
- cvalues: - cvalues:
name: CVALUES name: CVALUES
category: Safety category: Safety
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
configpath: opencompass/configs/datasets/cvalues configpath: opencompass/configs/datasets/cvalues
configpath_llmjudge: ''
- drcd: - drcd:
name: CLUE / DRCD name: CLUE / DRCD
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_DRCD configpath: opencompass/configs/datasets/CLUE_DRCD
configpath_llmjudge: ''
- drop: - drop:
name: DROP (DROP Simple Eval) name: DROP (DROP Simple Eval)
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1903.00161 paper: https://arxiv.org/pdf/1903.00161
configpath: opencompass/configs/datasets/drop configpath: opencompass/configs/datasets/drop
configpath_llmjudge: ''
- ds1000: - ds1000:
name: DS-1000 name: DS-1000
category: Code category: Code
paper: https://arxiv.org/pdf/2211.11501 paper: https://arxiv.org/pdf/2211.11501
configpath: opencompass/configs/datasets/ds1000 configpath: opencompass/configs/datasets/ds1000
configpath_llmjudge: ''
- eprstmt: - eprstmt:
name: FewCLUE / EPRSTMT name: FewCLUE / EPRSTMT
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_eprstmt configpath: opencompass/configs/datasets/FewCLUE_eprstmt
configpath_llmjudge: ''
- flores: - flores:
name: Flores name: Flores
category: Language category: Language
paper: https://aclanthology.org/D19-1632.pdf paper: https://aclanthology.org/D19-1632.pdf
configpath: opencompass/configs/datasets/flores configpath: opencompass/configs/datasets/flores
configpath_llmjudge: ''
- game24: - game24:
name: Game24 name: Game24
category: Math category: Math
paper: https://huggingface.co/datasets/nlile/24-game paper: https://huggingface.co/datasets/nlile/24-game
configpath: opencompass/configs/datasets/game24 configpath: opencompass/configs/datasets/game24
configpath_llmjudge: ''
- govrepcrs: - govrepcrs:
name: Government Report Dataset name: Government Report Dataset
category: Long Context category: Long Context
paper: https://aclanthology.org/2021.naacl-main.112.pdf paper: https://aclanthology.org/2021.naacl-main.112.pdf
configpath: opencompass/configs/datasets/govrepcrs configpath: opencompass/configs/datasets/govrepcrs
configpath_llmjudge: ''
- gpqa: - gpqa:
name: GPQA name: GPQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2311.12022v1 paper: https://arxiv.org/pdf/2311.12022v1
configpath: opencompass/configs/datasets/gpqa configpath: opencompass/configs/datasets/gpqa
configpath_llmjudge: ''
- gsm8k: - gsm8k:
name: GSM8K name: GSM8K
category: Math category: Math
paper: https://arxiv.org/pdf/2110.14168v2 paper: https://arxiv.org/pdf/2110.14168v2
configpath: opencompass/configs/datasets/gsm8k configpath: opencompass/configs/datasets/gsm8k
configpath_llmjudge: ''
- gsm_hard: - gsm_hard:
name: GSM-Hard name: GSM-Hard
category: Math category: Math
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
configpath: opencompass/configs/datasets/gsm_hard configpath: opencompass/configs/datasets/gsm_hard
configpath_llmjudge: ''
- hle: - hle:
name: HLE(Humanity's Last Exam) name: HLE(Humanity's Last Exam)
category: Reasoning category: Reasoning
paper: https://lastexam.ai/paper paper: https://lastexam.ai/paper
configpath: opencompass/configs/datasets/HLE configpath: opencompass/configs/datasets/HLE
configpath_llmjudge: ''
- hellaswag: - hellaswag:
name: HellaSwag name: HellaSwag
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1905.07830 paper: https://arxiv.org/pdf/1905.07830
configpath: opencompass/configs/datasets/hellaswag configpath: opencompass/configs/datasets/hellaswag
configpath_llmjudge: ''
- humaneval: - humaneval:
name: HumanEval name: HumanEval
category: Code category: Code
paper: https://arxiv.org/pdf/2107.03374v2 paper: https://arxiv.org/pdf/2107.03374v2
configpath: opencompass/configs/datasets/humaneval configpath: opencompass/configs/datasets/humaneval
configpath_llmjudge: ''
- humaneval_cn: - humaneval_cn:
name: HumanEval-CN name: HumanEval-CN
category: Code category: Code
paper: "" paper: ''
configpath: opencompass/configs/datasets/humaneval_cn configpath: opencompass/configs/datasets/humaneval_cn
configpath_llmjudge: ''
- humaneval_multi: - humaneval_multi:
name: Multi-HumanEval name: Multi-HumanEval
category: Code category: Code
paper: https://arxiv.org/pdf/2210.14868 paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/humaneval_multi configpath: opencompass/configs/datasets/humaneval_multi
configpath_llmjudge: ''
- humanevalx: - humanevalx:
name: HumanEval-X name: HumanEval-X
category: Code category: Code
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx configpath: opencompass/configs/datasets/humanevalx
configpath_llmjudge: ''
- hungarian_math: - hungarian_math:
name: Hungarian_Math name: Hungarian_Math
category: Math category: Math
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
configpath: opencompass/configs/datasets/hungarian_exam configpath: opencompass/configs/datasets/hungarian_exam
configpath_llmjudge: ''
- iwslt2017: - iwslt2017:
name: IWSLT2017 name: IWSLT2017
category: Language category: Language
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
configpath: opencompass/configs/datasets/iwslt2017 configpath: opencompass/configs/datasets/iwslt2017
configpath_llmjudge: ''
- jigsawmultilingual: - jigsawmultilingual:
name: JigsawMultilingual name: JigsawMultilingual
category: Safety category: Safety
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
configpath: opencompass/configs/datasets/jigsawmultilingual configpath: opencompass/configs/datasets/jigsawmultilingual
configpath_llmjudge: ''
- lambada: - lambada:
name: LAMBADA name: LAMBADA
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1606.06031 paper: https://arxiv.org/pdf/1606.06031
configpath: opencompass/configs/datasets/lambada configpath: opencompass/configs/datasets/lambada
configpath_llmjudge: ''
- lcsts: - lcsts:
name: LCSTS name: LCSTS
category: Understanding category: Understanding
paper: https://aclanthology.org/D15-1229.pdf paper: https://aclanthology.org/D15-1229.pdf
configpath: opencompass/configs/datasets/lcsts configpath: opencompass/configs/datasets/lcsts
configpath_llmjudge: ''
- livestembench: - livestembench:
name: LiveStemBench name: LiveStemBench
category: "" category: ''
paper: "" paper: ''
configpath: opencompass/configs/datasets/livestembench configpath: opencompass/configs/datasets/livestembench
configpath_llmjudge: ''
- llm_compression: - llm_compression:
name: LLM Compression name: LLM Compression
category: Bits Per Character (BPC) category: Bits Per Character (BPC)
paper: https://arxiv.org/pdf/2404.09937 paper: https://arxiv.org/pdf/2404.09937
configpath: opencompass/configs/datasets/llm_compression configpath: opencompass/configs/datasets/llm_compression
configpath_llmjudge: ''
- math: - math:
name: MATH name: MATH
category: Math category: Math
paper: https://arxiv.org/pdf/2103.03874 paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math configpath: opencompass/configs/datasets/math
configpath_llmjudge: ''
- math401: - math401:
name: MATH 401 name: MATH 401
category: Math category: Math
paper: https://arxiv.org/pdf/2304.02015 paper: https://arxiv.org/pdf/2304.02015
configpath: opencompass/configs/datasets/math401 configpath: opencompass/configs/datasets/math401
configpath_llmjudge: ''
- mathbench: - mathbench:
name: MathBench name: MathBench
category: Math category: Math
paper: https://arxiv.org/pdf/2405.12209 paper: https://arxiv.org/pdf/2405.12209
configpath: opencompass/configs/datasets/mathbench configpath: opencompass/configs/datasets/mathbench
configpath_llmjudge: ''
- mbpp: - mbpp:
name: MBPP name: MBPP
category: Code category: Code
paper: https://arxiv.org/pdf/2108.07732 paper: https://arxiv.org/pdf/2108.07732
configpath: opencompass/configs/datasets/mbpp configpath: opencompass/configs/datasets/mbpp
configpath_llmjudge: ''
- mbpp_cn: - mbpp_cn:
name: MBPP-CN name: MBPP-CN
category: Code category: Code
paper: "" paper: ''
configpath: opencompass/configs/datasets/mbpp_cn configpath: opencompass/configs/datasets/mbpp_cn
configpath_llmjudge: ''
- mbpp_plus: - mbpp_plus:
name: MBPP-PLUS name: MBPP-PLUS
category: Code category: Code
paper: "" paper: ''
configpath: opencompass/configs/datasets/mbpp_plus configpath: opencompass/configs/datasets/mbpp_plus
configpath_llmjudge: ''
- mgsm: - mgsm:
name: MGSM name: MGSM
category: Language / Math category: Language / Math
paper: https://arxiv.org/pdf/2210.03057 paper: https://arxiv.org/pdf/2210.03057
configpath: opencompass/configs/datasets/mgsm configpath: opencompass/configs/datasets/mgsm
configpath_llmjudge: ''
- mmlu: - mmlu:
name: MMLU name: MMLU
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2009.03300 paper: https://arxiv.org/pdf/2009.03300
configpath: opencompass/configs/datasets/mmlu configpath: opencompass/configs/datasets/mmlu
configpath_llmjudge: ''
- mmlu_cf: - mmlu_cf:
name: MMLU-CF name: MMLU-CF
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2412.15194 paper: https://arxiv.org/pdf/2412.15194
configpath: opencompass/configs/datasets/mmlu_cf configpath: opencompass/configs/datasets/mmlu_cf
configpath_llmjudge: ''
- mmlu_pro: - mmlu_pro:
name: MMLU-Pro name: MMLU-Pro
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2406.01574 paper: https://arxiv.org/pdf/2406.01574
configpath: opencompass/configs/datasets/mmlu_pro configpath: opencompass/configs/datasets/mmlu_pro
configpath_llmjudge: ''
- mmmlu: - mmmlu:
name: MMMLU name: MMMLU
category: Language / Understanding category: Language / Understanding
paper: https://huggingface.co/datasets/openai/MMMLU paper: https://huggingface.co/datasets/openai/MMMLU
configpath: opencompass/configs/datasets/mmmlu configpath: opencompass/configs/datasets/mmmlu
configpath_llmjudge: ''
- multirc: - multirc:
name: SuperGLUE / MultiRC name: SuperGLUE / MultiRC
category: Understanding category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
configpath_llmjudge: ''
- narrativeqa: - narrativeqa:
name: NarrativeQA name: NarrativeQA
category: Understanding category: Understanding
paper: https://github.com/google-deepmind/narrativeqa paper: https://github.com/google-deepmind/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa configpath: opencompass/configs/datasets/narrativeqa
configpath_llmjudge: ''
- natural_question: - natural_question:
name: NaturalQuestions name: NaturalQuestions
category: Knowledge category: Knowledge
paper: https://github.com/google-research-datasets/natural-questions paper: https://github.com/google-research-datasets/natural-questions
configpath: opencompass/configs/datasets/nq configpath: opencompass/configs/datasets/nq
configpath_llmjudge: ''
- natural_question_cn: - natural_question_cn:
name: NaturalQuestions-CN name: NaturalQuestions-CN
category: Knowledge category: Knowledge
paper: "" paper: ''
configpath: opencompass/configs/datasets/nq_cn configpath: opencompass/configs/datasets/nq_cn
configpath_llmjudge: ''
- obqa: - obqa:
name: OpenBookQA name: OpenBookQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1809.02789v1 paper: https://arxiv.org/pdf/1809.02789v1
configpath: opencompass/configs/datasets/obqa configpath: opencompass/configs/datasets/obqa
configpath_llmjudge: ''
- piqa: - piqa:
name: OpenBookQA name: OpenBookQA
category: Knowledge / Physics category: Knowledge / Physics
paper: https://arxiv.org/pdf/1911.11641v1 paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa configpath: opencompass/configs/datasets/piqa
configpath_llmjudge: ''
- py150: - py150:
name: py150 name: py150
category: Code category: Code
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
configpath: opencompass/configs/datasets/py150 configpath: opencompass/configs/datasets/py150
configpath_llmjudge: ''
- qasper: - qasper:
name: Qasper name: Qasper
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2105.03011 paper: https://arxiv.org/pdf/2105.03011
configpath: opencompass/configs/datasets/qasper configpath: opencompass/configs/datasets/qasper
configpath_llmjudge: ''
- qaspercut: - qaspercut:
name: Qasper-Cut name: Qasper-Cut
category: Long Context category: Long Context
paper: "" paper: ''
configpath: opencompass/configs/datasets/qaspercut configpath: opencompass/configs/datasets/qaspercut
configpath_llmjudge: ''
- race: - race:
name: RACE name: RACE
category: Examination category: Examination
paper: https://arxiv.org/pdf/1704.04683 paper: https://arxiv.org/pdf/1704.04683
configpath: opencompass/configs/datasets/race configpath: opencompass/configs/datasets/race
configpath_llmjudge: ''
- realtoxicprompts: - realtoxicprompts:
name: RealToxicPrompts name: RealToxicPrompts
category: Safety category: Safety
paper: https://arxiv.org/pdf/2009.11462 paper: https://arxiv.org/pdf/2009.11462
configpath: opencompass/configs/datasets/realtoxicprompts configpath: opencompass/configs/datasets/realtoxicprompts
configpath_llmjudge: ''
- record: - record:
name: SuperGLUE / ReCoRD name: SuperGLUE / ReCoRD
category: Understanding category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
configpath_llmjudge: ''
- rte: - rte:
name: SuperGLUE / RTE name: SuperGLUE / RTE
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_RTE configpath: opencompass/configs/datasets/SuperGLUE_RTE
configpath_llmjudge: ''
- ocnli: - ocnli:
name: CLUE / OCNLI name: CLUE / OCNLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_ocnli configpath: opencompass/configs/datasets/CLUE_ocnli
configpath_llmjudge: ''
- rolebench: - rolebench:
name: RoleBench name: RoleBench
category: Role Play category: Role Play
paper: https://arxiv.org/pdf/2310.00746 paper: https://arxiv.org/pdf/2310.00746
configpath: opencompass/configs/datasets/rolebench configpath: opencompass/configs/datasets/rolebench
configpath_llmjudge: ''
- s3eval: - s3eval:
name: S3Eval name: S3Eval
category: Long Context category: Long Context
paper: https://aclanthology.org/2024.naacl-long.69.pdf paper: https://aclanthology.org/2024.naacl-long.69.pdf
configpath: opencompass/configs/datasets/s3eval configpath: opencompass/configs/datasets/s3eval
configpath_llmjudge: ''
- scibench: - scibench:
name: SciBench name: SciBench
category: Reasoning category: Reasoning
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
configpath: opencompass/configs/datasets/scibench configpath: opencompass/configs/datasets/scibench
configpath_llmjudge: ''
- scicode: - scicode:
name: SciCode name: SciCode
category: Code category: Code
paper: https://arxiv.org/pdf/2407.13168 paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode configpath: opencompass/configs/datasets/scicode
configpath_llmjudge: ''
- simpleqa: - simpleqa:
name: SimpleQA name: SimpleQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2411.04368 paper: https://arxiv.org/pdf/2411.04368
configpath: opencompass/configs/datasets/SimpleQA configpath: opencompass/configs/datasets/SimpleQA
configpath_llmjudge: ''
- siqa: - siqa:
name: SocialIQA name: SocialIQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1904.09728 paper: https://arxiv.org/pdf/1904.09728
configpath: opencompass/configs/datasets/siqa configpath: opencompass/configs/datasets/siqa
configpath_llmjudge: ''
- squad20: - squad20:
name: SQuAD2.0 name: SQuAD2.0
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1806.03822 paper: https://arxiv.org/pdf/1806.03822
configpath: opencompass/configs/datasets/squad20 configpath: opencompass/configs/datasets/squad20
configpath_llmjudge: ''
- storycloze: - storycloze:
name: StoryCloze name: StoryCloze
category: Reasoning category: Reasoning
paper: https://aclanthology.org/2022.emnlp-main.616.pdf paper: https://aclanthology.org/2022.emnlp-main.616.pdf
configpath: opencompass/configs/datasets/storycloze configpath: opencompass/configs/datasets/storycloze
configpath_llmjudge: ''
- strategyqa: - strategyqa:
name: StrategyQA name: StrategyQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2101.02235 paper: https://arxiv.org/pdf/2101.02235
configpath: opencompass/configs/datasets/strategyqa configpath: opencompass/configs/datasets/strategyqa
configpath_llmjudge: ''
- summedits: - summedits:
name: SummEdits name: SummEdits
category: Language category: Language
paper: https://aclanthology.org/2023.emnlp-main.600.pdf paper: https://aclanthology.org/2023.emnlp-main.600.pdf
configpath: opencompass/configs/datasets/summedits configpath: opencompass/configs/datasets/summedits
configpath_llmjudge: ''
- summscreen: - summscreen:
name: SummScreen name: SummScreen
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2104.07091v1 paper: https://arxiv.org/pdf/2104.07091v1
configpath: opencompass/configs/datasets/summscreen configpath: opencompass/configs/datasets/summscreen
configpath_llmjudge: ''
- svamp: - svamp:
name: SVAMP name: SVAMP
category: Math category: Math
paper: https://aclanthology.org/2021.naacl-main.168.pdf paper: https://aclanthology.org/2021.naacl-main.168.pdf
configpath: opencompass/configs/datasets/SVAMP configpath: opencompass/configs/datasets/SVAMP
configpath_llmjudge: ''
- tabmwp: - tabmwp:
name: TabMWP name: TabMWP
category: Math / Table category: Math / Table
paper: https://arxiv.org/pdf/2209.14610 paper: https://arxiv.org/pdf/2209.14610
configpath: opencompass/configs/datasets/TabMWP configpath: opencompass/configs/datasets/TabMWP
configpath_llmjudge: ''
- taco: - taco:
name: TACO name: TACO
category: Code category: Code
paper: https://arxiv.org/pdf/2312.14852 paper: https://arxiv.org/pdf/2312.14852
configpath: opencompass/configs/datasets/taco configpath: opencompass/configs/datasets/taco
configpath_llmjudge: ''
- tnews: - tnews:
name: FewCLUE / TNEWS name: FewCLUE / TNEWS
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_tnews configpath: opencompass/configs/datasets/FewCLUE_tnews
configpath_llmjudge: ''
- bustm: - bustm:
name: FewCLUE / BUSTM name: FewCLUE / BUSTM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_bustm configpath: opencompass/configs/datasets/FewCLUE_bustm
configpath_llmjudge: ''
- csl: - csl:
name: FewCLUE / CSL name: FewCLUE / CSL
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_csl configpath: opencompass/configs/datasets/FewCLUE_csl
configpath_llmjudge: ''
- ocnli_fc: - ocnli_fc:
name: FewCLUE / OCNLI-FC name: FewCLUE / OCNLI-FC
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
configpath_llmjudge: ''
- triviaqa: - triviaqa:
name: TriviaQA name: TriviaQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1705.03551v2 paper: https://arxiv.org/pdf/1705.03551v2
configpath: opencompass/configs/datasets/triviaqa configpath: opencompass/configs/datasets/triviaqa
configpath_llmjudge: ''
- triviaqarc: - triviaqarc:
name: TriviaQA-RC name: TriviaQA-RC
category: Knowledge / Understanding category: Knowledge / Understanding
paper: "" paper: ''
configpath: opencompass/configs/datasets/triviaqarc configpath: opencompass/configs/datasets/triviaqarc
configpath_llmjudge: ''
- truthfulqa: - truthfulqa:
name: TruthfulQA name: TruthfulQA
category: Safety category: Safety
paper: https://arxiv.org/pdf/2109.07958v2 paper: https://arxiv.org/pdf/2109.07958v2
configpath: opencompass/configs/datasets/truthfulqa configpath: opencompass/configs/datasets/truthfulqa
configpath_llmjudge: ''
- tydiqa: - tydiqa:
name: TyDi-QA name: TyDi-QA
category: Language category: Language
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
configpath: opencompass/configs/datasets/tydiqa configpath: opencompass/configs/datasets/tydiqa
configpath_llmjudge: ''
- wic: - wic:
name: SuperGLUE / WiC name: SuperGLUE / WiC
category: Language category: Language
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WiC configpath: opencompass/configs/datasets/SuperGLUE_WiC
configpath_llmjudge: ''
- wsc: - wsc:
name: SuperGLUE / WSC name: SuperGLUE / WSC
category: Language / WSC category: Language / WSC
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WSC configpath: opencompass/configs/datasets/SuperGLUE_WSC
configpath_llmjudge: ''
- winogrande: - winogrande:
name: WinoGrande name: WinoGrande
category: Language / WSC category: Language / WSC
paper: https://arxiv.org/pdf/1907.10641v2 paper: https://arxiv.org/pdf/1907.10641v2
configpath: opencompass/configs/datasets/winogrande configpath: opencompass/configs/datasets/winogrande
configpath_llmjudge: ''
- xcopa: - xcopa:
name: XCOPA name: XCOPA
category: Language category: Language
paper: https://arxiv.org/pdf/2005.00333 paper: https://arxiv.org/pdf/2005.00333
configpath: opencompass/configs/datasets/XCOPA configpath: opencompass/configs/datasets/XCOPA
configpath_llmjudge: ''
- xiezhi: - xiezhi:
name: Xiezhi name: Xiezhi
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2306.05783 paper: https://arxiv.org/pdf/2306.05783
configpath: opencompass/configs/datasets/xiezhi configpath: opencompass/configs/datasets/xiezhi
configpath_llmjudge: ''
- xlsum: - xlsum:
name: XLSum name: XLSum
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2106.13822v1 paper: https://arxiv.org/pdf/2106.13822v1
configpath: opencompass/configs/datasets/XLSum configpath: opencompass/configs/datasets/XLSum
configpath_llmjudge: ''
- xsum: - xsum:
name: Xsum name: Xsum
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1808.08745 paper: https://arxiv.org/pdf/1808.08745
configpath: opencompass/configs/datasets/Xsum configpath: opencompass/configs/datasets/Xsum
configpath_llmjudge: ''

View File

@ -24,7 +24,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2: with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader) data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath'] HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
def table_format(data_list): def table_format(data_list):
@ -35,7 +35,7 @@ def table_format(data_list):
for index in HEADER: for index in HEADER:
if index == 'paper': if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')') table_format_list_sub.append('[链接](' + i[j][index] + ')')
elif index == 'configpath': elif index != 'name' and index != 'category':
if isinstance(i[j][index], list): if isinstance(i[j][index], list):
sub_list_text = '' sub_list_text = ''
for k in i[j][index]: for k in i[j][index]:
@ -60,7 +60,7 @@ def generate_table(data_list, title=None):
if title is not None: if title is not None:
f.write(f'\n{title}') f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""") f.write("""\n```{table}\n:class: dataset\n""")
header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接'] header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)']
table_cfg = dict(tablefmt='pipe', table_cfg = dict(tablefmt='pipe',
floatfmt='.2f', floatfmt='.2f',
numalign='right', numalign='right',

View File

@ -1,68 +1,4 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate from mmengine.config import read_base
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import RaceDataset
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
QUERY_TEMPLATE = """ with read_base():
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. from .race_gen_69ee4f import race_datasets # noqa: F401, F403
Article: {article}
Q: {question}
A. {A}
B. {B}
C. {C}
D. {D}
""".strip()
race_reader_cfg = dict(
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='validation',
test_split='test',
)
race_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
race_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
pred_role='BOT',
)
race_datasets = [
dict(
abbr='race-middle',
type=RaceDataset,
path='opencompass/race',
name='middle',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
dict(
abbr='race-high',
type=RaceDataset,
path='opencompass/race',
name='high',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
]

View File

@ -1,117 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import RaceDataset
from opencompass.utils.text_postprocessors import (
first_option_postprocess,
)
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_postprocess
QUERY_TEMPLATE = """
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
Article: {article}
Q: {question}
A. {A}
B. {B}
C. {C}
D. {D}
""".strip()
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
race_reader_cfg = dict(
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
output_column='answer',
train_split='validation',
test_split='test',
)
race_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
race_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=RaceDataset,
path='./data/gpqa/',
reader_cfg=race_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
),
pred_role='BOT',
)
race_datasets = [
dict(
abbr='race-middle',
type=RaceDataset,
path='opencompass/race',
name='middle',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
dict(
abbr='race-high',
type=RaceDataset,
path='opencompass/race',
name='high',
reader_cfg=race_reader_cfg,
infer_cfg=race_infer_cfg,
eval_cfg=race_eval_cfg,
),
]