This commit is contained in:
Myhs-phz 2025-03-19 03:37:23 +00:00
parent 716c02785c
commit ffe00a830d
4 changed files with 227 additions and 125 deletions

View File

@ -8,25 +8,25 @@
name: NPHardEval name: NPHardEval
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2312.14890v2 paper: https://arxiv.org/pdf/2312.14890v2
configpath: opencompass/configs/datasets/NPHardEval configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- pmmeval: - pmmeval:
name: PMMEval name: PMMEval
category: Language category: Language
paper: https://arxiv.org/pdf/2411.09116v1 paper: https://arxiv.org/pdf/2411.09116v1
configpath: opencompass/configs/datasets/PMMEval configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- theoremqa: - theoremqa:
name: TheroremQA name: TheroremQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2305.12524 paper: https://arxiv.org/pdf/2305.12524
configpath: opencompass/configs/datasets/TheroremQA configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- agieval: - agieval:
name: AGIEval name: AGIEval
category: Examination category: Examination
paper: https://arxiv.org/pdf/2304.06364 paper: https://arxiv.org/pdf/2304.06364
configpath: opencompass/configs/datasets/agieval configpath: opencompass/configs/datasets/agieval/agieval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- babilong: - babilong:
name: BABILong name: BABILong
@ -44,13 +44,13 @@
name: CaLM name: CaLM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2405.00622 paper: https://arxiv.org/pdf/2405.00622
configpath: opencompass/configs/datasets/calm configpath: opencompass/configs/datasets/calm/calm.py
configpath_llmjudge: '' configpath_llmjudge: ''
- infinitebench: - infinitebench:
name: InfiniteBench (∞Bench) name: InfiniteBench (∞Bench)
category: Long Context category: Long Context
paper: https://aclanthology.org/2024.acl-long.814.pdf paper: https://aclanthology.org/2024.acl-long.814.pdf
configpath: opencompass/configs/datasets/infinitebench configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
configpath_llmjudge: '' configpath_llmjudge: ''
- korbench: - korbench:
name: KOR-Bench name: KOR-Bench
@ -62,13 +62,15 @@
name: LawBench name: LawBench
category: Knowledge / Law category: Knowledge / Law
paper: https://arxiv.org/pdf/2309.16289 paper: https://arxiv.org/pdf/2309.16289
configpath: opencompass/configs/datasets/lawbench configpath:
- opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
- opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
configpath_llmjudge: '' configpath_llmjudge: ''
- leval: - leval:
name: L-Eval name: L-Eval
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2307.11088v1 paper: https://arxiv.org/pdf/2307.11088v1
configpath: opencompass/configs/datasets/leval configpath: opencompass/configs/datasets/leval/leval.py
configpath_llmjudge: '' configpath_llmjudge: ''
- livecodebench: - livecodebench:
name: LiveCodeBench name: LiveCodeBench
@ -80,25 +82,39 @@
name: LiveMathBench name: LiveMathBench
category: Math category: Math
paper: https://arxiv.org/pdf/2412.13147 paper: https://arxiv.org/pdf/2412.13147
configpath: opencompass/configs/datasets/livemathbench configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
configpath_llmjudge: ''
- livereasonbench:
name: LiveReasonBench
category: Reasoning
paper: ''
configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- longbench: - longbench:
name: LongBench name: LongBench
category: Long Context category: Long Context
paper: https://github.com/THUDM/LongBench paper: https://github.com/THUDM/LongBench
configpath: opencompass/configs/datasets/livemathbench configpath:
- opencompass/configs/datasets/longbench/longbench.py
- opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- lveval: - lveval:
name: LV-Eval name: LV-Eval
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2402.05136 paper: https://arxiv.org/pdf/2402.05136
configpath: opencompass/configs/datasets/lveval configpath: opencompass/configs/datasets/lveval/lveval.py
configpath_llmjudge: ''
- mastermath2024v1:
name: Mastermath2024v1
category: Math
paper: ''
configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- medbench: - medbench:
name: MedBench name: MedBench
category: Knowledge / Medicine category: Knowledge / Medicine
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- musr: - musr:
name: MuSR name: MuSR
@ -140,7 +156,7 @@
name: FLAMES name: FLAMES
category: Subjective / Alignment category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.06899 paper: https://arxiv.org/pdf/2311.06899
configpath: opencompass/configs/datasets/subjective/flames configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- fofo: - fofo:
name: FOFO name: FOFO
@ -182,55 +198,63 @@
name: T-Eval name: T-Eval
category: Tool Utilization category: Tool Utilization
paper: https://arxiv.org/pdf/2312.14033 paper: https://arxiv.org/pdf/2312.14033
configpath: opencompass/configs/datasets/teval configpath:
- opencompass/configs/datasets/teval/teval_en_gen.py
- opencompass/configs/datasets/teval/teval_zh_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- finalceiq: - finalceiq:
name: FinanceIQ name: FinanceIQ
category: Knowledge / Finance category: Knowledge / Finance
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- gaokaobench: - gaokaobench:
name: GAOKAOBench name: GAOKAOBench
category: Examination category: Examination
paper: https://arxiv.org/pdf/2305.12474 paper: https://arxiv.org/pdf/2305.12474
configpath: opencompass/configs/datasets/GaokaoBench configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- lcbench: - lcbench:
name: LCBench name: LCBench
category: Code category: Code
paper: https://github.com/open-compass/CodeBench/ paper: https://github.com/open-compass/CodeBench/
configpath: opencompass/configs/datasets/LCBench configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- MMLUArabic: - MMLUArabic:
name: ArabicMMLU name: ArabicMMLU
category: Language category: Language
paper: https://arxiv.org/pdf/2402.12840 paper: https://arxiv.org/pdf/2402.12840
configpath: opencompass/configs/datasets/MMLUArabic configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- OpenFinData: - OpenFinData:
name: OpenFinData name: OpenFinData
category: Knowledge / Finance category: Knowledge / Finance
paper: https://github.com/open-compass/OpenFinData paper: https://github.com/open-compass/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- QuALITY: - QuALITY:
name: QuALITY name: QuALITY
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2112.08608 paper: https://arxiv.org/pdf/2112.08608
configpath: opencompass/configs/datasets/QuALITY configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- advglue: - advglue:
name: Adversarial GLUE name: Adversarial GLUE
category: Safety category: Safety
paper: https://openreview.net/pdf?id=GF9cSKI3A_q paper: https://openreview.net/pdf?id=GF9cSKI3A_q
configpath: opencompass/configs/datasets/adv_glue configpath:
- opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- afqmcd: - afqmcd:
name: CLUE / AFQMC name: CLUE / AFQMC
category: Language category: Language
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_afqmc configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- aime2024: - aime2024:
name: AIME2024 name: AIME2024
@ -242,41 +266,46 @@
name: Adversarial NLI name: Adversarial NLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1910.14599v2 paper: https://arxiv.org/pdf/1910.14599v2
configpath: opencompass/configs/datasets/anli configpath: opencompass/configs/datasets/anli/anli_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- anthropics_evals: - anthropics_evals:
name: Anthropics Evals name: Anthropics Evals
category: Safety category: Safety
paper: https://arxiv.org/pdf/2212.09251 paper: https://arxiv.org/pdf/2212.09251
configpath: opencompass/configs/datasets/anthropics_evals configpath:
- opencompass/configs/datasets/anthropics_evals/airisk_gen.py
- opencompass/configs/datasets/anthropics_evals/persona_gen.py
- opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- apps: - apps:
name: APPS name: APPS
category: Code category: Code
paper: https://arxiv.org/pdf/2105.09938 paper: https://arxiv.org/pdf/2105.09938
configpath: opencompass/configs/datasets/apps configpath:
- opencompass/configs/datasets/apps/apps_gen.py
- opencompass/configs/datasets/apps/apps_mini_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- arc: - arc:
name: ARC name: ARC
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1803.05457 paper: https://arxiv.org/pdf/1803.05457
configpath: configpath:
- opencompass/configs/datasets/ARC_c - opencompass/configs/datasets/ARC_c/ARC_c_gen.py
- opencompass/configs/datasets/ARC_e - opencompass/configs/datasets/ARC_e/ARC_e_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- arc_prize_public_eval: - arc_prize_public_eval:
name: ARC Prize name: ARC Prize
category: ARC-AGI category: ARC-AGI
paper: https://arcprize.org/guide#private paper: https://arcprize.org/guide#private
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- ax: - ax:
name: SuperGLUE / AX name: SuperGLUE / AX
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: configpath:
- opencompass/configs/datasets/SuperGLUE_AX_b - opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
- opencompass/configs/datasets/SuperGLUE_AX_g - opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- bbh: - bbh:
name: BIG-Bench Hard name: BIG-Bench Hard
@ -288,79 +317,82 @@
name: SuperGLUE / BoolQ name: SuperGLUE / BoolQ
category: Knowledge category: Knowledge
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- c3: - c3:
name: CLUE / C3 (C³) name: CLUE / C3 (C³)
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3 configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cb: - cb:
name: SuperGLUE / CB name: SuperGLUE / CB
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_CB configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- ceval: - ceval:
name: C-EVAL name: C-EVAL
category: Examination category: Examination
paper: https://arxiv.org/pdf/2305.08322v1 paper: https://arxiv.org/pdf/2305.08322v1
configpath: opencompass/configs/datasets/ceval configpath: opencompass/configs/datasets/ceval/ceval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- charm: - charm:
name: CHARM name: CHARM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2403.14112 paper: https://arxiv.org/pdf/2403.14112
configpath: opencompass/configs/datasets/CHARM configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- chembench: - chembench:
name: ChemBench name: ChemBench
category: Knowledge / Chemistry category: Knowledge / Chemistry
paper: https://arxiv.org/pdf/2404.01475 paper: https://arxiv.org/pdf/2404.01475
configpath: opencompass/configs/datasets/ChemBench configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- chid: - chid:
name: FewCLUE / CHID name: FewCLUE / CHID
category: Language category: Language
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_chid configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- chinese_simpleqa: - chinese_simpleqa:
name: Chinese SimpleQA name: Chinese SimpleQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2411.07140 paper: https://arxiv.org/pdf/2411.07140
configpath: opencompass/configs/datasets/chinese_simpleqa configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cibench: - cibench:
name: CIBench name: CIBench
category: Code category: Code
paper: https://www.arxiv.org/pdf/2407.10499 paper: https://www.arxiv.org/pdf/2407.10499
configpath: opencompass/configs/datasets/CIBench configpath:
- opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
- opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
- opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
configpath_llmjudge: '' configpath_llmjudge: ''
- civilcomments: - civilcomments:
name: CivilComments name: CivilComments
category: Safety category: Safety
paper: https://arxiv.org/pdf/1903.04561 paper: https://arxiv.org/pdf/1903.04561
configpath: opencompass/configs/datasets/civilcomments configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
configpath_llmjudge: '' configpath_llmjudge: ''
- clozeTest_maxmin: - clozeTest_maxmin:
name: Cloze Test-max/min name: Cloze Test-max/min
category: Code category: Code
paper: https://arxiv.org/pdf/2102.04664 paper: https://arxiv.org/pdf/2102.04664
configpath: opencompass/configs/datasets/clozeTest_maxmin configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cluewsc: - cluewsc:
name: FewCLUE / CLUEWSC name: FewCLUE / CLUEWSC
category: Language / WSC category: Language / WSC
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_cluewsc configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cmb: - cmb:
name: CMB name: CMB
category: Knowledge / Medicine category: Knowledge / Medicine
paper: https://arxiv.org/pdf/2308.08833 paper: https://arxiv.org/pdf/2308.08833
configpath: opencompass/configs/datasets/cmb configpath: opencompass/configs/datasets/cmb/cmb_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cmmlu: - cmmlu:
name: CMMLU name: CMMLU
@ -372,61 +404,61 @@
name: CLUE / CMNLI name: CLUE / CMNLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_cmnli configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cmo_fib: - cmo_fib:
name: cmo_fib name: cmo_fib
category: Examination category: Examination
paper: '' paper: ''
configpath: opencompass/configs/datasets/cmo_fib configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cmrc: - cmrc:
name: CLUE / CMRC name: CLUE / CMRC
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_CMRC configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- commonsenseqa: - commonsenseqa:
name: CommonSenseQA name: CommonSenseQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1811.00937v2 paper: https://arxiv.org/pdf/1811.00937v2
configpath: opencompass/configs/datasets/commonsenseqa configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- commonsenseqa_cn: - commonsenseqa_cn:
name: CommonSenseQA-CN name: CommonSenseQA-CN
category: Knowledge category: Knowledge
paper: '' paper: ''
configpath: opencompass/configs/datasets/commonsenseqa_cn configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- copa: - copa:
name: SuperGLUE / COPA name: SuperGLUE / COPA
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_COPA configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- crowspairs: - crowspairs:
name: CrowsPairs name: CrowsPairs
category: Safety category: Safety
paper: https://arxiv.org/pdf/2010.00133 paper: https://arxiv.org/pdf/2010.00133
configpath: opencompass/configs/datasets/crowspairs configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- crowspairs_cn: - crowspairs_cn:
name: CrowsPairs-CN name: CrowsPairs-CN
category: Safety category: Safety
paper: '' paper: ''
configpath: opencompass/configs/datasets/crowspairs_cn configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- cvalues: - cvalues:
name: CVALUES name: CVALUES
category: Safety category: Safety
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
configpath: opencompass/configs/datasets/cvalues configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- drcd: - drcd:
name: CLUE / DRCD name: CLUE / DRCD
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_DRCD configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- drop: - drop:
name: DROP (DROP Simple Eval) name: DROP (DROP Simple Eval)
@ -438,31 +470,32 @@
name: DS-1000 name: DS-1000
category: Code category: Code
paper: https://arxiv.org/pdf/2211.11501 paper: https://arxiv.org/pdf/2211.11501
configpath: opencompass/configs/datasets/ds1000 configpath:
- opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
configpath_llmjudge: '' configpath_llmjudge: ''
- eprstmt: - eprstmt:
name: FewCLUE / EPRSTMT name: FewCLUE / EPRSTMT
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_eprstmt configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- flores: - flores:
name: Flores name: Flores
category: Language category: Language
paper: https://aclanthology.org/D19-1632.pdf paper: https://aclanthology.org/D19-1632.pdf
configpath: opencompass/configs/datasets/flores configpath: opencompass/configs/datasets/flores/flores_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- game24: - game24:
name: Game24 name: Game24
category: Math category: Math
paper: https://huggingface.co/datasets/nlile/24-game paper: https://huggingface.co/datasets/nlile/24-game
configpath: opencompass/configs/datasets/game24 configpath: opencompass/configs/datasets/game24/game24_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- govrepcrs: - govrepcrs:
name: Government Report Dataset name: Government Report Dataset
category: Long Context category: Long Context
paper: https://aclanthology.org/2021.naacl-main.112.pdf paper: https://aclanthology.org/2021.naacl-main.112.pdf
configpath: opencompass/configs/datasets/govrepcrs configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- gpqa: - gpqa:
name: GPQA name: GPQA
@ -474,19 +507,19 @@
name: GSM8K name: GSM8K
category: Math category: Math
paper: https://arxiv.org/pdf/2110.14168v2 paper: https://arxiv.org/pdf/2110.14168v2
configpath: opencompass/configs/datasets/gsm8k configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- gsm_hard: - gsm_hard:
name: GSM-Hard name: GSM-Hard
category: Math category: Math
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
configpath: opencompass/configs/datasets/gsm_hard configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- hle: - hle:
name: HLE(Humanity's Last Exam) name: HLE(Humanity's Last Exam)
category: Reasoning category: Reasoning
paper: https://lastexam.ai/paper paper: https://lastexam.ai/paper
configpath: opencompass/configs/datasets/HLE configpath: opencompass/configs/datasets/HLE/hle_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- hellaswag: - hellaswag:
name: HellaSwag name: HellaSwag
@ -504,61 +537,67 @@
name: HumanEval-CN name: HumanEval-CN
category: Code category: Code
paper: '' paper: ''
configpath: opencompass/configs/datasets/humaneval_cn configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- humaneval_multi: - humaneval_multi:
name: Multi-HumanEval name: Multi-HumanEval
category: Code category: Code
paper: https://arxiv.org/pdf/2210.14868 paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/humaneval_multi configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
configpath_llmjudge: ''
- humaneval_multi:
name: HumanEval+
category: Code
paper: https://arxiv.org/pdf/2305.01210
configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- humanevalx: - humanevalx:
name: HumanEval-X name: HumanEval-X
category: Code category: Code
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- hungarian_math: - hungarian_math:
name: Hungarian_Math name: Hungarian_Math
category: Math category: Math
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
configpath: opencompass/configs/datasets/hungarian_exam configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- iwslt2017: - iwslt2017:
name: IWSLT2017 name: IWSLT2017
category: Language category: Language
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
configpath: opencompass/configs/datasets/iwslt2017 configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- jigsawmultilingual: - jigsawmultilingual:
name: JigsawMultilingual name: JigsawMultilingual
category: Safety category: Safety
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
configpath: opencompass/configs/datasets/jigsawmultilingual configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
configpath_llmjudge: '' configpath_llmjudge: ''
- lambada: - lambada:
name: LAMBADA name: LAMBADA
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1606.06031 paper: https://arxiv.org/pdf/1606.06031
configpath: opencompass/configs/datasets/lambada configpath: opencompass/configs/datasets/lambada/lambada_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- lcsts: - lcsts:
name: LCSTS name: LCSTS
category: Understanding category: Understanding
paper: https://aclanthology.org/D15-1229.pdf paper: https://aclanthology.org/D15-1229.pdf
configpath: opencompass/configs/datasets/lcsts configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- livestembench: - livestembench:
name: LiveStemBench name: LiveStemBench
category: '' category: ''
paper: '' paper: ''
configpath: opencompass/configs/datasets/livestembench configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- llm_compression: - llm_compression:
name: LLM Compression name: LLM Compression
category: Bits Per Character (BPC) category: Bits Per Character (BPC)
paper: https://arxiv.org/pdf/2404.09937 paper: https://arxiv.org/pdf/2404.09937
configpath: opencompass/configs/datasets/llm_compression configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
configpath_llmjudge: '' configpath_llmjudge: ''
- math: - math:
name: MATH name: MATH
@ -576,37 +615,37 @@
name: MATH 401 name: MATH 401
category: Math category: Math
paper: https://arxiv.org/pdf/2304.02015 paper: https://arxiv.org/pdf/2304.02015
configpath: opencompass/configs/datasets/math401 configpath: opencompass/configs/datasets/math401/math401_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mathbench: - mathbench:
name: MathBench name: MathBench
category: Math category: Math
paper: https://arxiv.org/pdf/2405.12209 paper: https://arxiv.org/pdf/2405.12209
configpath: opencompass/configs/datasets/mathbench configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mbpp: - mbpp:
name: MBPP name: MBPP
category: Code category: Code
paper: https://arxiv.org/pdf/2108.07732 paper: https://arxiv.org/pdf/2108.07732
configpath: opencompass/configs/datasets/mbpp configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mbpp_cn: - mbpp_cn:
name: MBPP-CN name: MBPP-CN
category: Code category: Code
paper: '' paper: ''
configpath: opencompass/configs/datasets/mbpp_cn configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mbpp_plus: - mbpp_plus:
name: MBPP-PLUS name: MBPP-PLUS
category: Code category: Code
paper: '' paper: ''
configpath: opencompass/configs/datasets/mbpp_plus configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mgsm: - mgsm:
name: MGSM name: MGSM
category: Language / Math category: Language / Math
paper: https://arxiv.org/pdf/2210.03057 paper: https://arxiv.org/pdf/2210.03057
configpath: opencompass/configs/datasets/mgsm configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mmlu: - mmlu:
name: MMLU name: MMLU
@ -618,7 +657,7 @@
name: MMLU-CF name: MMLU-CF
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2412.15194 paper: https://arxiv.org/pdf/2412.15194
configpath: opencompass/configs/datasets/mmlu_cf configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- mmlu_pro: - mmlu_pro:
name: MMLU-Pro name: MMLU-Pro
@ -630,91 +669,99 @@
name: MMMLU name: MMMLU
category: Language / Understanding category: Language / Understanding
paper: https://huggingface.co/datasets/openai/MMMLU paper: https://huggingface.co/datasets/openai/MMMLU
configpath: opencompass/configs/datasets/mmmlu configpath:
- opencompass/configs/datasets/mmmlu/mmmlu_gen.py
- opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- multirc: - multirc:
name: SuperGLUE / MultiRC name: SuperGLUE / MultiRC
category: Understanding category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- narrativeqa: - narrativeqa:
name: NarrativeQA name: NarrativeQA
category: Understanding category: Understanding
paper: https://github.com/google-deepmind/narrativeqa paper: https://github.com/google-deepmind/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- natural_question: - natural_question:
name: NaturalQuestions name: NaturalQuestions
category: Knowledge category: Knowledge
paper: https://github.com/google-research-datasets/natural-questions paper: https://github.com/google-research-datasets/natural-questions
configpath: opencompass/configs/datasets/nq configpath: opencompass/configs/datasets/nq/nq_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- natural_question_cn: - natural_question_cn:
name: NaturalQuestions-CN name: NaturalQuestions-CN
category: Knowledge category: Knowledge
paper: '' paper: ''
configpath: opencompass/configs/datasets/nq_cn configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- obqa: - obqa:
name: OpenBookQA name: OpenBookQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1809.02789v1 paper: https://arxiv.org/pdf/1809.02789v1
configpath: opencompass/configs/datasets/obqa configpath: opencompass/configs/datasets/obqa/obqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- piqa: - piqa:
name: OpenBookQA name: OpenBookQA
category: Knowledge / Physics category: Knowledge / Physics
paper: https://arxiv.org/pdf/1911.11641v1 paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa configpath: opencompass/configs/datasets/piqa/piqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- py150: - py150:
name: py150 name: py150
category: Code category: Code
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
configpath: opencompass/configs/datasets/py150 configpath: opencompass/configs/datasets/py150/py150_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- qasper: - qasper:
name: Qasper name: Qasper
category: Long Context category: Long Context
paper: https://arxiv.org/pdf/2105.03011 paper: https://arxiv.org/pdf/2105.03011
configpath: opencompass/configs/datasets/qasper configpath: opencompass/configs/datasets/qasper/qasper_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- qaspercut: - qaspercut:
name: Qasper-Cut name: Qasper-Cut
category: Long Context category: Long Context
paper: '' paper: ''
configpath: opencompass/configs/datasets/qaspercut configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- race: - race:
name: RACE name: RACE
category: Examination category: Examination
paper: https://arxiv.org/pdf/1704.04683 paper: https://arxiv.org/pdf/1704.04683
configpath: opencompass/configs/datasets/race configpath: opencompass/configs/datasets/race/race_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- realtoxicprompts: - realtoxicprompts:
name: RealToxicPrompts name: RealToxicPrompts
category: Safety category: Safety
paper: https://arxiv.org/pdf/2009.11462 paper: https://arxiv.org/pdf/2009.11462
configpath: opencompass/configs/datasets/realtoxicprompts configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- record: - record:
name: SuperGLUE / ReCoRD name: SuperGLUE / ReCoRD
category: Understanding category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- rte: - rte:
name: SuperGLUE / RTE name: SuperGLUE / RTE
category: Reasoning category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_RTE configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- ocnli: - ocnli:
name: CLUE / OCNLI name: CLUE / OCNLI
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2004.05986 paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_ocnli configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
configpath_llmjudge: ''
- ocnlifc:
name: FewCLUE / OCNLI-FC
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- rolebench: - rolebench:
name: RoleBench name: RoleBench
@ -726,97 +773,97 @@
name: S3Eval name: S3Eval
category: Long Context category: Long Context
paper: https://aclanthology.org/2024.naacl-long.69.pdf paper: https://aclanthology.org/2024.naacl-long.69.pdf
configpath: opencompass/configs/datasets/s3eval configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- scibench: - scibench:
name: SciBench name: SciBench
category: Reasoning category: Reasoning
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
configpath: opencompass/configs/datasets/scibench configpath: opencompass/configs/datasets/scibench/scibench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- scicode: - scicode:
name: SciCode name: SciCode
category: Code category: Code
paper: https://arxiv.org/pdf/2407.13168 paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode configpath: opencompass/configs/datasets/scicode/scicode_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- simpleqa: - simpleqa:
name: SimpleQA name: SimpleQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2411.04368 paper: https://arxiv.org/pdf/2411.04368
configpath: opencompass/configs/datasets/SimpleQA configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- siqa: - siqa:
name: SocialIQA name: SocialIQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/1904.09728 paper: https://arxiv.org/pdf/1904.09728
configpath: opencompass/configs/datasets/siqa configpath: opencompass/configs/datasets/siqa/siqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- squad20: - squad20:
name: SQuAD2.0 name: SQuAD2.0
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1806.03822 paper: https://arxiv.org/pdf/1806.03822
configpath: opencompass/configs/datasets/squad20 configpath: opencompass/configs/datasets/squad20/squad20_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- storycloze: - storycloze:
name: StoryCloze name: StoryCloze
category: Reasoning category: Reasoning
paper: https://aclanthology.org/2022.emnlp-main.616.pdf paper: https://aclanthology.org/2022.emnlp-main.616.pdf
configpath: opencompass/configs/datasets/storycloze configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- strategyqa: - strategyqa:
name: StrategyQA name: StrategyQA
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2101.02235 paper: https://arxiv.org/pdf/2101.02235
configpath: opencompass/configs/datasets/strategyqa configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- summedits: - summedits:
name: SummEdits name: SummEdits
category: Language category: Language
paper: https://aclanthology.org/2023.emnlp-main.600.pdf paper: https://aclanthology.org/2023.emnlp-main.600.pdf
configpath: opencompass/configs/datasets/summedits configpath: opencompass/configs/datasets/summedits/summedits_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- summscreen: - summscreen:
name: SummScreen name: SummScreen
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2104.07091v1 paper: https://arxiv.org/pdf/2104.07091v1
configpath: opencompass/configs/datasets/summscreen configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- svamp: - svamp:
name: SVAMP name: SVAMP
category: Math category: Math
paper: https://aclanthology.org/2021.naacl-main.168.pdf paper: https://aclanthology.org/2021.naacl-main.168.pdf
configpath: opencompass/configs/datasets/SVAMP configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- tabmwp: - tabmwp:
name: TabMWP name: TabMWP
category: Math / Table category: Math / Table
paper: https://arxiv.org/pdf/2209.14610 paper: https://arxiv.org/pdf/2209.14610
configpath: opencompass/configs/datasets/TabMWP configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- taco: - taco:
name: TACO name: TACO
category: Code category: Code
paper: https://arxiv.org/pdf/2312.14852 paper: https://arxiv.org/pdf/2312.14852
configpath: opencompass/configs/datasets/taco configpath: opencompass/configs/datasets/taco/taco_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- tnews: - tnews:
name: FewCLUE / TNEWS name: FewCLUE / TNEWS
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_tnews configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- bustm: - bustm:
name: FewCLUE / BUSTM name: FewCLUE / BUSTM
category: Reasoning category: Reasoning
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_bustm configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- csl: - csl:
name: FewCLUE / CSL name: FewCLUE / CSL
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2107.07498 paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_csl configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- ocnli_fc: - ocnli_fc:
name: FewCLUE / OCNLI-FC name: FewCLUE / OCNLI-FC
@ -828,65 +875,95 @@
name: TriviaQA name: TriviaQA
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/1705.03551v2 paper: https://arxiv.org/pdf/1705.03551v2
configpath: opencompass/configs/datasets/triviaqa configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- triviaqarc: - triviaqarc:
name: TriviaQA-RC name: TriviaQA-RC
category: Knowledge / Understanding category: Knowledge / Understanding
paper: '' paper: ''
configpath: opencompass/configs/datasets/triviaqarc configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- truthfulqa: - truthfulqa:
name: TruthfulQA name: TruthfulQA
category: Safety category: Safety
paper: https://arxiv.org/pdf/2109.07958v2 paper: https://arxiv.org/pdf/2109.07958v2
configpath: opencompass/configs/datasets/truthfulqa configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- tydiqa: - tydiqa:
name: TyDi-QA name: TyDi-QA
category: Language category: Language
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
configpath: opencompass/configs/datasets/tydiqa configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- wic: - wic:
name: SuperGLUE / WiC name: SuperGLUE / WiC
category: Language category: Language
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WiC configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- wsc: - wsc:
name: SuperGLUE / WSC name: SuperGLUE / WSC
category: Language / WSC category: Language / WSC
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WSC configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- winogrande: - winogrande:
name: WinoGrande name: WinoGrande
category: Language / WSC category: Language / WSC
paper: https://arxiv.org/pdf/1907.10641v2 paper: https://arxiv.org/pdf/1907.10641v2
configpath: opencompass/configs/datasets/winogrande configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- xcopa: - xcopa:
name: XCOPA name: XCOPA
category: Language category: Language
paper: https://arxiv.org/pdf/2005.00333 paper: https://arxiv.org/pdf/2005.00333
configpath: opencompass/configs/datasets/XCOPA configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
configpath_llmjudge: '' configpath_llmjudge: ''
- xiezhi: - xiezhi:
name: Xiezhi name: Xiezhi
category: Knowledge category: Knowledge
paper: https://arxiv.org/pdf/2306.05783 paper: https://arxiv.org/pdf/2306.05783
configpath: opencompass/configs/datasets/xiezhi configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- xlsum: - xlsum:
name: XLSum name: XLSum
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/2106.13822v1 paper: https://arxiv.org/pdf/2106.13822v1
configpath: opencompass/configs/datasets/XLSum configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''
- xsum: - xsum:
name: Xsum name: Xsum
category: Understanding category: Understanding
paper: https://arxiv.org/pdf/1808.08745 paper: https://arxiv.org/pdf/1808.08745
configpath: opencompass/configs/datasets/Xsum configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
configpath_llmjudge: ''
- cola:
name: GLUE / CoLA
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
configpath_llmjudge: ''
- mprc:
name: GLUE / MPRC
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
configpath_llmjudge: ''
- qqp:
name: GLUE / QQP
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
configpath_llmjudge: ''
- omni_math:
name: Omni-MATH
category: Math
paper: https://omni-math.github.io/
configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
configpath_llmjudge: ''
- wikibench:
name: WikiBench
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
configpath_llmjudge: '' configpath_llmjudge: ''

View File

@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass.
You can use sorting and search functions to find the dataset you need. You can use sorting and search functions to find the dataset you need.
We provide recommended running configurations for each dataset,
and in some datasets also offer recommended configurations based on LLM Judge.
You can quickly start evaluation tasks based on the recommended configurations.
However, please note that these configurations may be updated over time.
""" """
with open('dataset_statistics.md', 'w') as f: with open('dataset_statistics.md', 'w') as f:
@ -24,7 +30,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2: with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader) data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath'] HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
def table_format(data_list): def table_format(data_list):
@ -35,6 +41,13 @@ def table_format(data_list):
for index in HEADER: for index in HEADER:
if index == 'paper': if index == 'paper':
table_format_list_sub.append('[link](' + i[j][index] + ')') table_format_list_sub.append('[link](' + i[j][index] + ')')
elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append('[link](' +
GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath': elif index == 'configpath':
if isinstance(i[j][index], list): if isinstance(i[j][index], list):
sub_list_text = '' sub_list_text = ''
@ -61,7 +74,10 @@ def generate_table(data_list, title=None):
if title is not None: if title is not None:
f.write(f'\n{title}') f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""") f.write("""\n```{table}\n:class: dataset\n""")
header = ['Name', 'Category', 'Paper or Repository', 'Config File'] header = [
'Name', 'Category', 'Paper or Repository', 'Recommended Config',
'Recommended Config (LLM Judge)'
]
table_cfg = dict(tablefmt='pipe', table_cfg = dict(tablefmt='pipe',
floatfmt='.2f', floatfmt='.2f',
numalign='right', numalign='right',

View File

@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\
你可以使用排序和搜索功能找到需要的数据集 你可以使用排序和搜索功能找到需要的数据集
我们对每一个数据集都给出了推荐的运行配置部分数据集中还提供了基于LLM Judge的推荐配置
你可以基于推荐配置快速启动评测但请注意推荐配置可能随时间推移被更新
""" """
with open('dataset_statistics.md', 'w') as f: with open('dataset_statistics.md', 'w') as f:
@ -35,7 +39,13 @@ def table_format(data_list):
for index in HEADER: for index in HEADER:
if index == 'paper': if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')') table_format_list_sub.append('[链接](' + i[j][index] + ')')
elif index != 'name' and index != 'category': elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list): if isinstance(i[j][index], list):
sub_list_text = '' sub_list_text = ''
for k in i[j][index]: for k in i[j][index]:
@ -60,7 +70,7 @@ def generate_table(data_list, title=None):
if title is not None: if title is not None:
f.write(f'\n{title}') f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""") f.write("""\n```{table}\n:class: dataset\n""")
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)'] header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
table_cfg = dict(tablefmt='pipe', table_cfg = dict(tablefmt='pipe',
floatfmt='.2f', floatfmt='.2f',
numalign='right', numalign='right',

View File

@ -78,7 +78,6 @@ def generic_llmjudge_postprocess(
f'No gold answer for {k}, use empty string as reference!') f'No gold answer for {k}, use empty string as reference!')
references.append('') references.append('')
results = get_final_results(judged_answers, references, origial_responses) results = get_final_results(judged_answers, references, origial_responses)
results['details'] = output results['details'] = output
return results return results