mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
feat
This commit is contained in:
parent
716c02785c
commit
ffe00a830d
@ -8,25 +8,25 @@
|
|||||||
name: NPHardEval
|
name: NPHardEval
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2312.14890v2
|
paper: https://arxiv.org/pdf/2312.14890v2
|
||||||
configpath: opencompass/configs/datasets/NPHardEval
|
configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- pmmeval:
|
- pmmeval:
|
||||||
name: PMMEval
|
name: PMMEval
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://arxiv.org/pdf/2411.09116v1
|
paper: https://arxiv.org/pdf/2411.09116v1
|
||||||
configpath: opencompass/configs/datasets/PMMEval
|
configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- theoremqa:
|
- theoremqa:
|
||||||
name: TheroremQA
|
name: TheroremQA
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2305.12524
|
paper: https://arxiv.org/pdf/2305.12524
|
||||||
configpath: opencompass/configs/datasets/TheroremQA
|
configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- agieval:
|
- agieval:
|
||||||
name: AGIEval
|
name: AGIEval
|
||||||
category: Examination
|
category: Examination
|
||||||
paper: https://arxiv.org/pdf/2304.06364
|
paper: https://arxiv.org/pdf/2304.06364
|
||||||
configpath: opencompass/configs/datasets/agieval
|
configpath: opencompass/configs/datasets/agieval/agieval_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- babilong:
|
- babilong:
|
||||||
name: BABILong
|
name: BABILong
|
||||||
@ -44,13 +44,13 @@
|
|||||||
name: CaLM
|
name: CaLM
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2405.00622
|
paper: https://arxiv.org/pdf/2405.00622
|
||||||
configpath: opencompass/configs/datasets/calm
|
configpath: opencompass/configs/datasets/calm/calm.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- infinitebench:
|
- infinitebench:
|
||||||
name: InfiniteBench (∞Bench)
|
name: InfiniteBench (∞Bench)
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://aclanthology.org/2024.acl-long.814.pdf
|
paper: https://aclanthology.org/2024.acl-long.814.pdf
|
||||||
configpath: opencompass/configs/datasets/infinitebench
|
configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- korbench:
|
- korbench:
|
||||||
name: KOR-Bench
|
name: KOR-Bench
|
||||||
@ -62,13 +62,15 @@
|
|||||||
name: LawBench
|
name: LawBench
|
||||||
category: Knowledge / Law
|
category: Knowledge / Law
|
||||||
paper: https://arxiv.org/pdf/2309.16289
|
paper: https://arxiv.org/pdf/2309.16289
|
||||||
configpath: opencompass/configs/datasets/lawbench
|
configpath:
|
||||||
|
- opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
|
||||||
|
- opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- leval:
|
- leval:
|
||||||
name: L-Eval
|
name: L-Eval
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://arxiv.org/pdf/2307.11088v1
|
paper: https://arxiv.org/pdf/2307.11088v1
|
||||||
configpath: opencompass/configs/datasets/leval
|
configpath: opencompass/configs/datasets/leval/leval.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- livecodebench:
|
- livecodebench:
|
||||||
name: LiveCodeBench
|
name: LiveCodeBench
|
||||||
@ -80,25 +82,39 @@
|
|||||||
name: LiveMathBench
|
name: LiveMathBench
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://arxiv.org/pdf/2412.13147
|
paper: https://arxiv.org/pdf/2412.13147
|
||||||
configpath: opencompass/configs/datasets/livemathbench
|
configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- livereasonbench:
|
||||||
|
name: LiveReasonBench
|
||||||
|
category: Reasoning
|
||||||
|
paper: ''
|
||||||
|
configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- longbench:
|
- longbench:
|
||||||
name: LongBench
|
name: LongBench
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://github.com/THUDM/LongBench
|
paper: https://github.com/THUDM/LongBench
|
||||||
configpath: opencompass/configs/datasets/livemathbench
|
configpath:
|
||||||
|
- opencompass/configs/datasets/longbench/longbench.py
|
||||||
|
- opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- lveval:
|
- lveval:
|
||||||
name: LV-Eval
|
name: LV-Eval
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://arxiv.org/pdf/2402.05136
|
paper: https://arxiv.org/pdf/2402.05136
|
||||||
configpath: opencompass/configs/datasets/lveval
|
configpath: opencompass/configs/datasets/lveval/lveval.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- mastermath2024v1:
|
||||||
|
name: Mastermath2024v1
|
||||||
|
category: Math
|
||||||
|
paper: ''
|
||||||
|
configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- medbench:
|
- medbench:
|
||||||
name: MedBench
|
name: MedBench
|
||||||
category: Knowledge / Medicine
|
category: Knowledge / Medicine
|
||||||
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
||||||
configpath: opencompass/configs/datasets/MedBench
|
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- musr:
|
- musr:
|
||||||
name: MuSR
|
name: MuSR
|
||||||
@ -140,7 +156,7 @@
|
|||||||
name: FLAMES
|
name: FLAMES
|
||||||
category: Subjective / Alignment
|
category: Subjective / Alignment
|
||||||
paper: https://arxiv.org/pdf/2311.06899
|
paper: https://arxiv.org/pdf/2311.06899
|
||||||
configpath: opencompass/configs/datasets/subjective/flames
|
configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- fofo:
|
- fofo:
|
||||||
name: FOFO
|
name: FOFO
|
||||||
@ -182,55 +198,63 @@
|
|||||||
name: T-Eval
|
name: T-Eval
|
||||||
category: Tool Utilization
|
category: Tool Utilization
|
||||||
paper: https://arxiv.org/pdf/2312.14033
|
paper: https://arxiv.org/pdf/2312.14033
|
||||||
configpath: opencompass/configs/datasets/teval
|
configpath:
|
||||||
|
- opencompass/configs/datasets/teval/teval_en_gen.py
|
||||||
|
- opencompass/configs/datasets/teval/teval_zh_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- finalceiq:
|
- finalceiq:
|
||||||
name: FinanceIQ
|
name: FinanceIQ
|
||||||
category: Knowledge / Finance
|
category: Knowledge / Finance
|
||||||
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
|
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
|
||||||
configpath: opencompass/configs/datasets/FinanceIQ
|
configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- gaokaobench:
|
- gaokaobench:
|
||||||
name: GAOKAOBench
|
name: GAOKAOBench
|
||||||
category: Examination
|
category: Examination
|
||||||
paper: https://arxiv.org/pdf/2305.12474
|
paper: https://arxiv.org/pdf/2305.12474
|
||||||
configpath: opencompass/configs/datasets/GaokaoBench
|
configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- lcbench:
|
- lcbench:
|
||||||
name: LCBench
|
name: LCBench
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://github.com/open-compass/CodeBench/
|
paper: https://github.com/open-compass/CodeBench/
|
||||||
configpath: opencompass/configs/datasets/LCBench
|
configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- MMLUArabic:
|
- MMLUArabic:
|
||||||
name: ArabicMMLU
|
name: ArabicMMLU
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://arxiv.org/pdf/2402.12840
|
paper: https://arxiv.org/pdf/2402.12840
|
||||||
configpath: opencompass/configs/datasets/MMLUArabic
|
configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- OpenFinData:
|
- OpenFinData:
|
||||||
name: OpenFinData
|
name: OpenFinData
|
||||||
category: Knowledge / Finance
|
category: Knowledge / Finance
|
||||||
paper: https://github.com/open-compass/OpenFinData
|
paper: https://github.com/open-compass/OpenFinData
|
||||||
configpath: opencompass/configs/datasets/OpenFinData
|
configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- QuALITY:
|
- QuALITY:
|
||||||
name: QuALITY
|
name: QuALITY
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://arxiv.org/pdf/2112.08608
|
paper: https://arxiv.org/pdf/2112.08608
|
||||||
configpath: opencompass/configs/datasets/QuALITY
|
configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- advglue:
|
- advglue:
|
||||||
name: Adversarial GLUE
|
name: Adversarial GLUE
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
|
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
|
||||||
configpath: opencompass/configs/datasets/adv_glue
|
configpath:
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
|
||||||
|
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- afqmcd:
|
- afqmcd:
|
||||||
name: CLUE / AFQMC
|
name: CLUE / AFQMC
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_afqmc
|
configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- aime2024:
|
- aime2024:
|
||||||
name: AIME2024
|
name: AIME2024
|
||||||
@ -242,41 +266,46 @@
|
|||||||
name: Adversarial NLI
|
name: Adversarial NLI
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/1910.14599v2
|
paper: https://arxiv.org/pdf/1910.14599v2
|
||||||
configpath: opencompass/configs/datasets/anli
|
configpath: opencompass/configs/datasets/anli/anli_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- anthropics_evals:
|
- anthropics_evals:
|
||||||
name: Anthropics Evals
|
name: Anthropics Evals
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://arxiv.org/pdf/2212.09251
|
paper: https://arxiv.org/pdf/2212.09251
|
||||||
configpath: opencompass/configs/datasets/anthropics_evals
|
configpath:
|
||||||
|
- opencompass/configs/datasets/anthropics_evals/airisk_gen.py
|
||||||
|
- opencompass/configs/datasets/anthropics_evals/persona_gen.py
|
||||||
|
- opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- apps:
|
- apps:
|
||||||
name: APPS
|
name: APPS
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2105.09938
|
paper: https://arxiv.org/pdf/2105.09938
|
||||||
configpath: opencompass/configs/datasets/apps
|
configpath:
|
||||||
|
- opencompass/configs/datasets/apps/apps_gen.py
|
||||||
|
- opencompass/configs/datasets/apps/apps_mini_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- arc:
|
- arc:
|
||||||
name: ARC
|
name: ARC
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/1803.05457
|
paper: https://arxiv.org/pdf/1803.05457
|
||||||
configpath:
|
configpath:
|
||||||
- opencompass/configs/datasets/ARC_c
|
- opencompass/configs/datasets/ARC_c/ARC_c_gen.py
|
||||||
- opencompass/configs/datasets/ARC_e
|
- opencompass/configs/datasets/ARC_e/ARC_e_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- arc_prize_public_eval:
|
- arc_prize_public_eval:
|
||||||
name: ARC Prize
|
name: ARC Prize
|
||||||
category: ARC-AGI
|
category: ARC-AGI
|
||||||
paper: https://arcprize.org/guide#private
|
paper: https://arcprize.org/guide#private
|
||||||
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
|
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- ax:
|
- ax:
|
||||||
name: SuperGLUE / AX
|
name: SuperGLUE / AX
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath:
|
configpath:
|
||||||
- opencompass/configs/datasets/SuperGLUE_AX_b
|
- opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
|
||||||
- opencompass/configs/datasets/SuperGLUE_AX_g
|
- opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- bbh:
|
- bbh:
|
||||||
name: BIG-Bench Hard
|
name: BIG-Bench Hard
|
||||||
@ -288,79 +317,82 @@
|
|||||||
name: SuperGLUE / BoolQ
|
name: SuperGLUE / BoolQ
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
|
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- c3:
|
- c3:
|
||||||
name: CLUE / C3 (C³)
|
name: CLUE / C3 (C³)
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_C3
|
configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cb:
|
- cb:
|
||||||
name: SuperGLUE / CB
|
name: SuperGLUE / CB
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_CB
|
configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- ceval:
|
- ceval:
|
||||||
name: C-EVAL
|
name: C-EVAL
|
||||||
category: Examination
|
category: Examination
|
||||||
paper: https://arxiv.org/pdf/2305.08322v1
|
paper: https://arxiv.org/pdf/2305.08322v1
|
||||||
configpath: opencompass/configs/datasets/ceval
|
configpath: opencompass/configs/datasets/ceval/ceval_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- charm:
|
- charm:
|
||||||
name: CHARM
|
name: CHARM
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2403.14112
|
paper: https://arxiv.org/pdf/2403.14112
|
||||||
configpath: opencompass/configs/datasets/CHARM
|
configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- chembench:
|
- chembench:
|
||||||
name: ChemBench
|
name: ChemBench
|
||||||
category: Knowledge / Chemistry
|
category: Knowledge / Chemistry
|
||||||
paper: https://arxiv.org/pdf/2404.01475
|
paper: https://arxiv.org/pdf/2404.01475
|
||||||
configpath: opencompass/configs/datasets/ChemBench
|
configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- chid:
|
- chid:
|
||||||
name: FewCLUE / CHID
|
name: FewCLUE / CHID
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_chid
|
configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- chinese_simpleqa:
|
- chinese_simpleqa:
|
||||||
name: Chinese SimpleQA
|
name: Chinese SimpleQA
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/2411.07140
|
paper: https://arxiv.org/pdf/2411.07140
|
||||||
configpath: opencompass/configs/datasets/chinese_simpleqa
|
configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cibench:
|
- cibench:
|
||||||
name: CIBench
|
name: CIBench
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://www.arxiv.org/pdf/2407.10499
|
paper: https://www.arxiv.org/pdf/2407.10499
|
||||||
configpath: opencompass/configs/datasets/CIBench
|
configpath:
|
||||||
|
- opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
|
||||||
|
- opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
|
||||||
|
- opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- civilcomments:
|
- civilcomments:
|
||||||
name: CivilComments
|
name: CivilComments
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://arxiv.org/pdf/1903.04561
|
paper: https://arxiv.org/pdf/1903.04561
|
||||||
configpath: opencompass/configs/datasets/civilcomments
|
configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- clozeTest_maxmin:
|
- clozeTest_maxmin:
|
||||||
name: Cloze Test-max/min
|
name: Cloze Test-max/min
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2102.04664
|
paper: https://arxiv.org/pdf/2102.04664
|
||||||
configpath: opencompass/configs/datasets/clozeTest_maxmin
|
configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cluewsc:
|
- cluewsc:
|
||||||
name: FewCLUE / CLUEWSC
|
name: FewCLUE / CLUEWSC
|
||||||
category: Language / WSC
|
category: Language / WSC
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_cluewsc
|
configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cmb:
|
- cmb:
|
||||||
name: CMB
|
name: CMB
|
||||||
category: Knowledge / Medicine
|
category: Knowledge / Medicine
|
||||||
paper: https://arxiv.org/pdf/2308.08833
|
paper: https://arxiv.org/pdf/2308.08833
|
||||||
configpath: opencompass/configs/datasets/cmb
|
configpath: opencompass/configs/datasets/cmb/cmb_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cmmlu:
|
- cmmlu:
|
||||||
name: CMMLU
|
name: CMMLU
|
||||||
@ -372,61 +404,61 @@
|
|||||||
name: CLUE / CMNLI
|
name: CLUE / CMNLI
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_cmnli
|
configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cmo_fib:
|
- cmo_fib:
|
||||||
name: cmo_fib
|
name: cmo_fib
|
||||||
category: Examination
|
category: Examination
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/cmo_fib
|
configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cmrc:
|
- cmrc:
|
||||||
name: CLUE / CMRC
|
name: CLUE / CMRC
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_CMRC
|
configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- commonsenseqa:
|
- commonsenseqa:
|
||||||
name: CommonSenseQA
|
name: CommonSenseQA
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/1811.00937v2
|
paper: https://arxiv.org/pdf/1811.00937v2
|
||||||
configpath: opencompass/configs/datasets/commonsenseqa
|
configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- commonsenseqa_cn:
|
- commonsenseqa_cn:
|
||||||
name: CommonSenseQA-CN
|
name: CommonSenseQA-CN
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/commonsenseqa_cn
|
configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- copa:
|
- copa:
|
||||||
name: SuperGLUE / COPA
|
name: SuperGLUE / COPA
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_COPA
|
configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- crowspairs:
|
- crowspairs:
|
||||||
name: CrowsPairs
|
name: CrowsPairs
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://arxiv.org/pdf/2010.00133
|
paper: https://arxiv.org/pdf/2010.00133
|
||||||
configpath: opencompass/configs/datasets/crowspairs
|
configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- crowspairs_cn:
|
- crowspairs_cn:
|
||||||
name: CrowsPairs-CN
|
name: CrowsPairs-CN
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/crowspairs_cn
|
configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- cvalues:
|
- cvalues:
|
||||||
name: CVALUES
|
name: CVALUES
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
|
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
|
||||||
configpath: opencompass/configs/datasets/cvalues
|
configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- drcd:
|
- drcd:
|
||||||
name: CLUE / DRCD
|
name: CLUE / DRCD
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_DRCD
|
configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- drop:
|
- drop:
|
||||||
name: DROP (DROP Simple Eval)
|
name: DROP (DROP Simple Eval)
|
||||||
@ -438,31 +470,32 @@
|
|||||||
name: DS-1000
|
name: DS-1000
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2211.11501
|
paper: https://arxiv.org/pdf/2211.11501
|
||||||
configpath: opencompass/configs/datasets/ds1000
|
configpath:
|
||||||
|
- opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- eprstmt:
|
- eprstmt:
|
||||||
name: FewCLUE / EPRSTMT
|
name: FewCLUE / EPRSTMT
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_eprstmt
|
configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- flores:
|
- flores:
|
||||||
name: Flores
|
name: Flores
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://aclanthology.org/D19-1632.pdf
|
paper: https://aclanthology.org/D19-1632.pdf
|
||||||
configpath: opencompass/configs/datasets/flores
|
configpath: opencompass/configs/datasets/flores/flores_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- game24:
|
- game24:
|
||||||
name: Game24
|
name: Game24
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://huggingface.co/datasets/nlile/24-game
|
paper: https://huggingface.co/datasets/nlile/24-game
|
||||||
configpath: opencompass/configs/datasets/game24
|
configpath: opencompass/configs/datasets/game24/game24_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- govrepcrs:
|
- govrepcrs:
|
||||||
name: Government Report Dataset
|
name: Government Report Dataset
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://aclanthology.org/2021.naacl-main.112.pdf
|
paper: https://aclanthology.org/2021.naacl-main.112.pdf
|
||||||
configpath: opencompass/configs/datasets/govrepcrs
|
configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- gpqa:
|
- gpqa:
|
||||||
name: GPQA
|
name: GPQA
|
||||||
@ -474,19 +507,19 @@
|
|||||||
name: GSM8K
|
name: GSM8K
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://arxiv.org/pdf/2110.14168v2
|
paper: https://arxiv.org/pdf/2110.14168v2
|
||||||
configpath: opencompass/configs/datasets/gsm8k
|
configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- gsm_hard:
|
- gsm_hard:
|
||||||
name: GSM-Hard
|
name: GSM-Hard
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
|
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
|
||||||
configpath: opencompass/configs/datasets/gsm_hard
|
configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- hle:
|
- hle:
|
||||||
name: HLE(Humanity's Last Exam)
|
name: HLE(Humanity's Last Exam)
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://lastexam.ai/paper
|
paper: https://lastexam.ai/paper
|
||||||
configpath: opencompass/configs/datasets/HLE
|
configpath: opencompass/configs/datasets/HLE/hle_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- hellaswag:
|
- hellaswag:
|
||||||
name: HellaSwag
|
name: HellaSwag
|
||||||
@ -504,61 +537,67 @@
|
|||||||
name: HumanEval-CN
|
name: HumanEval-CN
|
||||||
category: Code
|
category: Code
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/humaneval_cn
|
configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- humaneval_multi:
|
- humaneval_multi:
|
||||||
name: Multi-HumanEval
|
name: Multi-HumanEval
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2210.14868
|
paper: https://arxiv.org/pdf/2210.14868
|
||||||
configpath: opencompass/configs/datasets/humaneval_multi
|
configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- humaneval_multi:
|
||||||
|
name: HumanEval+
|
||||||
|
category: Code
|
||||||
|
paper: https://arxiv.org/pdf/2305.01210
|
||||||
|
configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- humanevalx:
|
- humanevalx:
|
||||||
name: HumanEval-X
|
name: HumanEval-X
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
||||||
configpath: opencompass/configs/datasets/humanevalx
|
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- hungarian_math:
|
- hungarian_math:
|
||||||
name: Hungarian_Math
|
name: Hungarian_Math
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
|
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
|
||||||
configpath: opencompass/configs/datasets/hungarian_exam
|
configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- iwslt2017:
|
- iwslt2017:
|
||||||
name: IWSLT2017
|
name: IWSLT2017
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
|
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
|
||||||
configpath: opencompass/configs/datasets/iwslt2017
|
configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- jigsawmultilingual:
|
- jigsawmultilingual:
|
||||||
name: JigsawMultilingual
|
name: JigsawMultilingual
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
||||||
configpath: opencompass/configs/datasets/jigsawmultilingual
|
configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- lambada:
|
- lambada:
|
||||||
name: LAMBADA
|
name: LAMBADA
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/1606.06031
|
paper: https://arxiv.org/pdf/1606.06031
|
||||||
configpath: opencompass/configs/datasets/lambada
|
configpath: opencompass/configs/datasets/lambada/lambada_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- lcsts:
|
- lcsts:
|
||||||
name: LCSTS
|
name: LCSTS
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://aclanthology.org/D15-1229.pdf
|
paper: https://aclanthology.org/D15-1229.pdf
|
||||||
configpath: opencompass/configs/datasets/lcsts
|
configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- livestembench:
|
- livestembench:
|
||||||
name: LiveStemBench
|
name: LiveStemBench
|
||||||
category: ''
|
category: ''
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/livestembench
|
configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- llm_compression:
|
- llm_compression:
|
||||||
name: LLM Compression
|
name: LLM Compression
|
||||||
category: Bits Per Character (BPC)
|
category: Bits Per Character (BPC)
|
||||||
paper: https://arxiv.org/pdf/2404.09937
|
paper: https://arxiv.org/pdf/2404.09937
|
||||||
configpath: opencompass/configs/datasets/llm_compression
|
configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- math:
|
- math:
|
||||||
name: MATH
|
name: MATH
|
||||||
@ -576,37 +615,37 @@
|
|||||||
name: MATH 401
|
name: MATH 401
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://arxiv.org/pdf/2304.02015
|
paper: https://arxiv.org/pdf/2304.02015
|
||||||
configpath: opencompass/configs/datasets/math401
|
configpath: opencompass/configs/datasets/math401/math401_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mathbench:
|
- mathbench:
|
||||||
name: MathBench
|
name: MathBench
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://arxiv.org/pdf/2405.12209
|
paper: https://arxiv.org/pdf/2405.12209
|
||||||
configpath: opencompass/configs/datasets/mathbench
|
configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mbpp:
|
- mbpp:
|
||||||
name: MBPP
|
name: MBPP
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2108.07732
|
paper: https://arxiv.org/pdf/2108.07732
|
||||||
configpath: opencompass/configs/datasets/mbpp
|
configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mbpp_cn:
|
- mbpp_cn:
|
||||||
name: MBPP-CN
|
name: MBPP-CN
|
||||||
category: Code
|
category: Code
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/mbpp_cn
|
configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mbpp_plus:
|
- mbpp_plus:
|
||||||
name: MBPP-PLUS
|
name: MBPP-PLUS
|
||||||
category: Code
|
category: Code
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/mbpp_plus
|
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mgsm:
|
- mgsm:
|
||||||
name: MGSM
|
name: MGSM
|
||||||
category: Language / Math
|
category: Language / Math
|
||||||
paper: https://arxiv.org/pdf/2210.03057
|
paper: https://arxiv.org/pdf/2210.03057
|
||||||
configpath: opencompass/configs/datasets/mgsm
|
configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mmlu:
|
- mmlu:
|
||||||
name: MMLU
|
name: MMLU
|
||||||
@ -618,7 +657,7 @@
|
|||||||
name: MMLU-CF
|
name: MMLU-CF
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2412.15194
|
paper: https://arxiv.org/pdf/2412.15194
|
||||||
configpath: opencompass/configs/datasets/mmlu_cf
|
configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- mmlu_pro:
|
- mmlu_pro:
|
||||||
name: MMLU-Pro
|
name: MMLU-Pro
|
||||||
@ -630,91 +669,99 @@
|
|||||||
name: MMMLU
|
name: MMMLU
|
||||||
category: Language / Understanding
|
category: Language / Understanding
|
||||||
paper: https://huggingface.co/datasets/openai/MMMLU
|
paper: https://huggingface.co/datasets/openai/MMMLU
|
||||||
configpath: opencompass/configs/datasets/mmmlu
|
configpath:
|
||||||
|
- opencompass/configs/datasets/mmmlu/mmmlu_gen.py
|
||||||
|
- opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- multirc:
|
- multirc:
|
||||||
name: SuperGLUE / MultiRC
|
name: SuperGLUE / MultiRC
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
|
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- narrativeqa:
|
- narrativeqa:
|
||||||
name: NarrativeQA
|
name: NarrativeQA
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://github.com/google-deepmind/narrativeqa
|
paper: https://github.com/google-deepmind/narrativeqa
|
||||||
configpath: opencompass/configs/datasets/narrativeqa
|
configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- natural_question:
|
- natural_question:
|
||||||
name: NaturalQuestions
|
name: NaturalQuestions
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://github.com/google-research-datasets/natural-questions
|
paper: https://github.com/google-research-datasets/natural-questions
|
||||||
configpath: opencompass/configs/datasets/nq
|
configpath: opencompass/configs/datasets/nq/nq_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- natural_question_cn:
|
- natural_question_cn:
|
||||||
name: NaturalQuestions-CN
|
name: NaturalQuestions-CN
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/nq_cn
|
configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- obqa:
|
- obqa:
|
||||||
name: OpenBookQA
|
name: OpenBookQA
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/1809.02789v1
|
paper: https://arxiv.org/pdf/1809.02789v1
|
||||||
configpath: opencompass/configs/datasets/obqa
|
configpath: opencompass/configs/datasets/obqa/obqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- piqa:
|
- piqa:
|
||||||
name: OpenBookQA
|
name: OpenBookQA
|
||||||
category: Knowledge / Physics
|
category: Knowledge / Physics
|
||||||
paper: https://arxiv.org/pdf/1911.11641v1
|
paper: https://arxiv.org/pdf/1911.11641v1
|
||||||
configpath: opencompass/configs/datasets/piqa
|
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- py150:
|
- py150:
|
||||||
name: py150
|
name: py150
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
|
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
|
||||||
configpath: opencompass/configs/datasets/py150
|
configpath: opencompass/configs/datasets/py150/py150_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- qasper:
|
- qasper:
|
||||||
name: Qasper
|
name: Qasper
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://arxiv.org/pdf/2105.03011
|
paper: https://arxiv.org/pdf/2105.03011
|
||||||
configpath: opencompass/configs/datasets/qasper
|
configpath: opencompass/configs/datasets/qasper/qasper_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- qaspercut:
|
- qaspercut:
|
||||||
name: Qasper-Cut
|
name: Qasper-Cut
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/qaspercut
|
configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- race:
|
- race:
|
||||||
name: RACE
|
name: RACE
|
||||||
category: Examination
|
category: Examination
|
||||||
paper: https://arxiv.org/pdf/1704.04683
|
paper: https://arxiv.org/pdf/1704.04683
|
||||||
configpath: opencompass/configs/datasets/race
|
configpath: opencompass/configs/datasets/race/race_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- realtoxicprompts:
|
- realtoxicprompts:
|
||||||
name: RealToxicPrompts
|
name: RealToxicPrompts
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://arxiv.org/pdf/2009.11462
|
paper: https://arxiv.org/pdf/2009.11462
|
||||||
configpath: opencompass/configs/datasets/realtoxicprompts
|
configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- record:
|
- record:
|
||||||
name: SuperGLUE / ReCoRD
|
name: SuperGLUE / ReCoRD
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
|
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- rte:
|
- rte:
|
||||||
name: SuperGLUE / RTE
|
name: SuperGLUE / RTE
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_RTE
|
configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- ocnli:
|
- ocnli:
|
||||||
name: CLUE / OCNLI
|
name: CLUE / OCNLI
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2004.05986
|
paper: https://arxiv.org/pdf/2004.05986
|
||||||
configpath: opencompass/configs/datasets/CLUE_ocnli
|
configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- ocnlifc:
|
||||||
|
name: FewCLUE / OCNLI-FC
|
||||||
|
category: Reasoning
|
||||||
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
|
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- rolebench:
|
- rolebench:
|
||||||
name: RoleBench
|
name: RoleBench
|
||||||
@ -726,97 +773,97 @@
|
|||||||
name: S3Eval
|
name: S3Eval
|
||||||
category: Long Context
|
category: Long Context
|
||||||
paper: https://aclanthology.org/2024.naacl-long.69.pdf
|
paper: https://aclanthology.org/2024.naacl-long.69.pdf
|
||||||
configpath: opencompass/configs/datasets/s3eval
|
configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- scibench:
|
- scibench:
|
||||||
name: SciBench
|
name: SciBench
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
|
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
|
||||||
configpath: opencompass/configs/datasets/scibench
|
configpath: opencompass/configs/datasets/scibench/scibench_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- scicode:
|
- scicode:
|
||||||
name: SciCode
|
name: SciCode
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2407.13168
|
paper: https://arxiv.org/pdf/2407.13168
|
||||||
configpath: opencompass/configs/datasets/scicode
|
configpath: opencompass/configs/datasets/scicode/scicode_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- simpleqa:
|
- simpleqa:
|
||||||
name: SimpleQA
|
name: SimpleQA
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/2411.04368
|
paper: https://arxiv.org/pdf/2411.04368
|
||||||
configpath: opencompass/configs/datasets/SimpleQA
|
configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- siqa:
|
- siqa:
|
||||||
name: SocialIQA
|
name: SocialIQA
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/1904.09728
|
paper: https://arxiv.org/pdf/1904.09728
|
||||||
configpath: opencompass/configs/datasets/siqa
|
configpath: opencompass/configs/datasets/siqa/siqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- squad20:
|
- squad20:
|
||||||
name: SQuAD2.0
|
name: SQuAD2.0
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/1806.03822
|
paper: https://arxiv.org/pdf/1806.03822
|
||||||
configpath: opencompass/configs/datasets/squad20
|
configpath: opencompass/configs/datasets/squad20/squad20_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- storycloze:
|
- storycloze:
|
||||||
name: StoryCloze
|
name: StoryCloze
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
|
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
|
||||||
configpath: opencompass/configs/datasets/storycloze
|
configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- strategyqa:
|
- strategyqa:
|
||||||
name: StrategyQA
|
name: StrategyQA
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2101.02235
|
paper: https://arxiv.org/pdf/2101.02235
|
||||||
configpath: opencompass/configs/datasets/strategyqa
|
configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- summedits:
|
- summedits:
|
||||||
name: SummEdits
|
name: SummEdits
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
|
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
|
||||||
configpath: opencompass/configs/datasets/summedits
|
configpath: opencompass/configs/datasets/summedits/summedits_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- summscreen:
|
- summscreen:
|
||||||
name: SummScreen
|
name: SummScreen
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2104.07091v1
|
paper: https://arxiv.org/pdf/2104.07091v1
|
||||||
configpath: opencompass/configs/datasets/summscreen
|
configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- svamp:
|
- svamp:
|
||||||
name: SVAMP
|
name: SVAMP
|
||||||
category: Math
|
category: Math
|
||||||
paper: https://aclanthology.org/2021.naacl-main.168.pdf
|
paper: https://aclanthology.org/2021.naacl-main.168.pdf
|
||||||
configpath: opencompass/configs/datasets/SVAMP
|
configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- tabmwp:
|
- tabmwp:
|
||||||
name: TabMWP
|
name: TabMWP
|
||||||
category: Math / Table
|
category: Math / Table
|
||||||
paper: https://arxiv.org/pdf/2209.14610
|
paper: https://arxiv.org/pdf/2209.14610
|
||||||
configpath: opencompass/configs/datasets/TabMWP
|
configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- taco:
|
- taco:
|
||||||
name: TACO
|
name: TACO
|
||||||
category: Code
|
category: Code
|
||||||
paper: https://arxiv.org/pdf/2312.14852
|
paper: https://arxiv.org/pdf/2312.14852
|
||||||
configpath: opencompass/configs/datasets/taco
|
configpath: opencompass/configs/datasets/taco/taco_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- tnews:
|
- tnews:
|
||||||
name: FewCLUE / TNEWS
|
name: FewCLUE / TNEWS
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_tnews
|
configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- bustm:
|
- bustm:
|
||||||
name: FewCLUE / BUSTM
|
name: FewCLUE / BUSTM
|
||||||
category: Reasoning
|
category: Reasoning
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_bustm
|
configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- csl:
|
- csl:
|
||||||
name: FewCLUE / CSL
|
name: FewCLUE / CSL
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2107.07498
|
paper: https://arxiv.org/pdf/2107.07498
|
||||||
configpath: opencompass/configs/datasets/FewCLUE_csl
|
configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- ocnli_fc:
|
- ocnli_fc:
|
||||||
name: FewCLUE / OCNLI-FC
|
name: FewCLUE / OCNLI-FC
|
||||||
@ -828,65 +875,95 @@
|
|||||||
name: TriviaQA
|
name: TriviaQA
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/1705.03551v2
|
paper: https://arxiv.org/pdf/1705.03551v2
|
||||||
configpath: opencompass/configs/datasets/triviaqa
|
configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- triviaqarc:
|
- triviaqarc:
|
||||||
name: TriviaQA-RC
|
name: TriviaQA-RC
|
||||||
category: Knowledge / Understanding
|
category: Knowledge / Understanding
|
||||||
paper: ''
|
paper: ''
|
||||||
configpath: opencompass/configs/datasets/triviaqarc
|
configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- truthfulqa:
|
- truthfulqa:
|
||||||
name: TruthfulQA
|
name: TruthfulQA
|
||||||
category: Safety
|
category: Safety
|
||||||
paper: https://arxiv.org/pdf/2109.07958v2
|
paper: https://arxiv.org/pdf/2109.07958v2
|
||||||
configpath: opencompass/configs/datasets/truthfulqa
|
configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- tydiqa:
|
- tydiqa:
|
||||||
name: TyDi-QA
|
name: TyDi-QA
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
|
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
|
||||||
configpath: opencompass/configs/datasets/tydiqa
|
configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- wic:
|
- wic:
|
||||||
name: SuperGLUE / WiC
|
name: SuperGLUE / WiC
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_WiC
|
configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- wsc:
|
- wsc:
|
||||||
name: SuperGLUE / WSC
|
name: SuperGLUE / WSC
|
||||||
category: Language / WSC
|
category: Language / WSC
|
||||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||||
configpath: opencompass/configs/datasets/SuperGLUE_WSC
|
configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- winogrande:
|
- winogrande:
|
||||||
name: WinoGrande
|
name: WinoGrande
|
||||||
category: Language / WSC
|
category: Language / WSC
|
||||||
paper: https://arxiv.org/pdf/1907.10641v2
|
paper: https://arxiv.org/pdf/1907.10641v2
|
||||||
configpath: opencompass/configs/datasets/winogrande
|
configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- xcopa:
|
- xcopa:
|
||||||
name: XCOPA
|
name: XCOPA
|
||||||
category: Language
|
category: Language
|
||||||
paper: https://arxiv.org/pdf/2005.00333
|
paper: https://arxiv.org/pdf/2005.00333
|
||||||
configpath: opencompass/configs/datasets/XCOPA
|
configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- xiezhi:
|
- xiezhi:
|
||||||
name: Xiezhi
|
name: Xiezhi
|
||||||
category: Knowledge
|
category: Knowledge
|
||||||
paper: https://arxiv.org/pdf/2306.05783
|
paper: https://arxiv.org/pdf/2306.05783
|
||||||
configpath: opencompass/configs/datasets/xiezhi
|
configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- xlsum:
|
- xlsum:
|
||||||
name: XLSum
|
name: XLSum
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/2106.13822v1
|
paper: https://arxiv.org/pdf/2106.13822v1
|
||||||
configpath: opencompass/configs/datasets/XLSum
|
configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
- xsum:
|
- xsum:
|
||||||
name: Xsum
|
name: Xsum
|
||||||
category: Understanding
|
category: Understanding
|
||||||
paper: https://arxiv.org/pdf/1808.08745
|
paper: https://arxiv.org/pdf/1808.08745
|
||||||
configpath: opencompass/configs/datasets/Xsum
|
configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
|
||||||
configpath_llmjudge: ''
|
configpath_llmjudge: ''
|
||||||
|
- cola:
|
||||||
|
name: GLUE / CoLA
|
||||||
|
category: Understanding
|
||||||
|
paper: https://arxiv.org/pdf/1804.07461
|
||||||
|
configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- mprc:
|
||||||
|
name: GLUE / MPRC
|
||||||
|
category: Understanding
|
||||||
|
paper: https://arxiv.org/pdf/1804.07461
|
||||||
|
configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- qqp:
|
||||||
|
name: GLUE / QQP
|
||||||
|
category: Understanding
|
||||||
|
paper: https://arxiv.org/pdf/1804.07461
|
||||||
|
configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- omni_math:
|
||||||
|
name: Omni-MATH
|
||||||
|
category: Math
|
||||||
|
paper: https://omni-math.github.io/
|
||||||
|
configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
|
||||||
|
configpath_llmjudge: ''
|
||||||
|
- wikibench:
|
||||||
|
name: WikiBench
|
||||||
|
category: Knowledge
|
||||||
|
paper: ''
|
||||||
|
configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
|
||||||
|
configpath_llmjudge: ''
|
@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass.
|
|||||||
|
|
||||||
You can use sorting and search functions to find the dataset you need.
|
You can use sorting and search functions to find the dataset you need.
|
||||||
|
|
||||||
|
We provide recommended running configurations for each dataset,
|
||||||
|
and in some datasets also offer recommended configurations based on LLM Judge.
|
||||||
|
|
||||||
|
You can quickly start evaluation tasks based on the recommended configurations.
|
||||||
|
However, please note that these configurations may be updated over time.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with open('dataset_statistics.md', 'w') as f:
|
with open('dataset_statistics.md', 'w') as f:
|
||||||
@ -24,7 +30,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
|
|||||||
with open(load_path, 'r') as f2:
|
with open(load_path, 'r') as f2:
|
||||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||||
|
|
||||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
|
||||||
|
|
||||||
|
|
||||||
def table_format(data_list):
|
def table_format(data_list):
|
||||||
@ -35,6 +41,13 @@ def table_format(data_list):
|
|||||||
for index in HEADER:
|
for index in HEADER:
|
||||||
if index == 'paper':
|
if index == 'paper':
|
||||||
table_format_list_sub.append('[link](' + i[j][index] + ')')
|
table_format_list_sub.append('[link](' + i[j][index] + ')')
|
||||||
|
elif index == 'configpath_llmjudge':
|
||||||
|
if i[j][index] == '':
|
||||||
|
table_format_list_sub.append(i[j][index])
|
||||||
|
else:
|
||||||
|
table_format_list_sub.append('[link](' +
|
||||||
|
GITHUB_PREFIX +
|
||||||
|
i[j][index] + ')')
|
||||||
elif index == 'configpath':
|
elif index == 'configpath':
|
||||||
if isinstance(i[j][index], list):
|
if isinstance(i[j][index], list):
|
||||||
sub_list_text = ''
|
sub_list_text = ''
|
||||||
@ -61,7 +74,10 @@ def generate_table(data_list, title=None):
|
|||||||
if title is not None:
|
if title is not None:
|
||||||
f.write(f'\n{title}')
|
f.write(f'\n{title}')
|
||||||
f.write("""\n```{table}\n:class: dataset\n""")
|
f.write("""\n```{table}\n:class: dataset\n""")
|
||||||
header = ['Name', 'Category', 'Paper or Repository', 'Config File']
|
header = [
|
||||||
|
'Name', 'Category', 'Paper or Repository', 'Recommended Config',
|
||||||
|
'Recommended Config (LLM Judge)'
|
||||||
|
]
|
||||||
table_cfg = dict(tablefmt='pipe',
|
table_cfg = dict(tablefmt='pipe',
|
||||||
floatfmt='.2f',
|
floatfmt='.2f',
|
||||||
numalign='right',
|
numalign='right',
|
||||||
|
@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\
|
|||||||
|
|
||||||
你可以使用排序和搜索功能找到需要的数据集。
|
你可以使用排序和搜索功能找到需要的数据集。
|
||||||
|
|
||||||
|
我们对每一个数据集都给出了推荐的运行配置,部分数据集中还提供了基于LLM Judge的推荐配置。
|
||||||
|
|
||||||
|
你可以基于推荐配置快速启动评测。但请注意,推荐配置可能随时间推移被更新。
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
with open('dataset_statistics.md', 'w') as f:
|
with open('dataset_statistics.md', 'w') as f:
|
||||||
@ -35,7 +39,13 @@ def table_format(data_list):
|
|||||||
for index in HEADER:
|
for index in HEADER:
|
||||||
if index == 'paper':
|
if index == 'paper':
|
||||||
table_format_list_sub.append('[链接](' + i[j][index] + ')')
|
table_format_list_sub.append('[链接](' + i[j][index] + ')')
|
||||||
elif index != 'name' and index != 'category':
|
elif index == 'configpath_llmjudge':
|
||||||
|
if i[j][index] == '':
|
||||||
|
table_format_list_sub.append(i[j][index])
|
||||||
|
else:
|
||||||
|
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
|
||||||
|
i[j][index] + ')')
|
||||||
|
elif index == 'configpath':
|
||||||
if isinstance(i[j][index], list):
|
if isinstance(i[j][index], list):
|
||||||
sub_list_text = ''
|
sub_list_text = ''
|
||||||
for k in i[j][index]:
|
for k in i[j][index]:
|
||||||
@ -60,7 +70,7 @@ def generate_table(data_list, title=None):
|
|||||||
if title is not None:
|
if title is not None:
|
||||||
f.write(f'\n{title}')
|
f.write(f'\n{title}')
|
||||||
f.write("""\n```{table}\n:class: dataset\n""")
|
f.write("""\n```{table}\n:class: dataset\n""")
|
||||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)']
|
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
|
||||||
table_cfg = dict(tablefmt='pipe',
|
table_cfg = dict(tablefmt='pipe',
|
||||||
floatfmt='.2f',
|
floatfmt='.2f',
|
||||||
numalign='right',
|
numalign='right',
|
||||||
|
@ -78,7 +78,6 @@ def generic_llmjudge_postprocess(
|
|||||||
f'No gold answer for {k}, use empty string as reference!')
|
f'No gold answer for {k}, use empty string as reference!')
|
||||||
references.append('')
|
references.append('')
|
||||||
results = get_final_results(judged_answers, references, origial_responses)
|
results = get_final_results(judged_answers, references, origial_responses)
|
||||||
|
|
||||||
results['details'] = output
|
results['details'] = output
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user