This commit is contained in:
Myhs-phz 2025-03-19 03:37:23 +00:00
parent 716c02785c
commit ffe00a830d
4 changed files with 227 additions and 125 deletions

View File

@ -8,25 +8,25 @@
name: NPHardEval
category: Reasoning
paper: https://arxiv.org/pdf/2312.14890v2
configpath: opencompass/configs/datasets/NPHardEval
configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
configpath_llmjudge: ''
- pmmeval:
name: PMMEval
category: Language
paper: https://arxiv.org/pdf/2411.09116v1
configpath: opencompass/configs/datasets/PMMEval
configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
configpath_llmjudge: ''
- theoremqa:
name: TheroremQA
category: Reasoning
paper: https://arxiv.org/pdf/2305.12524
configpath: opencompass/configs/datasets/TheroremQA
configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
configpath_llmjudge: ''
- agieval:
name: AGIEval
category: Examination
paper: https://arxiv.org/pdf/2304.06364
configpath: opencompass/configs/datasets/agieval
configpath: opencompass/configs/datasets/agieval/agieval_gen.py
configpath_llmjudge: ''
- babilong:
name: BABILong
@ -44,13 +44,13 @@
name: CaLM
category: Reasoning
paper: https://arxiv.org/pdf/2405.00622
configpath: opencompass/configs/datasets/calm
configpath: opencompass/configs/datasets/calm/calm.py
configpath_llmjudge: ''
- infinitebench:
name: InfiniteBench (∞Bench)
category: Long Context
paper: https://aclanthology.org/2024.acl-long.814.pdf
configpath: opencompass/configs/datasets/infinitebench
configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
configpath_llmjudge: ''
- korbench:
name: KOR-Bench
@ -62,13 +62,15 @@
name: LawBench
category: Knowledge / Law
paper: https://arxiv.org/pdf/2309.16289
configpath: opencompass/configs/datasets/lawbench
configpath:
- opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
- opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
configpath_llmjudge: ''
- leval:
name: L-Eval
category: Long Context
paper: https://arxiv.org/pdf/2307.11088v1
configpath: opencompass/configs/datasets/leval
configpath: opencompass/configs/datasets/leval/leval.py
configpath_llmjudge: ''
- livecodebench:
name: LiveCodeBench
@ -80,25 +82,39 @@
name: LiveMathBench
category: Math
paper: https://arxiv.org/pdf/2412.13147
configpath: opencompass/configs/datasets/livemathbench
configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
configpath_llmjudge: ''
- livereasonbench:
name: LiveReasonBench
category: Reasoning
paper: ''
configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
configpath_llmjudge: ''
- longbench:
name: LongBench
category: Long Context
paper: https://github.com/THUDM/LongBench
configpath: opencompass/configs/datasets/livemathbench
configpath:
- opencompass/configs/datasets/longbench/longbench.py
- opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
configpath_llmjudge: ''
- lveval:
name: LV-Eval
category: Long Context
paper: https://arxiv.org/pdf/2402.05136
configpath: opencompass/configs/datasets/lveval
configpath: opencompass/configs/datasets/lveval/lveval.py
configpath_llmjudge: ''
- mastermath2024v1:
name: Mastermath2024v1
category: Math
paper: ''
configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
configpath_llmjudge: ''
- medbench:
name: MedBench
category: Knowledge / Medicine
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench
configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
configpath_llmjudge: ''
- musr:
name: MuSR
@ -140,7 +156,7 @@
name: FLAMES
category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.06899
configpath: opencompass/configs/datasets/subjective/flames
configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
configpath_llmjudge: ''
- fofo:
name: FOFO
@ -182,55 +198,63 @@
name: T-Eval
category: Tool Utilization
paper: https://arxiv.org/pdf/2312.14033
configpath: opencompass/configs/datasets/teval
configpath:
- opencompass/configs/datasets/teval/teval_en_gen.py
- opencompass/configs/datasets/teval/teval_zh_gen.py
configpath_llmjudge: ''
- finalceiq:
name: FinanceIQ
category: Knowledge / Finance
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
configpath_llmjudge: ''
- gaokaobench:
name: GAOKAOBench
category: Examination
paper: https://arxiv.org/pdf/2305.12474
configpath: opencompass/configs/datasets/GaokaoBench
configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
configpath_llmjudge: ''
- lcbench:
name: LCBench
category: Code
paper: https://github.com/open-compass/CodeBench/
configpath: opencompass/configs/datasets/LCBench
configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
configpath_llmjudge: ''
- MMLUArabic:
name: ArabicMMLU
category: Language
paper: https://arxiv.org/pdf/2402.12840
configpath: opencompass/configs/datasets/MMLUArabic
configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
configpath_llmjudge: ''
- OpenFinData:
name: OpenFinData
category: Knowledge / Finance
paper: https://github.com/open-compass/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
configpath_llmjudge: ''
- QuALITY:
name: QuALITY
category: Long Context
paper: https://arxiv.org/pdf/2112.08608
configpath: opencompass/configs/datasets/QuALITY
configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
configpath_llmjudge: ''
- advglue:
name: Adversarial GLUE
category: Safety
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
configpath: opencompass/configs/datasets/adv_glue
configpath:
- opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
- opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
configpath_llmjudge: ''
- afqmcd:
name: CLUE / AFQMC
category: Language
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_afqmc
configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
configpath_llmjudge: ''
- aime2024:
name: AIME2024
@ -242,41 +266,46 @@
name: Adversarial NLI
category: Reasoning
paper: https://arxiv.org/pdf/1910.14599v2
configpath: opencompass/configs/datasets/anli
configpath: opencompass/configs/datasets/anli/anli_gen.py
configpath_llmjudge: ''
- anthropics_evals:
name: Anthropics Evals
category: Safety
paper: https://arxiv.org/pdf/2212.09251
configpath: opencompass/configs/datasets/anthropics_evals
configpath:
- opencompass/configs/datasets/anthropics_evals/airisk_gen.py
- opencompass/configs/datasets/anthropics_evals/persona_gen.py
- opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
configpath_llmjudge: ''
- apps:
name: APPS
category: Code
paper: https://arxiv.org/pdf/2105.09938
configpath: opencompass/configs/datasets/apps
configpath:
- opencompass/configs/datasets/apps/apps_gen.py
- opencompass/configs/datasets/apps/apps_mini_gen.py
configpath_llmjudge: ''
- arc:
name: ARC
category: Reasoning
paper: https://arxiv.org/pdf/1803.05457
configpath:
- opencompass/configs/datasets/ARC_c
- opencompass/configs/datasets/ARC_e
- opencompass/configs/datasets/ARC_c/ARC_c_gen.py
- opencompass/configs/datasets/ARC_e/ARC_e_gen.py
configpath_llmjudge: ''
- arc_prize_public_eval:
name: ARC Prize
category: ARC-AGI
paper: https://arcprize.org/guide#private
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
configpath_llmjudge: ''
- ax:
name: SuperGLUE / AX
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath:
- opencompass/configs/datasets/SuperGLUE_AX_b
- opencompass/configs/datasets/SuperGLUE_AX_g
- opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
- opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
configpath_llmjudge: ''
- bbh:
name: BIG-Bench Hard
@ -288,79 +317,82 @@
name: SuperGLUE / BoolQ
category: Knowledge
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
configpath_llmjudge: ''
- c3:
name: CLUE / C3 (C³)
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3
configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
configpath_llmjudge: ''
- cb:
name: SuperGLUE / CB
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_CB
configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
configpath_llmjudge: ''
- ceval:
name: C-EVAL
category: Examination
paper: https://arxiv.org/pdf/2305.08322v1
configpath: opencompass/configs/datasets/ceval
configpath: opencompass/configs/datasets/ceval/ceval_gen.py
configpath_llmjudge: ''
- charm:
name: CHARM
category: Reasoning
paper: https://arxiv.org/pdf/2403.14112
configpath: opencompass/configs/datasets/CHARM
configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
configpath_llmjudge: ''
- chembench:
name: ChemBench
category: Knowledge / Chemistry
paper: https://arxiv.org/pdf/2404.01475
configpath: opencompass/configs/datasets/ChemBench
configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
configpath_llmjudge: ''
- chid:
name: FewCLUE / CHID
category: Language
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_chid
configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
configpath_llmjudge: ''
- chinese_simpleqa:
name: Chinese SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.07140
configpath: opencompass/configs/datasets/chinese_simpleqa
configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
configpath_llmjudge: ''
- cibench:
name: CIBench
category: Code
paper: https://www.arxiv.org/pdf/2407.10499
configpath: opencompass/configs/datasets/CIBench
configpath:
- opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
- opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
- opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
configpath_llmjudge: ''
- civilcomments:
name: CivilComments
category: Safety
paper: https://arxiv.org/pdf/1903.04561
configpath: opencompass/configs/datasets/civilcomments
configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
configpath_llmjudge: ''
- clozeTest_maxmin:
name: Cloze Test-max/min
category: Code
paper: https://arxiv.org/pdf/2102.04664
configpath: opencompass/configs/datasets/clozeTest_maxmin
configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
configpath_llmjudge: ''
- cluewsc:
name: FewCLUE / CLUEWSC
category: Language / WSC
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_cluewsc
configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
configpath_llmjudge: ''
- cmb:
name: CMB
category: Knowledge / Medicine
paper: https://arxiv.org/pdf/2308.08833
configpath: opencompass/configs/datasets/cmb
configpath: opencompass/configs/datasets/cmb/cmb_gen.py
configpath_llmjudge: ''
- cmmlu:
name: CMMLU
@ -372,61 +404,61 @@
name: CLUE / CMNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_cmnli
configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
configpath_llmjudge: ''
- cmo_fib:
name: cmo_fib
category: Examination
paper: ''
configpath: opencompass/configs/datasets/cmo_fib
configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
configpath_llmjudge: ''
- cmrc:
name: CLUE / CMRC
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_CMRC
configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
configpath_llmjudge: ''
- commonsenseqa:
name: CommonSenseQA
category: Knowledge
paper: https://arxiv.org/pdf/1811.00937v2
configpath: opencompass/configs/datasets/commonsenseqa
configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
configpath_llmjudge: ''
- commonsenseqa_cn:
name: CommonSenseQA-CN
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/commonsenseqa_cn
configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
configpath_llmjudge: ''
- copa:
name: SuperGLUE / COPA
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_COPA
configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
configpath_llmjudge: ''
- crowspairs:
name: CrowsPairs
category: Safety
paper: https://arxiv.org/pdf/2010.00133
configpath: opencompass/configs/datasets/crowspairs
configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
configpath_llmjudge: ''
- crowspairs_cn:
name: CrowsPairs-CN
category: Safety
paper: ''
configpath: opencompass/configs/datasets/crowspairs_cn
configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
configpath_llmjudge: ''
- cvalues:
name: CVALUES
category: Safety
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
configpath: opencompass/configs/datasets/cvalues
configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
configpath_llmjudge: ''
- drcd:
name: CLUE / DRCD
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_DRCD
configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
configpath_llmjudge: ''
- drop:
name: DROP (DROP Simple Eval)
@ -438,31 +470,32 @@
name: DS-1000
category: Code
paper: https://arxiv.org/pdf/2211.11501
configpath: opencompass/configs/datasets/ds1000
configpath:
- opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
configpath_llmjudge: ''
- eprstmt:
name: FewCLUE / EPRSTMT
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_eprstmt
configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
configpath_llmjudge: ''
- flores:
name: Flores
category: Language
paper: https://aclanthology.org/D19-1632.pdf
configpath: opencompass/configs/datasets/flores
configpath: opencompass/configs/datasets/flores/flores_gen.py
configpath_llmjudge: ''
- game24:
name: Game24
category: Math
paper: https://huggingface.co/datasets/nlile/24-game
configpath: opencompass/configs/datasets/game24
configpath: opencompass/configs/datasets/game24/game24_gen.py
configpath_llmjudge: ''
- govrepcrs:
name: Government Report Dataset
category: Long Context
paper: https://aclanthology.org/2021.naacl-main.112.pdf
configpath: opencompass/configs/datasets/govrepcrs
configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
configpath_llmjudge: ''
- gpqa:
name: GPQA
@ -474,19 +507,19 @@
name: GSM8K
category: Math
paper: https://arxiv.org/pdf/2110.14168v2
configpath: opencompass/configs/datasets/gsm8k
configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
configpath_llmjudge: ''
- gsm_hard:
name: GSM-Hard
category: Math
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
configpath: opencompass/configs/datasets/gsm_hard
configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
configpath_llmjudge: ''
- hle:
name: HLE(Humanity's Last Exam)
category: Reasoning
paper: https://lastexam.ai/paper
configpath: opencompass/configs/datasets/HLE
configpath: opencompass/configs/datasets/HLE/hle_gen.py
configpath_llmjudge: ''
- hellaswag:
name: HellaSwag
@ -504,61 +537,67 @@
name: HumanEval-CN
category: Code
paper: ''
configpath: opencompass/configs/datasets/humaneval_cn
configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
configpath_llmjudge: ''
- humaneval_multi:
name: Multi-HumanEval
category: Code
paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/humaneval_multi
configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
configpath_llmjudge: ''
- humaneval_multi:
name: HumanEval+
category: Code
paper: https://arxiv.org/pdf/2305.01210
configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
configpath_llmjudge: ''
- humanevalx:
name: HumanEval-X
category: Code
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx
configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
configpath_llmjudge: ''
- hungarian_math:
name: Hungarian_Math
category: Math
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
configpath: opencompass/configs/datasets/hungarian_exam
configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
configpath_llmjudge: ''
- iwslt2017:
name: IWSLT2017
category: Language
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
configpath: opencompass/configs/datasets/iwslt2017
configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
configpath_llmjudge: ''
- jigsawmultilingual:
name: JigsawMultilingual
category: Safety
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
configpath: opencompass/configs/datasets/jigsawmultilingual
configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
configpath_llmjudge: ''
- lambada:
name: LAMBADA
category: Understanding
paper: https://arxiv.org/pdf/1606.06031
configpath: opencompass/configs/datasets/lambada
configpath: opencompass/configs/datasets/lambada/lambada_gen.py
configpath_llmjudge: ''
- lcsts:
name: LCSTS
category: Understanding
paper: https://aclanthology.org/D15-1229.pdf
configpath: opencompass/configs/datasets/lcsts
configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
configpath_llmjudge: ''
- livestembench:
name: LiveStemBench
category: ''
paper: ''
configpath: opencompass/configs/datasets/livestembench
configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
configpath_llmjudge: ''
- llm_compression:
name: LLM Compression
category: Bits Per Character (BPC)
paper: https://arxiv.org/pdf/2404.09937
configpath: opencompass/configs/datasets/llm_compression
configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
configpath_llmjudge: ''
- math:
name: MATH
@ -576,37 +615,37 @@
name: MATH 401
category: Math
paper: https://arxiv.org/pdf/2304.02015
configpath: opencompass/configs/datasets/math401
configpath: opencompass/configs/datasets/math401/math401_gen.py
configpath_llmjudge: ''
- mathbench:
name: MathBench
category: Math
paper: https://arxiv.org/pdf/2405.12209
configpath: opencompass/configs/datasets/mathbench
configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
configpath_llmjudge: ''
- mbpp:
name: MBPP
category: Code
paper: https://arxiv.org/pdf/2108.07732
configpath: opencompass/configs/datasets/mbpp
configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
configpath_llmjudge: ''
- mbpp_cn:
name: MBPP-CN
category: Code
paper: ''
configpath: opencompass/configs/datasets/mbpp_cn
configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
configpath_llmjudge: ''
- mbpp_plus:
name: MBPP-PLUS
category: Code
paper: ''
configpath: opencompass/configs/datasets/mbpp_plus
configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
configpath_llmjudge: ''
- mgsm:
name: MGSM
category: Language / Math
paper: https://arxiv.org/pdf/2210.03057
configpath: opencompass/configs/datasets/mgsm
configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
configpath_llmjudge: ''
- mmlu:
name: MMLU
@ -618,7 +657,7 @@
name: MMLU-CF
category: Understanding
paper: https://arxiv.org/pdf/2412.15194
configpath: opencompass/configs/datasets/mmlu_cf
configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
configpath_llmjudge: ''
- mmlu_pro:
name: MMLU-Pro
@ -630,91 +669,99 @@
name: MMMLU
category: Language / Understanding
paper: https://huggingface.co/datasets/openai/MMMLU
configpath: opencompass/configs/datasets/mmmlu
configpath:
- opencompass/configs/datasets/mmmlu/mmmlu_gen.py
- opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
configpath_llmjudge: ''
- multirc:
name: SuperGLUE / MultiRC
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
configpath_llmjudge: ''
- narrativeqa:
name: NarrativeQA
category: Understanding
paper: https://github.com/google-deepmind/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
configpath_llmjudge: ''
- natural_question:
name: NaturalQuestions
category: Knowledge
paper: https://github.com/google-research-datasets/natural-questions
configpath: opencompass/configs/datasets/nq
configpath: opencompass/configs/datasets/nq/nq_gen.py
configpath_llmjudge: ''
- natural_question_cn:
name: NaturalQuestions-CN
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/nq_cn
configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
configpath_llmjudge: ''
- obqa:
name: OpenBookQA
category: Knowledge
paper: https://arxiv.org/pdf/1809.02789v1
configpath: opencompass/configs/datasets/obqa
configpath: opencompass/configs/datasets/obqa/obqa_gen.py
configpath_llmjudge: ''
- piqa:
name: OpenBookQA
category: Knowledge / Physics
paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa
configpath: opencompass/configs/datasets/piqa/piqa_gen.py
configpath_llmjudge: ''
- py150:
name: py150
category: Code
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
configpath: opencompass/configs/datasets/py150
configpath: opencompass/configs/datasets/py150/py150_gen.py
configpath_llmjudge: ''
- qasper:
name: Qasper
category: Long Context
paper: https://arxiv.org/pdf/2105.03011
configpath: opencompass/configs/datasets/qasper
configpath: opencompass/configs/datasets/qasper/qasper_gen.py
configpath_llmjudge: ''
- qaspercut:
name: Qasper-Cut
category: Long Context
paper: ''
configpath: opencompass/configs/datasets/qaspercut
configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
configpath_llmjudge: ''
- race:
name: RACE
category: Examination
paper: https://arxiv.org/pdf/1704.04683
configpath: opencompass/configs/datasets/race
configpath: opencompass/configs/datasets/race/race_gen.py
configpath_llmjudge: ''
- realtoxicprompts:
name: RealToxicPrompts
category: Safety
paper: https://arxiv.org/pdf/2009.11462
configpath: opencompass/configs/datasets/realtoxicprompts
configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
configpath_llmjudge: ''
- record:
name: SuperGLUE / ReCoRD
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
configpath_llmjudge: ''
- rte:
name: SuperGLUE / RTE
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_RTE
configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
configpath_llmjudge: ''
- ocnli:
name: CLUE / OCNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_ocnli
configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
configpath_llmjudge: ''
- ocnlifc:
name: FewCLUE / OCNLI-FC
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
configpath_llmjudge: ''
- rolebench:
name: RoleBench
@ -726,97 +773,97 @@
name: S3Eval
category: Long Context
paper: https://aclanthology.org/2024.naacl-long.69.pdf
configpath: opencompass/configs/datasets/s3eval
configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
configpath_llmjudge: ''
- scibench:
name: SciBench
category: Reasoning
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
configpath: opencompass/configs/datasets/scibench
configpath: opencompass/configs/datasets/scibench/scibench_gen.py
configpath_llmjudge: ''
- scicode:
name: SciCode
category: Code
paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode
configpath: opencompass/configs/datasets/scicode/scicode_gen.py
configpath_llmjudge: ''
- simpleqa:
name: SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.04368
configpath: opencompass/configs/datasets/SimpleQA
configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
configpath_llmjudge: ''
- siqa:
name: SocialIQA
category: Reasoning
paper: https://arxiv.org/pdf/1904.09728
configpath: opencompass/configs/datasets/siqa
configpath: opencompass/configs/datasets/siqa/siqa_gen.py
configpath_llmjudge: ''
- squad20:
name: SQuAD2.0
category: Understanding
paper: https://arxiv.org/pdf/1806.03822
configpath: opencompass/configs/datasets/squad20
configpath: opencompass/configs/datasets/squad20/squad20_gen.py
configpath_llmjudge: ''
- storycloze:
name: StoryCloze
category: Reasoning
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
configpath: opencompass/configs/datasets/storycloze
configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
configpath_llmjudge: ''
- strategyqa:
name: StrategyQA
category: Reasoning
paper: https://arxiv.org/pdf/2101.02235
configpath: opencompass/configs/datasets/strategyqa
configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
configpath_llmjudge: ''
- summedits:
name: SummEdits
category: Language
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
configpath: opencompass/configs/datasets/summedits
configpath: opencompass/configs/datasets/summedits/summedits_gen.py
configpath_llmjudge: ''
- summscreen:
name: SummScreen
category: Understanding
paper: https://arxiv.org/pdf/2104.07091v1
configpath: opencompass/configs/datasets/summscreen
configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
configpath_llmjudge: ''
- svamp:
name: SVAMP
category: Math
paper: https://aclanthology.org/2021.naacl-main.168.pdf
configpath: opencompass/configs/datasets/SVAMP
configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
configpath_llmjudge: ''
- tabmwp:
name: TabMWP
category: Math / Table
paper: https://arxiv.org/pdf/2209.14610
configpath: opencompass/configs/datasets/TabMWP
configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
configpath_llmjudge: ''
- taco:
name: TACO
category: Code
paper: https://arxiv.org/pdf/2312.14852
configpath: opencompass/configs/datasets/taco
configpath: opencompass/configs/datasets/taco/taco_gen.py
configpath_llmjudge: ''
- tnews:
name: FewCLUE / TNEWS
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_tnews
configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
configpath_llmjudge: ''
- bustm:
name: FewCLUE / BUSTM
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_bustm
configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
configpath_llmjudge: ''
- csl:
name: FewCLUE / CSL
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_csl
configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
configpath_llmjudge: ''
- ocnli_fc:
name: FewCLUE / OCNLI-FC
@ -828,65 +875,95 @@
name: TriviaQA
category: Knowledge
paper: https://arxiv.org/pdf/1705.03551v2
configpath: opencompass/configs/datasets/triviaqa
configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
configpath_llmjudge: ''
- triviaqarc:
name: TriviaQA-RC
category: Knowledge / Understanding
paper: ''
configpath: opencompass/configs/datasets/triviaqarc
configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
configpath_llmjudge: ''
- truthfulqa:
name: TruthfulQA
category: Safety
paper: https://arxiv.org/pdf/2109.07958v2
configpath: opencompass/configs/datasets/truthfulqa
configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
configpath_llmjudge: ''
- tydiqa:
name: TyDi-QA
category: Language
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
configpath: opencompass/configs/datasets/tydiqa
configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
configpath_llmjudge: ''
- wic:
name: SuperGLUE / WiC
category: Language
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WiC
configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
configpath_llmjudge: ''
- wsc:
name: SuperGLUE / WSC
category: Language / WSC
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WSC
configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
configpath_llmjudge: ''
- winogrande:
name: WinoGrande
category: Language / WSC
paper: https://arxiv.org/pdf/1907.10641v2
configpath: opencompass/configs/datasets/winogrande
configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
configpath_llmjudge: ''
- xcopa:
name: XCOPA
category: Language
paper: https://arxiv.org/pdf/2005.00333
configpath: opencompass/configs/datasets/XCOPA
configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
configpath_llmjudge: ''
- xiezhi:
name: Xiezhi
category: Knowledge
paper: https://arxiv.org/pdf/2306.05783
configpath: opencompass/configs/datasets/xiezhi
configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
configpath_llmjudge: ''
- xlsum:
name: XLSum
category: Understanding
paper: https://arxiv.org/pdf/2106.13822v1
configpath: opencompass/configs/datasets/XLSum
configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
configpath_llmjudge: ''
- xsum:
name: Xsum
category: Understanding
paper: https://arxiv.org/pdf/1808.08745
configpath: opencompass/configs/datasets/Xsum
configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
configpath_llmjudge: ''
- cola:
name: GLUE / CoLA
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
configpath_llmjudge: ''
- mprc:
name: GLUE / MPRC
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
configpath_llmjudge: ''
- qqp:
name: GLUE / QQP
category: Understanding
paper: https://arxiv.org/pdf/1804.07461
configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
configpath_llmjudge: ''
- omni_math:
name: Omni-MATH
category: Math
paper: https://omni-math.github.io/
configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
configpath_llmjudge: ''
- wikibench:
name: WikiBench
category: Knowledge
paper: ''
configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
configpath_llmjudge: ''

View File

@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass.
You can use sorting and search functions to find the dataset you need.
We provide recommended running configurations for each dataset,
and in some datasets also offer recommended configurations based on LLM Judge.
You can quickly start evaluation tasks based on the recommended configurations.
However, please note that these configurations may be updated over time.
"""
with open('dataset_statistics.md', 'w') as f:
@ -24,7 +30,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath']
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
def table_format(data_list):
@ -35,6 +41,13 @@ def table_format(data_list):
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[link](' + i[j][index] + ')')
elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append('[link](' +
GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
@ -61,7 +74,10 @@ def generate_table(data_list, title=None):
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = ['Name', 'Category', 'Paper or Repository', 'Config File']
header = [
'Name', 'Category', 'Paper or Repository', 'Recommended Config',
'Recommended Config (LLM Judge)'
]
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',

View File

@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\
你可以使用排序和搜索功能找到需要的数据集
我们对每一个数据集都给出了推荐的运行配置部分数据集中还提供了基于LLM Judge的推荐配置
你可以基于推荐配置快速启动评测但请注意推荐配置可能随时间推移被更新
"""
with open('dataset_statistics.md', 'w') as f:
@ -35,7 +39,13 @@ def table_format(data_list):
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')')
elif index != 'name' and index != 'category':
elif index == 'configpath_llmjudge':
if i[j][index] == '':
table_format_list_sub.append(i[j][index])
else:
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
for k in i[j][index]:
@ -60,7 +70,7 @@ def generate_table(data_list, title=None):
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)']
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',

View File

@ -78,7 +78,6 @@ def generic_llmjudge_postprocess(
f'No gold answer for {k}, use empty string as reference!')
references.append('')
results = get_final_results(judged_answers, references, origial_responses)
results['details'] = output
return results