diff --git a/dataset-index.yml b/dataset-index.yml index ef21f4c3..3d916cd9 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -8,25 +8,25 @@ name: NPHardEval category: Reasoning paper: https://arxiv.org/pdf/2312.14890v2 - configpath: opencompass/configs/datasets/NPHardEval + configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py configpath_llmjudge: '' - pmmeval: name: PMMEval category: Language paper: https://arxiv.org/pdf/2411.09116v1 - configpath: opencompass/configs/datasets/PMMEval + configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py configpath_llmjudge: '' - theoremqa: name: TheroremQA category: Reasoning paper: https://arxiv.org/pdf/2305.12524 - configpath: opencompass/configs/datasets/TheroremQA + configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py configpath_llmjudge: '' - agieval: name: AGIEval category: Examination paper: https://arxiv.org/pdf/2304.06364 - configpath: opencompass/configs/datasets/agieval + configpath: opencompass/configs/datasets/agieval/agieval_gen.py configpath_llmjudge: '' - babilong: name: BABILong @@ -44,13 +44,13 @@ name: CaLM category: Reasoning paper: https://arxiv.org/pdf/2405.00622 - configpath: opencompass/configs/datasets/calm + configpath: opencompass/configs/datasets/calm/calm.py configpath_llmjudge: '' - infinitebench: name: InfiniteBench (∞Bench) category: Long Context paper: https://aclanthology.org/2024.acl-long.814.pdf - configpath: opencompass/configs/datasets/infinitebench + configpath: opencompass/configs/datasets/infinitebench/infinitebench.py configpath_llmjudge: '' - korbench: name: KOR-Bench @@ -62,13 +62,15 @@ name: LawBench category: Knowledge / Law paper: https://arxiv.org/pdf/2309.16289 - configpath: opencompass/configs/datasets/lawbench + configpath: + - opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py + - opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py configpath_llmjudge: '' - leval: name: L-Eval category: Long Context paper: https://arxiv.org/pdf/2307.11088v1 - configpath: opencompass/configs/datasets/leval + configpath: opencompass/configs/datasets/leval/leval.py configpath_llmjudge: '' - livecodebench: name: LiveCodeBench @@ -80,25 +82,39 @@ name: LiveMathBench category: Math paper: https://arxiv.org/pdf/2412.13147 - configpath: opencompass/configs/datasets/livemathbench + configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py + configpath_llmjudge: '' +- livereasonbench: + name: LiveReasonBench + category: Reasoning + paper: '' + configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py configpath_llmjudge: '' - longbench: name: LongBench category: Long Context paper: https://github.com/THUDM/LongBench - configpath: opencompass/configs/datasets/livemathbench + configpath: + - opencompass/configs/datasets/longbench/longbench.py + - opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py configpath_llmjudge: '' - lveval: name: LV-Eval category: Long Context paper: https://arxiv.org/pdf/2402.05136 - configpath: opencompass/configs/datasets/lveval + configpath: opencompass/configs/datasets/lveval/lveval.py + configpath_llmjudge: '' +- mastermath2024v1: + name: Mastermath2024v1 + category: Math + paper: '' + configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py configpath_llmjudge: '' - medbench: name: MedBench category: Knowledge / Medicine paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 - configpath: opencompass/configs/datasets/MedBench + configpath: opencompass/configs/datasets/MedBench/medbench_gen.py configpath_llmjudge: '' - musr: name: MuSR @@ -140,7 +156,7 @@ name: FLAMES category: Subjective / Alignment paper: https://arxiv.org/pdf/2311.06899 - configpath: opencompass/configs/datasets/subjective/flames + configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py configpath_llmjudge: '' - fofo: name: FOFO @@ -182,55 +198,63 @@ name: T-Eval category: Tool Utilization paper: https://arxiv.org/pdf/2312.14033 - configpath: opencompass/configs/datasets/teval + configpath: + - opencompass/configs/datasets/teval/teval_en_gen.py + - opencompass/configs/datasets/teval/teval_zh_gen.py configpath_llmjudge: '' - finalceiq: name: FinanceIQ category: Knowledge / Finance paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ - configpath: opencompass/configs/datasets/FinanceIQ + configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py configpath_llmjudge: '' - gaokaobench: name: GAOKAOBench category: Examination paper: https://arxiv.org/pdf/2305.12474 - configpath: opencompass/configs/datasets/GaokaoBench + configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py configpath_llmjudge: '' - lcbench: name: LCBench category: Code paper: https://github.com/open-compass/CodeBench/ - configpath: opencompass/configs/datasets/LCBench + configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py configpath_llmjudge: '' - MMLUArabic: name: ArabicMMLU category: Language paper: https://arxiv.org/pdf/2402.12840 - configpath: opencompass/configs/datasets/MMLUArabic + configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py configpath_llmjudge: '' - OpenFinData: name: OpenFinData category: Knowledge / Finance paper: https://github.com/open-compass/OpenFinData - configpath: opencompass/configs/datasets/OpenFinData + configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py configpath_llmjudge: '' - QuALITY: name: QuALITY category: Long Context paper: https://arxiv.org/pdf/2112.08608 - configpath: opencompass/configs/datasets/QuALITY + configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py configpath_llmjudge: '' - advglue: name: Adversarial GLUE category: Safety paper: https://openreview.net/pdf?id=GF9cSKI3A_q - configpath: opencompass/configs/datasets/adv_glue + configpath: + - opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py + - opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py + - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py + - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py + - opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py + - opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py configpath_llmjudge: '' - afqmcd: name: CLUE / AFQMC category: Language paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_afqmc + configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py configpath_llmjudge: '' - aime2024: name: AIME2024 @@ -242,41 +266,46 @@ name: Adversarial NLI category: Reasoning paper: https://arxiv.org/pdf/1910.14599v2 - configpath: opencompass/configs/datasets/anli + configpath: opencompass/configs/datasets/anli/anli_gen.py configpath_llmjudge: '' - anthropics_evals: name: Anthropics Evals category: Safety paper: https://arxiv.org/pdf/2212.09251 - configpath: opencompass/configs/datasets/anthropics_evals + configpath: + - opencompass/configs/datasets/anthropics_evals/airisk_gen.py + - opencompass/configs/datasets/anthropics_evals/persona_gen.py + - opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py configpath_llmjudge: '' - apps: name: APPS category: Code paper: https://arxiv.org/pdf/2105.09938 - configpath: opencompass/configs/datasets/apps + configpath: + - opencompass/configs/datasets/apps/apps_gen.py + - opencompass/configs/datasets/apps/apps_mini_gen.py configpath_llmjudge: '' - arc: name: ARC category: Reasoning paper: https://arxiv.org/pdf/1803.05457 configpath: - - opencompass/configs/datasets/ARC_c - - opencompass/configs/datasets/ARC_e + - opencompass/configs/datasets/ARC_c/ARC_c_gen.py + - opencompass/configs/datasets/ARC_e/ARC_e_gen.py configpath_llmjudge: '' - arc_prize_public_eval: name: ARC Prize category: ARC-AGI paper: https://arcprize.org/guide#private - configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation + configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py configpath_llmjudge: '' - ax: name: SuperGLUE / AX category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: - - opencompass/configs/datasets/SuperGLUE_AX_b - - opencompass/configs/datasets/SuperGLUE_AX_g + - opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py + - opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py configpath_llmjudge: '' - bbh: name: BIG-Bench Hard @@ -288,79 +317,82 @@ name: SuperGLUE / BoolQ category: Knowledge paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_BoolQ + configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py configpath_llmjudge: '' - c3: name: CLUE / C3 (C³) category: Understanding paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_C3 + configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py configpath_llmjudge: '' - cb: name: SuperGLUE / CB category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_CB + configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py configpath_llmjudge: '' - ceval: name: C-EVAL category: Examination paper: https://arxiv.org/pdf/2305.08322v1 - configpath: opencompass/configs/datasets/ceval + configpath: opencompass/configs/datasets/ceval/ceval_gen.py configpath_llmjudge: '' - charm: name: CHARM category: Reasoning paper: https://arxiv.org/pdf/2403.14112 - configpath: opencompass/configs/datasets/CHARM + configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py configpath_llmjudge: '' - chembench: name: ChemBench category: Knowledge / Chemistry paper: https://arxiv.org/pdf/2404.01475 - configpath: opencompass/configs/datasets/ChemBench + configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py configpath_llmjudge: '' - chid: name: FewCLUE / CHID category: Language paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_chid + configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py configpath_llmjudge: '' - chinese_simpleqa: name: Chinese SimpleQA category: Knowledge paper: https://arxiv.org/pdf/2411.07140 - configpath: opencompass/configs/datasets/chinese_simpleqa + configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py configpath_llmjudge: '' - cibench: name: CIBench category: Code paper: https://www.arxiv.org/pdf/2407.10499 - configpath: opencompass/configs/datasets/CIBench + configpath: + - opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py + - opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py + - opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py configpath_llmjudge: '' - civilcomments: name: CivilComments category: Safety paper: https://arxiv.org/pdf/1903.04561 - configpath: opencompass/configs/datasets/civilcomments + configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py configpath_llmjudge: '' - clozeTest_maxmin: name: Cloze Test-max/min category: Code paper: https://arxiv.org/pdf/2102.04664 - configpath: opencompass/configs/datasets/clozeTest_maxmin + configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py configpath_llmjudge: '' - cluewsc: name: FewCLUE / CLUEWSC category: Language / WSC paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_cluewsc + configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py configpath_llmjudge: '' - cmb: name: CMB category: Knowledge / Medicine paper: https://arxiv.org/pdf/2308.08833 - configpath: opencompass/configs/datasets/cmb + configpath: opencompass/configs/datasets/cmb/cmb_gen.py configpath_llmjudge: '' - cmmlu: name: CMMLU @@ -372,61 +404,61 @@ name: CLUE / CMNLI category: Reasoning paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_cmnli + configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py configpath_llmjudge: '' - cmo_fib: name: cmo_fib category: Examination paper: '' - configpath: opencompass/configs/datasets/cmo_fib + configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py configpath_llmjudge: '' - cmrc: name: CLUE / CMRC category: Understanding paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_CMRC + configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py configpath_llmjudge: '' - commonsenseqa: name: CommonSenseQA category: Knowledge paper: https://arxiv.org/pdf/1811.00937v2 - configpath: opencompass/configs/datasets/commonsenseqa + configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py configpath_llmjudge: '' - commonsenseqa_cn: name: CommonSenseQA-CN category: Knowledge paper: '' - configpath: opencompass/configs/datasets/commonsenseqa_cn + configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py configpath_llmjudge: '' - copa: name: SuperGLUE / COPA category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_COPA + configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py configpath_llmjudge: '' - crowspairs: name: CrowsPairs category: Safety paper: https://arxiv.org/pdf/2010.00133 - configpath: opencompass/configs/datasets/crowspairs + configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py configpath_llmjudge: '' - crowspairs_cn: name: CrowsPairs-CN category: Safety paper: '' - configpath: opencompass/configs/datasets/crowspairs_cn + configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py configpath_llmjudge: '' - cvalues: name: CVALUES category: Safety paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf - configpath: opencompass/configs/datasets/cvalues + configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py configpath_llmjudge: '' - drcd: name: CLUE / DRCD category: Understanding paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_DRCD + configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py configpath_llmjudge: '' - drop: name: DROP (DROP Simple Eval) @@ -438,31 +470,32 @@ name: DS-1000 category: Code paper: https://arxiv.org/pdf/2211.11501 - configpath: opencompass/configs/datasets/ds1000 + configpath: + - opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py configpath_llmjudge: '' - eprstmt: name: FewCLUE / EPRSTMT category: Understanding paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_eprstmt + configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py configpath_llmjudge: '' - flores: name: Flores category: Language paper: https://aclanthology.org/D19-1632.pdf - configpath: opencompass/configs/datasets/flores + configpath: opencompass/configs/datasets/flores/flores_gen.py configpath_llmjudge: '' - game24: name: Game24 category: Math paper: https://huggingface.co/datasets/nlile/24-game - configpath: opencompass/configs/datasets/game24 + configpath: opencompass/configs/datasets/game24/game24_gen.py configpath_llmjudge: '' - govrepcrs: name: Government Report Dataset category: Long Context paper: https://aclanthology.org/2021.naacl-main.112.pdf - configpath: opencompass/configs/datasets/govrepcrs + configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py configpath_llmjudge: '' - gpqa: name: GPQA @@ -474,19 +507,19 @@ name: GSM8K category: Math paper: https://arxiv.org/pdf/2110.14168v2 - configpath: opencompass/configs/datasets/gsm8k + configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py configpath_llmjudge: '' - gsm_hard: name: GSM-Hard category: Math paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf - configpath: opencompass/configs/datasets/gsm_hard + configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py configpath_llmjudge: '' - hle: name: HLE(Humanity's Last Exam) category: Reasoning paper: https://lastexam.ai/paper - configpath: opencompass/configs/datasets/HLE + configpath: opencompass/configs/datasets/HLE/hle_gen.py configpath_llmjudge: '' - hellaswag: name: HellaSwag @@ -504,61 +537,67 @@ name: HumanEval-CN category: Code paper: '' - configpath: opencompass/configs/datasets/humaneval_cn + configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py configpath_llmjudge: '' - humaneval_multi: name: Multi-HumanEval category: Code paper: https://arxiv.org/pdf/2210.14868 - configpath: opencompass/configs/datasets/humaneval_multi + configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py + configpath_llmjudge: '' +- humaneval_multi: + name: HumanEval+ + category: Code + paper: https://arxiv.org/pdf/2305.01210 + configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py configpath_llmjudge: '' - humanevalx: name: HumanEval-X category: Code paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 - configpath: opencompass/configs/datasets/humanevalx + configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py configpath_llmjudge: '' - hungarian_math: name: Hungarian_Math category: Math paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam - configpath: opencompass/configs/datasets/hungarian_exam + configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py configpath_llmjudge: '' - iwslt2017: name: IWSLT2017 category: Language paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf - configpath: opencompass/configs/datasets/iwslt2017 + configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py configpath_llmjudge: '' - jigsawmultilingual: name: JigsawMultilingual category: Safety paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data - configpath: opencompass/configs/datasets/jigsawmultilingual + configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py configpath_llmjudge: '' - lambada: name: LAMBADA category: Understanding paper: https://arxiv.org/pdf/1606.06031 - configpath: opencompass/configs/datasets/lambada + configpath: opencompass/configs/datasets/lambada/lambada_gen.py configpath_llmjudge: '' - lcsts: name: LCSTS category: Understanding paper: https://aclanthology.org/D15-1229.pdf - configpath: opencompass/configs/datasets/lcsts + configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py configpath_llmjudge: '' - livestembench: name: LiveStemBench category: '' paper: '' - configpath: opencompass/configs/datasets/livestembench + configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py configpath_llmjudge: '' - llm_compression: name: LLM Compression category: Bits Per Character (BPC) paper: https://arxiv.org/pdf/2404.09937 - configpath: opencompass/configs/datasets/llm_compression + configpath: opencompass/configs/datasets/llm_compression/llm_compression.py configpath_llmjudge: '' - math: name: MATH @@ -576,37 +615,37 @@ name: MATH 401 category: Math paper: https://arxiv.org/pdf/2304.02015 - configpath: opencompass/configs/datasets/math401 + configpath: opencompass/configs/datasets/math401/math401_gen.py configpath_llmjudge: '' - mathbench: name: MathBench category: Math paper: https://arxiv.org/pdf/2405.12209 - configpath: opencompass/configs/datasets/mathbench + configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py configpath_llmjudge: '' - mbpp: name: MBPP category: Code paper: https://arxiv.org/pdf/2108.07732 - configpath: opencompass/configs/datasets/mbpp + configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py configpath_llmjudge: '' - mbpp_cn: name: MBPP-CN category: Code paper: '' - configpath: opencompass/configs/datasets/mbpp_cn + configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py configpath_llmjudge: '' - mbpp_plus: name: MBPP-PLUS category: Code paper: '' - configpath: opencompass/configs/datasets/mbpp_plus + configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py configpath_llmjudge: '' - mgsm: name: MGSM category: Language / Math paper: https://arxiv.org/pdf/2210.03057 - configpath: opencompass/configs/datasets/mgsm + configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py configpath_llmjudge: '' - mmlu: name: MMLU @@ -618,7 +657,7 @@ name: MMLU-CF category: Understanding paper: https://arxiv.org/pdf/2412.15194 - configpath: opencompass/configs/datasets/mmlu_cf + configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py configpath_llmjudge: '' - mmlu_pro: name: MMLU-Pro @@ -630,91 +669,99 @@ name: MMMLU category: Language / Understanding paper: https://huggingface.co/datasets/openai/MMMLU - configpath: opencompass/configs/datasets/mmmlu + configpath: + - opencompass/configs/datasets/mmmlu/mmmlu_gen.py + - opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py configpath_llmjudge: '' - multirc: name: SuperGLUE / MultiRC category: Understanding paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_MultiRC + configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py configpath_llmjudge: '' - narrativeqa: name: NarrativeQA category: Understanding paper: https://github.com/google-deepmind/narrativeqa - configpath: opencompass/configs/datasets/narrativeqa + configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py configpath_llmjudge: '' - natural_question: name: NaturalQuestions category: Knowledge paper: https://github.com/google-research-datasets/natural-questions - configpath: opencompass/configs/datasets/nq + configpath: opencompass/configs/datasets/nq/nq_gen.py configpath_llmjudge: '' - natural_question_cn: name: NaturalQuestions-CN category: Knowledge paper: '' - configpath: opencompass/configs/datasets/nq_cn + configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py configpath_llmjudge: '' - obqa: name: OpenBookQA category: Knowledge paper: https://arxiv.org/pdf/1809.02789v1 - configpath: opencompass/configs/datasets/obqa + configpath: opencompass/configs/datasets/obqa/obqa_gen.py configpath_llmjudge: '' - piqa: name: OpenBookQA category: Knowledge / Physics paper: https://arxiv.org/pdf/1911.11641v1 - configpath: opencompass/configs/datasets/piqa + configpath: opencompass/configs/datasets/piqa/piqa_gen.py configpath_llmjudge: '' - py150: name: py150 category: Code paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line - configpath: opencompass/configs/datasets/py150 + configpath: opencompass/configs/datasets/py150/py150_gen.py configpath_llmjudge: '' - qasper: name: Qasper category: Long Context paper: https://arxiv.org/pdf/2105.03011 - configpath: opencompass/configs/datasets/qasper + configpath: opencompass/configs/datasets/qasper/qasper_gen.py configpath_llmjudge: '' - qaspercut: name: Qasper-Cut category: Long Context paper: '' - configpath: opencompass/configs/datasets/qaspercut + configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py configpath_llmjudge: '' - race: name: RACE category: Examination paper: https://arxiv.org/pdf/1704.04683 - configpath: opencompass/configs/datasets/race + configpath: opencompass/configs/datasets/race/race_gen.py configpath_llmjudge: '' - realtoxicprompts: name: RealToxicPrompts category: Safety paper: https://arxiv.org/pdf/2009.11462 - configpath: opencompass/configs/datasets/realtoxicprompts + configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py configpath_llmjudge: '' - record: name: SuperGLUE / ReCoRD category: Understanding paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD + configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py configpath_llmjudge: '' - rte: name: SuperGLUE / RTE category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_RTE + configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py configpath_llmjudge: '' - ocnli: name: CLUE / OCNLI category: Reasoning paper: https://arxiv.org/pdf/2004.05986 - configpath: opencompass/configs/datasets/CLUE_ocnli + configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py + configpath_llmjudge: '' +- ocnlifc: + name: FewCLUE / OCNLI-FC + category: Reasoning + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py configpath_llmjudge: '' - rolebench: name: RoleBench @@ -726,97 +773,97 @@ name: S3Eval category: Long Context paper: https://aclanthology.org/2024.naacl-long.69.pdf - configpath: opencompass/configs/datasets/s3eval + configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py configpath_llmjudge: '' - scibench: name: SciBench category: Reasoning paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf - configpath: opencompass/configs/datasets/scibench + configpath: opencompass/configs/datasets/scibench/scibench_gen.py configpath_llmjudge: '' - scicode: name: SciCode category: Code paper: https://arxiv.org/pdf/2407.13168 - configpath: opencompass/configs/datasets/scicode + configpath: opencompass/configs/datasets/scicode/scicode_gen.py configpath_llmjudge: '' - simpleqa: name: SimpleQA category: Knowledge paper: https://arxiv.org/pdf/2411.04368 - configpath: opencompass/configs/datasets/SimpleQA + configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py configpath_llmjudge: '' - siqa: name: SocialIQA category: Reasoning paper: https://arxiv.org/pdf/1904.09728 - configpath: opencompass/configs/datasets/siqa + configpath: opencompass/configs/datasets/siqa/siqa_gen.py configpath_llmjudge: '' - squad20: name: SQuAD2.0 category: Understanding paper: https://arxiv.org/pdf/1806.03822 - configpath: opencompass/configs/datasets/squad20 + configpath: opencompass/configs/datasets/squad20/squad20_gen.py configpath_llmjudge: '' - storycloze: name: StoryCloze category: Reasoning paper: https://aclanthology.org/2022.emnlp-main.616.pdf - configpath: opencompass/configs/datasets/storycloze + configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py configpath_llmjudge: '' - strategyqa: name: StrategyQA category: Reasoning paper: https://arxiv.org/pdf/2101.02235 - configpath: opencompass/configs/datasets/strategyqa + configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py configpath_llmjudge: '' - summedits: name: SummEdits category: Language paper: https://aclanthology.org/2023.emnlp-main.600.pdf - configpath: opencompass/configs/datasets/summedits + configpath: opencompass/configs/datasets/summedits/summedits_gen.py configpath_llmjudge: '' - summscreen: name: SummScreen category: Understanding paper: https://arxiv.org/pdf/2104.07091v1 - configpath: opencompass/configs/datasets/summscreen + configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py configpath_llmjudge: '' - svamp: name: SVAMP category: Math paper: https://aclanthology.org/2021.naacl-main.168.pdf - configpath: opencompass/configs/datasets/SVAMP + configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py configpath_llmjudge: '' - tabmwp: name: TabMWP category: Math / Table paper: https://arxiv.org/pdf/2209.14610 - configpath: opencompass/configs/datasets/TabMWP + configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py configpath_llmjudge: '' - taco: name: TACO category: Code paper: https://arxiv.org/pdf/2312.14852 - configpath: opencompass/configs/datasets/taco + configpath: opencompass/configs/datasets/taco/taco_gen.py configpath_llmjudge: '' - tnews: name: FewCLUE / TNEWS category: Understanding paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_tnews + configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py configpath_llmjudge: '' - bustm: name: FewCLUE / BUSTM category: Reasoning paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_bustm + configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py configpath_llmjudge: '' - csl: name: FewCLUE / CSL category: Understanding paper: https://arxiv.org/pdf/2107.07498 - configpath: opencompass/configs/datasets/FewCLUE_csl + configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py configpath_llmjudge: '' - ocnli_fc: name: FewCLUE / OCNLI-FC @@ -828,65 +875,95 @@ name: TriviaQA category: Knowledge paper: https://arxiv.org/pdf/1705.03551v2 - configpath: opencompass/configs/datasets/triviaqa + configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py configpath_llmjudge: '' - triviaqarc: name: TriviaQA-RC category: Knowledge / Understanding paper: '' - configpath: opencompass/configs/datasets/triviaqarc + configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py configpath_llmjudge: '' - truthfulqa: name: TruthfulQA category: Safety paper: https://arxiv.org/pdf/2109.07958v2 - configpath: opencompass/configs/datasets/truthfulqa + configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py configpath_llmjudge: '' - tydiqa: name: TyDi-QA category: Language paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf - configpath: opencompass/configs/datasets/tydiqa + configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py configpath_llmjudge: '' - wic: name: SuperGLUE / WiC category: Language paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_WiC + configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py configpath_llmjudge: '' - wsc: name: SuperGLUE / WSC category: Language / WSC paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: opencompass/configs/datasets/SuperGLUE_WSC + configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py configpath_llmjudge: '' - winogrande: name: WinoGrande category: Language / WSC paper: https://arxiv.org/pdf/1907.10641v2 - configpath: opencompass/configs/datasets/winogrande + configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py configpath_llmjudge: '' - xcopa: name: XCOPA category: Language paper: https://arxiv.org/pdf/2005.00333 - configpath: opencompass/configs/datasets/XCOPA + configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py configpath_llmjudge: '' - xiezhi: name: Xiezhi category: Knowledge paper: https://arxiv.org/pdf/2306.05783 - configpath: opencompass/configs/datasets/xiezhi + configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py configpath_llmjudge: '' - xlsum: name: XLSum category: Understanding paper: https://arxiv.org/pdf/2106.13822v1 - configpath: opencompass/configs/datasets/XLSum + configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py configpath_llmjudge: '' - xsum: name: Xsum category: Understanding paper: https://arxiv.org/pdf/1808.08745 - configpath: opencompass/configs/datasets/Xsum + configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py configpath_llmjudge: '' +- cola: + name: GLUE / CoLA + category: Understanding + paper: https://arxiv.org/pdf/1804.07461 + configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py + configpath_llmjudge: '' +- mprc: + name: GLUE / MPRC + category: Understanding + paper: https://arxiv.org/pdf/1804.07461 + configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py + configpath_llmjudge: '' +- qqp: + name: GLUE / QQP + category: Understanding + paper: https://arxiv.org/pdf/1804.07461 + configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py + configpath_llmjudge: '' +- omni_math: + name: Omni-MATH + category: Math + paper: https://omni-math.github.io/ + configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py + configpath_llmjudge: '' +- wikibench: + name: WikiBench + category: Knowledge + paper: '' + configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py + configpath_llmjudge: '' \ No newline at end of file diff --git a/docs/en/statis.py b/docs/en/statis.py index a110c631..483ebf78 100755 --- a/docs/en/statis.py +++ b/docs/en/statis.py @@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass. You can use sorting and search functions to find the dataset you need. +We provide recommended running configurations for each dataset, +and in some datasets also offer recommended configurations based on LLM Judge. + +You can quickly start evaluation tasks based on the recommended configurations. +However, please note that these configurations may be updated over time. + """ with open('dataset_statistics.md', 'w') as f: @@ -24,7 +30,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml') with open(load_path, 'r') as f2: data_list = yaml.load(f2, Loader=yaml.FullLoader) -HEADER = ['name', 'category', 'paper', 'configpath'] +HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge'] def table_format(data_list): @@ -35,6 +41,13 @@ def table_format(data_list): for index in HEADER: if index == 'paper': table_format_list_sub.append('[link](' + i[j][index] + ')') + elif index == 'configpath_llmjudge': + if i[j][index] == '': + table_format_list_sub.append(i[j][index]) + else: + table_format_list_sub.append('[link](' + + GITHUB_PREFIX + + i[j][index] + ')') elif index == 'configpath': if isinstance(i[j][index], list): sub_list_text = '' @@ -61,7 +74,10 @@ def generate_table(data_list, title=None): if title is not None: f.write(f'\n{title}') f.write("""\n```{table}\n:class: dataset\n""") - header = ['Name', 'Category', 'Paper or Repository', 'Config File'] + header = [ + 'Name', 'Category', 'Paper or Repository', 'Recommended Config', + 'Recommended Config (LLM Judge)' + ] table_cfg = dict(tablefmt='pipe', floatfmt='.2f', numalign='right', diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py index 4e68bae0..19d03bfd 100755 --- a/docs/zh_cn/statis.py +++ b/docs/zh_cn/statis.py @@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\ 你可以使用排序和搜索功能找到需要的数据集。 +我们对每一个数据集都给出了推荐的运行配置,部分数据集中还提供了基于LLM Judge的推荐配置。 + +你可以基于推荐配置快速启动评测。但请注意,推荐配置可能随时间推移被更新。 + """ with open('dataset_statistics.md', 'w') as f: @@ -35,7 +39,13 @@ def table_format(data_list): for index in HEADER: if index == 'paper': table_format_list_sub.append('[链接](' + i[j][index] + ')') - elif index != 'name' and index != 'category': + elif index == 'configpath_llmjudge': + if i[j][index] == '': + table_format_list_sub.append(i[j][index]) + else: + table_format_list_sub.append('[链接](' + GITHUB_PREFIX + + i[j][index] + ')') + elif index == 'configpath': if isinstance(i[j][index], list): sub_list_text = '' for k in i[j][index]: @@ -60,7 +70,7 @@ def generate_table(data_list, title=None): if title is not None: f.write(f'\n{title}') f.write("""\n```{table}\n:class: dataset\n""") - header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)'] + header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)'] table_cfg = dict(tablefmt='pipe', floatfmt='.2f', numalign='right', diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index 13c6d467..dc783167 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -78,7 +78,6 @@ def generic_llmjudge_postprocess( f'No gold answer for {k}, use empty string as reference!') references.append('') results = get_final_results(judged_answers, references, origial_responses) - results['details'] = output return results