diff --git a/dataset-index.yml b/dataset-index.yml index 0f15a382..6bd2f784 100644 --- a/dataset-index.yml +++ b/dataset-index.yml @@ -1,739 +1,886 @@ - - ifeval: name: IFEval category: Instruction Following paper: https://arxiv.org/pdf/2311.07911 configpath: opencompass/configs/datasets/IFEval/IFEval + configpath_llmjudge: '' - nphard: name: NPHardEval category: Reasoning paper: https://arxiv.org/pdf/2312.14890v2 configpath: opencompass/configs/datasets/NPHardEval + configpath_llmjudge: '' - pmmeval: name: PMMEval category: Language paper: https://arxiv.org/pdf/2411.09116v1 configpath: opencompass/configs/datasets/PMMEval + configpath_llmjudge: '' - theoremqa: name: TheroremQA category: Reasoning paper: https://arxiv.org/pdf/2305.12524 configpath: opencompass/configs/datasets/TheroremQA + configpath_llmjudge: '' - agieval: name: AGIEval category: Examination paper: https://arxiv.org/pdf/2304.06364 configpath: opencompass/configs/datasets/agieval + configpath_llmjudge: '' - babilong: name: BABILong category: Long Context paper: https://arxiv.org/pdf/2406.10149 configpath: opencompass/configs/datasets/babilong + configpath_llmjudge: '' - bigcodebench: name: BigCodeBench category: Code paper: https://arxiv.org/pdf/2406.15877 configpath: opencompass/configs/datasets/bigcodebench + configpath_llmjudge: '' - calm: name: CaLM category: Reasoning paper: https://arxiv.org/pdf/2405.00622 configpath: opencompass/configs/datasets/calm + configpath_llmjudge: '' - infinitebench: name: InfiniteBench (∞Bench) category: Long Context paper: https://aclanthology.org/2024.acl-long.814.pdf configpath: opencompass/configs/datasets/infinitebench + configpath_llmjudge: '' - korbench: name: KOR-Bench category: Reasoning paper: https://arxiv.org/pdf/2410.06526v1 configpath: opencompass/configs/datasets/korbench + configpath_llmjudge: '' - lawbench: name: LawBench category: Knowledge / Law paper: https://arxiv.org/pdf/2309.16289 configpath: opencompass/configs/datasets/lawbench + configpath_llmjudge: '' - leval: name: L-Eval category: Long Context paper: https://arxiv.org/pdf/2307.11088v1 configpath: opencompass/configs/datasets/leval + configpath_llmjudge: '' - livecodebench: name: LiveCodeBench category: Code paper: https://arxiv.org/pdf/2403.07974 configpath: opencompass/configs/datasets/livecodebench + configpath_llmjudge: '' - livemathbench: name: LiveMathBench category: Math paper: https://arxiv.org/pdf/2412.13147 configpath: opencompass/configs/datasets/livemathbench + configpath_llmjudge: '' - longbench: name: LongBench category: Long Context paper: https://github.com/THUDM/LongBench configpath: opencompass/configs/datasets/livemathbench + configpath_llmjudge: '' - lveval: name: LV-Eval category: Long Context paper: https://arxiv.org/pdf/2402.05136 configpath: opencompass/configs/datasets/lveval + configpath_llmjudge: '' - medbench: name: MedBench category: Knowledge / Medicine paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 configpath: opencompass/configs/datasets/MedBench + configpath_llmjudge: '' - musr: name: MuSR category: Reasoning paper: https://arxiv.org/pdf/2310.16049 configpath: opencompass/configs/datasets/musr + configpath_llmjudge: '' - needlebench: name: NeedleBench category: Long Context paper: https://arxiv.org/pdf/2407.11963 configpath: opencompass/configs/datasets/needlebench + configpath_llmjudge: '' - ruler: name: RULER category: Long Context paper: https://arxiv.org/pdf/2404.06654 configpath: opencompass/configs/datasets/ruler + configpath_llmjudge: '' - alignment: name: AlignBench category: Subjective / Alignment paper: https://arxiv.org/pdf/2311.18743 configpath: opencompass/configs/datasets/subjective/alignbench + configpath_llmjudge: '' - alpaca: name: AlpacaEval category: Subjective / Instruction Following paper: https://github.com/tatsu-lab/alpaca_eval configpath: opencompass/configs/datasets/subjective/aplaca_eval + configpath_llmjudge: '' - arenahard: name: Arena-Hard category: Subjective / Chatbot paper: https://lmsys.org/blog/2024-04-19-arena-hard/ configpath: opencompass/configs/datasets/subjective/arena_hard + configpath_llmjudge: '' - flames: name: FLAMES category: Subjective / Alignment paper: https://arxiv.org/pdf/2311.06899 configpath: opencompass/configs/datasets/subjective/flames + configpath_llmjudge: '' - fofo: name: FOFO category: Subjective / Format Following paper: https://arxiv.org/pdf/2402.18667 configpath: opencompass/configs/datasets/subjective/fofo + configpath_llmjudge: '' - followbench: name: FollowBench category: Subjective / Instruction Following paper: https://arxiv.org/pdf/2310.20410 configpath: opencompass/configs/datasets/subjective/followbench + configpath_llmjudge: '' - hellobench: name: HelloBench category: Subjective / Long Context paper: https://arxiv.org/pdf/2409.16191 configpath: opencompass/configs/datasets/subjective/hellobench + configpath_llmjudge: '' - judgerbench: name: JudgerBench category: Subjective / Long Context paper: https://arxiv.org/pdf/2410.16256 configpath: opencompass/configs/datasets/subjective/judgerbench + configpath_llmjudge: '' - multiround: name: MT-Bench-101 category: Subjective / Multi-Round paper: https://arxiv.org/pdf/2402.14762 configpath: opencompass/configs/datasets/subjective/multiround + configpath_llmjudge: '' - wildbench: name: WildBench category: Subjective / Real Task paper: https://arxiv.org/pdf/2406.04770 configpath: opencompass/configs/datasets/subjective/wildbench + configpath_llmjudge: '' - teval: name: T-Eval category: Tool Utilization paper: https://arxiv.org/pdf/2312.14033 configpath: opencompass/configs/datasets/teval + configpath_llmjudge: '' - finalceiq: name: FinanceIQ category: Knowledge / Finance paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ configpath: opencompass/configs/datasets/FinanceIQ + configpath_llmjudge: '' - gaokaobench: name: GAOKAOBench category: Examination paper: https://arxiv.org/pdf/2305.12474 configpath: opencompass/configs/datasets/GaokaoBench + configpath_llmjudge: '' - lcbench: name: LCBench category: Code paper: https://github.com/open-compass/CodeBench/ configpath: opencompass/configs/datasets/LCBench + configpath_llmjudge: '' - MMLUArabic: name: ArabicMMLU category: Language paper: https://arxiv.org/pdf/2402.12840 configpath: opencompass/configs/datasets/MMLUArabic + configpath_llmjudge: '' - OpenFinData: name: OpenFinData category: Knowledge / Finance paper: https://github.com/open-compass/OpenFinData configpath: opencompass/configs/datasets/OpenFinData + configpath_llmjudge: '' - QuALITY: name: QuALITY category: Long Context paper: https://arxiv.org/pdf/2112.08608 configpath: opencompass/configs/datasets/QuALITY + configpath_llmjudge: '' - advglue: name: Adversarial GLUE category: Safety paper: https://openreview.net/pdf?id=GF9cSKI3A_q configpath: opencompass/configs/datasets/adv_glue + configpath_llmjudge: '' - afqmcd: name: CLUE / AFQMC category: Language paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_afqmc + configpath_llmjudge: '' - aime2024: name: AIME2024 category: Examination paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024 configpath: opencompass/configs/datasets/aime2024 + configpath_llmjudge: '' - anli: name: Adversarial NLI category: Reasoning paper: https://arxiv.org/pdf/1910.14599v2 configpath: opencompass/configs/datasets/anli + configpath_llmjudge: '' - anthropics_evals: name: Anthropics Evals category: Safety paper: https://arxiv.org/pdf/2212.09251 configpath: opencompass/configs/datasets/anthropics_evals + configpath_llmjudge: '' - apps: name: APPS category: Code paper: https://arxiv.org/pdf/2105.09938 configpath: opencompass/configs/datasets/apps + configpath_llmjudge: '' - arc: name: ARC category: Reasoning paper: https://arxiv.org/pdf/1803.05457 - configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e] + configpath: + - opencompass/configs/datasets/ARC_c + - opencompass/configs/datasets/ARC_e + configpath_llmjudge: '' - arc_prize_public_eval: name: ARC Prize category: ARC-AGI paper: https://arcprize.org/guide#private configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation + configpath_llmjudge: '' - ax: name: SuperGLUE / AX category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf - configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g] + configpath: + - opencompass/configs/datasets/SuperGLUE_AX_b + - opencompass/configs/datasets/SuperGLUE_AX_g + configpath_llmjudge: '' - bbh: name: BIG-Bench Hard category: Reasoning paper: https://arxiv.org/pdf/2210.09261 configpath: opencompass/configs/datasets/bbh + configpath_llmjudge: '' - BoolQ: name: SuperGLUE / BoolQ category: Knowledge paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_BoolQ + configpath_llmjudge: '' - c3: name: CLUE / C3 (C³) category: Understanding paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_C3 + configpath_llmjudge: '' - cb: name: SuperGLUE / CB category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_CB + configpath_llmjudge: '' - ceval: name: C-EVAL category: Examination paper: https://arxiv.org/pdf/2305.08322v1 configpath: opencompass/configs/datasets/ceval + configpath_llmjudge: '' - charm: name: CHARM category: Reasoning paper: https://arxiv.org/pdf/2403.14112 configpath: opencompass/configs/datasets/CHARM + configpath_llmjudge: '' - chembench: name: ChemBench category: Knowledge / Chemistry paper: https://arxiv.org/pdf/2404.01475 configpath: opencompass/configs/datasets/ChemBench + configpath_llmjudge: '' - chid: name: FewCLUE / CHID category: Language paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_chid + configpath_llmjudge: '' - chinese_simpleqa: name: Chinese SimpleQA category: Knowledge paper: https://arxiv.org/pdf/2411.07140 configpath: opencompass/configs/datasets/chinese_simpleqa + configpath_llmjudge: '' - cibench: name: CIBench category: Code paper: https://www.arxiv.org/pdf/2407.10499 configpath: opencompass/configs/datasets/CIBench + configpath_llmjudge: '' - civilcomments: name: CivilComments category: Safety paper: https://arxiv.org/pdf/1903.04561 configpath: opencompass/configs/datasets/civilcomments + configpath_llmjudge: '' - clozeTest_maxmin: name: Cloze Test-max/min category: Code paper: https://arxiv.org/pdf/2102.04664 configpath: opencompass/configs/datasets/clozeTest_maxmin + configpath_llmjudge: '' - cluewsc: name: FewCLUE / CLUEWSC category: Language / WSC paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_cluewsc + configpath_llmjudge: '' - cmb: name: CMB category: Knowledge / Medicine paper: https://arxiv.org/pdf/2308.08833 configpath: opencompass/configs/datasets/cmb + configpath_llmjudge: '' - cmmlu: name: CMMLU category: Understanding paper: https://arxiv.org/pdf/2306.09212 configpath: opencompass/configs/datasets/cmmlu + configpath_llmjudge: '' - cmnli: name: CLUE / CMNLI category: Reasoning paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_cmnli + configpath_llmjudge: '' - cmo_fib: name: cmo_fib category: Examination - paper: "" + paper: '' configpath: opencompass/configs/datasets/cmo_fib + configpath_llmjudge: '' - cmrc: name: CLUE / CMRC category: Understanding paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_CMRC + configpath_llmjudge: '' - commonsenseqa: name: CommonSenseQA category: Knowledge paper: https://arxiv.org/pdf/1811.00937v2 configpath: opencompass/configs/datasets/commonsenseqa + configpath_llmjudge: '' - commonsenseqa_cn: name: CommonSenseQA-CN category: Knowledge - paper: "" + paper: '' configpath: opencompass/configs/datasets/commonsenseqa_cn + configpath_llmjudge: '' - copa: name: SuperGLUE / COPA category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_COPA + configpath_llmjudge: '' - crowspairs: name: CrowsPairs category: Safety paper: https://arxiv.org/pdf/2010.00133 configpath: opencompass/configs/datasets/crowspairs + configpath_llmjudge: '' - crowspairs_cn: name: CrowsPairs-CN category: Safety - paper: "" + paper: '' configpath: opencompass/configs/datasets/crowspairs_cn + configpath_llmjudge: '' - cvalues: name: CVALUES category: Safety paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf configpath: opencompass/configs/datasets/cvalues + configpath_llmjudge: '' - drcd: name: CLUE / DRCD category: Understanding paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_DRCD + configpath_llmjudge: '' - drop: name: DROP (DROP Simple Eval) category: Understanding paper: https://arxiv.org/pdf/1903.00161 configpath: opencompass/configs/datasets/drop + configpath_llmjudge: '' - ds1000: name: DS-1000 category: Code paper: https://arxiv.org/pdf/2211.11501 configpath: opencompass/configs/datasets/ds1000 + configpath_llmjudge: '' - eprstmt: name: FewCLUE / EPRSTMT category: Understanding paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_eprstmt + configpath_llmjudge: '' - flores: name: Flores category: Language paper: https://aclanthology.org/D19-1632.pdf configpath: opencompass/configs/datasets/flores + configpath_llmjudge: '' - game24: name: Game24 category: Math paper: https://huggingface.co/datasets/nlile/24-game configpath: opencompass/configs/datasets/game24 + configpath_llmjudge: '' - govrepcrs: name: Government Report Dataset category: Long Context paper: https://aclanthology.org/2021.naacl-main.112.pdf configpath: opencompass/configs/datasets/govrepcrs + configpath_llmjudge: '' - gpqa: name: GPQA category: Knowledge paper: https://arxiv.org/pdf/2311.12022v1 configpath: opencompass/configs/datasets/gpqa + configpath_llmjudge: '' - gsm8k: name: GSM8K category: Math paper: https://arxiv.org/pdf/2110.14168v2 configpath: opencompass/configs/datasets/gsm8k + configpath_llmjudge: '' - gsm_hard: name: GSM-Hard category: Math paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf configpath: opencompass/configs/datasets/gsm_hard + configpath_llmjudge: '' - hle: name: HLE(Humanity's Last Exam) category: Reasoning paper: https://lastexam.ai/paper configpath: opencompass/configs/datasets/HLE + configpath_llmjudge: '' - hellaswag: name: HellaSwag category: Reasoning paper: https://arxiv.org/pdf/1905.07830 configpath: opencompass/configs/datasets/hellaswag + configpath_llmjudge: '' - humaneval: name: HumanEval category: Code paper: https://arxiv.org/pdf/2107.03374v2 configpath: opencompass/configs/datasets/humaneval + configpath_llmjudge: '' - humaneval_cn: name: HumanEval-CN category: Code - paper: "" + paper: '' configpath: opencompass/configs/datasets/humaneval_cn + configpath_llmjudge: '' - humaneval_multi: name: Multi-HumanEval category: Code paper: https://arxiv.org/pdf/2210.14868 configpath: opencompass/configs/datasets/humaneval_multi + configpath_llmjudge: '' - humanevalx: name: HumanEval-X category: Code paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 configpath: opencompass/configs/datasets/humanevalx + configpath_llmjudge: '' - hungarian_math: name: Hungarian_Math category: Math paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam configpath: opencompass/configs/datasets/hungarian_exam + configpath_llmjudge: '' - iwslt2017: name: IWSLT2017 category: Language paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf configpath: opencompass/configs/datasets/iwslt2017 + configpath_llmjudge: '' - jigsawmultilingual: name: JigsawMultilingual category: Safety paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data configpath: opencompass/configs/datasets/jigsawmultilingual + configpath_llmjudge: '' - lambada: name: LAMBADA category: Understanding paper: https://arxiv.org/pdf/1606.06031 configpath: opencompass/configs/datasets/lambada + configpath_llmjudge: '' - lcsts: name: LCSTS category: Understanding paper: https://aclanthology.org/D15-1229.pdf configpath: opencompass/configs/datasets/lcsts + configpath_llmjudge: '' - livestembench: name: LiveStemBench - category: "" - paper: "" + category: '' + paper: '' configpath: opencompass/configs/datasets/livestembench + configpath_llmjudge: '' - llm_compression: name: LLM Compression category: Bits Per Character (BPC) paper: https://arxiv.org/pdf/2404.09937 configpath: opencompass/configs/datasets/llm_compression + configpath_llmjudge: '' - math: name: MATH category: Math paper: https://arxiv.org/pdf/2103.03874 configpath: opencompass/configs/datasets/math + configpath_llmjudge: '' - math401: name: MATH 401 category: Math paper: https://arxiv.org/pdf/2304.02015 configpath: opencompass/configs/datasets/math401 + configpath_llmjudge: '' - mathbench: name: MathBench category: Math paper: https://arxiv.org/pdf/2405.12209 configpath: opencompass/configs/datasets/mathbench + configpath_llmjudge: '' - mbpp: name: MBPP category: Code paper: https://arxiv.org/pdf/2108.07732 configpath: opencompass/configs/datasets/mbpp + configpath_llmjudge: '' - mbpp_cn: name: MBPP-CN category: Code - paper: "" + paper: '' configpath: opencompass/configs/datasets/mbpp_cn + configpath_llmjudge: '' - mbpp_plus: name: MBPP-PLUS category: Code - paper: "" + paper: '' configpath: opencompass/configs/datasets/mbpp_plus + configpath_llmjudge: '' - mgsm: name: MGSM category: Language / Math paper: https://arxiv.org/pdf/2210.03057 configpath: opencompass/configs/datasets/mgsm + configpath_llmjudge: '' - mmlu: name: MMLU category: Understanding paper: https://arxiv.org/pdf/2009.03300 configpath: opencompass/configs/datasets/mmlu + configpath_llmjudge: '' - mmlu_cf: name: MMLU-CF category: Understanding paper: https://arxiv.org/pdf/2412.15194 configpath: opencompass/configs/datasets/mmlu_cf + configpath_llmjudge: '' - mmlu_pro: name: MMLU-Pro category: Understanding paper: https://arxiv.org/pdf/2406.01574 configpath: opencompass/configs/datasets/mmlu_pro + configpath_llmjudge: '' - mmmlu: name: MMMLU category: Language / Understanding paper: https://huggingface.co/datasets/openai/MMMLU configpath: opencompass/configs/datasets/mmmlu + configpath_llmjudge: '' - multirc: name: SuperGLUE / MultiRC category: Understanding paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_MultiRC + configpath_llmjudge: '' - narrativeqa: name: NarrativeQA category: Understanding paper: https://github.com/google-deepmind/narrativeqa configpath: opencompass/configs/datasets/narrativeqa + configpath_llmjudge: '' - natural_question: name: NaturalQuestions category: Knowledge paper: https://github.com/google-research-datasets/natural-questions configpath: opencompass/configs/datasets/nq + configpath_llmjudge: '' - natural_question_cn: name: NaturalQuestions-CN category: Knowledge - paper: "" + paper: '' configpath: opencompass/configs/datasets/nq_cn + configpath_llmjudge: '' - obqa: name: OpenBookQA category: Knowledge paper: https://arxiv.org/pdf/1809.02789v1 configpath: opencompass/configs/datasets/obqa + configpath_llmjudge: '' - piqa: name: OpenBookQA category: Knowledge / Physics paper: https://arxiv.org/pdf/1911.11641v1 configpath: opencompass/configs/datasets/piqa + configpath_llmjudge: '' - py150: name: py150 category: Code paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line configpath: opencompass/configs/datasets/py150 + configpath_llmjudge: '' - qasper: name: Qasper category: Long Context paper: https://arxiv.org/pdf/2105.03011 configpath: opencompass/configs/datasets/qasper + configpath_llmjudge: '' - qaspercut: name: Qasper-Cut category: Long Context - paper: "" + paper: '' configpath: opencompass/configs/datasets/qaspercut + configpath_llmjudge: '' - race: name: RACE category: Examination paper: https://arxiv.org/pdf/1704.04683 configpath: opencompass/configs/datasets/race + configpath_llmjudge: '' - realtoxicprompts: name: RealToxicPrompts category: Safety paper: https://arxiv.org/pdf/2009.11462 configpath: opencompass/configs/datasets/realtoxicprompts + configpath_llmjudge: '' - record: name: SuperGLUE / ReCoRD category: Understanding paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD + configpath_llmjudge: '' - rte: name: SuperGLUE / RTE category: Reasoning paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_RTE + configpath_llmjudge: '' - ocnli: name: CLUE / OCNLI category: Reasoning paper: https://arxiv.org/pdf/2004.05986 configpath: opencompass/configs/datasets/CLUE_ocnli + configpath_llmjudge: '' - rolebench: name: RoleBench category: Role Play paper: https://arxiv.org/pdf/2310.00746 configpath: opencompass/configs/datasets/rolebench + configpath_llmjudge: '' - s3eval: name: S3Eval category: Long Context paper: https://aclanthology.org/2024.naacl-long.69.pdf configpath: opencompass/configs/datasets/s3eval + configpath_llmjudge: '' - scibench: name: SciBench category: Reasoning paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf configpath: opencompass/configs/datasets/scibench + configpath_llmjudge: '' - scicode: name: SciCode category: Code paper: https://arxiv.org/pdf/2407.13168 configpath: opencompass/configs/datasets/scicode + configpath_llmjudge: '' - simpleqa: name: SimpleQA category: Knowledge paper: https://arxiv.org/pdf/2411.04368 configpath: opencompass/configs/datasets/SimpleQA + configpath_llmjudge: '' - siqa: name: SocialIQA category: Reasoning paper: https://arxiv.org/pdf/1904.09728 configpath: opencompass/configs/datasets/siqa + configpath_llmjudge: '' - squad20: name: SQuAD2.0 category: Understanding paper: https://arxiv.org/pdf/1806.03822 configpath: opencompass/configs/datasets/squad20 + configpath_llmjudge: '' - storycloze: name: StoryCloze category: Reasoning paper: https://aclanthology.org/2022.emnlp-main.616.pdf configpath: opencompass/configs/datasets/storycloze + configpath_llmjudge: '' - strategyqa: name: StrategyQA category: Reasoning paper: https://arxiv.org/pdf/2101.02235 configpath: opencompass/configs/datasets/strategyqa + configpath_llmjudge: '' - summedits: name: SummEdits category: Language paper: https://aclanthology.org/2023.emnlp-main.600.pdf configpath: opencompass/configs/datasets/summedits + configpath_llmjudge: '' - summscreen: name: SummScreen category: Understanding paper: https://arxiv.org/pdf/2104.07091v1 configpath: opencompass/configs/datasets/summscreen + configpath_llmjudge: '' - svamp: name: SVAMP category: Math paper: https://aclanthology.org/2021.naacl-main.168.pdf configpath: opencompass/configs/datasets/SVAMP + configpath_llmjudge: '' - tabmwp: name: TabMWP category: Math / Table paper: https://arxiv.org/pdf/2209.14610 configpath: opencompass/configs/datasets/TabMWP + configpath_llmjudge: '' - taco: name: TACO category: Code paper: https://arxiv.org/pdf/2312.14852 configpath: opencompass/configs/datasets/taco + configpath_llmjudge: '' - tnews: name: FewCLUE / TNEWS category: Understanding paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_tnews + configpath_llmjudge: '' - bustm: name: FewCLUE / BUSTM category: Reasoning paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_bustm + configpath_llmjudge: '' - csl: name: FewCLUE / CSL category: Understanding paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_csl + configpath_llmjudge: '' - ocnli_fc: name: FewCLUE / OCNLI-FC category: Reasoning paper: https://arxiv.org/pdf/2107.07498 configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc + configpath_llmjudge: '' - triviaqa: name: TriviaQA category: Knowledge paper: https://arxiv.org/pdf/1705.03551v2 configpath: opencompass/configs/datasets/triviaqa + configpath_llmjudge: '' - triviaqarc: name: TriviaQA-RC category: Knowledge / Understanding - paper: "" + paper: '' configpath: opencompass/configs/datasets/triviaqarc + configpath_llmjudge: '' - truthfulqa: name: TruthfulQA category: Safety paper: https://arxiv.org/pdf/2109.07958v2 configpath: opencompass/configs/datasets/truthfulqa + configpath_llmjudge: '' - tydiqa: name: TyDi-QA category: Language paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf configpath: opencompass/configs/datasets/tydiqa + configpath_llmjudge: '' - wic: name: SuperGLUE / WiC category: Language paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_WiC + configpath_llmjudge: '' - wsc: name: SuperGLUE / WSC category: Language / WSC paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf configpath: opencompass/configs/datasets/SuperGLUE_WSC + configpath_llmjudge: '' - winogrande: name: WinoGrande category: Language / WSC paper: https://arxiv.org/pdf/1907.10641v2 configpath: opencompass/configs/datasets/winogrande + configpath_llmjudge: '' - xcopa: name: XCOPA category: Language paper: https://arxiv.org/pdf/2005.00333 configpath: opencompass/configs/datasets/XCOPA + configpath_llmjudge: '' - xiezhi: name: Xiezhi category: Knowledge paper: https://arxiv.org/pdf/2306.05783 configpath: opencompass/configs/datasets/xiezhi + configpath_llmjudge: '' - xlsum: name: XLSum category: Understanding paper: https://arxiv.org/pdf/2106.13822v1 configpath: opencompass/configs/datasets/XLSum + configpath_llmjudge: '' - xsum: name: Xsum category: Understanding paper: https://arxiv.org/pdf/1808.08745 configpath: opencompass/configs/datasets/Xsum - - - + configpath_llmjudge: '' diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py index eb5dc7fe..4e68bae0 100755 --- a/docs/zh_cn/statis.py +++ b/docs/zh_cn/statis.py @@ -24,7 +24,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml') with open(load_path, 'r') as f2: data_list = yaml.load(f2, Loader=yaml.FullLoader) -HEADER = ['name', 'category', 'paper', 'configpath'] +HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge'] def table_format(data_list): @@ -35,7 +35,7 @@ def table_format(data_list): for index in HEADER: if index == 'paper': table_format_list_sub.append('[链接](' + i[j][index] + ')') - elif index == 'configpath': + elif index != 'name' and index != 'category': if isinstance(i[j][index], list): sub_list_text = '' for k in i[j][index]: @@ -60,7 +60,7 @@ def generate_table(data_list, title=None): if title is not None: f.write(f'\n{title}') f.write("""\n```{table}\n:class: dataset\n""") - header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接'] + header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)'] table_cfg = dict(tablefmt='pipe', floatfmt='.2f', numalign='right', diff --git a/opencompass/configs/datasets/race/race_gen.py b/opencompass/configs/datasets/race/race_gen.py index 44059948..3fd646ca 100644 --- a/opencompass/configs/datasets/race/race_gen.py +++ b/opencompass/configs/datasets/race/race_gen.py @@ -1,68 +1,4 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import RaceDataset -from opencompass.utils.text_postprocessors import ( - first_option_postprocess, -) +from mmengine.config import read_base -QUERY_TEMPLATE = """ -Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. - -Article: {article} - -Q: {question} - -A. {A} -B. {B} -C. {C} -D. {D} -""".strip() - -race_reader_cfg = dict( - input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer', - train_split='validation', - test_split='test', -) - -race_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict(role='HUMAN', prompt=QUERY_TEMPLATE), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), -) - -race_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), - pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'), - pred_role='BOT', -) - -race_datasets = [ - dict( - abbr='race-middle', - type=RaceDataset, - path='opencompass/race', - name='middle', - reader_cfg=race_reader_cfg, - infer_cfg=race_infer_cfg, - eval_cfg=race_eval_cfg, - ), - dict( - abbr='race-high', - type=RaceDataset, - path='opencompass/race', - name='high', - reader_cfg=race_reader_cfg, - infer_cfg=race_infer_cfg, - eval_cfg=race_eval_cfg, - ), -] \ No newline at end of file +with read_base(): + from .race_gen_69ee4f import race_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/race/race_llm_judge_gen.py b/opencompass/configs/datasets/race/race_llm_judge_gen.py deleted file mode 100644 index 6842d032..00000000 --- a/opencompass/configs/datasets/race/race_llm_judge_gen.py +++ /dev/null @@ -1,117 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import RaceDataset -from opencompass.utils.text_postprocessors import ( - first_option_postprocess, -) -from opencompass.evaluator import GenericLLMEvaluator -from opencompass.datasets import generic_llmjudge_postprocess - -QUERY_TEMPLATE = """ -Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. - -Article: {article} - -Q: {question} - -A. {A} -B. {B} -C. {C} -D. {D} -""".strip() - -GRADER_TEMPLATE = """ - Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. - - Here are some evaluation criteria: - 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. - 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. - 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. - 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. - - Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: - A: CORRECT - B: INCORRECT - Just return the letters "A" or "B", with no text around it. - - Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. - - : {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n\n\n - : \n{answer}\n\n\n - : \n{prediction}\n\n\n - Judging the correctness of candidates' answers: -""".strip() - - -race_reader_cfg = dict( - input_columns=['article', 'question', 'A', 'B', 'C', 'D'], - output_column='answer', - train_split='validation', - test_split='test', -) - -race_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict(role='HUMAN', prompt=QUERY_TEMPLATE), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), -) - -race_eval_cfg = dict( - evaluator=dict( - type=GenericLLMEvaluator, - prompt_template=dict( - type=PromptTemplate, - template=dict( - begin=[ - dict( - role='SYSTEM', - fallback_role='HUMAN', - prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") - ], - round=[ - dict( - role='HUMAN', - prompt=GRADER_TEMPLATE - ), - ]), - ), - dataset_cfg=dict( - type=RaceDataset, - path='./data/gpqa/', - reader_cfg=race_reader_cfg, - ), - judge_cfg=dict(), - dict_postprocessor=dict(type=generic_llmjudge_postprocess), - ), - pred_role='BOT', -) - -race_datasets = [ - dict( - abbr='race-middle', - type=RaceDataset, - path='opencompass/race', - name='middle', - reader_cfg=race_reader_cfg, - infer_cfg=race_infer_cfg, - eval_cfg=race_eval_cfg, - ), - dict( - abbr='race-high', - type=RaceDataset, - path='opencompass/race', - name='high', - reader_cfg=race_reader_cfg, - infer_cfg=race_infer_cfg, - eval_cfg=race_eval_cfg, - ), -] \ No newline at end of file