mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix
This commit is contained in:
parent
b9b69febc3
commit
cc9761e882
@ -1,739 +1,886 @@
|
||||
|
||||
- ifeval:
|
||||
name: IFEval
|
||||
category: Instruction Following
|
||||
paper: https://arxiv.org/pdf/2311.07911
|
||||
configpath: opencompass/configs/datasets/IFEval/IFEval
|
||||
configpath_llmjudge: ''
|
||||
- nphard:
|
||||
name: NPHardEval
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2312.14890v2
|
||||
configpath: opencompass/configs/datasets/NPHardEval
|
||||
configpath_llmjudge: ''
|
||||
- pmmeval:
|
||||
name: PMMEval
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2411.09116v1
|
||||
configpath: opencompass/configs/datasets/PMMEval
|
||||
configpath_llmjudge: ''
|
||||
- theoremqa:
|
||||
name: TheroremQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2305.12524
|
||||
configpath: opencompass/configs/datasets/TheroremQA
|
||||
configpath_llmjudge: ''
|
||||
- agieval:
|
||||
name: AGIEval
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2304.06364
|
||||
configpath: opencompass/configs/datasets/agieval
|
||||
configpath_llmjudge: ''
|
||||
- babilong:
|
||||
name: BABILong
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2406.10149
|
||||
configpath: opencompass/configs/datasets/babilong
|
||||
configpath_llmjudge: ''
|
||||
- bigcodebench:
|
||||
name: BigCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2406.15877
|
||||
configpath: opencompass/configs/datasets/bigcodebench
|
||||
configpath_llmjudge: ''
|
||||
- calm:
|
||||
name: CaLM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2405.00622
|
||||
configpath: opencompass/configs/datasets/calm
|
||||
configpath_llmjudge: ''
|
||||
- infinitebench:
|
||||
name: InfiniteBench (∞Bench)
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2024.acl-long.814.pdf
|
||||
configpath: opencompass/configs/datasets/infinitebench
|
||||
configpath_llmjudge: ''
|
||||
- korbench:
|
||||
name: KOR-Bench
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2410.06526v1
|
||||
configpath: opencompass/configs/datasets/korbench
|
||||
configpath_llmjudge: ''
|
||||
- lawbench:
|
||||
name: LawBench
|
||||
category: Knowledge / Law
|
||||
paper: https://arxiv.org/pdf/2309.16289
|
||||
configpath: opencompass/configs/datasets/lawbench
|
||||
configpath_llmjudge: ''
|
||||
- leval:
|
||||
name: L-Eval
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2307.11088v1
|
||||
configpath: opencompass/configs/datasets/leval
|
||||
configpath_llmjudge: ''
|
||||
- livecodebench:
|
||||
name: LiveCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2403.07974
|
||||
configpath: opencompass/configs/datasets/livecodebench
|
||||
configpath_llmjudge: ''
|
||||
- livemathbench:
|
||||
name: LiveMathBench
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2412.13147
|
||||
configpath: opencompass/configs/datasets/livemathbench
|
||||
configpath_llmjudge: ''
|
||||
- longbench:
|
||||
name: LongBench
|
||||
category: Long Context
|
||||
paper: https://github.com/THUDM/LongBench
|
||||
configpath: opencompass/configs/datasets/livemathbench
|
||||
configpath_llmjudge: ''
|
||||
- lveval:
|
||||
name: LV-Eval
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2402.05136
|
||||
configpath: opencompass/configs/datasets/lveval
|
||||
configpath_llmjudge: ''
|
||||
- medbench:
|
||||
name: MedBench
|
||||
category: Knowledge / Medicine
|
||||
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
||||
configpath: opencompass/configs/datasets/MedBench
|
||||
configpath_llmjudge: ''
|
||||
- musr:
|
||||
name: MuSR
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2310.16049
|
||||
configpath: opencompass/configs/datasets/musr
|
||||
configpath_llmjudge: ''
|
||||
- needlebench:
|
||||
name: NeedleBench
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2407.11963
|
||||
configpath: opencompass/configs/datasets/needlebench
|
||||
configpath_llmjudge: ''
|
||||
- ruler:
|
||||
name: RULER
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2404.06654
|
||||
configpath: opencompass/configs/datasets/ruler
|
||||
configpath_llmjudge: ''
|
||||
- alignment:
|
||||
name: AlignBench
|
||||
category: Subjective / Alignment
|
||||
paper: https://arxiv.org/pdf/2311.18743
|
||||
configpath: opencompass/configs/datasets/subjective/alignbench
|
||||
configpath_llmjudge: ''
|
||||
- alpaca:
|
||||
name: AlpacaEval
|
||||
category: Subjective / Instruction Following
|
||||
paper: https://github.com/tatsu-lab/alpaca_eval
|
||||
configpath: opencompass/configs/datasets/subjective/aplaca_eval
|
||||
configpath_llmjudge: ''
|
||||
- arenahard:
|
||||
name: Arena-Hard
|
||||
category: Subjective / Chatbot
|
||||
paper: https://lmsys.org/blog/2024-04-19-arena-hard/
|
||||
configpath: opencompass/configs/datasets/subjective/arena_hard
|
||||
configpath_llmjudge: ''
|
||||
- flames:
|
||||
name: FLAMES
|
||||
category: Subjective / Alignment
|
||||
paper: https://arxiv.org/pdf/2311.06899
|
||||
configpath: opencompass/configs/datasets/subjective/flames
|
||||
configpath_llmjudge: ''
|
||||
- fofo:
|
||||
name: FOFO
|
||||
category: Subjective / Format Following
|
||||
paper: https://arxiv.org/pdf/2402.18667
|
||||
configpath: opencompass/configs/datasets/subjective/fofo
|
||||
configpath_llmjudge: ''
|
||||
- followbench:
|
||||
name: FollowBench
|
||||
category: Subjective / Instruction Following
|
||||
paper: https://arxiv.org/pdf/2310.20410
|
||||
configpath: opencompass/configs/datasets/subjective/followbench
|
||||
configpath_llmjudge: ''
|
||||
- hellobench:
|
||||
name: HelloBench
|
||||
category: Subjective / Long Context
|
||||
paper: https://arxiv.org/pdf/2409.16191
|
||||
configpath: opencompass/configs/datasets/subjective/hellobench
|
||||
configpath_llmjudge: ''
|
||||
- judgerbench:
|
||||
name: JudgerBench
|
||||
category: Subjective / Long Context
|
||||
paper: https://arxiv.org/pdf/2410.16256
|
||||
configpath: opencompass/configs/datasets/subjective/judgerbench
|
||||
configpath_llmjudge: ''
|
||||
- multiround:
|
||||
name: MT-Bench-101
|
||||
category: Subjective / Multi-Round
|
||||
paper: https://arxiv.org/pdf/2402.14762
|
||||
configpath: opencompass/configs/datasets/subjective/multiround
|
||||
configpath_llmjudge: ''
|
||||
- wildbench:
|
||||
name: WildBench
|
||||
category: Subjective / Real Task
|
||||
paper: https://arxiv.org/pdf/2406.04770
|
||||
configpath: opencompass/configs/datasets/subjective/wildbench
|
||||
configpath_llmjudge: ''
|
||||
- teval:
|
||||
name: T-Eval
|
||||
category: Tool Utilization
|
||||
paper: https://arxiv.org/pdf/2312.14033
|
||||
configpath: opencompass/configs/datasets/teval
|
||||
configpath_llmjudge: ''
|
||||
- finalceiq:
|
||||
name: FinanceIQ
|
||||
category: Knowledge / Finance
|
||||
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
|
||||
configpath: opencompass/configs/datasets/FinanceIQ
|
||||
configpath_llmjudge: ''
|
||||
- gaokaobench:
|
||||
name: GAOKAOBench
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2305.12474
|
||||
configpath: opencompass/configs/datasets/GaokaoBench
|
||||
configpath_llmjudge: ''
|
||||
- lcbench:
|
||||
name: LCBench
|
||||
category: Code
|
||||
paper: https://github.com/open-compass/CodeBench/
|
||||
configpath: opencompass/configs/datasets/LCBench
|
||||
configpath_llmjudge: ''
|
||||
- MMLUArabic:
|
||||
name: ArabicMMLU
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2402.12840
|
||||
configpath: opencompass/configs/datasets/MMLUArabic
|
||||
configpath_llmjudge: ''
|
||||
- OpenFinData:
|
||||
name: OpenFinData
|
||||
category: Knowledge / Finance
|
||||
paper: https://github.com/open-compass/OpenFinData
|
||||
configpath: opencompass/configs/datasets/OpenFinData
|
||||
configpath_llmjudge: ''
|
||||
- QuALITY:
|
||||
name: QuALITY
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2112.08608
|
||||
configpath: opencompass/configs/datasets/QuALITY
|
||||
configpath_llmjudge: ''
|
||||
- advglue:
|
||||
name: Adversarial GLUE
|
||||
category: Safety
|
||||
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
|
||||
configpath: opencompass/configs/datasets/adv_glue
|
||||
configpath_llmjudge: ''
|
||||
- afqmcd:
|
||||
name: CLUE / AFQMC
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_afqmc
|
||||
configpath_llmjudge: ''
|
||||
- aime2024:
|
||||
name: AIME2024
|
||||
category: Examination
|
||||
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
|
||||
configpath: opencompass/configs/datasets/aime2024
|
||||
configpath_llmjudge: ''
|
||||
- anli:
|
||||
name: Adversarial NLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1910.14599v2
|
||||
configpath: opencompass/configs/datasets/anli
|
||||
configpath_llmjudge: ''
|
||||
- anthropics_evals:
|
||||
name: Anthropics Evals
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2212.09251
|
||||
configpath: opencompass/configs/datasets/anthropics_evals
|
||||
configpath_llmjudge: ''
|
||||
- apps:
|
||||
name: APPS
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2105.09938
|
||||
configpath: opencompass/configs/datasets/apps
|
||||
configpath_llmjudge: ''
|
||||
- arc:
|
||||
name: ARC
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1803.05457
|
||||
configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e]
|
||||
configpath:
|
||||
- opencompass/configs/datasets/ARC_c
|
||||
- opencompass/configs/datasets/ARC_e
|
||||
configpath_llmjudge: ''
|
||||
- arc_prize_public_eval:
|
||||
name: ARC Prize
|
||||
category: ARC-AGI
|
||||
paper: https://arcprize.org/guide#private
|
||||
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
|
||||
configpath_llmjudge: ''
|
||||
- ax:
|
||||
name: SuperGLUE / AX
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g]
|
||||
configpath:
|
||||
- opencompass/configs/datasets/SuperGLUE_AX_b
|
||||
- opencompass/configs/datasets/SuperGLUE_AX_g
|
||||
configpath_llmjudge: ''
|
||||
- bbh:
|
||||
name: BIG-Bench Hard
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2210.09261
|
||||
configpath: opencompass/configs/datasets/bbh
|
||||
configpath_llmjudge: ''
|
||||
- BoolQ:
|
||||
name: SuperGLUE / BoolQ
|
||||
category: Knowledge
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
|
||||
configpath_llmjudge: ''
|
||||
- c3:
|
||||
name: CLUE / C3 (C³)
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_C3
|
||||
configpath_llmjudge: ''
|
||||
- cb:
|
||||
name: SuperGLUE / CB
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_CB
|
||||
configpath_llmjudge: ''
|
||||
- ceval:
|
||||
name: C-EVAL
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2305.08322v1
|
||||
configpath: opencompass/configs/datasets/ceval
|
||||
configpath_llmjudge: ''
|
||||
- charm:
|
||||
name: CHARM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2403.14112
|
||||
configpath: opencompass/configs/datasets/CHARM
|
||||
configpath_llmjudge: ''
|
||||
- chembench:
|
||||
name: ChemBench
|
||||
category: Knowledge / Chemistry
|
||||
paper: https://arxiv.org/pdf/2404.01475
|
||||
configpath: opencompass/configs/datasets/ChemBench
|
||||
configpath_llmjudge: ''
|
||||
- chid:
|
||||
name: FewCLUE / CHID
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_chid
|
||||
configpath_llmjudge: ''
|
||||
- chinese_simpleqa:
|
||||
name: Chinese SimpleQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2411.07140
|
||||
configpath: opencompass/configs/datasets/chinese_simpleqa
|
||||
configpath_llmjudge: ''
|
||||
- cibench:
|
||||
name: CIBench
|
||||
category: Code
|
||||
paper: https://www.arxiv.org/pdf/2407.10499
|
||||
configpath: opencompass/configs/datasets/CIBench
|
||||
configpath_llmjudge: ''
|
||||
- civilcomments:
|
||||
name: CivilComments
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/1903.04561
|
||||
configpath: opencompass/configs/datasets/civilcomments
|
||||
configpath_llmjudge: ''
|
||||
- clozeTest_maxmin:
|
||||
name: Cloze Test-max/min
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2102.04664
|
||||
configpath: opencompass/configs/datasets/clozeTest_maxmin
|
||||
configpath_llmjudge: ''
|
||||
- cluewsc:
|
||||
name: FewCLUE / CLUEWSC
|
||||
category: Language / WSC
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_cluewsc
|
||||
configpath_llmjudge: ''
|
||||
- cmb:
|
||||
name: CMB
|
||||
category: Knowledge / Medicine
|
||||
paper: https://arxiv.org/pdf/2308.08833
|
||||
configpath: opencompass/configs/datasets/cmb
|
||||
configpath_llmjudge: ''
|
||||
- cmmlu:
|
||||
name: CMMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2306.09212
|
||||
configpath: opencompass/configs/datasets/cmmlu
|
||||
configpath_llmjudge: ''
|
||||
- cmnli:
|
||||
name: CLUE / CMNLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_cmnli
|
||||
configpath_llmjudge: ''
|
||||
- cmo_fib:
|
||||
name: cmo_fib
|
||||
category: Examination
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/cmo_fib
|
||||
configpath_llmjudge: ''
|
||||
- cmrc:
|
||||
name: CLUE / CMRC
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_CMRC
|
||||
configpath_llmjudge: ''
|
||||
- commonsenseqa:
|
||||
name: CommonSenseQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1811.00937v2
|
||||
configpath: opencompass/configs/datasets/commonsenseqa
|
||||
configpath_llmjudge: ''
|
||||
- commonsenseqa_cn:
|
||||
name: CommonSenseQA-CN
|
||||
category: Knowledge
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/commonsenseqa_cn
|
||||
configpath_llmjudge: ''
|
||||
- copa:
|
||||
name: SuperGLUE / COPA
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_COPA
|
||||
configpath_llmjudge: ''
|
||||
- crowspairs:
|
||||
name: CrowsPairs
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2010.00133
|
||||
configpath: opencompass/configs/datasets/crowspairs
|
||||
configpath_llmjudge: ''
|
||||
- crowspairs_cn:
|
||||
name: CrowsPairs-CN
|
||||
category: Safety
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/crowspairs_cn
|
||||
configpath_llmjudge: ''
|
||||
- cvalues:
|
||||
name: CVALUES
|
||||
category: Safety
|
||||
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
|
||||
configpath: opencompass/configs/datasets/cvalues
|
||||
configpath_llmjudge: ''
|
||||
- drcd:
|
||||
name: CLUE / DRCD
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_DRCD
|
||||
configpath_llmjudge: ''
|
||||
- drop:
|
||||
name: DROP (DROP Simple Eval)
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1903.00161
|
||||
configpath: opencompass/configs/datasets/drop
|
||||
configpath_llmjudge: ''
|
||||
- ds1000:
|
||||
name: DS-1000
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2211.11501
|
||||
configpath: opencompass/configs/datasets/ds1000
|
||||
configpath_llmjudge: ''
|
||||
- eprstmt:
|
||||
name: FewCLUE / EPRSTMT
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_eprstmt
|
||||
configpath_llmjudge: ''
|
||||
- flores:
|
||||
name: Flores
|
||||
category: Language
|
||||
paper: https://aclanthology.org/D19-1632.pdf
|
||||
configpath: opencompass/configs/datasets/flores
|
||||
configpath_llmjudge: ''
|
||||
- game24:
|
||||
name: Game24
|
||||
category: Math
|
||||
paper: https://huggingface.co/datasets/nlile/24-game
|
||||
configpath: opencompass/configs/datasets/game24
|
||||
configpath_llmjudge: ''
|
||||
- govrepcrs:
|
||||
name: Government Report Dataset
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2021.naacl-main.112.pdf
|
||||
configpath: opencompass/configs/datasets/govrepcrs
|
||||
configpath_llmjudge: ''
|
||||
- gpqa:
|
||||
name: GPQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2311.12022v1
|
||||
configpath: opencompass/configs/datasets/gpqa
|
||||
configpath_llmjudge: ''
|
||||
- gsm8k:
|
||||
name: GSM8K
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2110.14168v2
|
||||
configpath: opencompass/configs/datasets/gsm8k
|
||||
configpath_llmjudge: ''
|
||||
- gsm_hard:
|
||||
name: GSM-Hard
|
||||
category: Math
|
||||
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
|
||||
configpath: opencompass/configs/datasets/gsm_hard
|
||||
configpath_llmjudge: ''
|
||||
- hle:
|
||||
name: HLE(Humanity's Last Exam)
|
||||
category: Reasoning
|
||||
paper: https://lastexam.ai/paper
|
||||
configpath: opencompass/configs/datasets/HLE
|
||||
configpath_llmjudge: ''
|
||||
- hellaswag:
|
||||
name: HellaSwag
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1905.07830
|
||||
configpath: opencompass/configs/datasets/hellaswag
|
||||
configpath_llmjudge: ''
|
||||
- humaneval:
|
||||
name: HumanEval
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2107.03374v2
|
||||
configpath: opencompass/configs/datasets/humaneval
|
||||
configpath_llmjudge: ''
|
||||
- humaneval_cn:
|
||||
name: HumanEval-CN
|
||||
category: Code
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/humaneval_cn
|
||||
configpath_llmjudge: ''
|
||||
- humaneval_multi:
|
||||
name: Multi-HumanEval
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2210.14868
|
||||
configpath: opencompass/configs/datasets/humaneval_multi
|
||||
configpath_llmjudge: ''
|
||||
- humanevalx:
|
||||
name: HumanEval-X
|
||||
category: Code
|
||||
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
||||
configpath: opencompass/configs/datasets/humanevalx
|
||||
configpath_llmjudge: ''
|
||||
- hungarian_math:
|
||||
name: Hungarian_Math
|
||||
category: Math
|
||||
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
|
||||
configpath: opencompass/configs/datasets/hungarian_exam
|
||||
configpath_llmjudge: ''
|
||||
- iwslt2017:
|
||||
name: IWSLT2017
|
||||
category: Language
|
||||
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
|
||||
configpath: opencompass/configs/datasets/iwslt2017
|
||||
configpath_llmjudge: ''
|
||||
- jigsawmultilingual:
|
||||
name: JigsawMultilingual
|
||||
category: Safety
|
||||
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
||||
configpath: opencompass/configs/datasets/jigsawmultilingual
|
||||
configpath_llmjudge: ''
|
||||
- lambada:
|
||||
name: LAMBADA
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1606.06031
|
||||
configpath: opencompass/configs/datasets/lambada
|
||||
configpath_llmjudge: ''
|
||||
- lcsts:
|
||||
name: LCSTS
|
||||
category: Understanding
|
||||
paper: https://aclanthology.org/D15-1229.pdf
|
||||
configpath: opencompass/configs/datasets/lcsts
|
||||
configpath_llmjudge: ''
|
||||
- livestembench:
|
||||
name: LiveStemBench
|
||||
category: ""
|
||||
paper: ""
|
||||
category: ''
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/livestembench
|
||||
configpath_llmjudge: ''
|
||||
- llm_compression:
|
||||
name: LLM Compression
|
||||
category: Bits Per Character (BPC)
|
||||
paper: https://arxiv.org/pdf/2404.09937
|
||||
configpath: opencompass/configs/datasets/llm_compression
|
||||
configpath_llmjudge: ''
|
||||
- math:
|
||||
name: MATH
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2103.03874
|
||||
configpath: opencompass/configs/datasets/math
|
||||
configpath_llmjudge: ''
|
||||
- math401:
|
||||
name: MATH 401
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2304.02015
|
||||
configpath: opencompass/configs/datasets/math401
|
||||
configpath_llmjudge: ''
|
||||
- mathbench:
|
||||
name: MathBench
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2405.12209
|
||||
configpath: opencompass/configs/datasets/mathbench
|
||||
configpath_llmjudge: ''
|
||||
- mbpp:
|
||||
name: MBPP
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2108.07732
|
||||
configpath: opencompass/configs/datasets/mbpp
|
||||
configpath_llmjudge: ''
|
||||
- mbpp_cn:
|
||||
name: MBPP-CN
|
||||
category: Code
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/mbpp_cn
|
||||
configpath_llmjudge: ''
|
||||
- mbpp_plus:
|
||||
name: MBPP-PLUS
|
||||
category: Code
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/mbpp_plus
|
||||
configpath_llmjudge: ''
|
||||
- mgsm:
|
||||
name: MGSM
|
||||
category: Language / Math
|
||||
paper: https://arxiv.org/pdf/2210.03057
|
||||
configpath: opencompass/configs/datasets/mgsm
|
||||
configpath_llmjudge: ''
|
||||
- mmlu:
|
||||
name: MMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2009.03300
|
||||
configpath: opencompass/configs/datasets/mmlu
|
||||
configpath_llmjudge: ''
|
||||
- mmlu_cf:
|
||||
name: MMLU-CF
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2412.15194
|
||||
configpath: opencompass/configs/datasets/mmlu_cf
|
||||
configpath_llmjudge: ''
|
||||
- mmlu_pro:
|
||||
name: MMLU-Pro
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2406.01574
|
||||
configpath: opencompass/configs/datasets/mmlu_pro
|
||||
configpath_llmjudge: ''
|
||||
- mmmlu:
|
||||
name: MMMLU
|
||||
category: Language / Understanding
|
||||
paper: https://huggingface.co/datasets/openai/MMMLU
|
||||
configpath: opencompass/configs/datasets/mmmlu
|
||||
configpath_llmjudge: ''
|
||||
- multirc:
|
||||
name: SuperGLUE / MultiRC
|
||||
category: Understanding
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
|
||||
configpath_llmjudge: ''
|
||||
- narrativeqa:
|
||||
name: NarrativeQA
|
||||
category: Understanding
|
||||
paper: https://github.com/google-deepmind/narrativeqa
|
||||
configpath: opencompass/configs/datasets/narrativeqa
|
||||
configpath_llmjudge: ''
|
||||
- natural_question:
|
||||
name: NaturalQuestions
|
||||
category: Knowledge
|
||||
paper: https://github.com/google-research-datasets/natural-questions
|
||||
configpath: opencompass/configs/datasets/nq
|
||||
configpath_llmjudge: ''
|
||||
- natural_question_cn:
|
||||
name: NaturalQuestions-CN
|
||||
category: Knowledge
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/nq_cn
|
||||
configpath_llmjudge: ''
|
||||
- obqa:
|
||||
name: OpenBookQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1809.02789v1
|
||||
configpath: opencompass/configs/datasets/obqa
|
||||
configpath_llmjudge: ''
|
||||
- piqa:
|
||||
name: OpenBookQA
|
||||
category: Knowledge / Physics
|
||||
paper: https://arxiv.org/pdf/1911.11641v1
|
||||
configpath: opencompass/configs/datasets/piqa
|
||||
configpath_llmjudge: ''
|
||||
- py150:
|
||||
name: py150
|
||||
category: Code
|
||||
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
|
||||
configpath: opencompass/configs/datasets/py150
|
||||
configpath_llmjudge: ''
|
||||
- qasper:
|
||||
name: Qasper
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2105.03011
|
||||
configpath: opencompass/configs/datasets/qasper
|
||||
configpath_llmjudge: ''
|
||||
- qaspercut:
|
||||
name: Qasper-Cut
|
||||
category: Long Context
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/qaspercut
|
||||
configpath_llmjudge: ''
|
||||
- race:
|
||||
name: RACE
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/1704.04683
|
||||
configpath: opencompass/configs/datasets/race
|
||||
configpath_llmjudge: ''
|
||||
- realtoxicprompts:
|
||||
name: RealToxicPrompts
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2009.11462
|
||||
configpath: opencompass/configs/datasets/realtoxicprompts
|
||||
configpath_llmjudge: ''
|
||||
- record:
|
||||
name: SuperGLUE / ReCoRD
|
||||
category: Understanding
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
|
||||
configpath_llmjudge: ''
|
||||
- rte:
|
||||
name: SuperGLUE / RTE
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_RTE
|
||||
configpath_llmjudge: ''
|
||||
- ocnli:
|
||||
name: CLUE / OCNLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_ocnli
|
||||
configpath_llmjudge: ''
|
||||
- rolebench:
|
||||
name: RoleBench
|
||||
category: Role Play
|
||||
paper: https://arxiv.org/pdf/2310.00746
|
||||
configpath: opencompass/configs/datasets/rolebench
|
||||
configpath_llmjudge: ''
|
||||
- s3eval:
|
||||
name: S3Eval
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2024.naacl-long.69.pdf
|
||||
configpath: opencompass/configs/datasets/s3eval
|
||||
configpath_llmjudge: ''
|
||||
- scibench:
|
||||
name: SciBench
|
||||
category: Reasoning
|
||||
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
|
||||
configpath: opencompass/configs/datasets/scibench
|
||||
configpath_llmjudge: ''
|
||||
- scicode:
|
||||
name: SciCode
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2407.13168
|
||||
configpath: opencompass/configs/datasets/scicode
|
||||
configpath_llmjudge: ''
|
||||
- simpleqa:
|
||||
name: SimpleQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2411.04368
|
||||
configpath: opencompass/configs/datasets/SimpleQA
|
||||
configpath_llmjudge: ''
|
||||
- siqa:
|
||||
name: SocialIQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1904.09728
|
||||
configpath: opencompass/configs/datasets/siqa
|
||||
configpath_llmjudge: ''
|
||||
- squad20:
|
||||
name: SQuAD2.0
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1806.03822
|
||||
configpath: opencompass/configs/datasets/squad20
|
||||
configpath_llmjudge: ''
|
||||
- storycloze:
|
||||
name: StoryCloze
|
||||
category: Reasoning
|
||||
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
|
||||
configpath: opencompass/configs/datasets/storycloze
|
||||
configpath_llmjudge: ''
|
||||
- strategyqa:
|
||||
name: StrategyQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2101.02235
|
||||
configpath: opencompass/configs/datasets/strategyqa
|
||||
configpath_llmjudge: ''
|
||||
- summedits:
|
||||
name: SummEdits
|
||||
category: Language
|
||||
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
|
||||
configpath: opencompass/configs/datasets/summedits
|
||||
configpath_llmjudge: ''
|
||||
- summscreen:
|
||||
name: SummScreen
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2104.07091v1
|
||||
configpath: opencompass/configs/datasets/summscreen
|
||||
configpath_llmjudge: ''
|
||||
- svamp:
|
||||
name: SVAMP
|
||||
category: Math
|
||||
paper: https://aclanthology.org/2021.naacl-main.168.pdf
|
||||
configpath: opencompass/configs/datasets/SVAMP
|
||||
configpath_llmjudge: ''
|
||||
- tabmwp:
|
||||
name: TabMWP
|
||||
category: Math / Table
|
||||
paper: https://arxiv.org/pdf/2209.14610
|
||||
configpath: opencompass/configs/datasets/TabMWP
|
||||
configpath_llmjudge: ''
|
||||
- taco:
|
||||
name: TACO
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2312.14852
|
||||
configpath: opencompass/configs/datasets/taco
|
||||
configpath_llmjudge: ''
|
||||
- tnews:
|
||||
name: FewCLUE / TNEWS
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_tnews
|
||||
configpath_llmjudge: ''
|
||||
- bustm:
|
||||
name: FewCLUE / BUSTM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_bustm
|
||||
configpath_llmjudge: ''
|
||||
- csl:
|
||||
name: FewCLUE / CSL
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_csl
|
||||
configpath_llmjudge: ''
|
||||
- ocnli_fc:
|
||||
name: FewCLUE / OCNLI-FC
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
|
||||
configpath_llmjudge: ''
|
||||
- triviaqa:
|
||||
name: TriviaQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1705.03551v2
|
||||
configpath: opencompass/configs/datasets/triviaqa
|
||||
configpath_llmjudge: ''
|
||||
- triviaqarc:
|
||||
name: TriviaQA-RC
|
||||
category: Knowledge / Understanding
|
||||
paper: ""
|
||||
paper: ''
|
||||
configpath: opencompass/configs/datasets/triviaqarc
|
||||
configpath_llmjudge: ''
|
||||
- truthfulqa:
|
||||
name: TruthfulQA
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2109.07958v2
|
||||
configpath: opencompass/configs/datasets/truthfulqa
|
||||
configpath_llmjudge: ''
|
||||
- tydiqa:
|
||||
name: TyDi-QA
|
||||
category: Language
|
||||
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
|
||||
configpath: opencompass/configs/datasets/tydiqa
|
||||
configpath_llmjudge: ''
|
||||
- wic:
|
||||
name: SuperGLUE / WiC
|
||||
category: Language
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_WiC
|
||||
configpath_llmjudge: ''
|
||||
- wsc:
|
||||
name: SuperGLUE / WSC
|
||||
category: Language / WSC
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_WSC
|
||||
configpath_llmjudge: ''
|
||||
- winogrande:
|
||||
name: WinoGrande
|
||||
category: Language / WSC
|
||||
paper: https://arxiv.org/pdf/1907.10641v2
|
||||
configpath: opencompass/configs/datasets/winogrande
|
||||
configpath_llmjudge: ''
|
||||
- xcopa:
|
||||
name: XCOPA
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2005.00333
|
||||
configpath: opencompass/configs/datasets/XCOPA
|
||||
configpath_llmjudge: ''
|
||||
- xiezhi:
|
||||
name: Xiezhi
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2306.05783
|
||||
configpath: opencompass/configs/datasets/xiezhi
|
||||
configpath_llmjudge: ''
|
||||
- xlsum:
|
||||
name: XLSum
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2106.13822v1
|
||||
configpath: opencompass/configs/datasets/XLSum
|
||||
configpath_llmjudge: ''
|
||||
- xsum:
|
||||
name: Xsum
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1808.08745
|
||||
configpath: opencompass/configs/datasets/Xsum
|
||||
|
||||
|
||||
|
||||
configpath_llmjudge: ''
|
||||
|
@ -24,7 +24,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
|
||||
with open(load_path, 'r') as f2:
|
||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||
|
||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
||||
HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
|
||||
|
||||
|
||||
def table_format(data_list):
|
||||
@ -35,7 +35,7 @@ def table_format(data_list):
|
||||
for index in HEADER:
|
||||
if index == 'paper':
|
||||
table_format_list_sub.append('[链接](' + i[j][index] + ')')
|
||||
elif index == 'configpath':
|
||||
elif index != 'name' and index != 'category':
|
||||
if isinstance(i[j][index], list):
|
||||
sub_list_text = ''
|
||||
for k in i[j][index]:
|
||||
@ -60,7 +60,7 @@ def generate_table(data_list, title=None):
|
||||
if title is not None:
|
||||
f.write(f'\n{title}')
|
||||
f.write("""\n```{table}\n:class: dataset\n""")
|
||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
|
||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置(基于规则评估)', '推荐配置(基于LLM评估)']
|
||||
table_cfg = dict(tablefmt='pipe',
|
||||
floatfmt='.2f',
|
||||
numalign='right',
|
||||
|
@ -1,68 +1,4 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import RaceDataset
|
||||
from opencompass.utils.text_postprocessors import (
|
||||
first_option_postprocess,
|
||||
)
|
||||
from mmengine.config import read_base
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
||||
|
||||
Article: {article}
|
||||
|
||||
Q: {question}
|
||||
|
||||
A. {A}
|
||||
B. {B}
|
||||
C. {C}
|
||||
D. {D}
|
||||
""".strip()
|
||||
|
||||
race_reader_cfg = dict(
|
||||
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer',
|
||||
train_split='validation',
|
||||
test_split='test',
|
||||
)
|
||||
|
||||
race_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
race_eval_cfg = dict(
|
||||
evaluator=dict(type=AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
race_datasets = [
|
||||
dict(
|
||||
abbr='race-middle',
|
||||
type=RaceDataset,
|
||||
path='opencompass/race',
|
||||
name='middle',
|
||||
reader_cfg=race_reader_cfg,
|
||||
infer_cfg=race_infer_cfg,
|
||||
eval_cfg=race_eval_cfg,
|
||||
),
|
||||
dict(
|
||||
abbr='race-high',
|
||||
type=RaceDataset,
|
||||
path='opencompass/race',
|
||||
name='high',
|
||||
reader_cfg=race_reader_cfg,
|
||||
infer_cfg=race_infer_cfg,
|
||||
eval_cfg=race_eval_cfg,
|
||||
),
|
||||
]
|
||||
with read_base():
|
||||
from .race_gen_69ee4f import race_datasets # noqa: F401, F403
|
@ -1,117 +0,0 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import RaceDataset
|
||||
from opencompass.utils.text_postprocessors import (
|
||||
first_option_postprocess,
|
||||
)
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
||||
|
||||
Article: {article}
|
||||
|
||||
Q: {question}
|
||||
|
||||
A. {A}
|
||||
B. {B}
|
||||
C. {C}
|
||||
D. {D}
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
race_reader_cfg = dict(
|
||||
input_columns=['article', 'question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer',
|
||||
train_split='validation',
|
||||
test_split='test',
|
||||
)
|
||||
|
||||
race_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=QUERY_TEMPLATE),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
race_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=RaceDataset,
|
||||
path='./data/gpqa/',
|
||||
reader_cfg=race_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
race_datasets = [
|
||||
dict(
|
||||
abbr='race-middle',
|
||||
type=RaceDataset,
|
||||
path='opencompass/race',
|
||||
name='middle',
|
||||
reader_cfg=race_reader_cfg,
|
||||
infer_cfg=race_infer_cfg,
|
||||
eval_cfg=race_eval_cfg,
|
||||
),
|
||||
dict(
|
||||
abbr='race-high',
|
||||
type=RaceDataset,
|
||||
path='opencompass/race',
|
||||
name='high',
|
||||
reader_cfg=race_reader_cfg,
|
||||
infer_cfg=race_infer_cfg,
|
||||
eval_cfg=race_eval_cfg,
|
||||
),
|
||||
]
|
Loading…
Reference in New Issue
Block a user