From 9c2e6a192c336cbe32375e1bb55a531ac1f744aa Mon Sep 17 00:00:00 2001 From: Pablo Hinojosa Date: Fri, 7 Feb 2025 08:41:08 +0100 Subject: [PATCH 1/9] [Fix] Update broken links in README.md (#1852) --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 559a0608..887fcb4c 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through ## 🚀 What's New - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks. -- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. -- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥 -- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥 +- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it. +- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥 +- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥 - **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥 - **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥 - **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥 From 3fd8b4e0cdc76eef0f3deae7a24aaa03d299ffb4 Mon Sep 17 00:00:00 2001 From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com> Date: Sat, 8 Feb 2025 15:15:47 +0800 Subject: [PATCH 2/9] [Update] Update BigCodeBench & LCBench load path (#1857) * BigCodeBench update * update LCBench * update LCBench 2 * update code --- opencompass/datasets/LCBench.py | 7 +++++-- opencompass/datasets/bigcodebench/bigcodebench.py | 6 +++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/opencompass/datasets/LCBench.py b/opencompass/datasets/LCBench.py index cd747788..9af4cadc 100644 --- a/opencompass/datasets/LCBench.py +++ b/opencompass/datasets/LCBench.py @@ -22,7 +22,10 @@ from .base import BaseDataset class LCDataset(BaseDataset): @staticmethod - def load(path: str, num_repeats: int = 1, difficulty='ALL'): + def load(path: str, + num_repeats: int = 1, + difficulty='ALL', + local_mode=False): """Load LC dataset for pass k mode. Note that you can use num_repeats > 1 when your model does not support @@ -38,7 +41,7 @@ class LCDataset(BaseDataset): num_repeats(int): Number of repetition for this dataset to get multiple responses in special cases. """ - path = get_data_path(path, local_mode=True) + path = get_data_path(path, local_mode=local_mode) def processing_test(example): example['test_case'] = example['test_list'] diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index 8d8e06f3..f1109b1d 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -73,6 +73,8 @@ class BigCodeBenchEvaluator(BaseEvaluator): eval_type='instruct', remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # noqa dataset_version: str = 'full', + local_mode: bool = False, + path: str = 'opencompass/bigcodebench', pass_k: str = '1,5,10', parallel: int = -1, min_time_limit: float = 1, @@ -84,7 +86,9 @@ class BigCodeBenchEvaluator(BaseEvaluator): super().__init__() self.dataset = BigCodeBenchDataset.load( release_version=release_version, - dataset_version=dataset_version)['test'] + dataset_version=dataset_version, + local_mode=local_mode, + path=path)['test'] self.eval_type = eval_type self.remote_execute_api = remote_execute_api From 68a9838907ac2f20e59aa50cf771c43c960bc6c7 Mon Sep 17 00:00:00 2001 From: Myhs_phz Date: Fri, 14 Feb 2025 16:17:30 +0800 Subject: [PATCH 3/9] [Feature] Add list of supported datasets at html page (#1850) * feat dataset-index.yml and stat.py * fix * fix * fix * feat url of paper and config file * doc all supported dataset list * docs zh and en * docs README zh and en * docs new_dataset * docs new_dataset --- README.md | 258 +------- README_zh-CN.md | 258 +------- dataset-index.yml | 734 ++++++++++++++++++++++ docs/en/_static/js/custom.js | 16 +- docs/en/advanced_guides/new_dataset.md | 12 + docs/en/conf.py | 8 + docs/en/index.rst | 7 + docs/en/statis.py | 76 +++ docs/zh_cn/_static/js/custom.js | 16 +- docs/zh_cn/advanced_guides/new_dataset.md | 12 + docs/zh_cn/conf.py | 1 + docs/zh_cn/index.rst | 7 + docs/zh_cn/statis.py | 75 +++ 13 files changed, 965 insertions(+), 515 deletions(-) create mode 100644 dataset-index.yml create mode 100755 docs/en/statis.py create mode 100755 docs/zh_cn/statis.py diff --git a/README.md b/README.md index 887fcb4c..736968eb 100644 --- a/README.md +++ b/README.md @@ -279,263 +279,13 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide ## 📖 Dataset Support - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Language - - Knowledge - - Reasoning - - Examination -
-
-Word Definition +We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website. -- WiC -- SummEdits +You can quickly find the dataset you need from the list through sorting, filtering, and searching functions. -
+Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details. -
-Idiom Learning - -- CHID - -
- -
-Semantic Similarity - -- AFQMC -- BUSTM - -
- -
-Coreference Resolution - -- CLUEWSC -- WSC -- WinoGrande - -
- -
-Translation - -- Flores -- IWSLT2017 - -
- -
-Multi-language Question Answering - -- TyDi-QA -- XCOPA - -
- -
-Multi-language Summary - -- XLSum - -
-
-
-Knowledge Question Answering - -- BoolQ -- CommonSenseQA -- NaturalQuestions -- TriviaQA - -
-
-
-Textual Entailment - -- CMNLI -- OCNLI -- OCNLI_FC -- AX-b -- AX-g -- CB -- RTE -- ANLI - -
- -
-Commonsense Reasoning - -- StoryCloze -- COPA -- ReCoRD -- HellaSwag -- PIQA -- SIQA - -
- -
-Mathematical Reasoning - -- MATH -- GSM8K - -
- -
-Theorem Application - -- TheoremQA -- StrategyQA -- SciBench - -
- -
-Comprehensive Reasoning - -- BBH - -
-
-
-Junior High, High School, University, Professional Examinations - -- C-Eval -- AGIEval -- MMLU -- GAOKAO-Bench -- CMMLU -- ARC -- Xiezhi - -
- -
-Medical Examinations - -- CMB - -
-
- Understanding - - Long Context - - Safety - - Code -
-
-Reading Comprehension - -- C3 -- CMRC -- DRCD -- MultiRC -- RACE -- DROP -- OpenBookQA -- SQuAD2.0 - -
- -
-Content Summary - -- CSL -- LCSTS -- XSum -- SummScreen - -
- -
-Content Analysis - -- EPRSTMT -- LAMBADA -- TNEWS - -
-
-
-Long Context Understanding - -- LEval -- LongBench -- GovReports -- NarrativeQA -- Qasper - -
-
-
-Safety - -- CivilComments -- CrowsPairs -- CValues -- JigsawMultilingual -- TruthfulQA - -
-
-Robustness - -- AdvGLUE - -
-
-
-Code - -- HumanEval -- HumanEvalX -- MBPP -- APPs -- DS1000 - -
-
+

🔝Back to top

## 📖 Model Support diff --git a/README_zh-CN.md b/README_zh-CN.md index 5c889956..8d8ecd02 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -274,263 +274,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下 ## 📖 数据集支持 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- 语言 - - 知识 - - 推理 - - 考试 -
-
-字词释义 +我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。 -- WiC -- SummEdits +您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。 -
- -
-成语习语 - -- CHID - -
- -
-语义相似度 - -- AFQMC -- BUSTM - -
- -
-指代消解 - -- CLUEWSC -- WSC -- WinoGrande - -
- -
-翻译 - -- Flores -- IWSLT2017 - -
- -
-多语种问答 - -- TyDi-QA -- XCOPA - -
- -
-多语种总结 - -- XLSum - -
-
-
-知识问答 - -- BoolQ -- CommonSenseQA -- NaturalQuestions -- TriviaQA - -
-
-
-文本蕴含 - -- CMNLI -- OCNLI -- OCNLI_FC -- AX-b -- AX-g -- CB -- RTE -- ANLI - -
- -
-常识推理 - -- StoryCloze -- COPA -- ReCoRD -- HellaSwag -- PIQA -- SIQA - -
- -
-数学推理 - -- MATH -- GSM8K - -
- -
-定理应用 - -- TheoremQA -- StrategyQA -- SciBench - -
- -
-综合推理 - -- BBH - -
-
-
-初中/高中/大学/职业考试 - -- C-Eval -- AGIEval -- MMLU -- GAOKAO-Bench -- CMMLU -- ARC -- Xiezhi - -
- -
-医学考试 - -- CMB - -
-
- 理解 - - 长文本 - - 安全 - - 代码 -
-
-阅读理解 - -- C3 -- CMRC -- DRCD -- MultiRC -- RACE -- DROP -- OpenBookQA -- SQuAD2.0 - -
- -
-内容总结 - -- CSL -- LCSTS -- XSum -- SummScreen - -
- -
-内容分析 - -- EPRSTMT -- LAMBADA -- TNEWS - -
-
-
-长文本理解 - -- LEval -- LongBench -- GovReports -- NarrativeQA -- Qasper - -
-
-
-安全 - -- CivilComments -- CrowsPairs -- CValues -- JigsawMultilingual -- TruthfulQA - -
-
-健壮性 - -- AdvGLUE - -
-
-
-代码 - -- HumanEval -- HumanEvalX -- MBPP -- APPs -- DS1000 - -
-
+详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。

🔝返回顶部

diff --git a/dataset-index.yml b/dataset-index.yml new file mode 100644 index 00000000..9fbde8bd --- /dev/null +++ b/dataset-index.yml @@ -0,0 +1,734 @@ + +- ifeval: + name: IFEval + category: Instruction Following + paper: https://arxiv.org/pdf/2311.07911 + configpath: opencompass/configs/datasets/IFEval +- nphard: + name: NPHardEval + category: Reasoning + paper: https://arxiv.org/pdf/2312.14890v2 + configpath: opencompass/configs/datasets/NPHardEval +- pmmeval: + name: PMMEval + category: Language + paper: https://arxiv.org/pdf/2411.09116v1 + configpath: opencompass/configs/datasets/PMMEval +- theoremqa: + name: TheroremQA + category: Reasoning + paper: https://arxiv.org/pdf/2305.12524 + configpath: opencompass/configs/datasets/TheroremQA +- agieval: + name: AGIEval + category: Examination + paper: https://arxiv.org/pdf/2304.06364 + configpath: opencompass/configs/datasets/agieval +- babilong: + name: BABILong + category: Long Context + paper: https://arxiv.org/pdf/2406.10149 + configpath: opencompass/configs/datasets/babilong +- bigcodebench: + name: BigCodeBench + category: Code + paper: https://arxiv.org/pdf/2406.15877 + configpath: opencompass/configs/datasets/bigcodebench +- calm: + name: CaLM + category: Reasoning + paper: https://arxiv.org/pdf/2405.00622 + configpath: opencompass/configs/datasets/calm +- infinitebench: + name: InfiniteBench (∞Bench) + category: Long Context + paper: https://aclanthology.org/2024.acl-long.814.pdf + configpath: opencompass/configs/datasets/infinitebench +- korbench: + name: KOR-Bench + category: Reasoning + paper: https://arxiv.org/pdf/2410.06526v1 + configpath: opencompass/configs/datasets/korbench +- lawbench: + name: LawBench + category: Knowledge / Law + paper: https://arxiv.org/pdf/2309.16289 + configpath: opencompass/configs/datasets/lawbench +- leval: + name: L-Eval + category: Long Context + paper: https://arxiv.org/pdf/2307.11088v1 + configpath: opencompass/configs/datasets/leval +- livecodebench: + name: LiveCodeBench + category: Code + paper: https://arxiv.org/pdf/2403.07974 + configpath: opencompass/configs/datasets/livecodebench +- livemathbench: + name: LiveMathBench + category: Math + paper: https://arxiv.org/pdf/2412.13147 + configpath: opencompass/configs/datasets/livemathbench +- longbench: + name: LongBench + category: Long Context + paper: https://github.com/THUDM/LongBench + configpath: opencompass/configs/datasets/livemathbench +- lveval: + name: LV-Eval + category: Long Context + paper: https://arxiv.org/pdf/2402.05136 + configpath: opencompass/configs/datasets/lveval +- medbench: + name: MedBench + category: Knowledge / Medicine + paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138 + configpath: opencompass/configs/datasets/MedBench +- musr: + name: MuSR + category: Reasoning + paper: https://arxiv.org/pdf/2310.16049 + configpath: opencompass/configs/datasets/musr +- needlebench: + name: NeedleBench + category: Long Context + paper: https://arxiv.org/pdf/2407.11963 + configpath: opencompass/configs/datasets/needlebench +- ruler: + name: RULER + category: Long Context + paper: https://arxiv.org/pdf/2404.06654 + configpath: opencompass/configs/datasets/ruler +- alignment: + name: AlignBench + category: Subjective / Alignment + paper: https://arxiv.org/pdf/2311.18743 + configpath: opencompass/configs/datasets/subjective/alignbench +- alpaca: + name: AlpacaEval + category: Subjective / Instruction Following + paper: https://github.com/tatsu-lab/alpaca_eval + configpath: opencompass/configs/datasets/subjective/aplaca_eval +- arenahard: + name: Arena-Hard + category: Subjective / Chatbot + paper: https://lmsys.org/blog/2024-04-19-arena-hard/ + configpath: opencompass/configs/datasets/subjective/arena_hard +- flames: + name: FLAMES + category: Subjective / Alignment + paper: https://arxiv.org/pdf/2311.06899 + configpath: opencompass/configs/datasets/subjective/flames +- fofo: + name: FOFO + category: Subjective / Format Following + paper: https://arxiv.org/pdf/2402.18667 + configpath: opencompass/configs/datasets/subjective/fofo +- followbench: + name: FollowBench + category: Subjective / Instruction Following + paper: https://arxiv.org/pdf/2310.20410 + configpath: opencompass/configs/datasets/subjective/followbench +- hellobench: + name: HelloBench + category: Subjective / Long Context + paper: https://arxiv.org/pdf/2409.16191 + configpath: opencompass/configs/datasets/subjective/hellobench +- judgerbench: + name: JudgerBench + category: Subjective / Long Context + paper: https://arxiv.org/pdf/2410.16256 + configpath: opencompass/configs/datasets/subjective/judgerbench +- multiround: + name: MT-Bench-101 + category: Subjective / Multi-Round + paper: https://arxiv.org/pdf/2402.14762 + configpath: opencompass/configs/datasets/subjective/multiround +- wildbench: + name: WildBench + category: Subjective / Real Task + paper: https://arxiv.org/pdf/2406.04770 + configpath: opencompass/configs/datasets/subjective/wildbench +- teval: + name: T-Eval + category: Tool Utilization + paper: https://arxiv.org/pdf/2312.14033 + configpath: opencompass/configs/datasets/teval +- finalceiq: + name: FinanceIQ + category: Knowledge / Finance + paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ + configpath: opencompass/configs/datasets/FinanceIQ +- gaokaobench: + name: GAOKAOBench + category: Examination + paper: https://arxiv.org/pdf/2305.12474 + configpath: opencompass/configs/datasets/GaokaoBench +- lcbench: + name: LCBench + category: Code + paper: https://github.com/open-compass/CodeBench/ + configpath: opencompass/configs/datasets/LCBench +- MMLUArabic: + name: ArabicMMLU + category: Language + paper: https://arxiv.org/pdf/2402.12840 + configpath: opencompass/configs/datasets/MMLUArabic +- OpenFinData: + name: OpenFinData + category: Knowledge / Finance + paper: https://github.com/open-compass/OpenFinData + configpath: opencompass/configs/datasets/OpenFinData +- QuALITY: + name: QuALITY + category: Long Context + paper: https://arxiv.org/pdf/2112.08608 + configpath: opencompass/configs/datasets/QuALITY +- advglue: + name: Adversarial GLUE + category: Safety + paper: https://openreview.net/pdf?id=GF9cSKI3A_q + configpath: opencompass/configs/datasets/adv_glue +- afqmcd: + name: CLUE / AFQMC + category: Language + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_afqmc +- aime2024: + name: AIME2024 + category: Examination + paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024 + configpath: opencompass/configs/datasets/aime2024 +- anli: + name: Adversarial NLI + category: Reasoning + paper: https://arxiv.org/pdf/1910.14599v2 + configpath: opencompass/configs/datasets/anli +- anthropics_evals: + name: Anthropics Evals + category: Safety + paper: https://arxiv.org/pdf/2212.09251 + configpath: opencompass/configs/datasets/anthropics_evals +- apps: + name: APPS + category: Code + paper: https://arxiv.org/pdf/2105.09938 + configpath: opencompass/configs/datasets/apps +- arc: + name: ARC + category: Reasoning + paper: https://arxiv.org/pdf/1803.05457 + configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e] +- arc_prize_public_eval: + name: ARC Prize + category: ARC-AGI + paper: https://arcprize.org/guide#private + configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation +- ax: + name: SuperGLUE / AX + category: Reasoning + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g] +- bbh: + name: BIG-Bench Hard + category: Reasoning + paper: https://arxiv.org/pdf/2210.09261 + configpath: opencompass/configs/datasets/bbh +- BoolQ: + name: SuperGLUE / BoolQ + category: Knowledge + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_BoolQ +- c3: + name: CLUE / C3 (C³) + category: Understanding + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_C3 +- cb: + name: SuperGLUE / CB + category: Reasoning + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_CB +- ceval: + name: C-EVAL + category: Examination + paper: https://arxiv.org/pdf/2305.08322v1 + configpath: opencompass/configs/datasets/ceval +- charm: + name: CHARM + category: Reasoning + paper: https://arxiv.org/pdf/2403.14112 + configpath: opencompass/configs/datasets/CHARM +- chembench: + name: ChemBench + category: Knowledge / Chemistry + paper: https://arxiv.org/pdf/2404.01475 + configpath: opencompass/configs/datasets/ChemBench +- chid: + name: FewCLUE / CHID + category: Language + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_chid +- chinese_simpleqa: + name: Chinese SimpleQA + category: Knowledge + paper: https://arxiv.org/pdf/2411.07140 + configpath: opencompass/configs/datasets/chinese_simpleqa +- cibench: + name: CIBench + category: Code + paper: https://www.arxiv.org/pdf/2407.10499 + configpath: opencompass/configs/datasets/CIBench +- civilcomments: + name: CivilComments + category: Safety + paper: https://arxiv.org/pdf/1903.04561 + configpath: opencompass/configs/datasets/civilcomments +- clozeTest_maxmin: + name: Cloze Test-max/min + category: Code + paper: https://arxiv.org/pdf/2102.04664 + configpath: opencompass/configs/datasets/clozeTest_maxmin +- cluewsc: + name: FewCLUE / CLUEWSC + category: Language / WSC + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_cluewsc +- cmb: + name: CMB + category: Knowledge / Medicine + paper: https://arxiv.org/pdf/2308.08833 + configpath: opencompass/configs/datasets/cmb +- cmmlu: + name: CMMLU + category: Understanding + paper: https://arxiv.org/pdf/2306.09212 + configpath: opencompass/configs/datasets/cmmlu +- cmnli: + name: CLUE / CMNLI + category: Reasoning + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_cmnli +- cmo_fib: + name: cmo_fib + category: Examination + paper: "" + configpath: opencompass/configs/datasets/cmo_fib +- cmrc: + name: CLUE / CMRC + category: Understanding + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_CMRC +- commonsenseqa: + name: CommonSenseQA + category: Knowledge + paper: https://arxiv.org/pdf/1811.00937v2 + configpath: opencompass/configs/datasets/commonsenseqa +- commonsenseqa_cn: + name: CommonSenseQA-CN + category: Knowledge + paper: "" + configpath: opencompass/configs/datasets/commonsenseqa_cn +- copa: + name: SuperGLUE / COPA + category: Reasoning + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_COPA +- crowspairs: + name: CrowsPairs + category: Safety + paper: https://arxiv.org/pdf/2010.00133 + configpath: opencompass/configs/datasets/crowspairs +- crowspairs_cn: + name: CrowsPairs-CN + category: Safety + paper: "" + configpath: opencompass/configs/datasets/crowspairs_cn +- cvalues: + name: CVALUES + category: Safety + paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf + configpath: opencompass/configs/datasets/cvalues +- drcd: + name: CLUE / DRCD + category: Understanding + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_DRCD +- drop: + name: DROP (DROP Simple Eval) + category: Understanding + paper: https://arxiv.org/pdf/1903.00161 + configpath: opencompass/configs/datasets/drop +- ds1000: + name: DS-1000 + category: Code + paper: https://arxiv.org/pdf/2211.11501 + configpath: opencompass/configs/datasets/ds1000 +- eprstmt: + name: FewCLUE / EPRSTMT + category: Understanding + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_eprstmt +- flores: + name: Flores + category: Language + paper: https://aclanthology.org/D19-1632.pdf + configpath: opencompass/configs/datasets/flores +- game24: + name: Game24 + category: Math + paper: https://huggingface.co/datasets/nlile/24-game + configpath: opencompass/configs/datasets/game24 +- govrepcrs: + name: Government Report Dataset + category: Long Context + paper: https://aclanthology.org/2021.naacl-main.112.pdf + configpath: opencompass/configs/datasets/govrepcrs +- gpqa: + name: GPQA + category: Knowledge + paper: https://arxiv.org/pdf/2311.12022v1 + configpath: opencompass/configs/datasets/gpqa +- gsm8k: + name: GSM8K + category: Math + paper: https://arxiv.org/pdf/2110.14168v2 + configpath: opencompass/configs/datasets/gsm8k +- gsm_hard: + name: GSM-Hard + category: Math + paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf + configpath: opencompass/configs/datasets/gsm_hard +- hellaswag: + name: HellaSwag + category: Reasoning + paper: https://arxiv.org/pdf/1905.07830 + configpath: opencompass/configs/datasets/hellaswag +- humaneval: + name: HumanEval + category: Code + paper: https://arxiv.org/pdf/2107.03374v2 + configpath: opencompass/configs/datasets/humaneval +- humaneval_cn: + name: HumanEval-CN + category: Code + paper: "" + configpath: opencompass/configs/datasets/humaneval_cn +- humaneval_multi: + name: Multi-HumanEval + category: Code + paper: https://arxiv.org/pdf/2210.14868 + configpath: opencompass/configs/datasets/humaneval_multi +- humanevalx: + name: HumanEval-X + category: Code + paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790 + configpath: opencompass/configs/datasets/humanevalx +- hungarian_math: + name: Hungarian_Math + category: Math + paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam + configpath: opencompass/configs/datasets/hungarian_exam +- iwslt2017: + name: IWSLT2017 + category: Language + paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf + configpath: opencompass/configs/datasets/iwslt2017 +- jigsawmultilingual: + name: JigsawMultilingual + category: Safety + paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data + configpath: opencompass/configs/datasets/jigsawmultilingual +- lambada: + name: LAMBADA + category: Understanding + paper: https://arxiv.org/pdf/1606.06031 + configpath: opencompass/configs/datasets/lambada +- lcsts: + name: LCSTS + category: Understanding + paper: https://aclanthology.org/D15-1229.pdf + configpath: opencompass/configs/datasets/lcsts +- livestembench: + name: LiveStemBench + category: "" + paper: "" + configpath: opencompass/configs/datasets/livestembench +- llm_compression: + name: LLM Compression + category: Bits Per Character (BPC) + paper: https://arxiv.org/pdf/2404.09937 + configpath: opencompass/configs/datasets/llm_compression +- math: + name: MATH + category: Math + paper: https://arxiv.org/pdf/2103.03874 + configpath: opencompass/configs/datasets/math +- math401: + name: MATH 401 + category: Math + paper: https://arxiv.org/pdf/2304.02015 + configpath: opencompass/configs/datasets/math401 +- mathbench: + name: MathBench + category: Math + paper: https://arxiv.org/pdf/2405.12209 + configpath: opencompass/configs/datasets/mathbench +- mbpp: + name: MBPP + category: Code + paper: https://arxiv.org/pdf/2108.07732 + configpath: opencompass/configs/datasets/mbpp +- mbpp_cn: + name: MBPP-CN + category: Code + paper: "" + configpath: opencompass/configs/datasets/mbpp_cn +- mbpp_plus: + name: MBPP-PLUS + category: Code + paper: "" + configpath: opencompass/configs/datasets/mbpp_plus +- mgsm: + name: MGSM + category: Language / Math + paper: https://arxiv.org/pdf/2210.03057 + configpath: opencompass/configs/datasets/mgsm +- mmlu: + name: MMLU + category: Understanding + paper: https://arxiv.org/pdf/2009.03300 + configpath: opencompass/configs/datasets/mmlu +- mmlu_cf: + name: MMLU-CF + category: Understanding + paper: https://arxiv.org/pdf/2412.15194 + configpath: opencompass/configs/datasets/mmlu_cf +- mmlu_pro: + name: MMLU-Pro + category: Understanding + paper: https://arxiv.org/pdf/2406.01574 + configpath: opencompass/configs/datasets/mmlu_pro +- mmmlu: + name: MMMLU + category: Language / Understanding + paper: https://huggingface.co/datasets/openai/MMMLU + configpath: opencompass/configs/datasets/mmmlu +- multirc: + name: SuperGLUE / MultiRC + category: Understanding + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_MultiRC +- narrativeqa: + name: NarrativeQA + category: Understanding + paper: https://github.com/google-deepmind/narrativeqa + configpath: opencompass/configs/datasets/narrativeqa +- natural_question: + name: NaturalQuestions + category: Knowledge + paper: https://github.com/google-research-datasets/natural-questions + configpath: opencompass/configs/datasets/nq +- natural_question_cn: + name: NaturalQuestions-CN + category: Knowledge + paper: "" + configpath: opencompass/configs/datasets/nq_cn +- obqa: + name: OpenBookQA + category: Knowledge + paper: https://arxiv.org/pdf/1809.02789v1 + configpath: opencompass/configs/datasets/obqa +- piqa: + name: OpenBookQA + category: Knowledge / Physics + paper: https://arxiv.org/pdf/1911.11641v1 + configpath: opencompass/configs/datasets/piqa +- py150: + name: py150 + category: Code + paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line + configpath: opencompass/configs/datasets/py150 +- qasper: + name: Qasper + category: Long Context + paper: https://arxiv.org/pdf/2105.03011 + configpath: opencompass/configs/datasets/qasper +- qaspercut: + name: Qasper-Cut + category: Long Context + paper: "" + configpath: opencompass/configs/datasets/qaspercut +- race: + name: RACE + category: Examination + paper: https://arxiv.org/pdf/1704.04683 + configpath: opencompass/configs/datasets/race +- realtoxicprompts: + name: RealToxicPrompts + category: Safety + paper: https://arxiv.org/pdf/2009.11462 + configpath: opencompass/configs/datasets/realtoxicprompts +- record: + name: SuperGLUE / ReCoRD + category: Understanding + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD +- rte: + name: SuperGLUE / RTE + category: Reasoning + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_RTE +- ocnli: + name: CLUE / OCNLI + category: Reasoning + paper: https://arxiv.org/pdf/2004.05986 + configpath: opencompass/configs/datasets/CLUE_ocnli +- rolebench: + name: RoleBench + category: Role Play + paper: https://arxiv.org/pdf/2310.00746 + configpath: opencompass/configs/datasets/rolebench +- s3eval: + name: S3Eval + category: Long Context + paper: https://aclanthology.org/2024.naacl-long.69.pdf + configpath: opencompass/configs/datasets/s3eval +- scibench: + name: SciBench + category: Reasoning + paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf + configpath: opencompass/configs/datasets/scibench +- scicode: + name: SciCode + category: Code + paper: https://arxiv.org/pdf/2407.13168 + configpath: opencompass/configs/datasets/scicode +- simpleqa: + name: SimpleQA + category: Knowledge + paper: https://arxiv.org/pdf/2411.04368 + configpath: opencompass/configs/datasets/SimpleQA +- siqa: + name: SocialIQA + category: Reasoning + paper: https://arxiv.org/pdf/1904.09728 + configpath: opencompass/configs/datasets/siqa +- squad20: + name: SQuAD2.0 + category: Understanding + paper: https://arxiv.org/pdf/1806.03822 + configpath: opencompass/configs/datasets/squad20 +- storycloze: + name: StoryCloze + category: Reasoning + paper: https://aclanthology.org/2022.emnlp-main.616.pdf + configpath: opencompass/configs/datasets/storycloze +- strategyqa: + name: StrategyQA + category: Reasoning + paper: https://arxiv.org/pdf/2101.02235 + configpath: opencompass/configs/datasets/strategyqa +- summedits: + name: SummEdits + category: Language + paper: https://aclanthology.org/2023.emnlp-main.600.pdf + configpath: opencompass/configs/datasets/summedits +- summscreen: + name: SummScreen + category: Understanding + paper: https://arxiv.org/pdf/2104.07091v1 + configpath: opencompass/configs/datasets/summscreen +- svamp: + name: SVAMP + category: Math + paper: https://aclanthology.org/2021.naacl-main.168.pdf + configpath: opencompass/configs/datasets/SVAMP +- tabmwp: + name: TabMWP + category: Math / Table + paper: https://arxiv.org/pdf/2209.14610 + configpath: opencompass/configs/datasets/TabMWP +- taco: + name: TACO + category: Code + paper: https://arxiv.org/pdf/2312.14852 + configpath: opencompass/configs/datasets/taco +- tnews: + name: FewCLUE / TNEWS + category: Understanding + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_tnews +- bustm: + name: FewCLUE / BUSTM + category: Reasoning + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_bustm +- csl: + name: FewCLUE / CSL + category: Understanding + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_csl +- ocnli_fc: + name: FewCLUE / OCNLI-FC + category: Reasoning + paper: https://arxiv.org/pdf/2107.07498 + configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc +- triviaqa: + name: TriviaQA + category: Knowledge + paper: https://arxiv.org/pdf/1705.03551v2 + configpath: opencompass/configs/datasets/triviaqa +- triviaqarc: + name: TriviaQA-RC + category: Knowledge / Understanding + paper: "" + configpath: opencompass/configs/datasets/triviaqarc +- truthfulqa: + name: TruthfulQA + category: Safety + paper: https://arxiv.org/pdf/2109.07958v2 + configpath: opencompass/configs/datasets/truthfulqa +- tydiqa: + name: TyDi-QA + category: Language + paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf + configpath: opencompass/configs/datasets/tydiqa +- wic: + name: SuperGLUE / WiC + category: Language + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_WiC +- wsc: + name: SuperGLUE / WSC + category: Language / WSC + paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf + configpath: opencompass/configs/datasets/SuperGLUE_WSC +- winogrande: + name: WinoGrande + category: Language / WSC + paper: https://arxiv.org/pdf/1907.10641v2 + configpath: opencompass/configs/datasets/winogrande +- xcopa: + name: XCOPA + category: Language + paper: https://arxiv.org/pdf/2005.00333 + configpath: opencompass/configs/datasets/XCOPA +- xiezhi: + name: Xiezhi + category: Knowledge + paper: https://arxiv.org/pdf/2306.05783 + configpath: opencompass/configs/datasets/xiezhi +- xlsum: + name: XLSum + category: Understanding + paper: https://arxiv.org/pdf/2106.13822v1 + configpath: opencompass/configs/datasets/XLSum +- xsum: + name: Xsum + category: Understanding + paper: https://arxiv.org/pdf/1808.08745 + configpath: opencompass/configs/datasets/Xsum + + + diff --git a/docs/en/_static/js/custom.js b/docs/en/_static/js/custom.js index 84da69d4..9b9f2480 100644 --- a/docs/en/_static/js/custom.js +++ b/docs/en/_static/js/custom.js @@ -1,10 +1,20 @@ -var collapsedSections = []; +var collapsedSections = ['Dataset Statistics']; $(document).ready(function () { - $('.model-summary').DataTable({ + $('.dataset').DataTable({ "stateSave": false, "lengthChange": false, "pageLength": 20, - "order": [] + "order": [], + "language": { + "info": "Show _START_ to _END_ Items(Totally _TOTAL_ )", + "infoFiltered": "(Filtered from _MAX_ Items)", + "search": "Search:", + "zeroRecords": "Item Not Found", + "paginate": { + "next": "Next", + "previous": "Previous" + }, + } }); }); diff --git a/docs/en/advanced_guides/new_dataset.md b/docs/en/advanced_guides/new_dataset.md index 72f33318..e07e6868 100644 --- a/docs/en/advanced_guides/new_dataset.md +++ b/docs/en/advanced_guides/new_dataset.md @@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee return dataset ``` +3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website. + + - The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example: + + ``` + - mydataset: + name: MyDataset + category: Understanding + paper: https://arxiv.org/pdf/xxxxxxx + configpath: opencompass/configs/datasets/MyDataset + ``` + Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial. diff --git a/docs/en/conf.py b/docs/en/conf.py index 64a3a83a..9101ba3f 100644 --- a/docs/en/conf.py +++ b/docs/en/conf.py @@ -220,3 +220,11 @@ autodoc_typehints = 'none' # The not found page notfound_template = '404.html' + + +def builder_inited_handler(app): + subprocess.run(['./statis.py']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) \ No newline at end of file diff --git a/docs/en/index.rst b/docs/en/index.rst index fdad9c9e..7181c459 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -80,6 +80,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass. tools.md +.. _Dataset List: +.. toctree:: + :maxdepth: 1 + :caption: Dataset List + + dataset_statistics.md + .. _Notes: .. toctree:: :maxdepth: 1 diff --git a/docs/en/statis.py b/docs/en/statis.py new file mode 100755 index 00000000..a110c631 --- /dev/null +++ b/docs/en/statis.py @@ -0,0 +1,76 @@ +#! /usr/bin/env python + +from pathlib import Path + +import yaml +from tabulate import tabulate + +OC_ROOT = Path(__file__).absolute().parents[2] +GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/' +DATASETZOO_TEMPLATE = """\ +# Dataset Statistics + +On this page, we have listed all the datasets supported by OpenCompass. + +You can use sorting and search functions to find the dataset you need. + +""" + +with open('dataset_statistics.md', 'w') as f: + f.write(DATASETZOO_TEMPLATE) + +load_path = str(OC_ROOT / 'dataset-index.yml') + +with open(load_path, 'r') as f2: + data_list = yaml.load(f2, Loader=yaml.FullLoader) + +HEADER = ['name', 'category', 'paper', 'configpath'] + + +def table_format(data_list): + table_format_list = [] + for i in data_list: + table_format_list_sub = [] + for j in i: + for index in HEADER: + if index == 'paper': + table_format_list_sub.append('[link](' + i[j][index] + ')') + elif index == 'configpath': + if isinstance(i[j][index], list): + sub_list_text = '' + for k in i[j][index]: + sub_list_text += ('[link](' + GITHUB_PREFIX + k + + ') / ') + table_format_list_sub.append(sub_list_text[:-2]) + else: + table_format_list_sub.append('[link](' + + GITHUB_PREFIX + + i[j][index] + ')') + else: + table_format_list_sub.append(i[j][index]) + table_format_list.append(table_format_list_sub) + return table_format_list + + +data_format_list = table_format(data_list) + + +def generate_table(data_list, title=None): + + with open('dataset_statistics.md', 'a') as f: + if title is not None: + f.write(f'\n{title}') + f.write("""\n```{table}\n:class: dataset\n""") + header = ['Name', 'Category', 'Paper or Repository', 'Config File'] + table_cfg = dict(tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(data_list, header, **table_cfg)) + f.write('\n```\n') + + +generate_table( + data_list=data_format_list, + title='## Supported Dataset List', +) diff --git a/docs/zh_cn/_static/js/custom.js b/docs/zh_cn/_static/js/custom.js index 84da69d4..ecbff47e 100644 --- a/docs/zh_cn/_static/js/custom.js +++ b/docs/zh_cn/_static/js/custom.js @@ -1,10 +1,20 @@ -var collapsedSections = []; +var collapsedSections = ['数据集统计']; $(document).ready(function () { - $('.model-summary').DataTable({ + $('.dataset').DataTable({ "stateSave": false, "lengthChange": false, "pageLength": 20, - "order": [] + "order": [], + "language": { + "info": "显示 _START_ 至 _END_ 条目(总计 _TOTAL_ )", + "infoFiltered": "(筛选自 _MAX_ 条目)", + "search": "搜索:", + "zeroRecords": "没有找到任何条目", + "paginate": { + "next": "下一页", + "previous": "上一页" + }, + } }); }); diff --git a/docs/zh_cn/advanced_guides/new_dataset.md b/docs/zh_cn/advanced_guides/new_dataset.md index 16e85f37..16921885 100644 --- a/docs/zh_cn/advanced_guides/new_dataset.md +++ b/docs/zh_cn/advanced_guides/new_dataset.md @@ -91,4 +91,16 @@ return dataset ``` +3. 在完成数据集脚本和配置文件的构建后,需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息,以使其加入OpenCompass官网Doc的数据集统计列表中。 + + - 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下: + + ``` + - mydataset: + name: MyDataset + category: Understanding + paper: https://arxiv.org/pdf/xxxxxxx + configpath: opencompass/configs/datasets/MyDataset + ``` + 详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程,启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。 diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py index 640ea1d8..8910ead0 100644 --- a/docs/zh_cn/conf.py +++ b/docs/zh_cn/conf.py @@ -224,6 +224,7 @@ notfound_template = '404.html' def builder_inited_handler(app): subprocess.run(['./cp_origin_docs.sh']) + subprocess.run(['./statis.py']) def setup(app): diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index 37a3bc0c..827c7d91 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -81,6 +81,13 @@ OpenCompass 上手路线 tools.md +.. _数据集列表: +.. toctree:: + :maxdepth: 1 + :caption: 数据集列表 + + dataset_statistics.md + .. _其他说明: .. toctree:: :maxdepth: 1 diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py new file mode 100755 index 00000000..eb5dc7fe --- /dev/null +++ b/docs/zh_cn/statis.py @@ -0,0 +1,75 @@ +#! /usr/bin/env python + +from pathlib import Path + +import yaml +from tabulate import tabulate + +OC_ROOT = Path(__file__).absolute().parents[2] +GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/' +DATASETZOO_TEMPLATE = """\ +# 数据集统计 + +在本页面中,我们列举了OpenCompass所支持的所有数据集。 + +你可以使用排序和搜索功能找到需要的数据集。 + +""" + +with open('dataset_statistics.md', 'w') as f: + f.write(DATASETZOO_TEMPLATE) + +load_path = str(OC_ROOT / 'dataset-index.yml') + +with open(load_path, 'r') as f2: + data_list = yaml.load(f2, Loader=yaml.FullLoader) + +HEADER = ['name', 'category', 'paper', 'configpath'] + + +def table_format(data_list): + table_format_list = [] + for i in data_list: + table_format_list_sub = [] + for j in i: + for index in HEADER: + if index == 'paper': + table_format_list_sub.append('[链接](' + i[j][index] + ')') + elif index == 'configpath': + if isinstance(i[j][index], list): + sub_list_text = '' + for k in i[j][index]: + sub_list_text += ('[链接](' + GITHUB_PREFIX + k + + ') / ') + table_format_list_sub.append(sub_list_text[:-2]) + else: + table_format_list_sub.append('[链接](' + GITHUB_PREFIX + + i[j][index] + ')') + else: + table_format_list_sub.append(i[j][index]) + table_format_list.append(table_format_list_sub) + return table_format_list + + +data_format_list = table_format(data_list) + + +def generate_table(data_list, title=None): + + with open('dataset_statistics.md', 'a') as f: + if title is not None: + f.write(f'\n{title}') + f.write("""\n```{table}\n:class: dataset\n""") + header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接'] + table_cfg = dict(tablefmt='pipe', + floatfmt='.2f', + numalign='right', + stralign='center') + f.write(tabulate(data_list, header, **table_cfg)) + f.write('\n```\n') + + +generate_table( + data_list=data_format_list, + title='## 支持数据集列表', +) From f407930475e4d7cf3338eb9a5b1ac4f03916d7f6 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Thu, 20 Feb 2025 12:19:46 +0800 Subject: [PATCH 4/9] [Feature] Support subjective evaluation for reasoning model (#1868) * fix pip version * fix pip version * add subeval for reasoning model * add subeval for reasoning model * update configs * update config * update config * update config * update files --- .../alignbench_judgeby_critiquellm.py | 2 +- .../alignbench_judgeby_critiquellm_new.py | 2 +- .../alignbench_v1_1_judgeby_critiquellm.py | 2 +- .../alignbench_v1_1_judgeby_critiquellm_new.py | 2 +- .../alpaca_eval/alpacav2_judgeby_gpt4.py | 3 ++- .../alpacav2_judgeby_gpt4_bradleyterry.py | 2 +- .../alpaca_eval/alpacav2_judgeby_gpt4_new.py | 2 +- .../arena_hard/arena_hard_compare.py | 2 +- .../arena_hard_compare_bradleyterry.py | 2 +- .../arena_hard/arena_hard_compare_new.py | 2 +- .../compassarena/compassarena_compare.py | 2 +- .../compassarena_compare_bradleyterry.py | 4 ++-- .../compassarena/compassarena_compare_new.py | 2 +- .../subjective/fofo/fofo_bilingual_judge.py | 2 +- .../fofo/fofo_bilingual_judge_new.py | 2 +- .../datasets/subjective/fofo/fofo_judge.py | 2 +- .../datasets/subjective/fofo/fofo_judge_new.py | 2 +- .../followbench/followbench_llmeval.py | 2 +- .../followbench/followbench_llmeval_new.py | 2 +- .../subjective/multiround/mtbench101_judge.py | 2 +- .../multiround/mtbench101_judge_new.py | 2 +- .../openicl/icl_evaluator/lm_evaluator.py | 2 +- opencompass/tasks/subjective_eval.py | 18 +++++++++++++++--- opencompass/utils/text_postprocessors.py | 12 ++++++++++++ 24 files changed, 51 insertions(+), 26 deletions(-) diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py index 86c2a80b..0bc7df77 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py @@ -32,7 +32,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py index 20797b0f..d3f59b9f 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py @@ -31,7 +31,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py index 024f66a1..44f63f4f 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py @@ -32,7 +32,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py index 2ff09a3e..216e6ffa 100644 --- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py +++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py @@ -31,7 +31,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py index 137e5ca0..ad0d4ef4 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py @@ -73,12 +73,13 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, + prompt_template=dict( type=PromptTemplate, template=dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py index 99f2e2be..19fe1559 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py @@ -74,7 +74,7 @@ for _name in subjective_all_sets: ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py index 06a82efe..a0510f5c 100644 --- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py +++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py @@ -72,7 +72,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py index 90837c7b..7446fdd7 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py @@ -38,7 +38,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py index 7a0e9ae8..dc4b250e 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py @@ -39,7 +39,7 @@ for _name in subjective_all_sets: ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py index 08b27ca7..dbad40ef 100644 --- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py +++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py @@ -37,7 +37,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py index 90141e66..47cc7b31 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py index 8a687889..38d7927a 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py @@ -1,6 +1,6 @@ from opencompass.datasets import ( CompassArenaDataset, - compassarena_bradleyterry_postprocess, + compassarena_bradleyterry_postprocess ) from opencompass.openicl.icl_evaluator import LMEvaluator from opencompass.openicl.icl_inferencer import GenInferencer @@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items(): ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py index a32691ad..83266765 100644 --- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py +++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py @@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items(): ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py index 089fd101..9516e074 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py @@ -91,7 +91,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py index 81e160b5..f732dba0 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py @@ -90,7 +90,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py index 89400892..8944be01 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py @@ -59,7 +59,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py index 691aff2b..03dcf190 100644 --- a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py +++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py @@ -58,7 +58,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=4096), + inferencer=dict(type=GenInferencer,), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py index e601bda3..1c4203fd 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py @@ -29,7 +29,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py index b0aacd86..970605b6 100644 --- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py +++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py @@ -28,7 +28,7 @@ for _name in subjective_all_sets: ]), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=2048), + inferencer=dict(type=GenInferencer), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py index 00924ecb..53ab1631 100644 --- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py +++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py @@ -24,7 +24,7 @@ for _name in subjective_all_sets: template="""{dialogue}""", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py index 938f2f5e..01b9c12f 100644 --- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py +++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py @@ -23,7 +23,7 @@ for _name in subjective_all_sets: template="""{dialogue}""", ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'), + inferencer=dict(type=ChatInferencer, infer_mode='last'), ) subjective_eval_cfg = dict( diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 53814070..94f2cf94 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -329,4 +329,4 @@ class LMEvaluator: else: kwargs = self.dict_postprocessor proc = DICT_POSTPROCESSORS.get(kwargs.pop('type')) - return proc(output, self.output_path, **kwargs) + return proc(output, self.output_path, **kwargs) \ No newline at end of file diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py index 417c5cdb..0ddd7b0c 100644 --- a/opencompass/tasks/subjective_eval.py +++ b/opencompass/tasks/subjective_eval.py @@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask): if fnmatch.fnmatch(ds_abbr, pattern): pred_postprocessor = model_postprocessors[pattern] break + if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor: kwargs = pred_postprocessor or eval_cfg['evaluator'][ 'pred_postprocessor'] - proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) + temp_kwargs = copy.deepcopy(kwargs) + proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type')) self.logger.info('Get postprocessor {postprocessor}.') - pred_strs = [proc(s, **kwargs) for s in pred_strs] + pred_strs = [proc(s, **temp_kwargs) for s in pred_strs] else: - self.logger.info('No postprocessor found.') + self.logger.info('No dataset postprocessor found.') + + if 'pred_postprocessor' in model_cfg or pred_postprocessor: + kwargs = pred_postprocessor or model_cfg['pred_postprocessor'] + temp_kwargs = copy.deepcopy(kwargs) + proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type')) + pred_strs = [proc(s, **temp_kwargs) for s in pred_strs] + else: + self.logger.info('No model postprocessor found.') return { 'model_name': model_abbr_from_cfg(model_cfg), @@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask): if fnmatch.fnmatch(ds_abbr, pattern): pred_postprocessor = model_postprocessors[pattern] break + if 'pred_postprocessor' in eval_cfg or pred_postprocessor: + kwargs = pred_postprocessor or eval_cfg['pred_postprocessor'] proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) pred_strs = [proc(s, **kwargs) for s in pred_strs] diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index eb7469ab..7110e752 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str: return '' +@TEXT_POSTPROCESSORS.register_module('think_pred') +def think_pred_postprocess( + prediction: str, + re_pattern: str, +) -> str: + match = re.search(re_pattern, prediction) + if match: + return match.group(1).strip() + else: + return prediction + + def first_option_postprocess(text: str, options: str, cushion=True) -> str: """Find first valid option for text.""" From bc22749fd8c20d4f69c2c4ebb9e517bce2c4666a Mon Sep 17 00:00:00 2001 From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com> Date: Thu, 20 Feb 2025 14:08:18 +0800 Subject: [PATCH 5/9] [CI] update daily test scores (#1870) * update * Update daily-run-test.yml * Update dlc.py --- .../scripts/oc_score_baseline_fullbench.yaml | 6 +++--- .../scripts/oc_score_baseline_testrange.yaml | 20 +++++++++---------- .github/workflows/daily-run-test.yml | 13 ++++++------ opencompass/runners/dlc.py | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 6ab32832..5b0dee2b 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind: openai_mmmlu_lite_DE-DE_accuracy: 51.27 openai_mmmlu_lite_ES-LA_accuracy: 56.94 openai_mmmlu_lite_FR-FR_accuracy: 58.22 - openai_mmmlu_lite_HI-IN_accuracy: 33.75 + openai_mmmlu_lite_HI-IN_accuracy: 30.75 openai_mmmlu_lite_ID-ID_accuracy: 50.6 openai_mmmlu_lite_IT-IT_accuracy: 50.6 openai_mmmlu_lite_JA-JP_accuracy: 51.13 @@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind: CompassArena_naive_average: 34.61 FoFo_naive_average: 0.38 mtbench101_avg: 8.01 - wildbench_average: -15.69 + wildbench_average: -10.49 simpleqa_accuracy_given_attempted: 0.04 chinese_simpleqa_given_attempted_accuracy: 0.34 alignment_bench_v1_1_专业能力: 6.05 @@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind: compassarena_knowledge_naive_average: 36 compassarena_reason_v2_naive_average: 35 compassarena_math_v2_naive_average: 19.91 - compassarena_creationv2_zh_naive_average: 35.81 + compassarena_creationv2_zh_naive_average: 43.64 fofo_test_prompts_overall: 0.35 fofo_test_prompts_cn_overall: 0.41 followbench_llmeval_en_HSR_AVG: 0.73 diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml index 45e20ddd..5f1121a7 100644 --- a/.github/scripts/oc_score_baseline_testrange.yaml +++ b/.github/scripts/oc_score_baseline_testrange.yaml @@ -15,13 +15,13 @@ chat: gsm8k_accuracy: 50 race-high_accuracy: 68.75 deepseek-7b-chat-vllm: - gsm8k_accuracy: 43.75 - race-high_accuracy: 75 + gsm8k_accuracy: 50 + race-high_accuracy: 78.12 gemma2-2b-it-hf: gsm8k_accuracy: 50 - race-high_accuracy: 71.88 + race-high_accuracy: 75 gemma2-9b-it-hf: - gsm8k_accuracy: 71.88 + gsm8k_accuracy: 68.75 race-high_accuracy: 84.38 gemma-2b-it-hf: gsm8k_accuracy: 3.12 @@ -36,7 +36,7 @@ chat: gsm8k_accuracy: 78.12 race-high_accuracy: 93.75 gemma-7b-it-vllm: - gsm8k_accuracy: 34.38 + gsm8k_accuracy: 46.88 race-high_accuracy: 68.75 internlm2_5-7b-chat-hf: gsm8k_accuracy: 84.38 @@ -57,7 +57,7 @@ chat: gsm8k_accuracy: 53.12 race-high_accuracy: 90.62 internlm2-chat-7b-vllm: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 43.75 race-high_accuracy: 84.38 llama-3_1-8b-instruct-hf: gsm8k_accuracy: 84.38 @@ -90,13 +90,13 @@ chat: gsm8k_accuracy: 75 race-high_accuracy: 81.25 mistral-nemo-instruct-2407-turbomind: - gsm8k_accuracy: 65.62 - race-high_accuracy: 87.50 + gsm8k_accuracy: 71.88 + race-high_accuracy: 78.12 mistral-7b-instruct-v0.1-vllm: gsm8k_accuracy: 34.38 race-high_accuracy: 68.75 mistral-7b-instruct-v0.2-vllm: - gsm8k_accuracy: 43.75 + gsm8k_accuracy: 31.25 race-high_accuracy: 75 phi-3-mini-4k-instruct-hf: gsm8k_accuracy: 81.25 @@ -177,7 +177,7 @@ chat: gsm8k_accuracy: 93.75 race-high_accuracy: 87.5 mixtral-8x7b-instruct-v0.1-hf: - gsm8k_accuracy: 56.25 + gsm8k_accuracy: 59.38 race-high_accuracy: 81.25 mixtral-large-instruct-2411-turbomind: gsm8k_accuracy: 90.62 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 0fa1f4a6..8aa1df16 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -17,7 +17,7 @@ on: required: false description: 'whether to build lmdeploy' type: boolean - default: false + default: true repo_org_lmdeploy: required: false description: 'Tested repository organization name. Default is internlm/lmdeploy' @@ -162,15 +162,16 @@ jobs: pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}} cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data - name: Prepare - reinstall lmdeploy - cu12 - if: ${{inputs.build_lmdeploy}} + if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} uses: actions/download-artifact@v4 with: name: my-artifact-${{ github.run_id }}-py310 - name: Prepare - reinstall lmdeploy - cu12 - if: ${{inputs.build_lmdeploy}} + if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}} run: | . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} + pip uninstall -y lmdeploy pip install lmdeploy-*.whl --no-deps - name: conda env run: | @@ -188,7 +189,7 @@ jobs: regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}} runs-on: volc_cu12_daily environment: 'prod' - timeout-minutes: 120 #2hours + timeout-minutes: 180 #3hours steps: - name: Clone repository uses: actions/checkout@v2 @@ -275,7 +276,7 @@ jobs: conda info --envs lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 & echo "restful_pid=$!" >> "$GITHUB_ENV" - sleep 120s + sleep 180s opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py @@ -334,7 +335,7 @@ jobs: notify_to_feishu: - if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} + if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }} needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test] timeout-minutes: 5 runs-on: self-hosted diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 03045870..44e9fd00 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -286,7 +286,7 @@ class DLCRunner(BaseRunner): f'Failed to get job info for {job_id}') status = job_info['Status'] - if status == 'Failed': + if status == 'Failed' or status == 'Stopped': return -1 elif status == 'Succeeded': return 0 From 27c916661d94973430925c54783e5c08f8e1fb48 Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 20 Feb 2025 19:32:12 +0800 Subject: [PATCH 6/9] [Feature] Math Verify with model post_processor (#1881) * update * [Feature] Update model post_processor * update * update * update --- examples/eval_math_verify.py | 77 +++++++++ .../configs/datasets/math/math_500_gen.py | 40 +++++ opencompass/datasets/custom.py | 58 ++++--- .../openicl/icl_evaluator/math_evaluator.py | 154 ++++++++++++++++++ opencompass/tasks/openicl_eval.py | 36 ++-- opencompass/utils/text_postprocessors.py | 42 +++++ requirements/extra.txt | 2 + 7 files changed, 369 insertions(+), 40 deletions(-) create mode 100644 examples/eval_math_verify.py create mode 100644 opencompass/configs/datasets/math/math_500_gen.py create mode 100644 opencompass/openicl/icl_evaluator/math_evaluator.py diff --git a/examples/eval_math_verify.py b/examples/eval_math_verify.py new file mode 100644 index 00000000..bbd1dbc3 --- /dev/null +++ b/examples/eval_math_verify.py @@ -0,0 +1,77 @@ +from mmengine.config import read_base +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +with read_base(): + from opencompass.configs.datasets.math.math_500_gen import math_datasets + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-llama-8b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', + engine_config=dict(session_len=32768, max_batch_size=8, tp=1), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=32768, + batch_size=32, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content), + ), + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-7b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', + engine_config=dict(session_len=32768, max_batch_size=8, tp=1), + gen_config=dict( + temperature=0.6, + top_p=0.95, + max_new_tokens=32768, + do_sample=True, + ), + max_seq_len=32768, + max_out_len=32768, + batch_size=32, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content), + ), + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-1_5b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', + engine_config=dict(session_len=32768, max_batch_size=16, tp=1), + gen_config=dict( + top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096 + ), + max_seq_len=32768, + max_out_len=32768, + batch_size=32, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content), + ), + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-14b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', + engine_config=dict(session_len=32768, max_batch_size=16, tp=2), + gen_config=dict( + top_k=1, + temperature=0.6, + top_p=0.95, + max_new_tokens=32768, + do_sample=True, + ), + max_seq_len=32768, + max_out_len=32768, + batch_size=16, + run_cfg=dict(num_gpus=2), + pred_postprocessor=dict(type=extract_non_reasoning_content), + ), +] + +datasets = [*math_datasets] + + +work_dir = './outputs/math_500' diff --git a/opencompass/configs/datasets/math/math_500_gen.py b/opencompass/configs/datasets/math/math_500_gen.py new file mode 100644 index 00000000..79d2f3b0 --- /dev/null +++ b/opencompass/configs/datasets/math/math_500_gen.py @@ -0,0 +1,40 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import CustomDataset +from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + + +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator), +) + +math_datasets = [ + dict( + type=CustomDataset, + abbr='math-500', + path='opencompass/math', + file_name='test_prm800k_500.jsonl', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + ) +] diff --git a/opencompass/datasets/custom.py b/opencompass/datasets/custom.py index ad3fbe2c..110cb72b 100644 --- a/opencompass/datasets/custom.py +++ b/opencompass/datasets/custom.py @@ -13,6 +13,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.registry import LOAD_DATASET +from opencompass.utils import get_data_path from .base import BaseDataset @@ -114,7 +115,7 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator): circular_pattern = origin_item['circular_pattern'] for k in circular_patterns: if tuple(circular_pattern) in circular_patterns[k]: - tmp_metrics[f'correct_{k}'] += (1 if parsed == refr else 0) + tmp_metrics[f'correct_{k}'] += 1 if parsed == refr else 0 tmp_metrics[f'count_{k}'] += 1 for k in circular_patterns: @@ -164,7 +165,10 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator): class CustomDataset(BaseDataset): @staticmethod - def load(path): + def load(path, file_name=None, local_mode=False): + path = get_data_path(path, local_mode=local_mode) + if file_name is not None: + path = os.path.join(path, file_name) if path.endswith('.jsonl'): with open(path, 'r', encoding='utf-8-sig') as f: data = [json.loads(line) for line in f] @@ -222,9 +226,10 @@ def make_mcq_gen_config(meta): ) eval_cfg = dict( - evaluator=dict(type=meta.get('evaluator', OptionSimAccEvaluator), - **meta.get('evaluator_kwargs', - {'options': meta['options']})), + evaluator=dict( + type=meta.get('evaluator', OptionSimAccEvaluator), + **meta.get('evaluator_kwargs', {'options': meta['options']}), + ), pred_role='BOT', ) @@ -269,10 +274,10 @@ def make_circular_mcq_gen_config(meta): ) eval_cfg = dict( - evaluator=dict(type=meta.get('evaluator', - CircularOptionSimAccEvaluator), - **meta.get('evaluator_kwargs', - {'options': meta['options']})), + evaluator=dict( + type=meta.get('evaluator', CircularOptionSimAccEvaluator), + **meta.get('evaluator_kwargs', {'options': meta['options']}), + ), pred_role='BOT', ) @@ -320,8 +325,10 @@ def make_qa_gen_config(meta): ) eval_cfg = dict( - evaluator=dict(type=meta.get('evaluator', AccEvaluator), - **meta.get('evaluator_kwargs', {})), + evaluator=dict( + type=meta.get('evaluator', AccEvaluator), + **meta.get('evaluator_kwargs', {}), + ), pred_role='BOT', ) @@ -346,9 +353,11 @@ def make_mcq_ppl_config(meta): template = { answer: dict(round=[ dict(role='HUMAN', prompt=human_prompt), - dict(role='BOT', - prompt=bot_prompt.format( - **{meta['output_column']: answer})), + dict( + role='BOT', + prompt=bot_prompt.format( + **{meta['output_column']: answer}), + ), ], ) for answer in meta['options'] } @@ -370,8 +379,10 @@ def make_mcq_ppl_config(meta): inferencer=dict(type=PPLInferencer), ) - eval_cfg = dict(evaluator=dict(type=meta.get('evaluator', AccEvaluator), - **meta.get('evaluator_kwargs', {}))) + eval_cfg = dict(evaluator=dict( + type=meta.get('evaluator', AccEvaluator), + **meta.get('evaluator_kwargs', {}), + )) dataset = dict( abbr=meta['abbr'], @@ -394,9 +405,11 @@ def make_circular_mcq_ppl_config(meta): template = { answer: dict(round=[ dict(role='HUMAN', prompt=human_prompt), - dict(role='BOT', - prompt=bot_prompt.format( - **{meta['output_column']: answer})), + dict( + role='BOT', + prompt=bot_prompt.format( + **{meta['output_column']: answer}), + ), ], ) for answer in meta['options'] } @@ -418,9 +431,10 @@ def make_circular_mcq_ppl_config(meta): inferencer=dict(type=PPLInferencer), ) - eval_cfg = dict( - evaluator=dict(type=meta.get('evaluator', CircularEvaluator), - **meta.get('evaluator_kwargs', {}))) + eval_cfg = dict(evaluator=dict( + type=meta.get('evaluator', CircularEvaluator), + **meta.get('evaluator_kwargs', {}), + )) dataset = dict( abbr=meta['abbr'], diff --git a/opencompass/openicl/icl_evaluator/math_evaluator.py b/opencompass/openicl/icl_evaluator/math_evaluator.py new file mode 100644 index 00000000..c790c17b --- /dev/null +++ b/opencompass/openicl/icl_evaluator/math_evaluator.py @@ -0,0 +1,154 @@ +from latex2sympy2_extended import NormalizationConfig +from math_verify import (ExprExtractionConfig, LatexExtractionConfig, parse, + verify) + +from opencompass.openicl.icl_evaluator import BaseEvaluator +from opencompass.registry import ICL_EVALUATORS + + +@ICL_EVALUATORS.register_module() +class MATHEvaluator(BaseEvaluator): + + def score(self, predictions, references): + + self.is_num_equal(predictions, references) + + correct = 0 + count = 0 + details = [] + for i, j in zip(predictions, references): + count += 1 + gold_parsed = parse( + j, + extraction_mode='first_match', + extraction_config=[ + LatexExtractionConfig(), + ExprExtractionConfig(), + ], + ) + # If parsing result is empty, try adding LaTeX + # environment and parse again + if len(gold_parsed) == 0: + j_with_env = f'${j}$' + gold_parsed = parse( + j_with_env, + extraction_mode='first_match', + extraction_config=[ + LatexExtractionConfig(), + ExprExtractionConfig(), + ], + ) + if len(gold_parsed) != 0: + # We require the answer to be provided in correct + # latex (no malformed operators) + answer_parsed = parse( + i, + extraction_config=[ + LatexExtractionConfig( + normalization_config=NormalizationConfig( + nits=False, + malformed_operators=False, + basic_latex=True, + equations=True, + boxed='all', + units=True, + ), + # Ensures that boxed is tried first + boxed_match_priority=0, + try_extract_without_anchor=False, + ) + ], + extraction_mode='first_match', + ) + + answer_correct = float(verify(answer_parsed, gold_parsed)) + correct += answer_correct + detail = { + 'pred': str(answer_parsed), + 'answer': str(gold_parsed), + 'correct': True if answer_correct else False, + } + details.append(detail) + result = {'accuracy': 100 * correct / count, 'details': details} + return result + + +if __name__ == '__main__': + import sympy + + test_cases = [ + # 1. Basic arithmetic operations + r'Simple fraction: \boxed{\frac{1}{2}}', + r'Addition: \boxed{2 + 3}', + r'Multiplication: \boxed{2 \times 3}', + # 2. Algebraic expressions + r'Quadratic: \boxed{x^2 + 2x + 1}', + r'Polynomial: \boxed{3x^3 - 2x^2 + 4x - 5}', + # 3. Trigonometric functions + r'Trigonometry: \boxed{\sin(x) + \cos(x)}', + r'Complex trig: \boxed{\tan^2(x) + \sec^2(x)}', + # 4. Roots and exponents + r'Square root: \boxed{\sqrt{16}}', + r'Complex root: \boxed{\sqrt[3]{x^2 + 1}}', + # 5. Logarithms + r'Natural log: \boxed{\ln(e^2)}', + r'Log base: \boxed{\log_2(8)}', + # 6. Limits and summations + r'Limit: \boxed{\lim_{x \to 0} \frac{\sin(x)}{x}}', + r'Sum: \boxed{\sum_{i=1}^{n} i}', + # 7. Integrals + r'Integral: \boxed{\int_{0}^{1} x^2 dx}', + r'Double integral: \boxed{\int_{0}^{1}\int_{0}^{1} xy \,dx\,dy}', + # 8. Matrices + r'Matrix: \boxed{\begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}}', + # 9. Complex combinations + r'Complex expr: \boxed{\frac{\sqrt{x^2 + 1}}{\ln(x)} + ' + r'\int_{0}^{x} t^2 dt}', + # 10. Error cases + r'Empty: \boxed{}', + r'Invalid: \boxed{\frac{1}}', # Missing denominator + r'Nested: \boxed{\boxed{1}}', # Nested boxed + ] + + def print_result(expr: str, result: list): + print('\n' + '=' * 50) + print(f'Input: {expr}') + print(f'Output type: {type(result)}') + print(f'Output: {result}') + + # If result is sympy expression, show more information + if result: + for item in result: + if isinstance(item, sympy.Basic): + print(f'Sympy repr: {repr(item)}') + try: + print(f'Evaluated: {item.evalf()}') + except Exception as e: + print(f'Cannot evaluate: {e}') + + # Test all cases + for test_expr in test_cases: + try: + result = parse(test_expr) + print_result(test_expr, result) + except Exception as e: + print(f'\nError processing {test_expr}: {e}') + + # Special test: verify numerical calculations + numerical_tests = [ + r'\boxed{2 + 2}', # Should equal 4 + r'\boxed{\frac{1}{2} + \frac{1}{3}}', # Should equal 5/6 + r'\boxed{\sqrt{16} + \sqrt{9}}', # Should equal 7 + ] + + print('\n' + '=' * 50 + '\nNumerical Verification Tests:') + for test_expr in numerical_tests: + try: + result = parse(test_expr) + if result and isinstance(result[0], sympy.Basic): + expr = result[0] + print(f'\nExpression: {test_expr}') + print(f'Symbolic: {expr}') + print(f'Numerical value: {float(expr.evalf())}') + except Exception as e: + print(f'\nError in numerical test {test_expr}: {e}') diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index 5bec3603..a797459f 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -1,6 +1,5 @@ import argparse import copy -import fnmatch import math import os import os.path as osp @@ -18,9 +17,8 @@ from mmengine.utils import mkdir_or_exist from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS, TEXT_POSTPROCESSORS) from opencompass.tasks.base import BaseTask, extract_role_pred -from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, - get_infer_output_path, get_logger, - task_abbr_from_cfg) +from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path, + get_logger, task_abbr_from_cfg) @TASKS.register_module() @@ -60,19 +58,9 @@ class OpenICLEvalTask(BaseTask): self.dataset_cfg = dataset_cfg # Load Dataset - self.eval_cfg = self.dataset_cfg.get('eval_cfg') - self.output_column = dataset_cfg['reader_cfg']['output_column'] - - # overwrite postprocessor if the model has specified one - ds_abbr = dataset_abbr_from_cfg(self.dataset_cfg) - model_postprocessors = self.model_cfg.get( - 'pred_postprocessor', {}) - for pattern in model_postprocessors.keys(): - if fnmatch.fnmatch(ds_abbr, pattern): - self.eval_cfg[ - 'pred_postprocessor'] = model_postprocessors[ - pattern] # noqa - break + self.eval_cfg = copy.deepcopy(dataset_cfg.get('eval_cfg')) + self.output_column = copy.deepcopy( + dataset_cfg['reader_cfg']['output_column']) out_path = get_infer_output_path( self.model_cfg, self.dataset_cfg, @@ -155,8 +143,20 @@ class OpenICLEvalTask(BaseTask): ] # Postprocess predictions if necessary + # Model Specified Postprocessor + if 'pred_postprocessor' in self.model_cfg: + kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor']) + proc = kwargs.pop('type') + if isinstance(proc, str): + proc = TEXT_POSTPROCESSORS.get(proc) + if pred_list_flag: + pred_strs = [[proc(s, **kwargs) for s in preds] + for preds in pred_strs] + else: + pred_strs = [proc(s, **kwargs) for s in pred_strs] + # Dataset Specified Postprocessor if 'pred_postprocessor' in self.eval_cfg: - kwargs = self.eval_cfg['pred_postprocessor'] + kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor']) proc = kwargs.pop('type') if isinstance(proc, str): proc = TEXT_POSTPROCESSORS.get(proc) diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 7110e752..d21a06ab 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -37,6 +37,7 @@ def general_cn_postprocess(text: str) -> str: cleaned_text = re.sub(r'\s+', ' ', no_articles).strip() import jieba + cleaned_text = ' '.join(jieba.cut(text)) return cleaned_text @@ -241,3 +242,44 @@ def match_answer_pattern(response_text: str, answer_pattern: str): match = re.search(answer_pattern, response_text) extracted_answer = match.group(1) if match else '' return extracted_answer + + +@TEXT_POSTPROCESSORS.register_module('extract-non-reasoning-content') +def extract_non_reasoning_content( + text: str, + think_start_token: str = '', + think_end_token: str = '', +) -> str: + """Extract content after the last reasoning tag from text. + + When only end token is present, returns content after the end token. + When both tokens are present, removes all content between start and end tokens. + + Args: + text (str): Input text containing reasoning tags. + think_start_token (str, optional): Start token for reasoning section. Defaults to ''. + think_end_token (str, optional): End token for reasoning section. Defaults to ''. + + Returns: + str: Processed text after removing reasoning sections. + + Examples: + >>> # When only end token exists + >>> text = "This is a test. How are you?" + >>> extract_non_reasoning_content(text) + 'How are you?' + + >>> # When both tokens exist + >>> text = "Startreasoning here End" + >>> extract_non_reasoning_content(text) + 'Start End' + """ + # If text contains only end token, split by end token and take the last part + if think_start_token not in text and think_end_token in text: + return text.split(think_end_token)[-1].strip() + + # Original behavior for complete tag pairs + reasoning_regex = re.compile(rf'{think_start_token}(.*?){think_end_token}', + re.DOTALL) + non_reasoning_content = reasoning_regex.sub('', text).strip() + return non_reasoning_content diff --git a/requirements/extra.txt b/requirements/extra.txt index 7f04c9d0..a98b3bc8 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -15,6 +15,8 @@ langdetect latex2sympy2 # Lawbench, leval ltp +# Math +math-verify # Taco, apps Dataset pyext # Law Bench From d7daee6e259dc851d615e685225db27800fd9a9a Mon Sep 17 00:00:00 2001 From: Linchen Xiao Date: Thu, 20 Feb 2025 19:33:25 +0800 Subject: [PATCH 7/9] [Update] OpenAI model update, bigcodebench update (#1879) * [Update] Openai model update, bigcodebench update * update --- .../datasets/bigcodebench/bigcodebench.py | 36 +++++++++- opencompass/models/openai_api.py | 70 ++++++++++--------- .../summarizers/subjective/compassbench.py | 38 ++++------ 3 files changed, 86 insertions(+), 58 deletions(-) diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py index f1109b1d..f347e9e2 100644 --- a/opencompass/datasets/bigcodebench/bigcodebench.py +++ b/opencompass/datasets/bigcodebench/bigcodebench.py @@ -121,8 +121,40 @@ class BigCodeBenchEvaluator(BaseEvaluator): logger.info('Start to extract code from predictions') sanitized_predictions = [] for prediction, entrypoint in zip(predictions, entrypoints): - sanitized_prediction = extract_code_generation( - prediction, entrypoint=entrypoint) + try: + import signal + from contextlib import contextmanager + + @contextmanager + def timeout_handler(seconds): + + def _handle_timeout(signum, frame): + raise TimeoutError(f'Code extraction timed out' + f'after {seconds} seconds') + + original_handler = signal.signal(signal.SIGALRM, + _handle_timeout) + signal.alarm(seconds) + try: + yield + finally: + signal.alarm(0) + signal.signal(signal.SIGALRM, original_handler) + + with timeout_handler(10): + sanitized_prediction = extract_code_generation( + prediction, entrypoint=entrypoint) + + except TimeoutError as e: + logger.warning( + f'Code extraction timeout for entrypoint {entrypoint}: ' + f'{str(e)}') + sanitized_prediction = '' + except Exception as e: + logger.warning( + f'Code extraction failed for entrypoint {entrypoint}: ' + f'{str(e)}') + sanitized_prediction = '' sanitized_predictions.append(sanitized_prediction) # Prepare for submission diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 2781d160..d5ac02d8 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -25,12 +25,7 @@ OPENAI_API_BASE = os.path.join( OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1/') -O1_MODEL_LIST = [ - 'o1-preview-2024-09-12', - 'o1-mini-2024-09-12', - 'o1-preview', - 'o1-mini', -] +O1_MODEL_LIST = ['o1', 'o3'] @MODELS.register_module() @@ -96,7 +91,6 @@ class OpenAI(BaseAPIModel): temperature: Optional[float] = None, tokenizer_path: Optional[str] = None, extra_body: Optional[Dict] = None, - max_completion_tokens: int = 16384, verbose: bool = False, ): @@ -151,9 +145,6 @@ class OpenAI(BaseAPIModel): self.proxy_url = openai_proxy_url self.path = path - self.max_completion_tokens = max_completion_tokens - self.logger.warning( - f'Max Completion tokens for {path} is {max_completion_tokens}') def generate( self, @@ -250,16 +241,15 @@ class OpenAI(BaseAPIModel): header['OpenAI-Organization'] = self.orgs[self.org_ctr] try: - if self.path in O1_MODEL_LIST: + if any(model in self.path for model in O1_MODEL_LIST): self.logger.warning( f"'max_token' is unsupported for model {self.path}") self.logger.warning( - f'We use max_completion_tokens: ' - f'{self.max_completion_tokens}for this query') + f'We use max_out_len: {max_out_len} for this query') data = dict( model=self.path, messages=messages, - max_completion_tokens=self.max_completion_tokens, + max_completion_tokens=max_out_len, n=1, logprobs=self.logprobs, top_logprobs=self.top_logprobs, @@ -440,7 +430,7 @@ class OpenAI(BaseAPIModel): if mode == 'front': cur_prompt = sep.join(words[-mid:]) elif mode == 'mid': - cur_prompt = (sep.join(words[:mid]) + sep.join(words[-mid:])) + cur_prompt = sep.join(words[:mid]) + sep.join(words[-mid:]) elif mode == 'rear': cur_prompt = sep.join(words[:mid]) @@ -480,7 +470,9 @@ class OpenAI(BaseAPIModel): """ # Check input length when mode is 'none' if mode == 'none': - input_len = get_token_len_func(str(input)) + input_len = (get_token_len_func(input) if isinstance( + input, str) else sum( + get_token_len_func(item['prompt']) for item in input)) if input_len > max_seq_len: raise ValueError( f'Input length ({input_len}) exceeds max_seq_len ' @@ -499,12 +491,15 @@ class OpenAI(BaseAPIModel): # Convert input to messages format if isinstance(input, str): messages = [{'role': 'user', 'content': input}] + input_len = get_token_len_func(input) else: messages = [] + processed_prompts = [] for item in input: input_content = item['prompt'] if mode != 'none': input_content = bin_trim_wrapper(input_content) + processed_prompts.append(input_content) msg = {'content': input_content} if item['role'] == 'HUMAN': msg['role'] = 'user' @@ -513,19 +508,18 @@ class OpenAI(BaseAPIModel): elif item['role'] == 'SYSTEM': msg['role'] = 'system' messages.append(msg) + input_len = sum( + get_token_len_func(prompt) for prompt in processed_prompts) # Adjust max_out_len if max_out_len is not None: original_max_out_len = max_out_len - max_out_len = min( - max_out_len, - max_seq_len - get_token_len_func(str(input)) - 100) + max_out_len = min(max_out_len, max_seq_len - input_len - 100) if max_out_len <= 0: raise ValueError( f'max_out_len ({max_out_len}) is less than or equal to 0. ' - f'This may be due to input length ' - f'({get_token_len_func(str(input))}) being too close to ' - f'max_seq_len ({max_seq_len}). Please either increase ' + f'This may be due to input length ({input_len}) being too ' + f'close to max_seq_len ({max_seq_len}). Please increase ' f'max_seq_len or use a truncation mode other than "none".') if max_out_len < original_max_out_len: self.logger.warning( @@ -555,7 +549,6 @@ class OpenAISDK(OpenAI): temperature: float | None = None, tokenizer_path: str | None = None, extra_body: Dict | None = None, - max_completion_tokens: int = 16384, verbose: bool = False, status_code_mappings: dict = {}, ): @@ -577,7 +570,6 @@ class OpenAISDK(OpenAI): tokenizer_path, extra_body, verbose=verbose, - max_completion_tokens=max_completion_tokens, ) from openai import OpenAI @@ -605,8 +597,23 @@ class OpenAISDK(OpenAI): self.logger.info(f'Used openai_client: {self.openai_client}') self.status_code_mappings = status_code_mappings - def _generate(self, input: PromptList | str, max_out_len: int, - temperature: float) -> str: + def _generate(self, + input: PromptList | str, + max_out_len: int, + temperature: float, + timeout: int = 3600) -> str: + """Generate results given a list of inputs. + + Args: + input (PromptType): A string or PromptDict. + max_out_len (int): The maximum length of the output. + temperature (float): What sampling temperature to use. + timeout (int, optional): Timeout in seconds for the API call. + Defaults to 3600 (60 minutes). + + Returns: + str: The generated string. + """ from openai import APIStatusError, BadRequestError assert isinstance(input, (str, PromptList)) @@ -618,16 +625,14 @@ class OpenAISDK(OpenAI): num_retries = 0 while num_retries < self.retry: self.wait() - - if self.path in O1_MODEL_LIST: + if any(model in self.path for model in O1_MODEL_LIST): self.logger.warning( f"'max_token' is unsupported for model {self.path}") self.logger.warning( - f'We use max_completion_tokens: ' - f'{self.max_completion_tokens}for this query') + f'We use max_out_len: {max_out_len} for this query') query_data = dict( model=self.path, - max_completion_tokens=self.max_completion_tokens, + max_completion_tokens=max_out_len, n=1, messages=messages, extra_body=self.extra_body, @@ -646,7 +651,8 @@ class OpenAISDK(OpenAI): if self.verbose: self.logger.info('Start calling OpenAI API') responses = self.openai_client.chat.completions.create( - **query_data) + **query_data, timeout=timeout) # timeout in seconds + if self.verbose: self.logger.info( 'Successfully get response from OpenAI API') diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py index 67c01243..7ed1ee53 100644 --- a/opencompass/summarizers/subjective/compassbench.py +++ b/opencompass/summarizers/subjective/compassbench.py @@ -34,39 +34,29 @@ MAP = { '总分', '中文总分', '英文总分', - 'instruct/compassbenchv1_4_IF_en_fofo_sub', - 'instruct/compassbenchv1_4_IF_zh_fofo_sub', + 'instruct/compassbench_2501_IF_en_chatIF_sub', + 'instruct/compassbench_2501_IF_en_functionalIF_sub', + 'instruct/compassbench_2501_IF_cn_chatIF_sub', + 'instruct/compassbench_2501_IF_cn_functionalIF_sub', ], 'language': [ '总分', '中文总分', '英文总分', - 'language/compassbenchv1_4_language_zh_chat_sub', - 'language/compassbenchv1_4_language_zh_creation_sub', - 'language/compassbenchv1_4_language_zh_NLP_sub', - 'language/compassbenchv1_4_language_en_chat_sub', - 'language/compassbenchv1_4_language_en_creation_sub', - 'language/compassbenchv1_4_language_en_NLP_sub', + 'language/compassbench_v2501_language_zh_chat_sub', + 'language/compassbench_v2501_language_zh_nlp_sub', + 'language/compassbench_v2501_language_zh_creation_sub', + 'language/compassbench_v2501_language_en_chat_sub', + 'language/compassbench_v2501_language_en_nlp_sub', + 'language/compassbench_v2501_language_en_creation_sub', ], - 'reasoning': [ + + 'code': [ '总分', '中文总分', '英文总分', - 'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub', - 'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub', - 'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub', - 'reasoning/compassbenchv1_4_reasoning_en_Social_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub', - 'reasoning/compassbenchv1_4_reasoning_zh_Social_sub', - ], - 'coding': [ - '总分', - '中文总分', - '英文总分', - 'coding/compassbenchv1_4_coding_en_sub', - 'coding/compassbenchv1_4_coding_zh_sub', + 'code/compassbench_2501_code_arena_en_sub', + 'code/compassbench_2501_code_arena_zh_sub', ], } From 046b6f75c6ee0ff2f583b30b6f39d73b52929f56 Mon Sep 17 00:00:00 2001 From: Junnan Liu Date: Thu, 20 Feb 2025 19:47:04 +0800 Subject: [PATCH 8/9] [Update] Update Greedy Config & README of LiveMathBench (#1862) * support omni-math * update config * upload README * Delete opencompass/configs/datasets/omni_math/__init__.py * update greedy config & README of LiveMathBench * update intro for max_out_len * rename livemathbench greedy confi * delete greedy config --------- Co-authored-by: liushz --- .../configs/datasets/livemathbench/README.md | 71 +++++++++---------- .../livemathbench/livemathbench_greedy_gen.py | 4 ++ ....py => livemathbench_greedy_gen_9befbf.py} | 10 +-- 3 files changed, 43 insertions(+), 42 deletions(-) create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py rename opencompass/configs/datasets/livemathbench/{livemathbench_greedy_gen_efb20d.py => livemathbench_greedy_gen_9befbf.py} (83%) diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md index 84490c94..24949f20 100644 --- a/opencompass/configs/datasets/livemathbench/README.md +++ b/opencompass/configs/datasets/livemathbench/README.md @@ -1,36 +1,30 @@ # LiveMathBench -## Details of Datsets +## v202412 + +### Details of Datsets | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving | | -- | -- | -- | -- | -- | -- | -| AIMC | cn | 0 | 0 | 0 | 46 | -| AIMC | en | 0 | 0 | 0 | 46 | -| CEE | cn | 0 | 0 | 13 | 40 | -| CEE | en | 0 | 0 | 13 | 40 | -| CMO | cn | 0 | 0 | 0 | 18 | -| CMO | en | 0 | 0 | 0 | 18 | -| MATH500 | en | 0 | 0 | 0 | 500 | -| AIME2024 | en | 0 | 0 | 0 | 44 | +| AMC | cn | 0 | 0 | 0 | 46 | +| AMC | en | 0 | 0 | 0 | 46 | +| CCEE | cn | 0 | 0 | 13 | 31 | +| CCEE | en | 0 | 0 | 13 | 31 | +| CNMO | cn | 0 | 0 | 0 | 18 | +| CNMO | en | 0 | 0 | 0 | 18 | +| WLPMC | cn | 0 | 0 | 0 | 11 | +| WLPMC | en | 0 | 0 | 0 | 11 | -## How to use - +### How to use +#### G-Pass@k ```python from mmengine.config import read_base with read_base(): - from opencompass.datasets.livemathbench import livemathbench_datasets + from opencompass.datasets.livemathbench_gen import livemathbench_datasets -livemathbench_datasets[0].update( - { - 'abbr': 'livemathbench_${k}x${n}' - 'path': '/path/to/data/dir', - 'k': 'k@pass', # the max value of k in k@pass - 'n': 'number of runs', # number of runs - } -) livemathbench_datasets[0]['eval_cfg']['evaluator'].update( { 'model_name': 'Qwen/Qwen2.5-72B-Instruct', @@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update( ] # set url of evaluation models } ) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) ``` -> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested. - +#### Greedy ```python +from mmengine.config import read_base + +with read_base(): + from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets + livemathbench_datasets[0]['eval_cfg']['evaluator'].update( { 'model_name': 'Qwen/Qwen2.5-72B-Instruct', 'url': [ 'http://0.0.0.0:23333/v1', '...' - ], # set url of evaluation models - - # for LLM-based extraction - 'use_extract_model': True, - 'post_model_name': 'oc-extractor', - 'post_url': [ - 'http://0.0.0.0:21006/v1, - '...' - ] + ] # set url of evaluation models } ) +livemathbench_dataset['infer_cfg']['inferencer'].update(dict( + max_out_len=32768 # for o1-like models you need to update max_out_len +)) + ``` -## Output Samples +### Output Samples | dataset | version | metric | mode | Qwen2.5-72B-Instruct | |----- | ----- | ----- | ----- | -----| -| LiveMathBench | caed8f | 1@pass | gen | 26.07 | -| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx | -| LiveMathBench | caed8f | 2@pass | gen | xx.xx | -| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx | -| LiveMathBench | caed8f | pass-rate | gen | xx.xx | +| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx | +| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx | diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py new file mode 100644 index 00000000..d311eeaf --- /dev/null +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py @@ -0,0 +1,4 @@ +from mmengine.config import read_base + +with read_base(): + from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py similarity index 83% rename from opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py rename to opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py index d6acd7c0..d8d8b79c 100644 --- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py +++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py @@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen livemathbench_dataset = dict( - abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name. type=LiveMathBenchDataset, - path='opencompass/LiveMathBench', + path='', k=1, replication=1, dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'], dataset_languages=['cn', 'en'], - cot=False, + cot=True, version='202412', + abbr='LiveMathBench-v202412', reader_cfg=dict( input_columns=['prompt'], output_column='answer' @@ -31,7 +31,7 @@ livemathbench_dataset = dict( retriever=dict(type=ZeroRetriever), inferencer=dict( type=GenInferencer, - max_out_len=16384, + max_out_len=8192 ), ), eval_cfg=dict( @@ -44,7 +44,7 @@ livemathbench_dataset = dict( extract_model_name='', k=[1], replication=1, - thresholds=[0.0, 0.25, 0.5, 0.75, 1.0] + thresholds=[0.0] ) ) ) From 465e93e10e633183d4ec24a547e386927fd6e559 Mon Sep 17 00:00:00 2001 From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com> Date: Mon, 24 Feb 2025 15:45:24 +0800 Subject: [PATCH 9/9] [Update] Academic bench llm judge update (#1876) * BigCodeBench update * update LCBench * update LCBench 2 * update code * academicBench update * academic bench ifeval&math update * generic_llmjudge_aime_academic_postprocess delete * aime delete * postprocessors update * ifeval delete * update work_dir * linting * linting double-quote-string-fixer * r1-distill out_len update * fix lint --------- Co-authored-by: MaiziXiao --- examples/eval_academic_leaderboard_202502.py | 137 +++++++++++++ ...0shot_nocot_genericllmeval_academic_gen.py | 98 +++++++++ .../bbh/bbh_0shot_nocot_academic_gen.py | 189 ++++++++++++++++++ ...math_prm800k_500_0shot_cot_academic_gen.py | 100 +++++++++ .../hf_deepseek_r1_distill_llama_70b.py | 14 ++ .../hf_deepseek_r1_distill_llama_8b.py | 14 ++ .../hf_deepseek_r1_distill_qwen_14b.py | 14 ++ .../hf_deepseek_r1_distill_qwen_1_5b.py | 14 ++ .../hf_deepseek_r1_distill_qwen_32b.py | 14 ++ .../hf_deepseek_r1_distill_qwen_7b.py | 14 ++ .../lmdeploy_deepseek_r1_distill_llama_70b.py | 20 ++ .../lmdeploy_deepseek_r1_distill_llama_8b.py | 20 ++ .../lmdeploy_deepseek_r1_distill_qwen_14b.py | 20 ++ .../lmdeploy_deepseek_r1_distill_qwen_1_5b.py | 20 ++ .../lmdeploy_deepseek_r1_distill_qwen_32b.py | 20 ++ .../lmdeploy_deepseek_r1_distill_qwen_7b.py | 20 ++ opencompass/datasets/generic.py | 29 ++- 17 files changed, 755 insertions(+), 2 deletions(-) create mode 100644 examples/eval_academic_leaderboard_202502.py create mode 100644 opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py create mode 100644 opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py diff --git a/examples/eval_academic_leaderboard_202502.py b/examples/eval_academic_leaderboard_202502.py new file mode 100644 index 00000000..231e9a9f --- /dev/null +++ b/examples/eval_academic_leaderboard_202502.py @@ -0,0 +1,137 @@ +# flake8: noqa + +from mmengine.config import read_base + +from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner +from opencompass.runners import LocalRunner, VOLCRunner +from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask + +####################################################################### +# PART 0 Essential Configs # +####################################################################### +with read_base(): + # Datasets Part + # Knowledge + # Math + from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \ + aime2024_datasets + from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \ + bbh_datasets + # General Reasoning + from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \ + gpqa_datasets + from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \ + humaneval_datasets + # Instruction Following + from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \ + ifeval_datasets + from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \ + LCBCodeGeneration_dataset + from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \ + math_datasets + from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \ + mmlu_pro_datasets + # Model List + from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \ + models as hf_internlm2_5_7b_chat_model + # Summary Groups + from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups + from opencompass.configs.summarizers.groups.mmlu_pro import \ + mmlu_pro_summary_groups + +####################################################################### +# PART 1 Datasets List # +####################################################################### +# datasets list for evaluation +# Only take LCB generation for evaluation +datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), + []) + [LCBCodeGeneration_dataset] + +# LLM judge config: using LLM to evaluate predictions +judge_cfg = dict() +for dataset in datasets: + dataset['infer_cfg']['inferencer']['max_out_len'] = 32768 + if 'judge_cfg' in dataset['eval_cfg']['evaluator']: + dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg + + +####################################################################### +# PART 2 Datset Summarizer # +####################################################################### + +core_summary_groups = [ + { + 'name': + 'core_average', + 'subsets': [ + ['IFEval', 'Prompt-level-strict-accuracy'], + ['bbh', 'naive_average'], + ['math_prm800k_500', 'accuracy'], + ['aime2024', 'accuracy'], + ['GPQA_diamond', 'accuracy'], + ['mmlu_pro', 'naive_average'], + ['openai_humaneval', 'humaneval_pass@1'], + ['lcb_code_generation', 'pass@1'], + ], + }, +] + +summarizer = dict( + dataset_abbrs=[ + ['core_average', 'naive_average'], + '', + 'Instruction Following', + ['IFEval', 'Prompt-level-strict-accuracy'], + '', + 'General Reasoning', + ['bbh', 'naive_average'], + ['GPQA_diamond', 'accuracy'], + '', + 'Math Calculation', + ['math_prm800k_500', 'accuracy'], + ['aime2024', 'accuracy'], + '', + 'Knowledge', + ['mmlu_pro', 'naive_average'], + '', + 'Code', + ['openai_humaneval', 'humaneval_pass@1'], + ['lcb_code_generation', 'pass@1'], + ], + summary_groups=sum( + [v for k, v in locals().items() if k.endswith('_summary_groups')], []), +) + +####################################################################### +# PART 3 Models List # +####################################################################### + +models = sum([v for k, v in locals().items() if k.endswith('_model')], []) + +####################################################################### +# PART 4 Inference/Evaluation Configuaration # +####################################################################### + +# Local Runner +infer = dict( + partitioner=dict(type=NumWorkerPartitioner, num_worker=8), + runner=dict( + type=LocalRunner, + max_num_workers=16, + retry=0, # Modify if needed + task=dict(type=OpenICLInferTask), + ), +) + +# eval with local runner +eval = dict( + partitioner=dict(type=NaivePartitioner, n=10), + runner=dict(type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLEvalTask)), +) + +####################################################################### +# PART 5 Utils Configuaration # +####################################################################### +work_dir = './outputs/oc_academic_202502' diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py new file mode 100644 index 00000000..30da2b98 --- /dev/null +++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py @@ -0,0 +1,98 @@ +# flake8: noqa + +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import Aime2024Dataset +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets.generic import generic_llmjudge_academic_postprocess + + +aime2024_reader_cfg = dict( + input_columns=['question'], + output_column='answer' +) + + +aime2024_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', + prompt='{question}\nRemember to put your final answer within \\boxed{}.'), + ], + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=2048) +) + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{question}\n\n\n + : \n{answer}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + +aime2024_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt=GRADER_TEMPLATE), + ]), + ), + dataset_cfg=dict( + type=Aime2024Dataset, + path='opencompass/aime2024', + reader_cfg=aime2024_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, + metric_name='accuracy'), + ), + pred_role='BOT', +) + +aime2024_datasets = [ + dict( + abbr='aime2024', + type=Aime2024Dataset, + path='opencompass/aime2024', + reader_cfg=aime2024_reader_cfg, + infer_cfg=aime2024_infer_cfg, + eval_cfg=aime2024_eval_cfg, + mode='singlescore', + ) +] diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py new file mode 100644 index 00000000..f0698689 --- /dev/null +++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py @@ -0,0 +1,189 @@ +# flake8: noqa + +import os +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import BBHDataset +from opencompass.datasets.generic import generic_llmjudge_academic_postprocess + + +bbh_reader_cfg = dict(input_columns=['input'], output_column='target') + +bbh_multiple_choice_sets = [ + 'temporal_sequences', + 'disambiguation_qa', + 'date_understanding', + 'tracking_shuffled_objects_three_objects', + 'penguins_in_a_table', + 'geometric_shapes', + 'snarks', + 'ruin_names', + 'tracking_shuffled_objects_seven_objects', + 'tracking_shuffled_objects_five_objects', + 'logical_deduction_three_objects', + 'hyperbaton', + 'logical_deduction_five_objects', + 'logical_deduction_seven_objects', + 'movie_recommendation', + 'salient_translation_error_detection', + 'reasoning_about_colored_objects', +] +bbh_free_form_sets = [ + 'multistep_arithmetic_two', + 'navigate', + 'dyck_languages', + 'word_sorting', + 'sports_understanding', + 'boolean_expressions', + 'object_counting', + 'formal_fallacies', + 'causal_judgement', + 'web_of_lies', +] + + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{input}\n\n\n + : \n{target}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + + +bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets + +# For zero shot inference in bbh +bbh_datasets = [] +for _name in bbh_sets: + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' " + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + + bbh_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt=GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=BBHDataset, + name=_name, + path='opencompass/bbh', + reader_cfg=bbh_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'), + ), + pred_role='BOT', + ) + + bbh_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy()) + ) + + +# For original 3 shot inference in bbh +bbh_3_shot_datasets = [] +for _name in bbh_sets: + with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f: + _hint = f.read() + bbh_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step." + ) + ])), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + + bbh_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt=GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=BBHDataset, + name=_name, + path='opencompass/bbh', + reader_cfg=bbh_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'), + ), + pred_role='BOT', + ) + + bbh_3_shot_datasets.append( + dict( + type=BBHDataset, + path='opencompass/bbh', + name=_name, + abbr='bbh-' + _name, + reader_cfg=bbh_reader_cfg, + infer_cfg=bbh_infer_cfg.copy(), + eval_cfg=bbh_eval_cfg.copy())) diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py new file mode 100644 index 00000000..c23bc136 --- /dev/null +++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py @@ -0,0 +1,100 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import ( + MATHDataset, + MATHEvaluator, + math_postprocess_v2, + normalize_final_answer, +) +from opencompass.evaluator import GenericLLMEvaluator +from opencompass.datasets import generic_llmjudge_academic_postprocess + + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role='HUMAN', + prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.', + ), + ] + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +GRADER_TEMPLATE = """ + Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. + + Here are some evaluation criteria: + 1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct. + 2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question. + 3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct. + 4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct. + 5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer. + + Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of: + A: CORRECT + B: INCORRECT + Just return the letters "A" or "B", with no text around it. + + Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer. + + + : \n{problem}\n\n\n + : \n{solution}\n\n\n + : \n{prediction}\n\n\n + + Judging the correctness of candidates' answers: +""".strip() + +# Evaluation configuration +math_eval_cfg = dict( + evaluator=dict( + type=GenericLLMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict( + begin=[ + dict( + role='SYSTEM', + fallback_role='HUMAN', + prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.") + ], + round=[ + dict( + role='HUMAN', + prompt = GRADER_TEMPLATE + ), + ]), + ), + dataset_cfg=dict( + type=MATHDataset, + path='opencompass/math', + file_name = 'test_prm800k_500.json', + reader_cfg=math_reader_cfg, + ), + judge_cfg=dict(), + dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess), + ), + pred_role='BOT', +) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math_prm800k_500', + path='opencompass/math', + file_name='test_prm800k_500.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg, + mode='singlescore', + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py new file mode 100644 index 00000000..15ac9f90 --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-llama-70b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=8), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py new file mode 100644 index 00000000..937c8bb2 --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-llama-8b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py new file mode 100644 index 00000000..d4c97023 --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-qwen-14b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', + max_out_len=16384, + batch_size=16, + run_cfg=dict(num_gpus=4), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py new file mode 100644 index 00000000..aa12591e --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-qwen-1.5b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py new file mode 100644 index 00000000..d62d8085 --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-qwen-32b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=4), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py new file mode 100644 index 00000000..9898843a --- /dev/null +++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py @@ -0,0 +1,14 @@ +from opencompass.models import HuggingFacewithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-r1-distill-qwen-7b-hf', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', + max_out_len=16384, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py new file mode 100644 index 00000000..1471be9b --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-llama-70b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B', + engine_config=dict(session_len=32768, max_batch_size=8, tp=8), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=8, + run_cfg=dict(num_gpus=8), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py new file mode 100644 index 00000000..46b521f2 --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-llama-8b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B', + engine_config=dict(session_len=32768, max_batch_size=8, tp=1), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py new file mode 100644 index 00000000..401299cd --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-14b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B', + engine_config=dict(session_len=32768, max_batch_size=16, tp=2), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=16, + run_cfg=dict(num_gpus=2), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py new file mode 100644 index 00000000..d19ace4e --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-1_5b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B', + engine_config=dict(session_len=32768, max_batch_size=16, tp=1), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=16, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py new file mode 100644 index 00000000..2ddef1d4 --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-32b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B', + engine_config=dict(session_len=32768, max_batch_size=16, tp=4), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=16, + run_cfg=dict(num_gpus=4), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py new file mode 100644 index 00000000..69f9e50f --- /dev/null +++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py @@ -0,0 +1,20 @@ +from opencompass.models import TurboMindModelwithChatTemplate +from opencompass.utils.text_postprocessors import extract_non_reasoning_content + +models = [ + dict( + type=TurboMindModelwithChatTemplate, + abbr='deepseek-r1-distill-qwen-7b-turbomind', + path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', + engine_config=dict(session_len=32768, max_batch_size=8, tp=1), + gen_config=dict(top_k=1, + temperature=1e-6, + top_p=0.9, + max_new_tokens=32768), + max_seq_len=32768, + max_out_len=32768, + batch_size=8, + run_cfg=dict(num_gpus=1), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) +] diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py index 867ba61f..28a37a02 100644 --- a/opencompass/datasets/generic.py +++ b/opencompass/datasets/generic.py @@ -1,7 +1,10 @@ import re -def get_final_results(judged_answers, references, origial_responses): +def get_final_results(judged_answers, + references, + origial_responses, + metric_name='accuracy'): count = 0 is_correct_count = 0 is_incorrect_count = 0 @@ -39,7 +42,7 @@ def get_final_results(judged_answers, references, origial_responses): is_correct) > 0 else 0 result = { # 'accuracy_given_attempted': accuracy_given_attempted, - 'accuracy': accuracy_given_attempted * 100, + metric_name: accuracy_given_attempted * 100, 'f1': f1, 'details': details } @@ -69,3 +72,25 @@ def generic_llmjudge_postprocess( results = get_final_results(judged_answers, references, origial_responses) results['details'] = output return results + + +def generic_llmjudge_academic_postprocess( + output: dict, + output_path: str, + metric_name: str = 'accuracy', +) -> dict: + judged_answers = [] + origial_responses = [] + references = [] + for k, v in output.items(): + origial_responses.append(v['prediction']) + processed_judge = _generic_llmjudge_postprocess(v['prediction']) + if processed_judge is not None: + judged_answers.append(processed_judge) + references.append(v['gold']) + results = get_final_results(judged_answers, references, origial_responses, + metric_name) + results['details'] = output + # For academic summarizer + results.pop('f1', None) + return results