From 68a9838907ac2f20e59aa50cf771c43c960bc6c7 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Fri, 14 Feb 2025 16:17:30 +0800
Subject: [PATCH 01/58] [Feature] Add list of supported datasets at html page
 (#1850)

* feat dataset-index.yml and stat.py

* fix

* fix

* fix

* feat url of paper and config file

* doc all supported dataset list

* docs zh and en

* docs README zh and en

* docs new_dataset

* docs new_dataset
---
 README.md                                 | 258 +-------
 README_zh-CN.md                           | 258 +-------
 dataset-index.yml                         | 734 ++++++++++++++++++++++
 docs/en/_static/js/custom.js              |  16 +-
 docs/en/advanced_guides/new_dataset.md    |  12 +
 docs/en/conf.py                           |   8 +
 docs/en/index.rst                         |   7 +
 docs/en/statis.py                         |  76 +++
 docs/zh_cn/_static/js/custom.js           |  16 +-
 docs/zh_cn/advanced_guides/new_dataset.md |  12 +
 docs/zh_cn/conf.py                        |   1 +
 docs/zh_cn/index.rst                      |   7 +
 docs/zh_cn/statis.py                      |  75 +++
 13 files changed, 965 insertions(+), 515 deletions(-)
 create mode 100644 dataset-index.yml
 create mode 100755 docs/en/statis.py
 create mode 100755 docs/zh_cn/statis.py
diff --git a/README.md b/README.md
index 887fcb4c..736968eb 100644
--- a/README.md
+++ b/README.md
@@ -279,263 +279,13 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
 
 ## 📖 Dataset Support
 
-<table align="center">
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Language</b>
-      </td>
-      <td>
-        <b>Knowledge</b>
-      </td>
-      <td>
-        <b>Reasoning</b>
-      </td>
-      <td>
-        <b>Examination</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>Word Definition</b></summary>
+We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
 
-- WiC
-- SummEdits
+You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
 
-</details>
+Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
 
-<details open>
-<summary><b>Idiom Learning</b></summary>
-
-- CHID
-
-</details>
-
-<details open>
-<summary><b>Semantic Similarity</b></summary>
-
-- AFQMC
-- BUSTM
-
-</details>
-
-<details open>
-<summary><b>Coreference Resolution</b></summary>
-
-- CLUEWSC
-- WSC
-- WinoGrande
-
-</details>
-
-<details open>
-<summary><b>Translation</b></summary>
-
-- Flores
-- IWSLT2017
-
-</details>
-
-<details open>
-<summary><b>Multi-language Question Answering</b></summary>
-
-- TyDi-QA
-- XCOPA
-
-</details>
-
-<details open>
-<summary><b>Multi-language Summary</b></summary>
-
-- XLSum
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Knowledge Question Answering</b></summary>
-
-- BoolQ
-- CommonSenseQA
-- NaturalQuestions
-- TriviaQA
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Textual Entailment</b></summary>
-
-- CMNLI
-- OCNLI
-- OCNLI_FC
-- AX-b
-- AX-g
-- CB
-- RTE
-- ANLI
-
-</details>
-
-<details open>
-<summary><b>Commonsense Reasoning</b></summary>
-
-- StoryCloze
-- COPA
-- ReCoRD
-- HellaSwag
-- PIQA
-- SIQA
-
-</details>
-
-<details open>
-<summary><b>Mathematical Reasoning</b></summary>
-
-- MATH
-- GSM8K
-
-</details>
-
-<details open>
-<summary><b>Theorem Application</b></summary>
-
-- TheoremQA
-- StrategyQA
-- SciBench
-
-</details>
-
-<details open>
-<summary><b>Comprehensive Reasoning</b></summary>
-
-- BBH
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
-
-- C-Eval
-- AGIEval
-- MMLU
-- GAOKAO-Bench
-- CMMLU
-- ARC
-- Xiezhi
-
-</details>
-
-<details open>
-<summary><b>Medical Examinations</b></summary>
-
-- CMB
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>Understanding</b>
-      </td>
-      <td>
-        <b>Long Context</b>
-      </td>
-      <td>
-        <b>Safety</b>
-      </td>
-      <td>
-        <b>Code</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>Reading Comprehension</b></summary>
-
-- C3
-- CMRC
-- DRCD
-- MultiRC
-- RACE
-- DROP
-- OpenBookQA
-- SQuAD2.0
-
-</details>
-
-<details open>
-<summary><b>Content Summary</b></summary>
-
-- CSL
-- LCSTS
-- XSum
-- SummScreen
-
-</details>
-
-<details open>
-<summary><b>Content Analysis</b></summary>
-
-- EPRSTMT
-- LAMBADA
-- TNEWS
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Long Context Understanding</b></summary>
-
-- LEval
-- LongBench
-- GovReports
-- NarrativeQA
-- Qasper
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Safety</b></summary>
-
-- CivilComments
-- CrowsPairs
-- CValues
-- JigsawMultilingual
-- TruthfulQA
-
-</details>
-<details open>
-<summary><b>Robustness</b></summary>
-
-- AdvGLUE
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>Code</b></summary>
-
-- HumanEval
-- HumanEvalX
-- MBPP
-- APPs
-- DS1000
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-</table>
+<p align="right"><a href="#top">🔝Back to top</a></p>
 
 ## 📖 Model Support
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 5c889956..8d8ecd02 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -274,263 +274,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 
 ## 📖 数据集支持
 
-<table align="center">
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>语言</b>
-      </td>
-      <td>
-        <b>知识</b>
-      </td>
-      <td>
-        <b>推理</b>
-      </td>
-      <td>
-        <b>考试</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>字词释义</b></summary>
+我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
 
-- WiC
-- SummEdits
+您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 
-</details>
-
-<details open>
-<summary><b>成语习语</b></summary>
-
-- CHID
-
-</details>
-
-<details open>
-<summary><b>语义相似度</b></summary>
-
-- AFQMC
-- BUSTM
-
-</details>
-
-<details open>
-<summary><b>指代消解</b></summary>
-
-- CLUEWSC
-- WSC
-- WinoGrande
-
-</details>
-
-<details open>
-<summary><b>翻译</b></summary>
-
-- Flores
-- IWSLT2017
-
-</details>
-
-<details open>
-<summary><b>多语种问答</b></summary>
-
-- TyDi-QA
-- XCOPA
-
-</details>
-
-<details open>
-<summary><b>多语种总结</b></summary>
-
-- XLSum
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>知识问答</b></summary>
-
-- BoolQ
-- CommonSenseQA
-- NaturalQuestions
-- TriviaQA
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>文本蕴含</b></summary>
-
-- CMNLI
-- OCNLI
-- OCNLI_FC
-- AX-b
-- AX-g
-- CB
-- RTE
-- ANLI
-
-</details>
-
-<details open>
-<summary><b>常识推理</b></summary>
-
-- StoryCloze
-- COPA
-- ReCoRD
-- HellaSwag
-- PIQA
-- SIQA
-
-</details>
-
-<details open>
-<summary><b>数学推理</b></summary>
-
-- MATH
-- GSM8K
-
-</details>
-
-<details open>
-<summary><b>定理应用</b></summary>
-
-- TheoremQA
-- StrategyQA
-- SciBench
-
-</details>
-
-<details open>
-<summary><b>综合推理</b></summary>
-
-- BBH
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>初中/高中/大学/职业考试</b></summary>
-
-- C-Eval
-- AGIEval
-- MMLU
-- GAOKAO-Bench
-- CMMLU
-- ARC
-- Xiezhi
-
-</details>
-
-<details open>
-<summary><b>医学考试</b></summary>
-
-- CMB
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-  <tbody>
-    <tr align="center" valign="bottom">
-      <td>
-        <b>理解</b>
-      </td>
-      <td>
-        <b>长文本</b>
-      </td>
-      <td>
-        <b>安全</b>
-      </td>
-      <td>
-        <b>代码</b>
-      </td>
-    </tr>
-    <tr valign="top">
-      <td>
-<details open>
-<summary><b>阅读理解</b></summary>
-
-- C3
-- CMRC
-- DRCD
-- MultiRC
-- RACE
-- DROP
-- OpenBookQA
-- SQuAD2.0
-
-</details>
-
-<details open>
-<summary><b>内容总结</b></summary>
-
-- CSL
-- LCSTS
-- XSum
-- SummScreen
-
-</details>
-
-<details open>
-<summary><b>内容分析</b></summary>
-
-- EPRSTMT
-- LAMBADA
-- TNEWS
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>长文本理解</b></summary>
-
-- LEval
-- LongBench
-- GovReports
-- NarrativeQA
-- Qasper
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>安全</b></summary>
-
-- CivilComments
-- CrowsPairs
-- CValues
-- JigsawMultilingual
-- TruthfulQA
-
-</details>
-<details open>
-<summary><b>健壮性</b></summary>
-
-- AdvGLUE
-
-</details>
-      </td>
-      <td>
-<details open>
-<summary><b>代码</b></summary>
-
-- HumanEval
-- HumanEvalX
-- MBPP
-- APPs
-- DS1000
-
-</details>
-      </td>
-    </tr>
-</td>
-    </tr>
-  </tbody>
-</table>
+详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
 
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 
diff --git a/dataset-index.yml b/dataset-index.yml
new file mode 100644
index 00000000..9fbde8bd
--- /dev/null
+++ b/dataset-index.yml
@@ -0,0 +1,734 @@
+
+- ifeval:
+    name: IFEval
+    category: Instruction Following
+    paper: https://arxiv.org/pdf/2311.07911
+    configpath: opencompass/configs/datasets/IFEval
+- nphard:
+    name: NPHardEval
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2312.14890v2
+    configpath: opencompass/configs/datasets/NPHardEval
+- pmmeval:
+    name: PMMEval
+    category: Language
+    paper: https://arxiv.org/pdf/2411.09116v1
+    configpath: opencompass/configs/datasets/PMMEval
+- theoremqa:
+    name: TheroremQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2305.12524
+    configpath: opencompass/configs/datasets/TheroremQA
+- agieval:
+    name: AGIEval
+    category: Examination
+    paper: https://arxiv.org/pdf/2304.06364
+    configpath: opencompass/configs/datasets/agieval
+- babilong:
+    name: BABILong
+    category: Long Context
+    paper: https://arxiv.org/pdf/2406.10149
+    configpath: opencompass/configs/datasets/babilong
+- bigcodebench:
+    name: BigCodeBench
+    category: Code
+    paper: https://arxiv.org/pdf/2406.15877
+    configpath: opencompass/configs/datasets/bigcodebench
+- calm:
+    name: CaLM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2405.00622
+    configpath: opencompass/configs/datasets/calm
+- infinitebench:
+    name: InfiniteBench (∞Bench)
+    category: Long Context
+    paper: https://aclanthology.org/2024.acl-long.814.pdf
+    configpath: opencompass/configs/datasets/infinitebench
+- korbench:
+    name: KOR-Bench
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2410.06526v1
+    configpath: opencompass/configs/datasets/korbench
+- lawbench:
+    name: LawBench
+    category: Knowledge / Law
+    paper: https://arxiv.org/pdf/2309.16289
+    configpath: opencompass/configs/datasets/lawbench
+- leval:
+    name: L-Eval
+    category: Long Context
+    paper: https://arxiv.org/pdf/2307.11088v1
+    configpath: opencompass/configs/datasets/leval
+- livecodebench:
+    name: LiveCodeBench
+    category: Code
+    paper: https://arxiv.org/pdf/2403.07974
+    configpath: opencompass/configs/datasets/livecodebench
+- livemathbench:
+    name: LiveMathBench
+    category: Math
+    paper: https://arxiv.org/pdf/2412.13147
+    configpath: opencompass/configs/datasets/livemathbench
+- longbench:
+    name: LongBench
+    category: Long Context
+    paper: https://github.com/THUDM/LongBench
+    configpath: opencompass/configs/datasets/livemathbench
+- lveval:
+    name: LV-Eval
+    category: Long Context
+    paper: https://arxiv.org/pdf/2402.05136
+    configpath: opencompass/configs/datasets/lveval
+- medbench:
+    name: MedBench
+    category: Knowledge / Medicine
+    paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
+    configpath: opencompass/configs/datasets/MedBench
+- musr:
+    name: MuSR
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2310.16049
+    configpath: opencompass/configs/datasets/musr
+- needlebench:
+    name: NeedleBench
+    category: Long Context
+    paper: https://arxiv.org/pdf/2407.11963
+    configpath: opencompass/configs/datasets/needlebench
+- ruler:
+    name: RULER
+    category: Long Context
+    paper: https://arxiv.org/pdf/2404.06654
+    configpath: opencompass/configs/datasets/ruler
+- alignment:
+    name: AlignBench
+    category: Subjective / Alignment
+    paper: https://arxiv.org/pdf/2311.18743
+    configpath: opencompass/configs/datasets/subjective/alignbench
+- alpaca:
+    name: AlpacaEval
+    category: Subjective / Instruction Following
+    paper: https://github.com/tatsu-lab/alpaca_eval
+    configpath: opencompass/configs/datasets/subjective/aplaca_eval
+- arenahard:
+    name: Arena-Hard
+    category: Subjective / Chatbot
+    paper: https://lmsys.org/blog/2024-04-19-arena-hard/
+    configpath: opencompass/configs/datasets/subjective/arena_hard
+- flames:
+    name: FLAMES
+    category: Subjective / Alignment
+    paper: https://arxiv.org/pdf/2311.06899
+    configpath: opencompass/configs/datasets/subjective/flames
+- fofo:
+    name: FOFO
+    category: Subjective / Format Following
+    paper: https://arxiv.org/pdf/2402.18667
+    configpath: opencompass/configs/datasets/subjective/fofo
+- followbench:
+    name: FollowBench
+    category: Subjective / Instruction Following
+    paper: https://arxiv.org/pdf/2310.20410
+    configpath: opencompass/configs/datasets/subjective/followbench
+- hellobench:
+    name: HelloBench
+    category: Subjective / Long Context
+    paper: https://arxiv.org/pdf/2409.16191
+    configpath: opencompass/configs/datasets/subjective/hellobench
+- judgerbench:
+    name: JudgerBench
+    category: Subjective / Long Context
+    paper: https://arxiv.org/pdf/2410.16256
+    configpath: opencompass/configs/datasets/subjective/judgerbench
+- multiround:
+    name: MT-Bench-101
+    category: Subjective / Multi-Round
+    paper: https://arxiv.org/pdf/2402.14762
+    configpath: opencompass/configs/datasets/subjective/multiround
+- wildbench:
+    name: WildBench
+    category: Subjective / Real Task
+    paper: https://arxiv.org/pdf/2406.04770
+    configpath: opencompass/configs/datasets/subjective/wildbench
+- teval:
+    name: T-Eval
+    category: Tool Utilization
+    paper: https://arxiv.org/pdf/2312.14033
+    configpath: opencompass/configs/datasets/teval
+- finalceiq:
+    name: FinanceIQ
+    category: Knowledge / Finance
+    paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
+    configpath: opencompass/configs/datasets/FinanceIQ
+- gaokaobench:
+    name: GAOKAOBench
+    category: Examination
+    paper: https://arxiv.org/pdf/2305.12474
+    configpath: opencompass/configs/datasets/GaokaoBench
+- lcbench:
+    name: LCBench
+    category: Code
+    paper: https://github.com/open-compass/CodeBench/
+    configpath: opencompass/configs/datasets/LCBench
+- MMLUArabic:
+    name: ArabicMMLU
+    category: Language
+    paper: https://arxiv.org/pdf/2402.12840
+    configpath: opencompass/configs/datasets/MMLUArabic
+- OpenFinData:
+    name: OpenFinData
+    category: Knowledge / Finance
+    paper: https://github.com/open-compass/OpenFinData
+    configpath: opencompass/configs/datasets/OpenFinData
+- QuALITY:
+    name: QuALITY
+    category: Long Context
+    paper: https://arxiv.org/pdf/2112.08608
+    configpath: opencompass/configs/datasets/QuALITY
+- advglue:
+    name: Adversarial GLUE
+    category: Safety
+    paper: https://openreview.net/pdf?id=GF9cSKI3A_q
+    configpath: opencompass/configs/datasets/adv_glue
+- afqmcd:
+    name: CLUE / AFQMC
+    category: Language
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_afqmc
+- aime2024:
+    name: AIME2024
+    category: Examination
+    paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
+    configpath: opencompass/configs/datasets/aime2024
+- anli:
+    name: Adversarial NLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1910.14599v2
+    configpath: opencompass/configs/datasets/anli
+- anthropics_evals:
+    name: Anthropics Evals
+    category: Safety
+    paper: https://arxiv.org/pdf/2212.09251
+    configpath: opencompass/configs/datasets/anthropics_evals
+- apps:
+    name: APPS
+    category: Code
+    paper: https://arxiv.org/pdf/2105.09938
+    configpath: opencompass/configs/datasets/apps
+- arc:
+    name: ARC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1803.05457
+    configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e]
+- arc_prize_public_eval:
+    name: ARC Prize
+    category: ARC-AGI
+    paper: https://arcprize.org/guide#private
+    configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
+- ax:
+    name: SuperGLUE / AX
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g]
+- bbh:
+    name: BIG-Bench Hard
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2210.09261
+    configpath: opencompass/configs/datasets/bbh
+- BoolQ:
+    name: SuperGLUE / BoolQ
+    category: Knowledge
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
+- c3:
+    name: CLUE / C3 (C³)
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_C3
+- cb:
+    name: SuperGLUE / CB
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_CB
+- ceval:
+    name: C-EVAL
+    category: Examination
+    paper: https://arxiv.org/pdf/2305.08322v1
+    configpath: opencompass/configs/datasets/ceval
+- charm:
+    name: CHARM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2403.14112
+    configpath: opencompass/configs/datasets/CHARM
+- chembench:
+    name: ChemBench
+    category: Knowledge / Chemistry
+    paper: https://arxiv.org/pdf/2404.01475
+    configpath: opencompass/configs/datasets/ChemBench
+- chid:
+    name: FewCLUE / CHID
+    category: Language
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_chid
+- chinese_simpleqa:
+    name: Chinese SimpleQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2411.07140
+    configpath: opencompass/configs/datasets/chinese_simpleqa
+- cibench:
+    name: CIBench
+    category: Code
+    paper: https://www.arxiv.org/pdf/2407.10499
+    configpath: opencompass/configs/datasets/CIBench
+- civilcomments:
+    name: CivilComments
+    category: Safety
+    paper: https://arxiv.org/pdf/1903.04561
+    configpath: opencompass/configs/datasets/civilcomments
+- clozeTest_maxmin:
+    name: Cloze Test-max/min
+    category: Code
+    paper: https://arxiv.org/pdf/2102.04664
+    configpath: opencompass/configs/datasets/clozeTest_maxmin
+- cluewsc:
+    name: FewCLUE / CLUEWSC
+    category: Language / WSC
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_cluewsc
+- cmb:
+    name: CMB
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/pdf/2308.08833
+    configpath: opencompass/configs/datasets/cmb
+- cmmlu:
+    name: CMMLU
+    category: Understanding
+    paper: https://arxiv.org/pdf/2306.09212
+    configpath: opencompass/configs/datasets/cmmlu
+- cmnli:
+    name: CLUE / CMNLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_cmnli
+- cmo_fib:
+    name: cmo_fib
+    category: Examination
+    paper: ""
+    configpath: opencompass/configs/datasets/cmo_fib
+- cmrc:
+    name: CLUE / CMRC
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_CMRC
+- commonsenseqa:
+    name: CommonSenseQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1811.00937v2
+    configpath: opencompass/configs/datasets/commonsenseqa
+- commonsenseqa_cn:
+    name: CommonSenseQA-CN
+    category: Knowledge
+    paper: ""
+    configpath: opencompass/configs/datasets/commonsenseqa_cn
+- copa:
+    name: SuperGLUE / COPA
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_COPA
+- crowspairs:
+    name: CrowsPairs
+    category: Safety
+    paper: https://arxiv.org/pdf/2010.00133
+    configpath: opencompass/configs/datasets/crowspairs
+- crowspairs_cn:
+    name: CrowsPairs-CN
+    category: Safety
+    paper: ""
+    configpath: opencompass/configs/datasets/crowspairs_cn
+- cvalues:
+    name: CVALUES
+    category: Safety
+    paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
+    configpath: opencompass/configs/datasets/cvalues
+- drcd:
+    name: CLUE / DRCD
+    category: Understanding
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_DRCD
+- drop:
+    name: DROP (DROP Simple Eval)
+    category: Understanding
+    paper: https://arxiv.org/pdf/1903.00161
+    configpath: opencompass/configs/datasets/drop
+- ds1000:
+    name: DS-1000
+    category: Code
+    paper: https://arxiv.org/pdf/2211.11501
+    configpath: opencompass/configs/datasets/ds1000
+- eprstmt:
+    name: FewCLUE / EPRSTMT
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_eprstmt
+- flores:
+    name: Flores
+    category: Language
+    paper: https://aclanthology.org/D19-1632.pdf
+    configpath: opencompass/configs/datasets/flores
+- game24:
+    name: Game24
+    category: Math
+    paper: https://huggingface.co/datasets/nlile/24-game
+    configpath: opencompass/configs/datasets/game24
+- govrepcrs:
+    name: Government Report Dataset
+    category: Long Context
+    paper: https://aclanthology.org/2021.naacl-main.112.pdf
+    configpath: opencompass/configs/datasets/govrepcrs
+- gpqa:
+    name: GPQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2311.12022v1
+    configpath: opencompass/configs/datasets/gpqa
+- gsm8k:
+    name: GSM8K
+    category: Math
+    paper: https://arxiv.org/pdf/2110.14168v2
+    configpath: opencompass/configs/datasets/gsm8k
+- gsm_hard:
+    name: GSM-Hard
+    category: Math
+    paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
+    configpath: opencompass/configs/datasets/gsm_hard
+- hellaswag:
+    name: HellaSwag
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1905.07830
+    configpath: opencompass/configs/datasets/hellaswag
+- humaneval:
+    name: HumanEval
+    category: Code
+    paper: https://arxiv.org/pdf/2107.03374v2
+    configpath: opencompass/configs/datasets/humaneval
+- humaneval_cn:
+    name: HumanEval-CN
+    category: Code
+    paper: ""
+    configpath: opencompass/configs/datasets/humaneval_cn
+- humaneval_multi:
+    name: Multi-HumanEval
+    category: Code
+    paper: https://arxiv.org/pdf/2210.14868
+    configpath: opencompass/configs/datasets/humaneval_multi
+- humanevalx:
+    name: HumanEval-X
+    category: Code
+    paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
+    configpath: opencompass/configs/datasets/humanevalx
+- hungarian_math:
+    name: Hungarian_Math
+    category: Math
+    paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
+    configpath: opencompass/configs/datasets/hungarian_exam
+- iwslt2017:
+    name: IWSLT2017
+    category: Language
+    paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
+    configpath: opencompass/configs/datasets/iwslt2017
+- jigsawmultilingual:
+    name: JigsawMultilingual
+    category: Safety
+    paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
+    configpath: opencompass/configs/datasets/jigsawmultilingual
+- lambada:
+    name: LAMBADA
+    category: Understanding
+    paper: https://arxiv.org/pdf/1606.06031
+    configpath: opencompass/configs/datasets/lambada
+- lcsts:
+    name: LCSTS
+    category: Understanding
+    paper: https://aclanthology.org/D15-1229.pdf
+    configpath: opencompass/configs/datasets/lcsts
+- livestembench:
+    name: LiveStemBench
+    category: ""
+    paper: ""
+    configpath: opencompass/configs/datasets/livestembench
+- llm_compression:
+    name: LLM Compression
+    category: Bits Per Character (BPC)
+    paper: https://arxiv.org/pdf/2404.09937
+    configpath: opencompass/configs/datasets/llm_compression
+- math:
+    name: MATH
+    category: Math
+    paper: https://arxiv.org/pdf/2103.03874
+    configpath: opencompass/configs/datasets/math
+- math401:
+    name: MATH 401
+    category: Math
+    paper: https://arxiv.org/pdf/2304.02015
+    configpath: opencompass/configs/datasets/math401
+- mathbench:
+    name: MathBench
+    category: Math
+    paper: https://arxiv.org/pdf/2405.12209
+    configpath: opencompass/configs/datasets/mathbench
+- mbpp:
+    name: MBPP
+    category: Code
+    paper: https://arxiv.org/pdf/2108.07732
+    configpath: opencompass/configs/datasets/mbpp
+- mbpp_cn:
+    name: MBPP-CN
+    category: Code
+    paper: ""
+    configpath: opencompass/configs/datasets/mbpp_cn
+- mbpp_plus:
+    name: MBPP-PLUS
+    category: Code
+    paper: ""
+    configpath: opencompass/configs/datasets/mbpp_plus
+- mgsm:
+    name: MGSM
+    category: Language / Math
+    paper: https://arxiv.org/pdf/2210.03057
+    configpath: opencompass/configs/datasets/mgsm
+- mmlu:
+    name: MMLU
+    category: Understanding
+    paper: https://arxiv.org/pdf/2009.03300
+    configpath: opencompass/configs/datasets/mmlu
+- mmlu_cf:
+    name: MMLU-CF
+    category: Understanding
+    paper: https://arxiv.org/pdf/2412.15194
+    configpath: opencompass/configs/datasets/mmlu_cf
+- mmlu_pro:
+    name: MMLU-Pro
+    category: Understanding
+    paper: https://arxiv.org/pdf/2406.01574
+    configpath: opencompass/configs/datasets/mmlu_pro
+- mmmlu:
+    name: MMMLU
+    category: Language / Understanding
+    paper: https://huggingface.co/datasets/openai/MMMLU
+    configpath: opencompass/configs/datasets/mmmlu
+- multirc:
+    name: SuperGLUE / MultiRC
+    category: Understanding
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
+- narrativeqa:
+    name: NarrativeQA
+    category: Understanding
+    paper: https://github.com/google-deepmind/narrativeqa
+    configpath: opencompass/configs/datasets/narrativeqa
+- natural_question:
+    name: NaturalQuestions
+    category: Knowledge
+    paper: https://github.com/google-research-datasets/natural-questions
+    configpath: opencompass/configs/datasets/nq
+- natural_question_cn:
+    name: NaturalQuestions-CN
+    category: Knowledge
+    paper: ""
+    configpath: opencompass/configs/datasets/nq_cn
+- obqa:
+    name: OpenBookQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1809.02789v1
+    configpath: opencompass/configs/datasets/obqa
+- piqa:
+    name: OpenBookQA
+    category: Knowledge / Physics
+    paper: https://arxiv.org/pdf/1911.11641v1
+    configpath: opencompass/configs/datasets/piqa
+- py150:
+    name: py150
+    category: Code
+    paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
+    configpath: opencompass/configs/datasets/py150
+- qasper:
+    name: Qasper
+    category: Long Context
+    paper: https://arxiv.org/pdf/2105.03011
+    configpath: opencompass/configs/datasets/qasper
+- qaspercut:
+    name: Qasper-Cut
+    category: Long Context
+    paper: ""
+    configpath: opencompass/configs/datasets/qaspercut
+- race:
+    name: RACE
+    category: Examination
+    paper: https://arxiv.org/pdf/1704.04683
+    configpath: opencompass/configs/datasets/race
+- realtoxicprompts:
+    name: RealToxicPrompts
+    category: Safety
+    paper: https://arxiv.org/pdf/2009.11462
+    configpath: opencompass/configs/datasets/realtoxicprompts
+- record:
+    name: SuperGLUE / ReCoRD
+    category: Understanding
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
+- rte:
+    name: SuperGLUE / RTE
+    category: Reasoning
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_RTE
+- ocnli:
+    name: CLUE / OCNLI
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2004.05986
+    configpath: opencompass/configs/datasets/CLUE_ocnli
+- rolebench:
+    name: RoleBench
+    category: Role Play
+    paper: https://arxiv.org/pdf/2310.00746
+    configpath: opencompass/configs/datasets/rolebench
+- s3eval:
+    name: S3Eval
+    category: Long Context
+    paper: https://aclanthology.org/2024.naacl-long.69.pdf
+    configpath: opencompass/configs/datasets/s3eval
+- scibench:
+    name: SciBench
+    category: Reasoning
+    paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
+    configpath: opencompass/configs/datasets/scibench
+- scicode:
+    name: SciCode
+    category: Code
+    paper: https://arxiv.org/pdf/2407.13168
+    configpath: opencompass/configs/datasets/scicode
+- simpleqa:
+    name: SimpleQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2411.04368
+    configpath: opencompass/configs/datasets/SimpleQA
+- siqa:
+    name: SocialIQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/1904.09728
+    configpath: opencompass/configs/datasets/siqa
+- squad20:
+    name: SQuAD2.0
+    category: Understanding
+    paper: https://arxiv.org/pdf/1806.03822
+    configpath: opencompass/configs/datasets/squad20
+- storycloze:
+    name: StoryCloze
+    category: Reasoning
+    paper: https://aclanthology.org/2022.emnlp-main.616.pdf
+    configpath: opencompass/configs/datasets/storycloze
+- strategyqa:
+    name: StrategyQA
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2101.02235
+    configpath: opencompass/configs/datasets/strategyqa
+- summedits:
+    name: SummEdits
+    category: Language
+    paper: https://aclanthology.org/2023.emnlp-main.600.pdf
+    configpath: opencompass/configs/datasets/summedits
+- summscreen:
+    name: SummScreen
+    category: Understanding
+    paper: https://arxiv.org/pdf/2104.07091v1
+    configpath: opencompass/configs/datasets/summscreen
+- svamp:
+    name: SVAMP
+    category: Math
+    paper: https://aclanthology.org/2021.naacl-main.168.pdf
+    configpath: opencompass/configs/datasets/SVAMP
+- tabmwp:
+    name: TabMWP
+    category: Math / Table
+    paper: https://arxiv.org/pdf/2209.14610
+    configpath: opencompass/configs/datasets/TabMWP
+- taco:
+    name: TACO
+    category: Code
+    paper: https://arxiv.org/pdf/2312.14852
+    configpath: opencompass/configs/datasets/taco
+- tnews:
+    name: FewCLUE / TNEWS
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_tnews
+- bustm:
+    name: FewCLUE / BUSTM
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_bustm
+- csl:
+    name: FewCLUE / CSL
+    category: Understanding
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_csl
+- ocnli_fc:
+    name: FewCLUE / OCNLI-FC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
+- triviaqa:
+    name: TriviaQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/1705.03551v2
+    configpath: opencompass/configs/datasets/triviaqa
+- triviaqarc:
+    name: TriviaQA-RC
+    category: Knowledge / Understanding
+    paper: ""
+    configpath: opencompass/configs/datasets/triviaqarc
+- truthfulqa:
+    name: TruthfulQA
+    category: Safety
+    paper: https://arxiv.org/pdf/2109.07958v2
+    configpath: opencompass/configs/datasets/truthfulqa
+- tydiqa:
+    name: TyDi-QA
+    category: Language
+    paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
+    configpath: opencompass/configs/datasets/tydiqa
+- wic:
+    name: SuperGLUE / WiC
+    category: Language
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_WiC
+- wsc:
+    name: SuperGLUE / WSC
+    category: Language / WSC
+    paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
+    configpath: opencompass/configs/datasets/SuperGLUE_WSC
+- winogrande:
+    name: WinoGrande
+    category: Language / WSC
+    paper: https://arxiv.org/pdf/1907.10641v2
+    configpath: opencompass/configs/datasets/winogrande
+- xcopa:
+    name: XCOPA
+    category: Language
+    paper: https://arxiv.org/pdf/2005.00333
+    configpath: opencompass/configs/datasets/XCOPA
+- xiezhi:
+    name: Xiezhi
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2306.05783
+    configpath: opencompass/configs/datasets/xiezhi
+- xlsum:
+    name: XLSum
+    category: Understanding
+    paper: https://arxiv.org/pdf/2106.13822v1
+    configpath: opencompass/configs/datasets/XLSum
+- xsum:
+    name: Xsum
+    category: Understanding
+    paper: https://arxiv.org/pdf/1808.08745
+    configpath: opencompass/configs/datasets/Xsum
+
+
+
diff --git a/docs/en/_static/js/custom.js b/docs/en/_static/js/custom.js
index 84da69d4..9b9f2480 100644
--- a/docs/en/_static/js/custom.js
+++ b/docs/en/_static/js/custom.js
@@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['Dataset Statistics'];
 
 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
     "stateSave": false,
     "lengthChange": false,
     "pageLength": 20,
-    "order": []
+    "order": [],
+    "language": {
+      "info": "Show _START_ to _END_ Items（Totally _TOTAL_ ）",
+      "infoFiltered": "（Filtered from _MAX_ Items）",
+      "search": "Search：",
+      "zeroRecords": "Item Not Found",
+      "paginate": {
+        "next": "Next",
+        "previous": "Previous"
+      },
+    }
   });
 });
diff --git a/docs/en/advanced_guides/new_dataset.md b/docs/en/advanced_guides/new_dataset.md
index 72f33318..e07e6868 100644
--- a/docs/en/advanced_guides/new_dataset.md
+++ b/docs/en/advanced_guides/new_dataset.md
@@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
         return dataset
    ```
 
+3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
+
+   - The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
+
+   ```
+   - mydataset:
+       name: MyDataset
+       category: Understanding
+       paper: https://arxiv.org/pdf/xxxxxxx
+       configpath: opencompass/configs/datasets/MyDataset
+   ```
+
    Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 64a3a83a..9101ba3f 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -220,3 +220,11 @@ autodoc_typehints = 'none'
 
 # The not found page
 notfound_template = '404.html'
+
+
+def builder_inited_handler(app):
+    subprocess.run(['./statis.py'])
+
+
+def setup(app):
+    app.connect('builder-inited', builder_inited_handler)
\ No newline at end of file
diff --git a/docs/en/index.rst b/docs/en/index.rst
index fdad9c9e..7181c459 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -80,6 +80,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
 
    tools.md
 
+.. _Dataset List:
+.. toctree::
+   :maxdepth: 1
+   :caption: Dataset List
+
+   dataset_statistics.md
+
 .. _Notes:
 .. toctree::
    :maxdepth: 1
diff --git a/docs/en/statis.py b/docs/en/statis.py
new file mode 100755
index 00000000..a110c631
--- /dev/null
+++ b/docs/en/statis.py
@@ -0,0 +1,76 @@
+#! /usr/bin/env python
+
+from pathlib import Path
+
+import yaml
+from tabulate import tabulate
+
+OC_ROOT = Path(__file__).absolute().parents[2]
+GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
+DATASETZOO_TEMPLATE = """\
+# Dataset Statistics
+
+On this page, we have listed all the datasets supported by OpenCompass.
+
+You can use sorting and search functions to find the dataset you need.
+
+"""
+
+with open('dataset_statistics.md', 'w') as f:
+    f.write(DATASETZOO_TEMPLATE)
+
+load_path = str(OC_ROOT / 'dataset-index.yml')
+
+with open(load_path, 'r') as f2:
+    data_list = yaml.load(f2, Loader=yaml.FullLoader)
+
+HEADER = ['name', 'category', 'paper', 'configpath']
+
+
+def table_format(data_list):
+    table_format_list = []
+    for i in data_list:
+        table_format_list_sub = []
+        for j in i:
+            for index in HEADER:
+                if index == 'paper':
+                    table_format_list_sub.append('[link](' + i[j][index] + ')')
+                elif index == 'configpath':
+                    if isinstance(i[j][index], list):
+                        sub_list_text = ''
+                        for k in i[j][index]:
+                            sub_list_text += ('[link](' + GITHUB_PREFIX + k +
+                                              ') / ')
+                        table_format_list_sub.append(sub_list_text[:-2])
+                    else:
+                        table_format_list_sub.append('[link](' +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                else:
+                    table_format_list_sub.append(i[j][index])
+        table_format_list.append(table_format_list_sub)
+    return table_format_list
+
+
+data_format_list = table_format(data_list)
+
+
+def generate_table(data_list, title=None):
+
+    with open('dataset_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: dataset\n""")
+        header = ['Name', 'Category', 'Paper or Repository', 'Config File']
+        table_cfg = dict(tablefmt='pipe',
+                         floatfmt='.2f',
+                         numalign='right',
+                         stralign='center')
+        f.write(tabulate(data_list, header, **table_cfg))
+        f.write('\n```\n')
+
+
+generate_table(
+    data_list=data_format_list,
+    title='## Supported Dataset List',
+)
diff --git a/docs/zh_cn/_static/js/custom.js b/docs/zh_cn/_static/js/custom.js
index 84da69d4..ecbff47e 100644
--- a/docs/zh_cn/_static/js/custom.js
+++ b/docs/zh_cn/_static/js/custom.js
@@ -1,10 +1,20 @@
-var collapsedSections = [];
+var collapsedSections = ['数据集统计'];
 
 $(document).ready(function () {
-  $('.model-summary').DataTable({
+  $('.dataset').DataTable({
     "stateSave": false,
     "lengthChange": false,
     "pageLength": 20,
-    "order": []
+    "order": [],
+    "language": {
+      "info": "显示 _START_ 至 _END_ 条目（总计 _TOTAL_ ）",
+      "infoFiltered": "（筛选自 _MAX_ 条目）",
+      "search": "搜索：",
+      "zeroRecords": "没有找到任何条目",
+      "paginate": {
+        "next": "下一页",
+        "previous": "上一页"
+      },
+    }
   });
 });
diff --git a/docs/zh_cn/advanced_guides/new_dataset.md b/docs/zh_cn/advanced_guides/new_dataset.md
index 16e85f37..16921885 100644
--- a/docs/zh_cn/advanced_guides/new_dataset.md
+++ b/docs/zh_cn/advanced_guides/new_dataset.md
@@ -91,4 +91,16 @@
         return dataset
    ```
 
+3. 在完成数据集脚本和配置文件的构建后，需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息，以使其加入OpenCompass官网Doc的数据集统计列表中。
+
+   - 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下：
+
+   ```
+   - mydataset:
+       name: MyDataset
+       category: Understanding
+       paper: https://arxiv.org/pdf/xxxxxxx
+       configpath: opencompass/configs/datasets/MyDataset
+   ```
+
 详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程，启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 640ea1d8..8910ead0 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -224,6 +224,7 @@ notfound_template = '404.html'
 
 def builder_inited_handler(app):
     subprocess.run(['./cp_origin_docs.sh'])
+    subprocess.run(['./statis.py'])
 
 
 def setup(app):
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 37a3bc0c..827c7d91 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -81,6 +81,13 @@ OpenCompass 上手路线
 
    tools.md
 
+.. _数据集列表:
+.. toctree::
+   :maxdepth: 1
+   :caption: 数据集列表
+
+   dataset_statistics.md
+
 .. _其他说明:
 .. toctree::
    :maxdepth: 1
diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py
new file mode 100755
index 00000000..eb5dc7fe
--- /dev/null
+++ b/docs/zh_cn/statis.py
@@ -0,0 +1,75 @@
+#! /usr/bin/env python
+
+from pathlib import Path
+
+import yaml
+from tabulate import tabulate
+
+OC_ROOT = Path(__file__).absolute().parents[2]
+GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
+DATASETZOO_TEMPLATE = """\
+# 数据集统计
+
+在本页面中，我们列举了OpenCompass所支持的所有数据集。
+
+你可以使用排序和搜索功能找到需要的数据集。
+
+"""
+
+with open('dataset_statistics.md', 'w') as f:
+    f.write(DATASETZOO_TEMPLATE)
+
+load_path = str(OC_ROOT / 'dataset-index.yml')
+
+with open(load_path, 'r') as f2:
+    data_list = yaml.load(f2, Loader=yaml.FullLoader)
+
+HEADER = ['name', 'category', 'paper', 'configpath']
+
+
+def table_format(data_list):
+    table_format_list = []
+    for i in data_list:
+        table_format_list_sub = []
+        for j in i:
+            for index in HEADER:
+                if index == 'paper':
+                    table_format_list_sub.append('[链接](' + i[j][index] + ')')
+                elif index == 'configpath':
+                    if isinstance(i[j][index], list):
+                        sub_list_text = ''
+                        for k in i[j][index]:
+                            sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
+                                              ') / ')
+                        table_format_list_sub.append(sub_list_text[:-2])
+                    else:
+                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                                                     i[j][index] + ')')
+                else:
+                    table_format_list_sub.append(i[j][index])
+        table_format_list.append(table_format_list_sub)
+    return table_format_list
+
+
+data_format_list = table_format(data_list)
+
+
+def generate_table(data_list, title=None):
+
+    with open('dataset_statistics.md', 'a') as f:
+        if title is not None:
+            f.write(f'\n{title}')
+        f.write("""\n```{table}\n:class: dataset\n""")
+        header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
+        table_cfg = dict(tablefmt='pipe',
+                         floatfmt='.2f',
+                         numalign='right',
+                         stralign='center')
+        f.write(tabulate(data_list, header, **table_cfg))
+        f.write('\n```\n')
+
+
+generate_table(
+    data_list=data_format_list,
+    title='## 支持数据集列表',
+)

From f407930475e4d7cf3338eb9a5b1ac4f03916d7f6 Mon Sep 17 00:00:00 2001
From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
Date: Thu, 20 Feb 2025 12:19:46 +0800
Subject: [PATCH 02/58] [Feature] Support subjective evaluation for reasoning
 model (#1868)

* fix pip version

* fix pip version

* add subeval for reasoning model

* add subeval for reasoning model

* update configs

* update config

* update config

* update config

* update files
---
 .../alignbench_judgeby_critiquellm.py          |  2 +-
 .../alignbench_judgeby_critiquellm_new.py      |  2 +-
 .../alignbench_v1_1_judgeby_critiquellm.py     |  2 +-
 .../alignbench_v1_1_judgeby_critiquellm_new.py |  2 +-
 .../alpaca_eval/alpacav2_judgeby_gpt4.py       |  3 ++-
 .../alpacav2_judgeby_gpt4_bradleyterry.py      |  2 +-
 .../alpaca_eval/alpacav2_judgeby_gpt4_new.py   |  2 +-
 .../arena_hard/arena_hard_compare.py           |  2 +-
 .../arena_hard_compare_bradleyterry.py         |  2 +-
 .../arena_hard/arena_hard_compare_new.py       |  2 +-
 .../compassarena/compassarena_compare.py       |  2 +-
 .../compassarena_compare_bradleyterry.py       |  4 ++--
 .../compassarena/compassarena_compare_new.py   |  2 +-
 .../subjective/fofo/fofo_bilingual_judge.py    |  2 +-
 .../fofo/fofo_bilingual_judge_new.py           |  2 +-
 .../datasets/subjective/fofo/fofo_judge.py     |  2 +-
 .../datasets/subjective/fofo/fofo_judge_new.py |  2 +-
 .../followbench/followbench_llmeval.py         |  2 +-
 .../followbench/followbench_llmeval_new.py     |  2 +-
 .../subjective/multiround/mtbench101_judge.py  |  2 +-
 .../multiround/mtbench101_judge_new.py         |  2 +-
 .../openicl/icl_evaluator/lm_evaluator.py      |  2 +-
 opencompass/tasks/subjective_eval.py           | 18 +++++++++++++++---
 opencompass/utils/text_postprocessors.py       | 12 ++++++++++++
 24 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
index 86c2a80b..0bc7df77 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py
@@ -32,7 +32,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
index 20797b0f..d3f59b9f 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm_new.py
@@ -31,7 +31,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
index 024f66a1..44f63f4f 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm.py
@@ -32,7 +32,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
index 2ff09a3e..216e6ffa 100644
--- a/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
+++ b/opencompass/configs/datasets/subjective/alignbench/alignbench_v1_1_judgeby_critiquellm_new.py
@@ -31,7 +31,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
index 137e5ca0..ad0d4ef4 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py
@@ -73,12 +73,13 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
         evaluator=dict(
             type=LMEvaluator,
+            
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
index 99f2e2be..19fe1559 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_bradleyterry.py
@@ -74,7 +74,7 @@ for _name in subjective_all_sets:
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
index 06a82efe..a0510f5c 100644
--- a/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
+++ b/opencompass/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4_new.py
@@ -72,7 +72,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
index 90837c7b..7446fdd7 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare.py
@@ -38,7 +38,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
index 7a0e9ae8..dc4b250e 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_bradleyterry.py
@@ -39,7 +39,7 @@ for _name in subjective_all_sets:
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
index 08b27ca7..dbad40ef 100644
--- a/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
+++ b/opencompass/configs/datasets/subjective/arena_hard/arena_hard_compare_new.py
@@ -37,7 +37,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
index 90141e66..47cc7b31 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare.py
@@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
index 8a687889..38d7927a 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_bradleyterry.py
@@ -1,6 +1,6 @@
 from opencompass.datasets import (
     CompassArenaDataset,
-    compassarena_bradleyterry_postprocess,
+    compassarena_bradleyterry_postprocess
 )
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.openicl.icl_inferencer import GenInferencer
@@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
             ),
         ),
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+        inferencer=dict(type=GenInferencer),
     )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
index a32691ad..83266765 100644
--- a/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
+++ b/opencompass/configs/datasets/subjective/compassarena/compassarena_compare_new.py
@@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
index 089fd101..9516e074 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge.py
@@ -91,7 +91,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
index 81e160b5..f732dba0 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_bilingual_judge_new.py
@@ -90,7 +90,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
index 89400892..8944be01 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge.py
@@ -59,7 +59,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
index 691aff2b..03dcf190 100644
--- a/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
+++ b/opencompass/configs/datasets/subjective/fofo/fofo_judge_new.py
@@ -58,7 +58,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=4096),
+            inferencer=dict(type=GenInferencer,),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
index e601bda3..1c4203fd 100644
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval.py
@@ -29,7 +29,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
index b0aacd86..970605b6 100644
--- a/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
+++ b/opencompass/configs/datasets/subjective/followbench/followbench_llmeval_new.py
@@ -28,7 +28,7 @@ for _name in subjective_all_sets:
                 ]),
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=2048),
+            inferencer=dict(type=GenInferencer),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
index 00924ecb..53ab1631 100644
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge.py
@@ -24,7 +24,7 @@ for _name in subjective_all_sets:
                 template="""{dialogue}""",
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+            inferencer=dict(type=ChatInferencer, infer_mode='last'),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
index 938f2f5e..01b9c12f 100644
--- a/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
+++ b/opencompass/configs/datasets/subjective/multiround/mtbench101_judge_new.py
@@ -23,7 +23,7 @@ for _name in subjective_all_sets:
                 template="""{dialogue}""",
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
+            inferencer=dict(type=ChatInferencer, infer_mode='last'),
         )
 
     subjective_eval_cfg = dict(
diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py
index 53814070..94f2cf94 100644
--- a/opencompass/openicl/icl_evaluator/lm_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py
@@ -329,4 +329,4 @@ class LMEvaluator:
         else:
             kwargs = self.dict_postprocessor
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            return proc(output, self.output_path, **kwargs)
\ No newline at end of file
diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py
index 417c5cdb..0ddd7b0c 100644
--- a/opencompass/tasks/subjective_eval.py
+++ b/opencompass/tasks/subjective_eval.py
@@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
             if fnmatch.fnmatch(ds_abbr, pattern):
                 pred_postprocessor = model_postprocessors[pattern]
                 break
+
         if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
             kwargs = pred_postprocessor or eval_cfg['evaluator'][
                 'pred_postprocessor']
-            proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
+            temp_kwargs = copy.deepcopy(kwargs)
+            proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
             self.logger.info('Get postprocessor {postprocessor}.')
-            pred_strs = [proc(s, **kwargs) for s in pred_strs]
+            pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
         else:
-            self.logger.info('No postprocessor found.')
+            self.logger.info('No dataset postprocessor found.')
+
+        if 'pred_postprocessor' in model_cfg or pred_postprocessor:
+            kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
+            temp_kwargs = copy.deepcopy(kwargs)
+            proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
+            pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
+        else:
+            self.logger.info('No model postprocessor found.')
 
         return {
             'model_name': model_abbr_from_cfg(model_cfg),
@@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
             if fnmatch.fnmatch(ds_abbr, pattern):
                 pred_postprocessor = model_postprocessors[pattern]
                 break
+
         if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
+
             kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
             proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
             pred_strs = [proc(s, **kwargs) for s in pred_strs]
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index eb7469ab..7110e752 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -57,6 +57,18 @@ def last_capital_postprocess(text: str) -> str:
     return ''
 
 
+@TEXT_POSTPROCESSORS.register_module('think_pred')
+def think_pred_postprocess(
+    prediction: str,
+    re_pattern: str,
+) -> str:
+    match = re.search(re_pattern, prediction)
+    if match:
+        return match.group(1).strip()
+    else:
+        return prediction
+
+
 def first_option_postprocess(text: str, options: str, cushion=True) -> str:
     """Find first valid option for text."""
 

From bc22749fd8c20d4f69c2c4ebb9e517bce2c4666a Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Thu, 20 Feb 2025 14:08:18 +0800
Subject: [PATCH 03/58] [CI] update daily test scores (#1870)

* update

* Update daily-run-test.yml

* Update dlc.py
---
 .../scripts/oc_score_baseline_fullbench.yaml  |  6 +++---
 .../scripts/oc_score_baseline_testrange.yaml  | 20 +++++++++----------
 .github/workflows/daily-run-test.yml          | 13 ++++++------
 opencompass/runners/dlc.py                    |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 6ab32832..5b0dee2b 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
         openai_mmmlu_lite_DE-DE_accuracy: 51.27
         openai_mmmlu_lite_ES-LA_accuracy: 56.94
         openai_mmmlu_lite_FR-FR_accuracy: 58.22
-        openai_mmmlu_lite_HI-IN_accuracy: 33.75
+        openai_mmmlu_lite_HI-IN_accuracy: 30.75
         openai_mmmlu_lite_ID-ID_accuracy: 50.6
         openai_mmmlu_lite_IT-IT_accuracy: 50.6
         openai_mmmlu_lite_JA-JP_accuracy: 51.13
@@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
         CompassArena_naive_average: 34.61
         FoFo_naive_average: 0.38
         mtbench101_avg: 8.01
-        wildbench_average: -15.69
+        wildbench_average: -10.49
         simpleqa_accuracy_given_attempted: 0.04
         chinese_simpleqa_given_attempted_accuracy: 0.34
         alignment_bench_v1_1_专业能力: 6.05
@@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
         compassarena_math_v2_naive_average: 19.91
-        compassarena_creationv2_zh_naive_average: 35.81
+        compassarena_creationv2_zh_naive_average: 43.64
         fofo_test_prompts_overall: 0.35
         fofo_test_prompts_cn_overall: 0.41
         followbench_llmeval_en_HSR_AVG: 0.73
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 45e20ddd..5f1121a7 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -15,13 +15,13 @@ chat:
         gsm8k_accuracy: 50
         race-high_accuracy: 68.75
     deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 75
+        gsm8k_accuracy: 50
+        race-high_accuracy: 78.12
     gemma2-2b-it-hf:
         gsm8k_accuracy: 50
-        race-high_accuracy: 71.88
+        race-high_accuracy: 75
     gemma2-9b-it-hf:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 84.38
     gemma-2b-it-hf:
         gsm8k_accuracy: 3.12
@@ -36,7 +36,7 @@ chat:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     gemma-7b-it-vllm:
-        gsm8k_accuracy: 34.38
+        gsm8k_accuracy: 46.88
         race-high_accuracy: 68.75
     internlm2_5-7b-chat-hf:
         gsm8k_accuracy: 84.38
@@ -57,7 +57,7 @@ chat:
         gsm8k_accuracy: 53.12
         race-high_accuracy: 90.62
     internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 43.75
         race-high_accuracy: 84.38
     llama-3_1-8b-instruct-hf:
         gsm8k_accuracy: 84.38
@@ -90,13 +90,13 @@ chat:
         gsm8k_accuracy: 75
         race-high_accuracy: 81.25
     mistral-nemo-instruct-2407-turbomind:
-        gsm8k_accuracy: 65.62
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 71.88
+        race-high_accuracy: 78.12
     mistral-7b-instruct-v0.1-vllm:
         gsm8k_accuracy: 34.38
         race-high_accuracy: 68.75
     mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 43.75
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 75
     phi-3-mini-4k-instruct-hf:
         gsm8k_accuracy: 81.25
@@ -177,7 +177,7 @@ chat:
         gsm8k_accuracy: 93.75
         race-high_accuracy: 87.5
     mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
         race-high_accuracy: 81.25
     mixtral-large-instruct-2411-turbomind:
         gsm8k_accuracy: 90.62
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 0fa1f4a6..8aa1df16 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -17,7 +17,7 @@ on:
         required: false
         description: 'whether to build lmdeploy'
         type:  boolean
-        default: false
+        default: true
       repo_org_lmdeploy:
         required: false
         description: 'Tested repository organization name. Default is internlm/lmdeploy'
@@ -162,15 +162,16 @@ jobs:
             pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
             cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
         uses: actions/download-artifact@v4
         with:
           name: my-artifact-${{ github.run_id }}-py310
       - name: Prepare - reinstall lmdeploy - cu12
-        if: ${{inputs.build_lmdeploy}}
+        if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
+          pip uninstall -y lmdeploy
           pip install lmdeploy-*.whl --no-deps
       - name: conda env
         run: |
@@ -188,7 +189,7 @@ jobs:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
     runs-on: volc_cu12_daily
     environment: 'prod'
-    timeout-minutes: 120 #2hours
+    timeout-minutes: 180 #3hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -275,7 +276,7 @@ jobs:
           conda info --envs
           lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
-          sleep 120s
+          sleep 180s
           opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
           python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@@ -334,7 +335,7 @@ jobs:
 
 
   notify_to_feishu:
-    if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
+    if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
     needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
     timeout-minutes: 5
     runs-on: self-hosted
diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py
index 03045870..44e9fd00 100644
--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
                             f'Failed to get job info for {job_id}')
 
                     status = job_info['Status']
-                    if status == 'Failed':
+                    if status == 'Failed' or status == 'Stopped':
                         return -1
                     elif status == 'Succeeded':
                         return 0

From 27c916661d94973430925c54783e5c08f8e1fb48 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Thu, 20 Feb 2025 19:32:12 +0800
Subject: [PATCH 04/58] [Feature] Math Verify with model post_processor (#1881)

* update

* [Feature] Update model post_processor

* update

* update

* update
---
 examples/eval_math_verify.py                  |  77 +++++++++
 .../configs/datasets/math/math_500_gen.py     |  40 +++++
 opencompass/datasets/custom.py                |  58 ++++---
 .../openicl/icl_evaluator/math_evaluator.py   | 154 ++++++++++++++++++
 opencompass/tasks/openicl_eval.py             |  36 ++--
 opencompass/utils/text_postprocessors.py      |  42 +++++
 requirements/extra.txt                        |   2 +
 7 files changed, 369 insertions(+), 40 deletions(-)
 create mode 100644 examples/eval_math_verify.py
 create mode 100644 opencompass/configs/datasets/math/math_500_gen.py
 create mode 100644 opencompass/openicl/icl_evaluator/math_evaluator.py

diff --git a/examples/eval_math_verify.py b/examples/eval_math_verify.py
new file mode 100644
index 00000000..bbd1dbc3
--- /dev/null
+++ b/examples/eval_math_verify.py
@@ -0,0 +1,77 @@
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+with read_base():
+    from opencompass.configs.datasets.math.math_500_gen import math_datasets
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=32,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(
+            top_k=1,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768,
+            do_sample=True,
+        ),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content),
+    ),
+]
+
+datasets = [*math_datasets]
+
+
+work_dir = './outputs/math_500'
diff --git a/opencompass/configs/datasets/math/math_500_gen.py b/opencompass/configs/datasets/math/math_500_gen.py
new file mode 100644
index 00000000..79d2f3b0
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_500_gen.py
@@ -0,0 +1,40 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='math-500',
+        path='opencompass/math',
+        file_name='test_prm800k_500.jsonl',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/datasets/custom.py b/opencompass/datasets/custom.py
index ad3fbe2c..110cb72b 100644
--- a/opencompass/datasets/custom.py
+++ b/opencompass/datasets/custom.py
@@ -13,6 +13,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
 
 from .base import BaseDataset
 
@@ -114,7 +115,7 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
             circular_pattern = origin_item['circular_pattern']
             for k in circular_patterns:
                 if tuple(circular_pattern) in circular_patterns[k]:
-                    tmp_metrics[f'correct_{k}'] += (1 if parsed == refr else 0)
+                    tmp_metrics[f'correct_{k}'] += 1 if parsed == refr else 0
                     tmp_metrics[f'count_{k}'] += 1
 
         for k in circular_patterns:
@@ -164,7 +165,10 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
 class CustomDataset(BaseDataset):
 
     @staticmethod
-    def load(path):
+    def load(path, file_name=None, local_mode=False):
+        path = get_data_path(path, local_mode=local_mode)
+        if file_name is not None:
+            path = os.path.join(path, file_name)
         if path.endswith('.jsonl'):
             with open(path, 'r', encoding='utf-8-sig') as f:
                 data = [json.loads(line) for line in f]
@@ -222,9 +226,10 @@ def make_mcq_gen_config(meta):
     )
 
     eval_cfg = dict(
-        evaluator=dict(type=meta.get('evaluator', OptionSimAccEvaluator),
-                       **meta.get('evaluator_kwargs',
-                                  {'options': meta['options']})),
+        evaluator=dict(
+            type=meta.get('evaluator', OptionSimAccEvaluator),
+            **meta.get('evaluator_kwargs', {'options': meta['options']}),
+        ),
         pred_role='BOT',
     )
 
@@ -269,10 +274,10 @@ def make_circular_mcq_gen_config(meta):
     )
 
     eval_cfg = dict(
-        evaluator=dict(type=meta.get('evaluator',
-                                     CircularOptionSimAccEvaluator),
-                       **meta.get('evaluator_kwargs',
-                                  {'options': meta['options']})),
+        evaluator=dict(
+            type=meta.get('evaluator', CircularOptionSimAccEvaluator),
+            **meta.get('evaluator_kwargs', {'options': meta['options']}),
+        ),
         pred_role='BOT',
     )
 
@@ -320,8 +325,10 @@ def make_qa_gen_config(meta):
     )
 
     eval_cfg = dict(
-        evaluator=dict(type=meta.get('evaluator', AccEvaluator),
-                       **meta.get('evaluator_kwargs', {})),
+        evaluator=dict(
+            type=meta.get('evaluator', AccEvaluator),
+            **meta.get('evaluator_kwargs', {}),
+        ),
         pred_role='BOT',
     )
 
@@ -346,9 +353,11 @@ def make_mcq_ppl_config(meta):
         template = {
             answer: dict(round=[
                 dict(role='HUMAN', prompt=human_prompt),
-                dict(role='BOT',
-                     prompt=bot_prompt.format(
-                         **{meta['output_column']: answer})),
+                dict(
+                    role='BOT',
+                    prompt=bot_prompt.format(
+                        **{meta['output_column']: answer}),
+                ),
             ], )
             for answer in meta['options']
         }
@@ -370,8 +379,10 @@ def make_mcq_ppl_config(meta):
         inferencer=dict(type=PPLInferencer),
     )
 
-    eval_cfg = dict(evaluator=dict(type=meta.get('evaluator', AccEvaluator),
-                                   **meta.get('evaluator_kwargs', {})))
+    eval_cfg = dict(evaluator=dict(
+        type=meta.get('evaluator', AccEvaluator),
+        **meta.get('evaluator_kwargs', {}),
+    ))
 
     dataset = dict(
         abbr=meta['abbr'],
@@ -394,9 +405,11 @@ def make_circular_mcq_ppl_config(meta):
         template = {
             answer: dict(round=[
                 dict(role='HUMAN', prompt=human_prompt),
-                dict(role='BOT',
-                     prompt=bot_prompt.format(
-                         **{meta['output_column']: answer})),
+                dict(
+                    role='BOT',
+                    prompt=bot_prompt.format(
+                        **{meta['output_column']: answer}),
+                ),
             ], )
             for answer in meta['options']
         }
@@ -418,9 +431,10 @@ def make_circular_mcq_ppl_config(meta):
         inferencer=dict(type=PPLInferencer),
     )
 
-    eval_cfg = dict(
-        evaluator=dict(type=meta.get('evaluator', CircularEvaluator),
-                       **meta.get('evaluator_kwargs', {})))
+    eval_cfg = dict(evaluator=dict(
+        type=meta.get('evaluator', CircularEvaluator),
+        **meta.get('evaluator_kwargs', {}),
+    ))
 
     dataset = dict(
         abbr=meta['abbr'],
diff --git a/opencompass/openicl/icl_evaluator/math_evaluator.py b/opencompass/openicl/icl_evaluator/math_evaluator.py
new file mode 100644
index 00000000..c790c17b
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/math_evaluator.py
@@ -0,0 +1,154 @@
+from latex2sympy2_extended import NormalizationConfig
+from math_verify import (ExprExtractionConfig, LatexExtractionConfig, parse,
+                         verify)
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+
+
+@ICL_EVALUATORS.register_module()
+class MATHEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+
+        self.is_num_equal(predictions, references)
+
+        correct = 0
+        count = 0
+        details = []
+        for i, j in zip(predictions, references):
+            count += 1
+            gold_parsed = parse(
+                j,
+                extraction_mode='first_match',
+                extraction_config=[
+                    LatexExtractionConfig(),
+                    ExprExtractionConfig(),
+                ],
+            )
+            # If parsing result is empty, try adding LaTeX
+            # environment and parse again
+            if len(gold_parsed) == 0:
+                j_with_env = f'${j}$'
+                gold_parsed = parse(
+                    j_with_env,
+                    extraction_mode='first_match',
+                    extraction_config=[
+                        LatexExtractionConfig(),
+                        ExprExtractionConfig(),
+                    ],
+                )
+            if len(gold_parsed) != 0:
+                # We require the answer to be provided in correct
+                # latex (no malformed operators)
+                answer_parsed = parse(
+                    i,
+                    extraction_config=[
+                        LatexExtractionConfig(
+                            normalization_config=NormalizationConfig(
+                                nits=False,
+                                malformed_operators=False,
+                                basic_latex=True,
+                                equations=True,
+                                boxed='all',
+                                units=True,
+                            ),
+                            # Ensures that boxed is tried first
+                            boxed_match_priority=0,
+                            try_extract_without_anchor=False,
+                        )
+                    ],
+                    extraction_mode='first_match',
+                )
+
+                answer_correct = float(verify(answer_parsed, gold_parsed))
+                correct += answer_correct
+                detail = {
+                    'pred': str(answer_parsed),
+                    'answer': str(gold_parsed),
+                    'correct': True if answer_correct else False,
+                }
+                details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+if __name__ == '__main__':
+    import sympy
+
+    test_cases = [
+        # 1. Basic arithmetic operations
+        r'Simple fraction: \boxed{\frac{1}{2}}',
+        r'Addition: \boxed{2 + 3}',
+        r'Multiplication: \boxed{2 \times 3}',
+        # 2. Algebraic expressions
+        r'Quadratic: \boxed{x^2 + 2x + 1}',
+        r'Polynomial: \boxed{3x^3 - 2x^2 + 4x - 5}',
+        # 3. Trigonometric functions
+        r'Trigonometry: \boxed{\sin(x) + \cos(x)}',
+        r'Complex trig: \boxed{\tan^2(x) + \sec^2(x)}',
+        # 4. Roots and exponents
+        r'Square root: \boxed{\sqrt{16}}',
+        r'Complex root: \boxed{\sqrt[3]{x^2 + 1}}',
+        # 5. Logarithms
+        r'Natural log: \boxed{\ln(e^2)}',
+        r'Log base: \boxed{\log_2(8)}',
+        # 6. Limits and summations
+        r'Limit: \boxed{\lim_{x \to 0} \frac{\sin(x)}{x}}',
+        r'Sum: \boxed{\sum_{i=1}^{n} i}',
+        # 7. Integrals
+        r'Integral: \boxed{\int_{0}^{1} x^2 dx}',
+        r'Double integral: \boxed{\int_{0}^{1}\int_{0}^{1} xy \,dx\,dy}',
+        # 8. Matrices
+        r'Matrix: \boxed{\begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}}',
+        # 9. Complex combinations
+        r'Complex expr: \boxed{\frac{\sqrt{x^2 + 1}}{\ln(x)} + '
+        r'\int_{0}^{x} t^2 dt}',
+        # 10. Error cases
+        r'Empty: \boxed{}',
+        r'Invalid: \boxed{\frac{1}}',  # Missing denominator
+        r'Nested: \boxed{\boxed{1}}',  # Nested boxed
+    ]
+
+    def print_result(expr: str, result: list):
+        print('\n' + '=' * 50)
+        print(f'Input: {expr}')
+        print(f'Output type: {type(result)}')
+        print(f'Output: {result}')
+
+        # If result is sympy expression, show more information
+        if result:
+            for item in result:
+                if isinstance(item, sympy.Basic):
+                    print(f'Sympy repr: {repr(item)}')
+                    try:
+                        print(f'Evaluated: {item.evalf()}')
+                    except Exception as e:
+                        print(f'Cannot evaluate: {e}')
+
+    # Test all cases
+    for test_expr in test_cases:
+        try:
+            result = parse(test_expr)
+            print_result(test_expr, result)
+        except Exception as e:
+            print(f'\nError processing {test_expr}: {e}')
+
+    # Special test: verify numerical calculations
+    numerical_tests = [
+        r'\boxed{2 + 2}',  # Should equal 4
+        r'\boxed{\frac{1}{2} + \frac{1}{3}}',  # Should equal 5/6
+        r'\boxed{\sqrt{16} + \sqrt{9}}',  # Should equal 7
+    ]
+
+    print('\n' + '=' * 50 + '\nNumerical Verification Tests:')
+    for test_expr in numerical_tests:
+        try:
+            result = parse(test_expr)
+            if result and isinstance(result[0], sympy.Basic):
+                expr = result[0]
+                print(f'\nExpression: {test_expr}')
+                print(f'Symbolic: {expr}')
+                print(f'Numerical value: {float(expr.evalf())}')
+        except Exception as e:
+            print(f'\nError in numerical test {test_expr}: {e}')
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index 5bec3603..a797459f 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -1,6 +1,5 @@
 import argparse
 import copy
-import fnmatch
 import math
 import os
 import os.path as osp
@@ -18,9 +17,8 @@ from mmengine.utils import mkdir_or_exist
 from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
                                   TEXT_POSTPROCESSORS)
 from opencompass.tasks.base import BaseTask, extract_role_pred
-from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
-                               get_infer_output_path, get_logger,
-                               task_abbr_from_cfg)
+from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
+                               get_logger, task_abbr_from_cfg)
 
 
 @TASKS.register_module()
@@ -60,19 +58,9 @@ class OpenICLEvalTask(BaseTask):
                 self.dataset_cfg = dataset_cfg
 
                 # Load Dataset
-                self.eval_cfg = self.dataset_cfg.get('eval_cfg')
-                self.output_column = dataset_cfg['reader_cfg']['output_column']
-
-                # overwrite postprocessor if the model has specified one
-                ds_abbr = dataset_abbr_from_cfg(self.dataset_cfg)
-                model_postprocessors = self.model_cfg.get(
-                    'pred_postprocessor', {})
-                for pattern in model_postprocessors.keys():
-                    if fnmatch.fnmatch(ds_abbr, pattern):
-                        self.eval_cfg[
-                            'pred_postprocessor'] = model_postprocessors[
-                                pattern]  # noqa
-                        break
+                self.eval_cfg = copy.deepcopy(dataset_cfg.get('eval_cfg'))
+                self.output_column = copy.deepcopy(
+                    dataset_cfg['reader_cfg']['output_column'])
 
                 out_path = get_infer_output_path(
                     self.model_cfg, self.dataset_cfg,
@@ -155,8 +143,20 @@ class OpenICLEvalTask(BaseTask):
                     ]
 
             # Postprocess predictions if necessary
+            # Model Specified Postprocessor
+            if 'pred_postprocessor' in self.model_cfg:
+                kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
+                proc = kwargs.pop('type')
+                if isinstance(proc, str):
+                    proc = TEXT_POSTPROCESSORS.get(proc)
+                if pred_list_flag:
+                    pred_strs = [[proc(s, **kwargs) for s in preds]
+                                 for preds in pred_strs]
+                else:
+                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
+            # Dataset Specified Postprocessor
             if 'pred_postprocessor' in self.eval_cfg:
-                kwargs = self.eval_cfg['pred_postprocessor']
+                kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
                 proc = kwargs.pop('type')
                 if isinstance(proc, str):
                     proc = TEXT_POSTPROCESSORS.get(proc)
diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py
index 7110e752..d21a06ab 100644
--- a/opencompass/utils/text_postprocessors.py
+++ b/opencompass/utils/text_postprocessors.py
@@ -37,6 +37,7 @@ def general_cn_postprocess(text: str) -> str:
 
     cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
     import jieba
+
     cleaned_text = ' '.join(jieba.cut(text))
     return cleaned_text
 
@@ -241,3 +242,44 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
     match = re.search(answer_pattern, response_text)
     extracted_answer = match.group(1) if match else ''
     return extracted_answer
+
+
+@TEXT_POSTPROCESSORS.register_module('extract-non-reasoning-content')
+def extract_non_reasoning_content(
+    text: str,
+    think_start_token: str = '<think>',
+    think_end_token: str = '</think>',
+) -> str:
+    """Extract content after the last reasoning tag from text.
+
+    When only end token is present, returns content after the end token.
+    When both tokens are present, removes all content between start and end tokens.
+
+    Args:
+        text (str): Input text containing reasoning tags.
+        think_start_token (str, optional): Start token for reasoning section. Defaults to '<think>'.
+        think_end_token (str, optional): End token for reasoning section. Defaults to '</think>'.
+
+    Returns:
+        str: Processed text after removing reasoning sections.
+
+    Examples:
+        >>> # When only end token exists
+        >>> text = "This is a test.</think> How are you?"
+        >>> extract_non_reasoning_content(text)
+        'How are you?'
+
+        >>> # When both tokens exist
+        >>> text = "Start<think>reasoning here</think> End"
+        >>> extract_non_reasoning_content(text)
+        'Start End'
+    """
+    # If text contains only end token, split by end token and take the last part
+    if think_start_token not in text and think_end_token in text:
+        return text.split(think_end_token)[-1].strip()
+
+    # Original behavior for complete tag pairs
+    reasoning_regex = re.compile(rf'{think_start_token}(.*?){think_end_token}',
+                                 re.DOTALL)
+    non_reasoning_content = reasoning_regex.sub('', text).strip()
+    return non_reasoning_content
diff --git a/requirements/extra.txt b/requirements/extra.txt
index 7f04c9d0..a98b3bc8 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -15,6 +15,8 @@ langdetect
 latex2sympy2
 # Lawbench, leval
 ltp
+# Math
+math-verify
 # Taco, apps Dataset
 pyext
 # Law Bench

From d7daee6e259dc851d615e685225db27800fd9a9a Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Thu, 20 Feb 2025 19:33:25 +0800
Subject: [PATCH 05/58] [Update] OpenAI model update, bigcodebench update
 (#1879)

* [Update] Openai model update, bigcodebench update

* update
---
 .../datasets/bigcodebench/bigcodebench.py     | 36 +++++++++-
 opencompass/models/openai_api.py              | 70 ++++++++++---------
 .../summarizers/subjective/compassbench.py    | 38 ++++------
 3 files changed, 86 insertions(+), 58 deletions(-)

diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py
index f1109b1d..f347e9e2 100644
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@@ -121,8 +121,40 @@ class BigCodeBenchEvaluator(BaseEvaluator):
         logger.info('Start to extract code from predictions')
         sanitized_predictions = []
         for prediction, entrypoint in zip(predictions, entrypoints):
-            sanitized_prediction = extract_code_generation(
-                prediction, entrypoint=entrypoint)
+            try:
+                import signal
+                from contextlib import contextmanager
+
+                @contextmanager
+                def timeout_handler(seconds):
+
+                    def _handle_timeout(signum, frame):
+                        raise TimeoutError(f'Code extraction timed out'
+                                           f'after {seconds} seconds')
+
+                    original_handler = signal.signal(signal.SIGALRM,
+                                                     _handle_timeout)
+                    signal.alarm(seconds)
+                    try:
+                        yield
+                    finally:
+                        signal.alarm(0)
+                        signal.signal(signal.SIGALRM, original_handler)
+
+                with timeout_handler(10):
+                    sanitized_prediction = extract_code_generation(
+                        prediction, entrypoint=entrypoint)
+
+            except TimeoutError as e:
+                logger.warning(
+                    f'Code extraction timeout for entrypoint {entrypoint}: '
+                    f'{str(e)}')
+                sanitized_prediction = ''
+            except Exception as e:
+                logger.warning(
+                    f'Code extraction failed for entrypoint {entrypoint}: '
+                    f'{str(e)}')
+                sanitized_prediction = ''
             sanitized_predictions.append(sanitized_prediction)
 
         # Prepare for submission
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index 2781d160..d5ac02d8 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -25,12 +25,7 @@ OPENAI_API_BASE = os.path.join(
 OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
                                     'https://api.openai.com/v1/')
 
-O1_MODEL_LIST = [
-    'o1-preview-2024-09-12',
-    'o1-mini-2024-09-12',
-    'o1-preview',
-    'o1-mini',
-]
+O1_MODEL_LIST = ['o1', 'o3']
 
 
 @MODELS.register_module()
@@ -96,7 +91,6 @@ class OpenAI(BaseAPIModel):
         temperature: Optional[float] = None,
         tokenizer_path: Optional[str] = None,
         extra_body: Optional[Dict] = None,
-        max_completion_tokens: int = 16384,
         verbose: bool = False,
     ):
 
@@ -151,9 +145,6 @@ class OpenAI(BaseAPIModel):
             self.proxy_url = openai_proxy_url
 
         self.path = path
-        self.max_completion_tokens = max_completion_tokens
-        self.logger.warning(
-            f'Max Completion tokens for {path} is {max_completion_tokens}')
 
     def generate(
         self,
@@ -250,16 +241,15 @@ class OpenAI(BaseAPIModel):
                 header['OpenAI-Organization'] = self.orgs[self.org_ctr]
 
             try:
-                if self.path in O1_MODEL_LIST:
+                if any(model in self.path for model in O1_MODEL_LIST):
                     self.logger.warning(
                         f"'max_token' is unsupported for model {self.path}")
                     self.logger.warning(
-                        f'We use max_completion_tokens: '
-                        f'{self.max_completion_tokens}for this query')
+                        f'We use max_out_len: {max_out_len} for this query')
                     data = dict(
                         model=self.path,
                         messages=messages,
-                        max_completion_tokens=self.max_completion_tokens,
+                        max_completion_tokens=max_out_len,
                         n=1,
                         logprobs=self.logprobs,
                         top_logprobs=self.top_logprobs,
@@ -440,7 +430,7 @@ class OpenAI(BaseAPIModel):
             if mode == 'front':
                 cur_prompt = sep.join(words[-mid:])
             elif mode == 'mid':
-                cur_prompt = (sep.join(words[:mid]) + sep.join(words[-mid:]))
+                cur_prompt = sep.join(words[:mid]) + sep.join(words[-mid:])
             elif mode == 'rear':
                 cur_prompt = sep.join(words[:mid])
 
@@ -480,7 +470,9 @@ class OpenAI(BaseAPIModel):
         """
         # Check input length when mode is 'none'
         if mode == 'none':
-            input_len = get_token_len_func(str(input))
+            input_len = (get_token_len_func(input) if isinstance(
+                input, str) else sum(
+                    get_token_len_func(item['prompt']) for item in input))
             if input_len > max_seq_len:
                 raise ValueError(
                     f'Input length ({input_len}) exceeds max_seq_len '
@@ -499,12 +491,15 @@ class OpenAI(BaseAPIModel):
         # Convert input to messages format
         if isinstance(input, str):
             messages = [{'role': 'user', 'content': input}]
+            input_len = get_token_len_func(input)
         else:
             messages = []
+            processed_prompts = []
             for item in input:
                 input_content = item['prompt']
                 if mode != 'none':
                     input_content = bin_trim_wrapper(input_content)
+                processed_prompts.append(input_content)
                 msg = {'content': input_content}
                 if item['role'] == 'HUMAN':
                     msg['role'] = 'user'
@@ -513,19 +508,18 @@ class OpenAI(BaseAPIModel):
                 elif item['role'] == 'SYSTEM':
                     msg['role'] = 'system'
                 messages.append(msg)
+            input_len = sum(
+                get_token_len_func(prompt) for prompt in processed_prompts)
 
         # Adjust max_out_len
         if max_out_len is not None:
             original_max_out_len = max_out_len
-            max_out_len = min(
-                max_out_len,
-                max_seq_len - get_token_len_func(str(input)) - 100)
+            max_out_len = min(max_out_len, max_seq_len - input_len - 100)
             if max_out_len <= 0:
                 raise ValueError(
                     f'max_out_len ({max_out_len}) is less than or equal to 0. '
-                    f'This may be due to input length '
-                    f'({get_token_len_func(str(input))}) being too close to '
-                    f'max_seq_len ({max_seq_len}). Please either increase '
+                    f'This may be due to input length ({input_len}) being too '
+                    f'close to max_seq_len ({max_seq_len}). Please increase '
                     f'max_seq_len or use a truncation mode other than "none".')
             if max_out_len < original_max_out_len:
                 self.logger.warning(
@@ -555,7 +549,6 @@ class OpenAISDK(OpenAI):
         temperature: float | None = None,
         tokenizer_path: str | None = None,
         extra_body: Dict | None = None,
-        max_completion_tokens: int = 16384,
         verbose: bool = False,
         status_code_mappings: dict = {},
     ):
@@ -577,7 +570,6 @@ class OpenAISDK(OpenAI):
             tokenizer_path,
             extra_body,
             verbose=verbose,
-            max_completion_tokens=max_completion_tokens,
         )
         from openai import OpenAI
 
@@ -605,8 +597,23 @@ class OpenAISDK(OpenAI):
             self.logger.info(f'Used openai_client: {self.openai_client}')
         self.status_code_mappings = status_code_mappings
 
-    def _generate(self, input: PromptList | str, max_out_len: int,
-                  temperature: float) -> str:
+    def _generate(self,
+                  input: PromptList | str,
+                  max_out_len: int,
+                  temperature: float,
+                  timeout: int = 3600) -> str:
+        """Generate results given a list of inputs.
+
+        Args:
+            input (PromptType): A string or PromptDict.
+            max_out_len (int): The maximum length of the output.
+            temperature (float): What sampling temperature to use.
+            timeout (int, optional): Timeout in seconds for the API call.
+                Defaults to 3600 (60 minutes).
+
+        Returns:
+            str: The generated string.
+        """
         from openai import APIStatusError, BadRequestError
 
         assert isinstance(input, (str, PromptList))
@@ -618,16 +625,14 @@ class OpenAISDK(OpenAI):
         num_retries = 0
         while num_retries < self.retry:
             self.wait()
-
-            if self.path in O1_MODEL_LIST:
+            if any(model in self.path for model in O1_MODEL_LIST):
                 self.logger.warning(
                     f"'max_token' is unsupported for model {self.path}")
                 self.logger.warning(
-                    f'We use max_completion_tokens: '
-                    f'{self.max_completion_tokens}for this query')
+                    f'We use max_out_len: {max_out_len} for this query')
                 query_data = dict(
                     model=self.path,
-                    max_completion_tokens=self.max_completion_tokens,
+                    max_completion_tokens=max_out_len,
                     n=1,
                     messages=messages,
                     extra_body=self.extra_body,
@@ -646,7 +651,8 @@ class OpenAISDK(OpenAI):
                 if self.verbose:
                     self.logger.info('Start calling OpenAI API')
                 responses = self.openai_client.chat.completions.create(
-                    **query_data)
+                    **query_data, timeout=timeout)  # timeout in seconds
+
                 if self.verbose:
                     self.logger.info(
                         'Successfully get response from OpenAI API')
diff --git a/opencompass/summarizers/subjective/compassbench.py b/opencompass/summarizers/subjective/compassbench.py
index 67c01243..7ed1ee53 100644
--- a/opencompass/summarizers/subjective/compassbench.py
+++ b/opencompass/summarizers/subjective/compassbench.py
@@ -34,39 +34,29 @@ MAP = {
         '总分',
         '中文总分',
         '英文总分',
-        'instruct/compassbenchv1_4_IF_en_fofo_sub',
-        'instruct/compassbenchv1_4_IF_zh_fofo_sub',
+        'instruct/compassbench_2501_IF_en_chatIF_sub',
+        'instruct/compassbench_2501_IF_en_functionalIF_sub',
+        'instruct/compassbench_2501_IF_cn_chatIF_sub',
+        'instruct/compassbench_2501_IF_cn_functionalIF_sub',
     ],
     'language': [
         '总分',
         '中文总分',
         '英文总分',
-        'language/compassbenchv1_4_language_zh_chat_sub',
-        'language/compassbenchv1_4_language_zh_creation_sub',
-        'language/compassbenchv1_4_language_zh_NLP_sub',
-        'language/compassbenchv1_4_language_en_chat_sub',
-        'language/compassbenchv1_4_language_en_creation_sub',
-        'language/compassbenchv1_4_language_en_NLP_sub',
+        'language/compassbench_v2501_language_zh_chat_sub',
+        'language/compassbench_v2501_language_zh_nlp_sub',
+        'language/compassbench_v2501_language_zh_creation_sub',
+        'language/compassbench_v2501_language_en_chat_sub',
+        'language/compassbench_v2501_language_en_nlp_sub',
+        'language/compassbench_v2501_language_en_creation_sub',
     ],
-    'reasoning': [
+
+    'code': [
         '总分',
         '中文总分',
         '英文总分',
-        'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub',
-        'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub',
-        'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub',
-        'reasoning/compassbenchv1_4_reasoning_en_Social_sub',
-        'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub',
-        'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub',
-        'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub',
-        'reasoning/compassbenchv1_4_reasoning_zh_Social_sub',
-    ],
-    'coding': [
-        '总分',
-        '中文总分',
-        '英文总分',
-        'coding/compassbenchv1_4_coding_en_sub',
-        'coding/compassbenchv1_4_coding_zh_sub',
+        'code/compassbench_2501_code_arena_en_sub',
+        'code/compassbench_2501_code_arena_zh_sub',
     ],
 }
 

From 046b6f75c6ee0ff2f583b30b6f39d73b52929f56 Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Thu, 20 Feb 2025 19:47:04 +0800
Subject: [PATCH 06/58] [Update] Update Greedy Config & README of LiveMathBench
 (#1862)

* support omni-math

* update config

* upload README

* Delete opencompass/configs/datasets/omni_math/__init__.py

* update greedy config & README of LiveMathBench

* update intro for  max_out_len

* rename livemathbench greedy confi

* delete greedy config

---------

Co-authored-by: liushz <qq1791167085@163.com>
---
 .../configs/datasets/livemathbench/README.md  | 71 +++++++++----------
 .../livemathbench/livemathbench_greedy_gen.py |  4 ++
 ....py => livemathbench_greedy_gen_9befbf.py} | 10 +--
 3 files changed, 43 insertions(+), 42 deletions(-)
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
 rename opencompass/configs/datasets/livemathbench/{livemathbench_greedy_gen_efb20d.py => livemathbench_greedy_gen_9befbf.py} (83%)

diff --git a/opencompass/configs/datasets/livemathbench/README.md b/opencompass/configs/datasets/livemathbench/README.md
index 84490c94..24949f20 100644
--- a/opencompass/configs/datasets/livemathbench/README.md
+++ b/opencompass/configs/datasets/livemathbench/README.md
@@ -1,36 +1,30 @@
 # LiveMathBench
 
-## Details of Datsets
+## v202412
+
+### Details of Datsets
 
 | dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
 | -- | -- | -- | -- | -- | -- |
-| AIMC | cn | 0 | 0 | 0 | 46 |
-| AIMC | en | 0 | 0 | 0 | 46 |
-| CEE | cn | 0 | 0 | 13 | 40 |
-| CEE | en | 0 | 0 | 13 | 40 |
-| CMO | cn | 0 | 0 | 0 | 18 |
-| CMO | en | 0 | 0 | 0 | 18 |
-| MATH500 | en | 0 | 0 | 0 | 500 |
-| AIME2024 | en | 0 | 0 | 0 | 44 |
+| AMC | cn | 0 | 0 | 0 | 46 |
+| AMC | en | 0 | 0 | 0 | 46 |
+| CCEE | cn | 0 | 0 | 13 | 31 |
+| CCEE | en | 0 | 0 | 13 | 31 |
+| CNMO | cn | 0 | 0 | 0 | 18 |
+| CNMO | en | 0 | 0 | 0 | 18 |
+| WLPMC | cn | 0 | 0 | 0 | 11 |
+| WLPMC | en | 0 | 0 | 0 | 11 |
 
 
-## How to use
-
+### How to use
 
+#### G-Pass@k
 ```python
 from mmengine.config import read_base
 
 with read_base():
-    from opencompass.datasets.livemathbench import livemathbench_datasets
+    from opencompass.datasets.livemathbench_gen import livemathbench_datasets
 
-livemathbench_datasets[0].update(
-    {
-        'abbr': 'livemathbench_${k}x${n}'
-        'path': '/path/to/data/dir', 
-        'k': 'k@pass', # the max value of k in k@pass
-        'n': 'number of runs', # number of runs
-    }
-)
 livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
     {
         'model_name': 'Qwen/Qwen2.5-72B-Instruct', 
@@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
         ]  # set url of evaluation models
     }
 )
+livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
+    max_out_len=32768 # for o1-like models you need to update max_out_len
+))
 
 ```
 
-> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested.
-
+#### Greedy
 ```python
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
+
 livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
     {
         'model_name': 'Qwen/Qwen2.5-72B-Instruct', 
         'url': [
             'http://0.0.0.0:23333/v1', 
             '...'
-        ],  # set url of evaluation models
-
-        # for LLM-based extraction
-        'use_extract_model': True,
-        'post_model_name': 'oc-extractor',
-        'post_url': [
-            'http://0.0.0.0:21006/v1,
-            '...'
-        ]
+        ]  # set url of evaluation models
     }
 )
+livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
+    max_out_len=32768 # for o1-like models you need to update max_out_len
+))
+
 ```
 
-## Output Samples
+### Output Samples
 
 | dataset | version | metric | mode | Qwen2.5-72B-Instruct |
 |----- | ----- | ----- | ----- | -----|
-| LiveMathBench | caed8f | 1@pass | gen | 26.07 |
-| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx |
-| LiveMathBench | caed8f | 2@pass | gen | xx.xx |
-| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx |
-| LiveMathBench | caed8f | pass-rate | gen | xx.xx |
+| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
+| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |
 
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
new file mode 100644
index 00000000..d311eeaf
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .livemathbench_greedy_gen_efb20d import livemathbench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
similarity index 83%
rename from opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py
rename to opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
index d6acd7c0..d8d8b79c 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_efb20d.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
@@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen
 
 
 livemathbench_dataset = dict(
-    abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name.
     type=LiveMathBenchDataset,
-    path='opencompass/LiveMathBench',
+    path='',
     k=1,
     replication=1,
     dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
     dataset_languages=['cn', 'en'],
-    cot=False,
+    cot=True,
     version='202412',
+    abbr='LiveMathBench-v202412',
     reader_cfg=dict(
         input_columns=['prompt'], 
         output_column='answer'
@@ -31,7 +31,7 @@ livemathbench_dataset = dict(
         retriever=dict(type=ZeroRetriever),
         inferencer=dict(
             type=GenInferencer, 
-            max_out_len=16384,
+            max_out_len=8192
         ),
     ),
     eval_cfg=dict(
@@ -44,7 +44,7 @@ livemathbench_dataset = dict(
             extract_model_name='',
             k=[1],
             replication=1,
-            thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
+            thresholds=[0.0]
         )
     )
 )

From 465e93e10e633183d4ec24a547e386927fd6e559 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Mon, 24 Feb 2025 15:45:24 +0800
Subject: [PATCH 07/58] [Update] Academic bench llm judge update (#1876)

* BigCodeBench update

* update LCBench

* update LCBench 2

* update code

* academicBench update

* academic bench ifeval&math update

* generic_llmjudge_aime_academic_postprocess delete

* aime delete

* postprocessors update

* ifeval delete

* update work_dir

* linting

* linting double-quote-string-fixer

* r1-distill out_len update

* fix lint

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
---
 examples/eval_academic_leaderboard_202502.py  | 137 +++++++++++++
 ...0shot_nocot_genericllmeval_academic_gen.py |  98 +++++++++
 .../bbh/bbh_0shot_nocot_academic_gen.py       | 189 ++++++++++++++++++
 ...math_prm800k_500_0shot_cot_academic_gen.py | 100 +++++++++
 .../hf_deepseek_r1_distill_llama_70b.py       |  14 ++
 .../hf_deepseek_r1_distill_llama_8b.py        |  14 ++
 .../hf_deepseek_r1_distill_qwen_14b.py        |  14 ++
 .../hf_deepseek_r1_distill_qwen_1_5b.py       |  14 ++
 .../hf_deepseek_r1_distill_qwen_32b.py        |  14 ++
 .../hf_deepseek_r1_distill_qwen_7b.py         |  14 ++
 .../lmdeploy_deepseek_r1_distill_llama_70b.py |  20 ++
 .../lmdeploy_deepseek_r1_distill_llama_8b.py  |  20 ++
 .../lmdeploy_deepseek_r1_distill_qwen_14b.py  |  20 ++
 .../lmdeploy_deepseek_r1_distill_qwen_1_5b.py |  20 ++
 .../lmdeploy_deepseek_r1_distill_qwen_32b.py  |  20 ++
 .../lmdeploy_deepseek_r1_distill_qwen_7b.py   |  20 ++
 opencompass/datasets/generic.py               |  29 ++-
 17 files changed, 755 insertions(+), 2 deletions(-)
 create mode 100644 examples/eval_academic_leaderboard_202502.py
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
 create mode 100644 opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py
 create mode 100644 opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py
 create mode 100644 opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py

diff --git a/examples/eval_academic_leaderboard_202502.py b/examples/eval_academic_leaderboard_202502.py
new file mode 100644
index 00000000..231e9a9f
--- /dev/null
+++ b/examples/eval_academic_leaderboard_202502.py
@@ -0,0 +1,137 @@
+# flake8: noqa
+
+from mmengine.config import read_base
+
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.runners import LocalRunner, VOLCRunner
+from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
+
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+with read_base():
+    # Datasets Part
+    # Knowledge
+    # Math
+    from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
+        aime2024_datasets
+    from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
+        bbh_datasets
+    # General Reasoning
+    from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
+        gpqa_datasets
+    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
+        humaneval_datasets
+    # Instruction Following
+    from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
+        ifeval_datasets
+    from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
+        LCBCodeGeneration_dataset
+    from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
+        math_datasets
+    from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
+        mmlu_pro_datasets
+    # Model List
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
+        models as hf_internlm2_5_7b_chat_model
+    # Summary Groups
+    from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
+    from opencompass.configs.summarizers.groups.mmlu_pro import \
+        mmlu_pro_summary_groups
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+# datasets list for evaluation
+# Only take LCB generation for evaluation
+datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
+               []) + [LCBCodeGeneration_dataset]
+
+# LLM judge config: using LLM to evaluate predictions
+judge_cfg = dict()
+for dataset in datasets:
+    dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
+    if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
+        dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+
+#######################################################################
+#                       PART 2  Datset Summarizer                     #
+#######################################################################
+
+core_summary_groups = [
+    {
+        'name':
+        'core_average',
+        'subsets': [
+            ['IFEval', 'Prompt-level-strict-accuracy'],
+            ['bbh', 'naive_average'],
+            ['math_prm800k_500', 'accuracy'],
+            ['aime2024', 'accuracy'],
+            ['GPQA_diamond', 'accuracy'],
+            ['mmlu_pro', 'naive_average'],
+            ['openai_humaneval', 'humaneval_pass@1'],
+            ['lcb_code_generation', 'pass@1'],
+        ],
+    },
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['core_average', 'naive_average'],
+        '',
+        'Instruction Following',
+        ['IFEval', 'Prompt-level-strict-accuracy'],
+        '',
+        'General Reasoning',
+        ['bbh', 'naive_average'],
+        ['GPQA_diamond', 'accuracy'],
+        '',
+        'Math Calculation',
+        ['math_prm800k_500', 'accuracy'],
+        ['aime2024', 'accuracy'],
+        '',
+        'Knowledge',
+        ['mmlu_pro', 'naive_average'],
+        '',
+        'Code',
+        ['openai_humaneval', 'humaneval_pass@1'],
+        ['lcb_code_generation', 'pass@1'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith('_summary_groups')], []),
+)
+
+#######################################################################
+#                        PART 3  Models  List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+#######################################################################
+#                 PART 4  Inference/Evaluation Configuaration         #
+#######################################################################
+
+# Local Runner
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        retry=0,  # Modify if needed
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+# eval with local runner
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=10),
+    runner=dict(type=LocalRunner,
+                max_num_workers=16,
+                task=dict(type=OpenICLEvalTask)),
+)
+
+#######################################################################
+#                      PART 5  Utils Configuaration                   #
+#######################################################################
+work_dir = './outputs/oc_academic_202502'
diff --git a/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
new file mode 100644
index 00000000..30da2b98
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_0shot_nocot_genericllmeval_academic_gen.py
@@ -0,0 +1,98 @@
+# flake8: noqa
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN',
+                     prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=2048)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=GRADER_TEMPLATE),
+                ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
+                                metric_name='accuracy'),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
new file mode 100644
index 00000000..f0698689
--- /dev/null
+++ b/opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
@@ -0,0 +1,189 @@
+# flake8: noqa
+
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import BBHDataset
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
+
+# For zero shot inference in bbh
+bbh_datasets = []
+for _name in bbh_sets:
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy())
+        )
+
+
+# For original 3 shot inference in bbh
+bbh_3_shot_datasets = []
+for _name in bbh_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=512))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_3_shot_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py
new file mode 100644
index 00000000..c23bc136
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_academic_gen.py
@@ -0,0 +1,100 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    MATHDataset,
+    MATHEvaluator,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_academic_postprocess
+
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            file_name = 'test_prm800k_500.json',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500',
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py
new file mode 100644
index 00000000..15ac9f90
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_70b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-llama-70b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
+        max_out_len=16384,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py
new file mode 100644
index 00000000..937c8bb2
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_llama_8b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        max_out_len=16384,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py
new file mode 100644
index 00000000..d4c97023
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_14b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        max_out_len=16384,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py
new file mode 100644
index 00000000..aa12591e
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_1_5b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1.5b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        max_out_len=16384,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py
new file mode 100644
index 00000000..d62d8085
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_32b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-32b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+        max_out_len=16384,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py
new file mode 100644
index 00000000..9898843a
--- /dev/null
+++ b/opencompass/configs/models/deepseek/hf_deepseek_r1_distill_qwen_7b.py
@@ -0,0 +1,14 @@
+from opencompass.models import HuggingFacewithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-hf',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        max_out_len=16384,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py
new file mode 100644
index 00000000..1471be9b
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_70b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-70b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=8),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py
new file mode 100644
index 00000000..46b521f2
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_llama_8b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-llama-8b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py
new file mode 100644
index 00000000..401299cd
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_14b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py
new file mode 100644
index 00000000..d19ace4e
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_1_5b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py
new file mode 100644
index 00000000..2ddef1d4
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_32b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-32b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=16,
+        run_cfg=dict(num_gpus=4),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py
new file mode 100644
index 00000000..69f9e50f
--- /dev/null
+++ b/opencompass/configs/models/deepseek/lmdeploy_deepseek_r1_distill_qwen_7b.py
@@ -0,0 +1,20 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
+        gen_config=dict(top_k=1,
+                        temperature=1e-6,
+                        top_p=0.9,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py
index 867ba61f..28a37a02 100644
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@@ -1,7 +1,10 @@
 import re
 
 
-def get_final_results(judged_answers, references, origial_responses):
+def get_final_results(judged_answers,
+                      references,
+                      origial_responses,
+                      metric_name='accuracy'):
     count = 0
     is_correct_count = 0
     is_incorrect_count = 0
@@ -39,7 +42,7 @@ def get_final_results(judged_answers, references, origial_responses):
                                                    is_correct) > 0 else 0
     result = {
         # 'accuracy_given_attempted': accuracy_given_attempted,
-        'accuracy': accuracy_given_attempted * 100,
+        metric_name: accuracy_given_attempted * 100,
         'f1': f1,
         'details': details
     }
@@ -69,3 +72,25 @@ def generic_llmjudge_postprocess(
     results = get_final_results(judged_answers, references, origial_responses)
     results['details'] = output
     return results
+
+
+def generic_llmjudge_academic_postprocess(
+    output: dict,
+    output_path: str,
+    metric_name: str = 'accuracy',
+) -> dict:
+    judged_answers = []
+    origial_responses = []
+    references = []
+    for k, v in output.items():
+        origial_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            references.append(v['gold'])
+    results = get_final_results(judged_answers, references, origial_responses,
+                                metric_name)
+    results['details'] = output
+    # For academic summarizer
+    results.pop('f1', None)
+    return results

From 22a33d8759b5b5980faaa1afbf710a78d2dfce48 Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Tue, 25 Feb 2025 17:24:36 +0800
Subject: [PATCH 08/58] [Update] Update LiveMathBench Hard Configs (#1826)

* support G-Pass@k and livemathbench

* fix bugs

* fix comments of GPassKEvaluator

* update saved details of GPassKEvaluator

* update saved details of GPassKEvaluator

* fix eval api configs & update openai_api for ease of debugging

* update huggingface path

* fix method name of G-Pass@k

* fix default value of eval_model_name

* refactor G-Pass@k evaluator

* log generation params for each backend

* fix evaluation resume

* add notimplementerror

* update livemathbench-hard configs

* remove max_out_len from livemathbench_hard_greedy_gen_9befbf.py

* remove max_out_len from livemathbench_hard_gen_9befbf.py

* rename livemathbench_hard_gen_9befbf.py to livemathbench_hard_gen_353ae7.py

* rename livemathbench_hard_greedy_gen_9befbf.py to livemathbench_hard_greedy_gen_353ae7.py

* update livemathbench_gen_9befbf.py

* remove whitespace

* upload livemathbench hard configs
---
 .../livemathbench/livemathbench_gen_9befbf.py |  2 +-
 .../livemathbench/livemathbench_gen_f1c095.py | 49 ------------------
 .../livemathbench/livemathbench_greedy_gen.py |  2 +-
 .../livemathbench_hard_gen_353ae7.py          | 50 +++++++++++++++++++
 .../livemathbench_hard_greedy_gen_353ae7.py   | 50 +++++++++++++++++++
 .../datasets/livemathbench/livemathbench.py   |  2 +-
 .../models/turbomind_with_tf_above_v4_33.py   |  2 -
 7 files changed, 103 insertions(+), 54 deletions(-)
 delete mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_gen_f1c095.py
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py

diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
index 3748c022..454e7d3e 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
@@ -48,4 +48,4 @@ livemathbench_dataset = dict(
         )
     )
 )
-livemathbench_datasets = [livemathbench_dataset]
\ No newline at end of file
+livemathbench_datasets = [livemathbench_dataset]
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_f1c095.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_f1c095.py
deleted file mode 100644
index 56161095..00000000
--- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_f1c095.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-
-from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
-
-
-livemathbench_reader_cfg = dict(
-    input_columns=['prompt'], 
-    output_column='answer'
-)
-
-livemathbench_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            round=[
-                dict(role='HUMAN', prompt='{prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(
-        type=GenInferencer, 
-        max_out_len=8192,
-        temperature=1.0
-    )
-)
-
-livemathbench_eval_cfg = dict(
-    evaluator=dict(
-        type=LiveMathBenchEvaluator, 
-        model_name='Qwen/Qwen2.5-72B-Instruct', 
-        url=['http://172.30.40.154:23333/v1/'] #'https://api.openai.com/v1/'
-    )
-)
-
-livemathbench_datasets = [
-    dict(
-        type=LiveMathBenchDataset,
-        abbr='LiveMathBench-k1-n1',
-        path='opencompass/LiveMathBench202412',
-        k=1, # K@Pass
-        n=1,  # Run times
-        reader_cfg=livemathbench_reader_cfg,
-        infer_cfg=livemathbench_infer_cfg,
-        eval_cfg=livemathbench_eval_cfg
-    )
-]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
index d311eeaf..c1d72d15 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .livemathbench_greedy_gen_efb20d import livemathbench_datasets  # noqa: F401, F403
\ No newline at end of file
+    from .livemathbench_greedy_gen_9befbf import livemathbench_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
new file mode 100644
index 00000000..e932d3c3
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
@@ -0,0 +1,50 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+
+
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=16,
+    replication=3,
+    dataset_splits=['hard'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'], 
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[],
+            use_extract_model=False,
+            extract_url=[],
+            extract_model_name='',
+            k=[4, 8, 16],
+            replication=3,
+            thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
new file mode 100644
index 00000000..830e55af
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
@@ -0,0 +1,50 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBenchEvaluator
+
+
+livemathbench_dataset = dict(
+    type=LiveMathBenchDataset,
+    path='',
+    k=1,
+    replication=1,
+    dataset_splits=['hard'],
+    dataset_languages=['cn', 'en'],
+    cot=True,
+    version='202412',
+    abbr='LiveMathBench-v202412-Hard',
+    reader_cfg=dict(
+        input_columns=['prompt'], 
+        output_column='answer'
+    ),
+    infer_cfg=dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt='{prompt}'),
+                ]
+            )
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(
+            type=GenInferencer
+        ),
+    ),
+    eval_cfg=dict(
+        evaluator=dict(
+            type=LiveMathBenchEvaluator,
+            model_name='',
+            url=[],
+            use_extract_model=False,
+            extract_url=[],
+            extract_model_name='',
+            k=[1],
+            replication=1,
+            thresholds=[0.0]
+        )
+    )
+)
+livemathbench_datasets = [livemathbench_dataset]
\ No newline at end of file
diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py
index d2b4b93b..13abf3aa 100644
--- a/opencompass/datasets/livemathbench/livemathbench.py
+++ b/opencompass/datasets/livemathbench/livemathbench.py
@@ -48,6 +48,7 @@ class LiveMathBenchDataset(BaseDataset):
         if path != '':
             path = get_data_path(path)
             path = os.path.join(path, version)
+
         for split, language in product(dataset_splits, dataset_languages):
             dataset_info[f'{split}_{language}'] = {
                 'single-choice': 0,
@@ -64,7 +65,6 @@ class LiveMathBenchDataset(BaseDataset):
 
             if path != '':
                 file_path = os.path.join(path, f'{split}_{language}.jsonl')
-
                 if not os.path.exists(file_path):
                     raise FileNotFoundError(
                         f'File {file_path} does not exist, please check the '
diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py
index 88b605f9..7138974d 100644
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@@ -164,8 +164,6 @@ class TurboMindModelwithChatTemplate(BaseModel):
         self.logger.info('Generation Config of LMdeploy: ')
         self.logger.info(gen_config)
 
-
-
         results = []
         outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
         for output in outputs:

From fd6fbf01a244663e866369bbedb4a974dd0bd37e Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Tue, 25 Feb 2025 20:34:41 +0800
Subject: [PATCH 09/58] [Update] Support AIME-24 Evaluation for DeepSeek-R1
 series (#1888)

* Update

* Update

* Update

* Update
---
 examples/eval_simpleqa.py                     |   4 +-
 opencompass/cli/main.py                       |   2 -
 ...lympiadBench_0shot_llmverify_gen_be8b13.py | 109 ++++++++++++++++++
 .../aime2024_llmverify_repeat16_gen_bf7475.py |  96 +++++++++++++++
 .../aime2024_llmverify_repeat8_gen_e8fcee.py  |  96 +++++++++++++++
 .../math_prm800k_500_llmverify_gen_6ff468.py  |  99 ++++++++++++++++
 ...rm800k_500_llmverify_repeat4_gen_97b203.py | 100 ++++++++++++++++
 opencompass/datasets/base.py                  |   3 +-
 .../icl_retriever/icl_topk_retriever.py       |   3 +-
 requirements/runtime.txt                      |   2 +-
 tools/list_configs.py                         |   8 +-
 11 files changed, 512 insertions(+), 10 deletions(-)
 create mode 100644 opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py

diff --git a/examples/eval_simpleqa.py b/examples/eval_simpleqa.py
index 193c63b8..ade38d38 100644
--- a/examples/eval_simpleqa.py
+++ b/examples/eval_simpleqa.py
@@ -36,8 +36,8 @@ infer = dict(
 eval = dict(
     partitioner=dict(
         type=SubjectiveNaivePartitioner,
-        models=[gpt_4o_2024_05_13_model],
-        judge_models=[gpt_4o_2024_05_13_model],
+        models=models,
+        judge_models=judge_models,
     ),
     runner=dict(type=LocalRunner,
                 max_num_workers=256,
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index 63377371..21308e10 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -177,8 +177,6 @@ def parse_dlc_args(dlc_parser):
                             type=str)
 
 
-
-
 def parse_hf_args(hf_parser):
     """These args are all for the quick construction of HuggingFace models."""
     hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
diff --git a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
new file mode 100644
index 00000000..e0e59a33
--- /dev/null
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_0shot_llmverify_gen_be8b13.py
@@ -0,0 +1,109 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbench_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # olympiadbench_eval_cfg = dict(
+    #     evaluator=dict(type=OlympiadBenchEvaluator, version='v2'), 
+    #     pred_postprocessor=dict(type=olympiadbench_postprocess_v2),
+    # )
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlympiadBenchDataset,
+                path='opencompass/OlympiadBench',
+                name=_name,
+                reader_cfg=olympiadbench_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olympiadbench_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+        )
+    )
+
+del _name
diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
new file mode 100644
index 00000000..070a63bc
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat16_gen_bf7475.py
@@ -0,0 +1,96 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr=f'aime2024-run{idx}',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+    for idx in range(16)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
new file mode 100644
index 00000000..07fa6d1b
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmverify_repeat8_gen_e8fcee.py
@@ -0,0 +1,96 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+aime2024_datasets = [
+    dict(
+        abbr=f'aime2024-run{idx}',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+        mode='singlescore',
+    )
+    for idx in range(8)
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py
new file mode 100644
index 00000000..78e66452
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_gen_6ff468.py
@@ -0,0 +1,99 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+# Verify: LLM Verify
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            file_name = 'test_prm800k_500.json',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500-llmjudge',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py
new file mode 100644
index 00000000..a7e373e9
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py
@@ -0,0 +1,100 @@
+# CoT: No CoT
+# K-Shot: 0-Shot
+# Verify: LLM Verify
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            file_name = 'test_prm800k_500.json',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr=f'math_prm800k_500-llmjudge-run{idx}',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+        mode='singlescore',
+    )
+    for idx in range(4)
+]
diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py
index 5412ef4c..5dc0f073 100644
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@@ -1,4 +1,3 @@
-from abc import abstractstaticmethod
 from typing import Dict, Optional, Union
 
 from datasets import Dataset, DatasetDict
@@ -23,6 +22,6 @@ class BaseDataset:
     def test(self):
         return self.reader.dataset['test']
 
-    @abstractstaticmethod
+    @staticmethod
     def load(**kwargs) -> Union[Dataset, DatasetDict]:
         pass
diff --git a/opencompass/openicl/icl_retriever/icl_topk_retriever.py b/opencompass/openicl/icl_retriever/icl_topk_retriever.py
index c9ac8f81..9703a621 100644
--- a/opencompass/openicl/icl_retriever/icl_topk_retriever.py
+++ b/opencompass/openicl/icl_retriever/icl_topk_retriever.py
@@ -7,7 +7,6 @@ from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import torch
 import tqdm
-from sentence_transformers import SentenceTransformer
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer, BatchEncoding, PreTrainedTokenizerBase
 from transformers.file_utils import PaddingStrategy
@@ -56,6 +55,8 @@ class TopkRetriever(BaseRetriever):
                  tokenizer_name: Optional[str] = 'gpt2-xl',
                  batch_size: Optional[int] = 1) -> None:
         super().__init__(dataset, ice_separator, ice_eos_token, ice_num)
+        from sentence_transformers import SentenceTransformer
+
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.batch_size = batch_size
         self.tokenizer_name = tokenizer_name
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 47133f21..348df85d 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -9,7 +9,7 @@ fuzzywuzzy
 gradio-client
 h5py
 httpx==0.27.2
-huggingface_hub<=0.24.7
+huggingface_hub
 immutabledict
 importlib-metadata
 jieba
diff --git a/tools/list_configs.py b/tools/list_configs.py
index cc778aeb..9aa6cc49 100644
--- a/tools/list_configs.py
+++ b/tools/list_configs.py
@@ -19,11 +19,15 @@ def parse_args():
 
 def main():
     args = parse_args()
-    models = match_files('configs/models/', args.pattern, fuzzy=True)
+    models = match_files('opencompass/configs/models/',
+                         args.pattern,
+                         fuzzy=True)
     if models:
         table = [['Model', 'Config Path'], *models]
         print(tabulate.tabulate(table, headers='firstrow', tablefmt='psql'))
-    datasets = match_files('configs/datasets/', args.pattern, fuzzy=True)
+    datasets = match_files('opencompass/configs/datasets/',
+                           args.pattern,
+                           fuzzy=True)
     if datasets:
         table = [['Dataset', 'Config Path'], *datasets]
         print(tabulate.tabulate(table, headers='firstrow', tablefmt='psql'))

From bdb2d46f5905a6aa23e977f34989c865671a8513 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 26 Feb 2025 15:08:50 +0800
Subject: [PATCH 10/58] [Feature] Add general math, llm judge evaluator (#1892)

* update_doc

* update llm_judge

* update README

* update md file name
---
 README.md                                 |   1 +
 README_zh-CN.md                           |   1 +
 docs/en/advanced_guides/llm_judge.md      | 252 ++++++++++++++++++++++
 docs/en/advanced_guides/math_verify.md    | 190 ++++++++++++++++
 docs/en/index.rst                         |  10 +-
 docs/zh_cn/advanced_guides/llm_judge.md   | 251 +++++++++++++++++++++
 docs/zh_cn/advanced_guides/math_verify.md | 190 ++++++++++++++++
 docs/zh_cn/index.rst                      |  11 +-
 examples/eval_llm_judge.py                | 116 ++++++++++
 opencompass/tasks/openicl_eval.py         | 100 ++++++---
 10 files changed, 1075 insertions(+), 47 deletions(-)
 create mode 100644 docs/en/advanced_guides/llm_judge.md
 create mode 100644 docs/en/advanced_guides/math_verify.md
 create mode 100644 docs/zh_cn/advanced_guides/llm_judge.md
 create mode 100644 docs/zh_cn/advanced_guides/math_verify.md
 create mode 100644 examples/eval_llm_judge.py

diff --git a/README.md b/README.md
index 736968eb..93457e09 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
 - **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 8d8ecd02..6d071ebc 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,6 +57,7 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
 - **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU)，欢迎尝试! 🔥🔥🔥
diff --git a/docs/en/advanced_guides/llm_judge.md b/docs/en/advanced_guides/llm_judge.md
new file mode 100644
index 00000000..91a1a5bf
--- /dev/null
+++ b/docs/en/advanced_guides/llm_judge.md
@@ -0,0 +1,252 @@
+# LLM as Judge Evaluation
+
+## Introduction
+
+The GenericLLMEvaluator is particularly useful for scenarios where rule-based methods (like regular expressions) cannot perfectly judge outputs, such as:
+
+- Cases where models output answer content without option identifiers
+- Factual judgment datasets that are difficult to evaluate with rules
+- Open-ended responses requiring complex understanding and reasoning
+- Evaluation that requires a lot of rules to be designed
+
+OpenCompass provides the GenericLLMEvaluator component to facilitate LLM-as-judge evaluations.
+
+## Dataset Format
+
+The dataset for LLM judge evaluation should be in either JSON Lines (.jsonl) or CSV format. Each entry should contain at least:
+
+- A problem or question
+- A reference answer or gold standard
+- (The model's prediction will be generated during evaluation)
+
+Example JSONL format:
+
+```json
+{"problem": "What is the capital of France?", "answer": "Paris"}
+```
+
+Example CSV format:
+
+```csv
+problem,answer
+"What is the capital of France?","Paris"
+```
+
+## Configuration
+
+To set up an LLM judge evaluation, you'll need to configure three main components:
+
+1. Dataset Reader Configuration
+
+```python
+reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='answer'      # Column name for the reference answer
+)
+```
+
+2. Inference Configuration
+
+```python
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',  # Template for prompting the model
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. Evaluation Configuration with LLM Judge
+
+```python
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,  # Using LLM as evaluator
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # Template for the judge
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # Configuration for the judge model
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # Post-processing the judge's output
+    ),
+)
+```
+
+## Using CustomDataset with GenericLLMEvaluator
+
+Here's how to set up a complete configuration for LLM judge evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# Import your judge model configuration
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as judge_model,
+    )
+
+# Define your judge template
+JUDGE_TEMPLATE = """
+Please evaluate whether the following response correctly answers the question.
+Question: {problem}
+Reference Answer: {answer}
+Model Response: {prediction}
+
+Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""".strip()
+
+# Dataset reader configuration
+reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# Inference configuration for the model being evaluated
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration with LLM judge
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=judge_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-dataset',
+        path='path/to/your/dataset',
+        file_name='your_dataset.jsonl',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+# Model configuration for the model being evaluated
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='model-to-evaluate',
+        path='path/to/your/model',
+        # ... other model configurations
+    )
+]
+
+# Output directory
+work_dir = './outputs/llm_judge_eval'
+```
+
+## GenericLLMEvaluator
+
+The GenericLLMEvaluator is designed to use an LLM as a judge for evaluating model outputs. Key features include:
+
+1. Flexible prompt templates for instructing the judge
+2. Support for various judge models (local or API-based)
+3. Customizable evaluation criteria through prompt engineering
+4. Post-processing of judge outputs to extract structured evaluations
+
+**Important Note**: The current generic version of the judge template only supports outputs in the format of "A" (correct) or "B" (incorrect), and does not support other output formats (like "CORRECT" or "INCORRECT"). This is because the post-processing function `generic_llmjudge_postprocess` is specifically designed to parse this format.
+
+The evaluator works by:
+
+1. Taking the original problem, reference answer, and model prediction
+2. Formatting them into a prompt for the judge model
+3. Parsing the judge's response to determine the evaluation result (looking for "A" or "B")
+4. Aggregating results across the dataset
+
+If you would like to see the full details of evaluation results, you can add `--dump-eval-details` to the command line when you start the job.
+Example evaluation output:
+
+```python
+{
+    'accuracy': 75.0,  # Percentage of responses judged as correct
+    'details': [
+        {
+            'origin_prompt': """
+            Please evaluate whether the following response correctly answers the question.
+            Question: What is the capital of France?
+            Reference Answer: Paris
+            Model Response: Paris
+            Is the model response correct? If correct, answer "A"; if incorrect, answer "B".
+""",
+            'gold': 'Paris',
+            'prediction': 'A',
+        },
+        # ... more results
+    ]
+}
+```
+
+## Complete Example
+
+For a complete working example, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving using an LLM judge.
diff --git a/docs/en/advanced_guides/math_verify.md b/docs/en/advanced_guides/math_verify.md
new file mode 100644
index 00000000..da9cfd2f
--- /dev/null
+++ b/docs/en/advanced_guides/math_verify.md
@@ -0,0 +1,190 @@
+# General Math Evaluation Guidance
+
+## Introduction
+
+Mathematical reasoning is a crucial capability for large language models (LLMs). To evaluate a model's mathematical abilities, we need to test its capability to solve mathematical problems step by step and provide accurate final answers. OpenCompass provides a convenient way to evaluate mathematical reasoning through the CustomDataset and MATHEvaluator components.
+
+## Dataset Format
+
+The math evaluation dataset should be in either JSON Lines (.jsonl) or CSV format. Each problem should contain at least:
+
+- A problem statement
+- A solution/answer (typically in LaTeX format with the final answer in \\boxed{})
+
+Example JSONL format:
+
+```json
+{"problem": "Find the value of x if 2x + 3 = 7", "solution": "Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"}
+```
+
+Example CSV format:
+
+```csv
+problem,solution
+"Find the value of x if 2x + 3 = 7","Let's solve step by step:\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\nTherefore, \\boxed{2}"
+```
+
+## Configuration
+
+To evaluate mathematical reasoning, you'll need to set up three main components:
+
+1. Dataset Reader Configuration
+
+```python
+math_reader_cfg = dict(
+    input_columns=['problem'],  # Column name for the question
+    output_column='solution'    # Column name for the answer
+)
+```
+
+2. Inference Configuration
+
+```python
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. Evaluation Configuration
+
+```python
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+```
+
+## Using CustomDataset
+
+Here's how to set up a complete configuration for math evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',              # Dataset abbreviation
+        path='path/to/your/dataset',         # Path to your dataset file
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+```
+
+## MATHEvaluator
+
+The MATHEvaluator is specifically designed to evaluate mathematical answers. It is developed based on the math_verify library, which provides mathematical expression parsing and verification capabilities, supporting extraction and equivalence verification for both LaTeX and general expressions.
+
+The MATHEvaluator implements:
+
+1. Extracts answers from both predictions and references using LaTeX extraction
+2. Handles various LaTeX formats and environments
+3. Verifies mathematical equivalence between predicted and reference answers
+4. Provides detailed evaluation results including:
+   - Accuracy score
+   - Detailed comparison between predictions and references
+   - Parse results of both predicted and reference answers
+
+The evaluator supports:
+
+- Basic arithmetic operations
+- Fractions and decimals
+- Algebraic expressions
+- Trigonometric functions
+- Roots and exponents
+- Mathematical symbols and operators
+
+Example evaluation output:
+
+```python
+{
+    'accuracy': 85.0,  # Percentage of correct answers
+    'details': [
+        {
+            'predictions': 'x = 2',           # Parsed prediction
+            'references': 'x = 2',         # Parsed reference
+            'correct': True            # Whether they match
+        },
+        # ... more results
+    ]
+}
+```
+
+## Complete Example
+
+Here's a complete example of how to set up math evaluation:
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+
+# Dataset configuration
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',
+        path='path/to/your/dataset.jsonl',  # or .csv
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# Model configuration
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='your-model-name',
+        path='your/model/path',
+        # ... other model configurations
+    )
+]
+
+# Output directory
+work_dir = './outputs/math_eval'
+```
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 7181c459..8c64f210 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -39,8 +39,6 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    user_guides/evaluation.md
    user_guides/experimentation.md
    user_guides/metrics.md
-   user_guides/summarizer.md
-   user_guides/corebench.md
 
 .. _Prompt:
 .. toctree::
@@ -62,16 +60,12 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    advanced_guides/custom_dataset.md
    advanced_guides/new_model.md
    advanced_guides/evaluation_lmdeploy.md
-   advanced_guides/evaluation_lightllm.md
    advanced_guides/accelerator_intro.md
+   advanced_guides/math_verify.md
+   advanced_guides/llm_judge.md
    advanced_guides/code_eval.md
    advanced_guides/code_eval_service.md
-   advanced_guides/prompt_attack.md
-   advanced_guides/longeval.md
    advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
-   advanced_guides/contamination_eval.md
-   advanced_guides/needleinahaystack_eval.md
 
 .. _Tools:
 .. toctree::
diff --git a/docs/zh_cn/advanced_guides/llm_judge.md b/docs/zh_cn/advanced_guides/llm_judge.md
new file mode 100644
index 00000000..66d288a8
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@@ -0,0 +1,251 @@
+# LLM 作为评判器
+
+## 简介
+
+GenericLLMEvaluator组件特别适用于那些难以通过规则式方法（如正则表达式）进行完美判断的场景，例如：
+
+- 模型不输出选项标识而只输出选项内容的情况
+- 需要事实性判断的数据集
+- 需要复杂理解和推理的开放式回答
+- 需要设计大量规则的判断
+
+OpenCompass提供了GenericLLMEvaluator组件来实现LLM作为评判器的评估。
+
+## 数据集格式
+
+用于LLM评判的数据集应该是JSON Lines (.jsonl)或CSV格式。每个条目至少应包含：
+
+- 问题或任务
+- 参考答案或标准答案
+- (模型的预测将在评估过程中生成)
+
+JSONL格式示例：
+
+```json
+{"problem": "法国的首都是什么？", "answer": "巴黎"}
+```
+
+CSV格式示例：
+
+```csv
+problem,answer
+"法国的首都是什么？","巴黎"
+```
+
+## 配置说明
+
+要设置LLM评判评估，你需要配置三个主要组件：
+
+1. 数据集读取配置
+
+```python
+reader_cfg = dict(
+    input_columns=['problem'],  # 问题列的名称
+    output_column='answer'      # 参考答案列的名称
+)
+```
+
+2. 推理配置
+
+```python
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',  # 提示模型的模板
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. 使用LLM评判器的评估配置
+
+```python
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,  # 使用LLM作为评估器
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),  # 评判器的模板
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=YOUR_JUDGE_MODEL_CONFIG,  # 评判模型的配置
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),  # 处理评判器输出的后处理器
+    ),
+)
+```
+
+## 使用CustomDataset和GenericLLMEvaluator
+
+以下是如何设置完整的LLM评判评估配置：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# 导入评判模型配置
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as judge_model,
+    )
+
+# 定义评判模板
+JUDGE_TEMPLATE = """
+请评估以下回答是否正确地回答了问题。
+问题：{problem}
+参考答案：{answer}
+模型回答：{prediction}
+
+模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。
+""".strip()
+
+# 数据集读取配置
+reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# 被评估模型的推理配置
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# 使用LLM评判器的评估配置
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=JUDGE_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='path/to/your/dataset',
+            file_name='your_dataset.jsonl',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=judge_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+# 数据集配置
+datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-dataset',
+        path='path/to/your/dataset',
+        file_name='your_dataset.jsonl',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+# 被评估模型的配置
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='model-to-evaluate',
+        path='path/to/your/model',
+        # ... 其他模型配置
+    )
+]
+
+# 输出目录
+work_dir = './outputs/llm_judge_eval'
+```
+
+## GenericLLMEvaluator
+
+GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。主要特点包括：
+
+1. 灵活的提示模板，用于指导评判器
+2. 支持各种评判模型（本地或基于API）
+3. 通过提示工程自定义评估标准
+4. 对评判器输出进行后处理以提取结构化评估
+
+**重要说明**：目前通用版本的评判模板只支持输出"A"（正确）或"B"（不正确）的格式，不支持其他输出格式（如"正确"或"不正确"）。这是因为后处理函数`generic_llmjudge_postprocess`专门设计为解析这种格式。
+
+评估器的工作原理：
+
+1. 获取原始问题、参考答案和模型预测
+2. 将它们格式化为评判模型的提示
+3. 解析评判器的响应以确定评估结果（寻找"A"或"B"）
+4. 汇总整个数据集的结果
+
+如果需要查看评估的详细结果，可以在启动任务时添加`--dump-eval-details`到命令行。
+评估输出示例：
+
+```python
+{
+    'accuracy': 75.0,  # 被判断为正确的回答百分比
+    'details': [
+        {
+            'origin_prompt': """
+            请评估以下回答是否正确地回答了问题。
+            问题：法国的首都是什么？
+            参考答案：巴黎
+            模型回答：法国的首都是巴黎。
+            模型回答是否正确？如果正确，请回答"A"；如果不正确，请回答"B"。""",
+            'gold': '巴黎',
+            'prediction': 'A',
+        },
+        # ... 更多结果
+    ]
+}
+```
+
+## 完整示例
+
+有关完整的工作示例，请参考examples目录中的`eval_llm_judge.py`文件，该文件演示了如何使用LLM评判器评估数学问题解决能力。
diff --git a/docs/zh_cn/advanced_guides/math_verify.md b/docs/zh_cn/advanced_guides/math_verify.md
new file mode 100644
index 00000000..8e8d2fa6
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/math_verify.md
@@ -0,0 +1,190 @@
+# 数学能力评测
+
+## 简介
+
+数学推理能力是大语言模型(LLMs)的一项关键能力。为了评估模型的数学能力，我们需要测试其逐步解决数学问题并提供准确最终答案的能力。OpenCompass 通过 CustomDataset 和 MATHEvaluator 组件提供了一种便捷的数学推理评测方式。
+
+## 数据集格式
+
+数学评测数据集应该是 JSON Lines (.jsonl) 或 CSV 格式。每个问题至少应包含：
+
+- 问题陈述
+- 解答/答案（通常使用 LaTeX 格式，最终答案需要用 \\boxed{} 括起来）
+
+JSONL 格式示例：
+
+```json
+{"problem": "求解方程 2x + 3 = 7", "solution": "让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"}
+```
+
+CSV 格式示例：
+
+```csv
+problem,solution
+"求解方程 2x + 3 = 7","让我们逐步解决：\n2x + 3 = 7\n2x = 7 - 3\n2x = 4\nx = 2\n因此，\\boxed{2}"
+```
+
+## 配置说明
+
+要进行数学推理评测，你需要设置三个主要组件：
+
+1. 数据集读取配置
+
+```python
+math_reader_cfg = dict(
+    input_columns=['problem'],  # 问题列的名称
+    output_column='solution'    # 答案列的名称
+)
+```
+
+2. 推理配置
+
+```python
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+```
+
+3. 评测配置
+
+```python
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+```
+
+## 使用 CustomDataset
+
+以下是如何设置完整的数学评测配置：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',              # 数据集简称
+        path='path/to/your/dataset',         # 数据集文件路径
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+```
+
+## MATHEvaluator
+
+MATHEvaluator 是专门设计用于评估数学答案的评测器。它基于 math_verify 库进行开发，该库提供了数学表达式解析和验证功能，支持 LaTeX 和一般表达式的提取与等价性验证。
+
+MATHEvaluator 具有以下功能：
+
+1. 使用 LaTeX 提取器从预测和参考答案中提取答案
+2. 处理各种 LaTeX 格式和环境
+3. 验证预测答案和参考答案之间的数学等价性
+4. 提供详细的评测结果，包括：
+   - 准确率分数
+   - 预测和参考答案的详细比较
+   - 预测和参考答案的解析结果
+
+评测器支持：
+
+- 基本算术运算
+- 分数和小数
+- 代数表达式
+- 三角函数
+- 根式和指数
+- 数学符号和运算符
+
+评测输出示例：
+
+```python
+{
+    'accuracy': 85.0,  # 正确答案的百分比
+    'details': [
+        {
+            'predictions': 'x = 2',           # 解析后的预测答案
+            'references': 'x = 2',         # 解析后的参考答案
+            'correct': True            # 是否匹配
+        },
+        # ... 更多结果
+    ]
+}
+```
+
+## 完整示例
+
+以下是设置数学评测的完整示例：
+
+```python
+from mmengine.config import read_base
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.datasets import CustomDataset
+from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+
+# 数据集读取配置
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+# 推理配置
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\n请逐步推理，并将最终答案放在 \\boxed{} 中。',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# 评测配置
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator),
+)
+
+# 数据集配置
+math_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='my-math-dataset',
+        path='path/to/your/dataset.jsonl',  # 或 .csv
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# 模型配置
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='your-model-name',
+        path='your/model/path',
+        # ... 其他模型配置
+    )
+]
+
+# 输出目录
+work_dir = './outputs/math_eval'
+```
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 827c7d91..2a2628c9 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -40,8 +40,6 @@ OpenCompass 上手路线
    user_guides/evaluation.md
    user_guides/experimentation.md
    user_guides/metrics.md
-   user_guides/summarizer.md
-   user_guides/corebench.md
 
 .. _提示词:
 .. toctree::
@@ -62,17 +60,12 @@ OpenCompass 上手路线
    advanced_guides/custom_dataset.md
    advanced_guides/new_model.md
    advanced_guides/evaluation_lmdeploy.md
-   advanced_guides/evaluation_lightllm.md
    advanced_guides/accelerator_intro.md
+   advanced_guides/math_verify.md
+   advanced_guides/llm_judge.md
    advanced_guides/code_eval.md
    advanced_guides/code_eval_service.md
-   advanced_guides/prompt_attack.md
-   advanced_guides/longeval.md
    advanced_guides/subjective_evaluation.md
-   advanced_guides/circular_eval.md
-   advanced_guides/contamination_eval.md
-   advanced_guides/compassbench_intro.md
-   advanced_guides/needleinahaystack_eval.md
 
 .. _工具:
 .. toctree::
diff --git a/examples/eval_llm_judge.py b/examples/eval_llm_judge.py
new file mode 100644
index 00000000..b7e18463
--- /dev/null
+++ b/examples/eval_llm_judge.py
@@ -0,0 +1,116 @@
+from mmengine.config import read_base
+from opencompass.models.openai_api import OpenAISDK
+
+# Import pre-configured models from OpenCompass
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_14b_instruct import (
+        models as lmdeploy_qwen2_5_14b_instruct_model,
+    )
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import CustomDataset
+
+
+# Dataset reader configuration
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+# Inference configuration
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration using LLM as judge
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/math',
+            file_name='test_prm800k_500.jsonl',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=lmdeploy_qwen2_5_14b_instruct_model[0],
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+# Dataset configuration
+datasets = [
+    dict(
+        type=CustomDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.jsonl',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
+
+# Model to be evaluated
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+# Limiting test to first 8 examples for quick testing
+math_reader_cfg['test_range'] = '[0:8]'
+
+# Output directory
+work_dir = 'outputs/llm_judge'
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index a797459f..65ea510f 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -3,6 +3,7 @@ import copy
 import math
 import os
 import os.path as osp
+import random
 import statistics
 import sys
 import time
@@ -37,18 +38,31 @@ class OpenICLEvalTask(BaseTask):
         super().__init__(cfg)
         self.logger = get_logger()
         self.num_gpus = max(
-            c.get('eval_cfg', {}).get('num_gpus', 0)
+            max(
+                c.get('eval_cfg', {}).get('num_gpus', 0),
+                c.get('eval_cfg', {}).get('evaluator', {}).get(
+                    'judge_cfg', {}).get('run_cfg', {}).get('num_gpus', 0),
+            ) for c in sum(self.dataset_cfgs, []))
+        self.num_procs = max(
+            c.get('eval_cfg', {}).get('evaluator', {}).get(
+                'judge_cfg', {}).get('run_cfg', {}).get('num_procs', 1)
             for c in sum(self.dataset_cfgs, []))
-        self.dump_details = cfg.get('eval', {}).get('runner', {}).get(
-            'task', {}).get('dump_details', False)
-        self.cal_extract_rate = cfg.get('eval', {}).get('runner', {}).get(
-            'task', {}).get('cal_extract_rate', False)
+        self.dump_details = (cfg.get('eval', {}).get('runner', {}).get(
+            'task', {}).get('dump_details', False))
+        self.cal_extract_rate = (cfg.get('eval', {}).get('runner', {}).get(
+            'task', {}).get('cal_extract_rate', False))
 
     def get_command(self, cfg_path, template):
         sys.path.append(os.getcwd())
         script_path = __file__
-        python = sys.executable
-        command = f'{python} {script_path} {cfg_path}'
+        if self.num_gpus > 1:
+            port = random.randint(12000, 32000)
+            command = (f'torchrun --master_port={port} '
+                       f'--nproc_per_node {self.num_procs} '
+                       f'{script_path} {cfg_path}')
+        else:
+            python = sys.executable
+            command = f'{python} {script_path} {cfg_path}'
         return template.format(task_cmd=command)
 
     def run(self):
@@ -63,8 +77,10 @@ class OpenICLEvalTask(BaseTask):
                     dataset_cfg['reader_cfg']['output_column'])
 
                 out_path = get_infer_output_path(
-                    self.model_cfg, self.dataset_cfg,
-                    osp.join(self.work_dir, 'results'))
+                    self.model_cfg,
+                    self.dataset_cfg,
+                    osp.join(self.work_dir, 'results'),
+                )
                 if osp.exists(out_path):
                     continue
                 self._score()
@@ -86,8 +102,10 @@ class OpenICLEvalTask(BaseTask):
 
         # Load predictions
         filename = get_infer_output_path(
-            self.model_cfg, self.dataset_cfg,
-            osp.join(self.work_dir, 'predictions'))
+            self.model_cfg,
+            self.dataset_cfg,
+            osp.join(self.work_dir, 'predictions'),
+        )
         # in case the prediction is partial
         root, ext = osp.splitext(filename)
         partial_filename = root + '_0' + ext
@@ -123,6 +141,7 @@ class OpenICLEvalTask(BaseTask):
                     and not MODELS.get(self.model_cfg['type']).is_api):
                 # Create a prompt template for role config parsing
                 from opencompass.models.base import LMTemplateParser
+
                 parser = LMTemplateParser(self.model_cfg['meta_template'])
                 role = parser.roles[self.eval_cfg['pred_role']]
                 if sc_size is not None:
@@ -131,15 +150,19 @@ class OpenICLEvalTask(BaseTask):
                         'must be list.')
                 if pred_list_flag:
                     pred_strs = [[
-                        extract_role_pred(_pred, role.get('begin', None),
-                                          role.get('end', None))
-                        for _pred in pred
+                        extract_role_pred(
+                            _pred,
+                            role.get('begin', None),
+                            role.get('end', None),
+                        ) for _pred in pred
                     ] for pred in pred_strs]
                 else:
                     pred_strs = [
-                        extract_role_pred(pred, role.get('begin', None),
-                                          role.get('end', None))
-                        for pred in pred_strs
+                        extract_role_pred(
+                            pred,
+                            role.get('begin', None),
+                            role.get('end', None),
+                        ) for pred in pred_strs
                     ]
 
             # Postprocess predictions if necessary
@@ -195,8 +218,10 @@ class OpenICLEvalTask(BaseTask):
             icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
             # need results dir to save other files
             out_path = get_infer_output_path(
-                self.model_cfg, self.dataset_cfg,
-                osp.join(self.work_dir, 'results'))
+                self.model_cfg,
+                self.dataset_cfg,
+                osp.join(self.work_dir, 'results'),
+            )
             icl_evaluator._out_dir = osp.splitext(out_path)[
                 0]  # strip extension
 
@@ -235,9 +260,13 @@ class OpenICLEvalTask(BaseTask):
                 details = result.get('details', None)
                 try:
                     result['details'] = self.format_details(
-                        pred_strs, model_pred_strs,
-                        test_set[self.output_column], details, model_details,
-                        pred_dicts)
+                        pred_strs,
+                        model_pred_strs,
+                        test_set[self.output_column],
+                        details,
+                        model_details,
+                        pred_dicts,
+                    )
                     self.logger.warning(
                         f"result['details'] : {result['details']}"),
                     result['type'] = result['details'].pop('type', None)
@@ -247,8 +276,8 @@ class OpenICLEvalTask(BaseTask):
 
                     if 'PPL' in str(
                             self.dataset_cfg.infer_cfg.inferencer.type):
-                        result['correct_bpb'], result['incorrect_bpb'] = \
-                            self.calculate_bpb(pred_dicts)
+                        result['correct_bpb'], result['incorrect_bpb'] = (
+                            self.calculate_bpb(pred_dicts))
                 except Exception as e:
                     self.logger.warning(f'Skip dumping details due to: {e}.')
             else:
@@ -281,8 +310,11 @@ class OpenICLEvalTask(BaseTask):
                 f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
 
         # Save result
-        out_path = get_infer_output_path(self.model_cfg, self.dataset_cfg,
-                                         osp.join(self.work_dir, 'results'))
+        out_path = get_infer_output_path(
+            self.model_cfg,
+            self.dataset_cfg,
+            osp.join(self.work_dir, 'results'),
+        )
         mkdir_or_exist(osp.split(out_path)[0])
         mmengine.dump(result, out_path, ensure_ascii=False, indent=4)
 
@@ -305,8 +337,15 @@ class OpenICLEvalTask(BaseTask):
         success_rate = 100 - len(invalid_extractions) / len(details) * 100
         return success_rate
 
-    def format_details(self, predictions, model_pred_strs, references, details,
-                       model_details, pred_dicts):
+    def format_details(
+        self,
+        predictions,
+        model_pred_strs,
+        references,
+        details,
+        model_details,
+        pred_dicts,
+    ):
         """This function is responsible for formatting prediction details.
 
         Args:
@@ -344,8 +383,9 @@ class OpenICLEvalTask(BaseTask):
                 result['references'] = str(references[i])
                 result['correct'] = str(predictions[i]) == str(references[i])
             elif details is not None and model_details is not None:
-                assert model_pred_strs != [], \
-                    'Model details is not None, but model_pred_strs is empty'
+                assert (
+                    model_pred_strs != []
+                ), 'Model details is not None, but model_pred_strs is empty'
                 self.logger.info(
                     f"model_details[i]['pred']: {model_details[i]['pred']}")
                 results['type'] = 'GEN'

From 6042b88e585b215785da184b18488cad59d57e6e Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 26 Feb 2025 19:04:01 +0800
Subject: [PATCH 11/58] [CI] update dailytest sceduler and baseline's
 score(#1898)

---
 .github/scripts/eval_regression_api.py        |  2 +-
 .../scripts/oc_score_baseline_fullbench.yaml  |  4 +--
 .../scripts/oc_score_baseline_testrange.yaml  | 32 +++++++++----------
 .github/workflows/daily-run-test.yml          | 16 +++++-----
 4 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py
index 29ec5406..ba1902a9 100644
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@@ -24,7 +24,7 @@ models = [
         abbr='lmdeploy-api-test',
         type=OpenAISDK,
         key='EMPTY',
-        openai_api_base='http://localhost:23333/v1',
+        openai_api_base='http://0.0.0.0:23333/v1',
         path='internlm2',
         tokenizer_path='internlm/internlm2_5-7b-chat',
         rpm_verbose=True,
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 5b0dee2b..9f171a02 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -42,7 +42,7 @@ internlm2_5-7b-chat-hf_fullbench:
         alpaca_eval_total: 20
         arenahard_score: 50
         Followbench_naive_average: 1
-        CompassArena_naive_average: 44.00
+        CompassArena_naive_average: 43
         mtbench101_avg: 7.8
         wildbench_average: -12.78
         simpleqa_accuracy_given_attempted: 0
@@ -58,7 +58,7 @@ internlm2_5-7b-chat-hf_fullbench:
         alpaca_eval_helpful_base: 20
         compassarena_language_naive_average: 35
         compassarena_knowledge_naive_average: 55
-        compassarena_reason_v2_naive_average: 45.00
+        compassarena_reason_v2_naive_average: 40
         compassarena_math_v2_naive_average: 55
         compassarena_creationv2_zh_naive_average: 30
         followbench_llmeval_en_HSR_AVG: 1
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 5f1121a7..45f74131 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -6,7 +6,7 @@ chat:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     deepseek-7b-chat-hf:
         gsm8k_accuracy: 46.88
@@ -63,7 +63,7 @@ chat:
         gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 81.25
     llama-3-8b-instruct-hf:
         gsm8k_accuracy: 68.75
@@ -75,7 +75,7 @@ chat:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 65.62
         race-high_accuracy: 81.25
     llama-3-8b-instruct-turbomind:
         gsm8k_accuracy: 71.88
@@ -226,25 +226,25 @@ base:
         race-high_accuracy: 25
         winogrande_accuracy: 68.75
     gemma2-2b-hf:
-        gsm8k_accuracy: 28.12
+        gsm8k_accuracy: 31.25
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 56.25
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75.00
     gemma2-9b-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 75.00
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 84.38
+        race-high_accuracy: 84.38
+        winogrande_accuracy: 81.25
     gemma-2b-hf:
-        gsm8k_accuracy: 18.75
+        gsm8k_accuracy: 21.88
         GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 25
+        race-high_accuracy: 21.88
         winogrande_accuracy: 53.12
     gemma-7b-hf:
         gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 6.25
+        GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 71.88
     gemma-2b-vllm:
         gsm8k_accuracy: 15.62
         GPQA_diamond_accuracy: 3.12
@@ -441,10 +441,10 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     deepseek-v2-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 3.12
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 75
+        gsm8k_accuracy: 65.62
+        GPQA_diamond_accuracy: 15.62
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 84.38
     llama-3-70b-hf:
         gsm8k_accuracy: 62.5
         GPQA_diamond_accuracy: 3.12
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 8aa1df16..3cdb3a73 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -44,7 +44,7 @@ on:
         type: string
         default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
   schedule:
-    - cron:  '15 14 * * *'
+    - cron:  '15 14 * * 0,2'
 
 env:
   HF_DATASETS_OFFLINE: 1
@@ -87,7 +87,7 @@ jobs:
           name: my-artifact-${{ github.run_id }}
 
   build-pypi-lmdeploy:
-    if: ${{!cancelled() && (github.event_name != 'schedule' && inputs.build_lmdeploy)}}
+    if: ${{!cancelled() && (github.event_name == 'schedule' || inputs.build_lmdeploy)}}
     strategy:
       matrix:
         pyver: [py310]
@@ -127,7 +127,7 @@ jobs:
     needs: ['build-pypi', 'build-pypi-lmdeploy']
     runs-on: volc_cu12
     environment: 'prod'
-    timeout-minutes: 240 #4hours
+    timeout-minutes: 120 #2hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -148,7 +148,7 @@ jobs:
         uses: nick-fields/retry@v3
         with:
           max_attempts: 1
-          timeout_minutes: 240
+          timeout_minutes: 120
           command: |
             . ${{env.CONDA_PATH}}/bin/activate
             conda create -y --name ${{env.CONDA_ENV}} python=3.10
@@ -211,7 +211,7 @@ jobs:
         uses: nick-fields/retry@v3
         with:
           max_attempts: 1
-          timeout_minutes: 120
+          timeout_minutes: 180
           command: |
             . ${{env.CONDA_PATH}}/bin/activate
             conda activate ${{env.CONDA_ENV}}
@@ -230,7 +230,7 @@ jobs:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
     runs-on: volc_cu12_local
     environment: 'prod'
-    timeout-minutes: 240 #4hours
+    timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -306,7 +306,7 @@ jobs:
         function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
     runs-on: volc_cu12
     environment: 'prod'
-    timeout-minutes: 360 #6hours
+    timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
         uses: actions/checkout@v2
@@ -323,7 +323,7 @@ jobs:
         uses: nick-fields/retry@v3
         with:
           max_attempts: 1
-          timeout_minutes: 360
+          timeout_minutes: 480
           command: |
             . ${{env.CONDA_PATH}}/bin/activate
             conda activate ${{env.CONDA_ENV}}

From 73c80953c658f70c61951279f0c8d6d8bcd769f2 Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Wed, 26 Feb 2025 19:43:12 +0800
Subject: [PATCH 12/58] [Feature] Support Dataset Repeat and G-Pass Compute for
 Each Evaluator (#1886)

* support dataset repeat and g-pass compute for each evaluator

* fix pre-commit errors

* delete print

* delete gpassk_evaluator and fix potential errors

* change `repeat` to `n`

* fix `repeat` to `n` in openicl_eval

* update doc for multi-run and g-pass

* update latex equation in doc

* update eng doc for multi-run and g-pass

* update datasets.md

* update datasets.md

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation in zh_cn user_guides

* mmodify pre-commit-zh-cn

* recover pre-commit and edit math expr in doc

* del [TIP]

* del cite tag in doc

* del extract_model param in livemathbench config
---
 docs/en/user_guides/datasets.md               |  40 +++++
 docs/zh_cn/user_guides/datasets.md            |  39 +++++
 .../livemathbench/livemathbench_gen_9befbf.py |  10 +-
 .../livemathbench_greedy_gen_9befbf.py        |  10 +-
 opencompass/datasets/base.py                  |  38 +++-
 .../datasets/livemathbench/livemathbench.py   |  80 ++-------
 opencompass/openicl/icl_evaluator/__init__.py |   1 -
 .../icl_evaluator/icl_base_evaluator.py       | 159 +++++++++++++++++
 .../icl_evaluator/icl_gpassk_evaluator.py     | 163 ------------------
 opencompass/tasks/openicl_eval.py             |   9 +-
 opencompass/utils/build.py                    |   1 -
 11 files changed, 300 insertions(+), 250 deletions(-)
 delete mode 100644 opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py

diff --git a/docs/en/user_guides/datasets.md b/docs/en/user_guides/datasets.md
index 222b303a..fb11b394 100644
--- a/docs/en/user_guides/datasets.md
+++ b/docs/en/user_guides/datasets.md
@@ -81,3 +81,43 @@ datasets += cmnli_datasets
 Users can choose different abilities, different datasets and different evaluation methods configuration files to build the part of the dataset in the evaluation script according to their needs.
 
 For information on how to start an evaluation task and how to evaluate self-built datasets, please refer to the relevant documents.
+
+### Multiple Evaluations on the Dataset
+
+In the dataset configuration, you can set the parameter `n` to perform multiple evaluations on the same dataset and return the average metrics, for example:
+
+```python
+afqmc_datasets = [
+    dict(
+        abbr="afqmc-dev",
+        type=AFQMCDatasetV2,
+        path="./data/CLUE/AFQMC/dev.json",
+        n=10, # Perform 10 evaluations
+        reader_cfg=afqmc_reader_cfg,
+        infer_cfg=afqmc_infer_cfg,
+        eval_cfg=afqmc_eval_cfg,
+    ),
+]
+
+```
+
+Additionally, for binary evaluation metrics (such as accuracy, pass-rate, etc.), you can also set the parameter `k` in conjunction with `n` for [G-Pass@k](http://arxiv.org/abs/2412.13147) evaluation. The formula for G-Pass@k is:
+
+```{math}
+\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
+```
+
+where $n$ is the number of evaluations, and $c$ is the number of times that passed or were correct out of $n$ runs. An example configuration is as follows:
+
+```python
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        k=[2, 4], # Return results for G-Pass@2 and G-Pass@4
+        n=12, # 12 evaluations
+        ...
+    )
+]
+```
diff --git a/docs/zh_cn/user_guides/datasets.md b/docs/zh_cn/user_guides/datasets.md
index 4a0ee8b2..b1494d3c 100644
--- a/docs/zh_cn/user_guides/datasets.md
+++ b/docs/zh_cn/user_guides/datasets.md
@@ -81,3 +81,42 @@ datasets += cmnli_datasets
 用户可以根据需要，选择不同能力不同数据集以及不同评测方式的配置文件来构建评测脚本中数据集的部分。
 
 有关如何启动评测任务，以及如何评测自建数据集可以参考相关文档。
+
+### 数据集多次评测
+
+在数据集配置中可以通过设置参数`n`来对同一数据集进行多次评测，最终返回平均指标，例如：
+
+```python
+afqmc_datasets = [
+    dict(
+        abbr="afqmc-dev",
+        type=AFQMCDatasetV2,
+        path="./data/CLUE/AFQMC/dev.json",
+        n=10, # 进行10次评测
+        reader_cfg=afqmc_reader_cfg,
+        infer_cfg=afqmc_infer_cfg,
+        eval_cfg=afqmc_eval_cfg,
+    ),
+]
+```
+
+另外，对于二值评测指标（例如accuracy，pass-rate等），还可以通过设置参数`k`配合`n`进行[G-Pass@k](http://arxiv.org/abs/2412.13147)评测。G-Pass@k计算公式为：
+
+```{math}
+\text{G-Pass@}k_\tau=E_{\text{Data}}\left[ \sum_{j=\lceil \tau \cdot k \rceil}^c \frac{{c \choose j} \cdot {n - c \choose k - j}}{{n \choose k}} \right], 
+```
+
+其中 $n$ 为评测次数, $c$ 为 $n$ 次运行中通过或正确的次数。配置例子如下：
+
+```python
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        k=[2, 4], # 返回 G-Pass@2和G-Pass@4的结果
+        n=12, # 12次评测
+        ...
+    )
+]
+```
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
index 454e7d3e..aa2da4ad 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_gen_9befbf.py
@@ -9,7 +9,7 @@ livemathbench_dataset = dict(
     type=LiveMathBenchDataset,
     path='',
     k=16,
-    replication=3,
+    n=48,
     dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
     dataset_languages=['cn', 'en'],
     cot=True,
@@ -38,13 +38,7 @@ livemathbench_dataset = dict(
         evaluator=dict(
             type=LiveMathBenchEvaluator,
             model_name='',
-            url=[],
-            use_extract_model=False,
-            extract_url=[],
-            extract_model_name='',
-            k=[4, 8, 16],
-            replication=3,
-            thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
+            url=[]
         )
     )
 )
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
index d8d8b79c..c8f66615 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_greedy_gen_9befbf.py
@@ -9,7 +9,7 @@ livemathbench_dataset = dict(
     type=LiveMathBenchDataset,
     path='',
     k=1,
-    replication=1,
+    n=1,
     dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
     dataset_languages=['cn', 'en'],
     cot=True,
@@ -38,13 +38,7 @@ livemathbench_dataset = dict(
         evaluator=dict(
             type=LiveMathBenchEvaluator,
             model_name='',
-            url=[],
-            use_extract_model=False,
-            extract_url=[],
-            extract_model_name='',
-            k=[1],
-            replication=1,
-            thresholds=[0.0]
+            url=[]
         )
     )
 )
diff --git a/opencompass/datasets/base.py b/opencompass/datasets/base.py
index 5dc0f073..7099b0c6 100644
--- a/opencompass/datasets/base.py
+++ b/opencompass/datasets/base.py
@@ -1,4 +1,5 @@
-from typing import Dict, Optional, Union
+from copy import deepcopy
+from typing import Dict, List, Optional, Union
 
 from datasets import Dataset, DatasetDict
 
@@ -7,8 +8,39 @@ from opencompass.openicl import DatasetReader
 
 class BaseDataset:
 
-    def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs):
-        self.dataset = self.load(**kwargs)
+    def __init__(self,
+                 reader_cfg: Optional[Dict] = {},
+                 k: Union[int, List[int]] = 1,
+                 n: int = 1,
+                 **kwargs):
+        abbr = kwargs.pop('abbr', 'dataset')
+        dataset = self.load(**kwargs)
+        # maybe duplicate
+        assert (max(k) if isinstance(k, List) else
+                k) <= n, 'Maximum value of `k` must less than or equal to `n`'
+        if isinstance(dataset, Dataset):
+            examples = []
+            for idx, example in enumerate(dataset):
+                if 'subdivision' not in example:
+                    example['subdivision'] = abbr
+                if 'idx' not in example:
+                    example['idx'] = idx
+                examples.append(example)
+            examples = sum([deepcopy(examples) for _ in range(n)], [])
+            self.dataset = Dataset.from_list(examples)
+        else:
+            self.dataset = DatasetDict()
+            for key in dataset:
+                examples = []
+                for idx, example in enumerate(dataset[key]):
+                    if 'subdivision' not in example:
+                        example['subdivision'] = f'{abbr}_{key}'
+                    if 'idx' not in example:
+                        example['idx'] = idx
+                    examples.append(example)
+                print(abbr, key, len(examples))
+                examples = sum([deepcopy(examples) for _ in range(n)], [])
+                self.dataset[key] = Dataset.from_list(examples)
         self._init_reader(**reader_cfg)
 
     def _init_reader(self, **kwargs):
diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py
index 13abf3aa..57ec7048 100644
--- a/opencompass/datasets/livemathbench/livemathbench.py
+++ b/opencompass/datasets/livemathbench/livemathbench.py
@@ -1,11 +1,9 @@
 import os
 import warnings
-from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from copy import deepcopy
 from functools import partial
 from itertools import product
-from typing import Any, Callable, Dict, List, Union
+from typing import Any, Callable, Dict, List
 
 import jsonlines
 import mmengine
@@ -14,7 +12,7 @@ from datasets import Dataset, load_dataset
 
 from opencompass.datasets.math import MATHAgentEvaluator, math_postprocess_v2
 from opencompass.models import OpenAISDK
-from opencompass.openicl.icl_evaluator import GPassKEvaluator
+from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.openicl.icl_inferencer.icl_base_inferencer import \
     dump_results_dict
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET, MODELS
@@ -31,8 +29,6 @@ class LiveMathBenchDataset(BaseDataset):
 
     @staticmethod
     def load(path: str,
-             k: Union[int, List[int]],
-             replication: int,
              dataset_splits: List[str] = [
                  'CNMO',
                  'CCEE',
@@ -104,17 +100,13 @@ class LiveMathBenchDataset(BaseDataset):
                                   ('' if 'options' not in example else
                                    ' '.join(example['options']))),
                 })
-                max_k = k if isinstance(k, int) else max(k)
-                for idx in range(max_k * replication):
-                    duplicated_example = deepcopy(example)
-                    duplicated_example.update({'replication_idx': idx})
-                    dataset.append(duplicated_example)
+                dataset.append(example)
 
         return Dataset.from_list(dataset)
 
 
 @ICL_EVALUATORS.register_module()
-class LiveMathBenchEvaluator(GPassKEvaluator):
+class LiveMathBenchEvaluator(BaseEvaluator):
     api_meta_template = dict(round=[
         dict(role='HUMAN', api_role='HUMAN'),
         dict(role='BOT', api_role='BOT', generate=True),
@@ -125,11 +117,8 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
                  url,
                  use_extract_model=False,
                  extract_url=[],
-                 extract_model_name='',
-                 k: Union[int, List[int]] = 16,
-                 replication: int = 3,
-                 thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]):
-        super().__init__(k, replication, thresholds)
+                 extract_model_name=''):
+        super().__init__()
 
         if isinstance(url, str):
             url = [url]
@@ -310,55 +299,18 @@ class LiveMathBenchEvaluator(GPassKEvaluator):
     def preprocess(self, predictions, references, test_set):
         return self.judge(predictions, references, test_set)
 
-    def group(self, predictions, labels, test_set):
-        example2replications = {}
-        for example, label, prediction in zip(test_set, labels, predictions):
-            example_abbr = f"{example['subdivision']}_{example['idx']}"
-            if example_abbr not in example2replications:
-                example2replications[example_abbr] = []
-            example.update({'prediction': prediction, 'label': label})
-            example2replications[example_abbr].append(example)
-        for _, replications in example2replications.items():
-            assert len(replications) == self.n, print(len(replications),
-                                                      self.n)
-        return example2replications
+    def score(self, predictions, references, test_set) -> Dict[str, Any]:
+        labels = self.preprocess(predictions, references, test_set)
+        results = {'accuracy': 100 * np.mean(labels), 'details': []}
 
-    def reduce(self, details) -> Dict[str, Any]:
-        """Aggregate the overall metrics.
+        for pred, ref, label in zip(predictions, references, labels):
+            results['details'].append({
+                'pred': pred,
+                'ref': ref,
+                'correct': label
+            })
 
-        Return:
-            A dict contains overall metrics, like:
-            {'details': details for each example, 'G-Pass@16': xxx}
-        """
-        g_passk_details = OrderedDict()
-        g_passk_details['details'] = details
-
-        all_dataset = set([detail['subdivision'] for detail in details])
-
-        for k in self.k:
-            for subdivision in sorted(list(all_dataset)):
-                for threshold in self.thresholds:
-                    g_passk_details[
-                        f'{subdivision}/G-Pass@{k}_{threshold}'] = \
-                            100. * np.mean(
-                            [
-                                detail[f'G-Pass@{k}_{threshold}']
-                                for detail in details
-                                if detail['subdivision'] == subdivision
-                            ])
-                g_passk_details[f'{subdivision}/mG-Pass@{k}'] = 100. * np.mean(
-                    [
-                        detail[f'mG-Pass@{k}'] for detail in details
-                        if detail['subdivision'] == subdivision
-                    ])
-
-            for threshold in self.thresholds:
-                g_passk_details[f'G-Pass@{k}_{threshold}'] = 100. * np.mean(
-                    [detail[f'G-Pass@{k}_{threshold}'] for detail in details])
-            g_passk_details[f'mG-Pass@{k}'] = 100. * np.mean(
-                [detail[f'mG-Pass@{k}'] for detail in details])
-
-        return g_passk_details
+        return results
 
 
 class LiveMathBenchOutputHandler:
diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py
index 5103c00d..1fd1683b 100644
--- a/opencompass/openicl/icl_evaluator/__init__.py
+++ b/opencompass/openicl/icl_evaluator/__init__.py
@@ -4,7 +4,6 @@ from .icl_base_evaluator import BaseEvaluator  # noqa
 from .icl_bpc_evaluator import BPCEvaluator  # noqa
 from .icl_circular_evaluator import CircularEvaluator  # noqa
 from .icl_em_evaluator import EMEvaluator  # noqa
-from .icl_gpassk_evaluator import GPassKEvaluator  # noqa
 from .icl_hf_evaluator import *  # noqa
 from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator  # noqa
 from .icl_misc_evaluator import AverageInferencePPLEvaluator  # noqa
diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index 0b07cfaa..0956f498 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -1,4 +1,39 @@
 """Base Evaluator."""
+from collections import OrderedDict
+from copy import deepcopy
+from typing import Any, Dict, Iterable, List, Union
+
+import numpy as np
+from datasets import Dataset
+from scipy.stats import hypergeom
+
+
+def compute_pass_at_k(n, c, k):
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+
+def _compute_g_pass_at_k(n, c, k, m):
+    if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
+        return 0.0
+    return hypergeom.sf(m - 1, n, c, k)
+
+
+def compute_g_pass_at_k(n, c, k, t):
+    m = max(int(np.ceil(k * t)), 1)
+    return _compute_g_pass_at_k(n, c, k, m)
+
+
+def compute_mg_pass_at_k(n, c, k):
+    l, r = int(np.ceil(k * 0.5)), k
+
+    mg_pass_at_k = 0.0
+    for i in range(l + 1, r + 1):
+        mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
+    mg_pass_at_k = 2 * mg_pass_at_k / k
+
+    return mg_pass_at_k
 
 
 class BaseEvaluator:
@@ -6,6 +41,130 @@ class BaseEvaluator:
     def __init__(self) -> None:
         pass
 
+    @property
+    def output_dir(self):
+        # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
+        return self._out_dir
+
+    def group(self, n: int, details: List[Dict[str, Any]],
+              test_set: Dataset) -> Dict[str, Any]:
+        example2replications = {}
+        for detail, example in zip(details, test_set):
+            example_abbr = f"{example['subdivision']}_{example['idx']}"
+            if example_abbr not in example2replications:
+                example2replications[example_abbr] = []
+            example.update({'detail': detail})
+            example2replications[example_abbr].append(example)
+        for _, replications in example2replications.items():
+            assert len(replications) == n, print(len(replications), n)
+        return example2replications
+
+    def reduce(self, details: List[Dict[str, Any]]) -> Dict[str, Any]:
+        g_passk_details = OrderedDict()
+        all_subdivisions = set(
+            [detail['example_abbr'].split('_')[0] for detail in details])
+        all_metrics = list(details[0].keys())
+
+        for subdivision in sorted(list(all_subdivisions)):
+            for metric in all_metrics:
+                if metric in ['predictions', 'example_abbr']:
+                    continue
+                g_passk_details[f'{subdivision}/{metric}'] = 100 * np.mean([
+                    detail[metric] for detail in details
+                    if detail['example_abbr'].split('_')[0] == subdivision
+                ])
+
+        for metric in all_metrics:
+            if metric in ['predictions', 'example_abbr']:
+                continue
+            g_passk_details[metric] = 100. * np.mean(
+                [detail[metric] for detail in details])
+        return g_passk_details
+
+    def evaluate(self, k: Union[int, List[int]], n: int,
+                 original_dataset: Dataset, **score_kwargs):
+        real_size = len(original_dataset) // n
+        all_details = []
+        all_results = []
+        for i in range(n):
+
+            def select_fn(i, real_size, x):
+                if isinstance(x, Dataset):
+                    return x.select(range(i * real_size, (i + 1) * real_size))
+                elif isinstance(x, Iterable):
+                    return x[i * real_size:(i + 1) * real_size]
+                else:
+                    return x
+
+            results = self.score(
+                **{
+                    key: select_fn(i, real_size, value)
+                    for key, value in score_kwargs.items()
+                })
+            details = results.pop('details', None)
+            if details is not None:
+                if isinstance(details, Dict):
+                    details = list(details.values())
+                all_details.extend(details)
+            all_results.append(results)
+
+        eval_results = {}
+        for single_results in all_results:
+            for key in single_results:
+                if key not in eval_results:
+                    eval_results[key] = []
+                eval_results[key].append(single_results[key])
+        for key in deepcopy(eval_results):
+            if isinstance(eval_results[key][0], float) or isinstance(
+                    eval_results[key][0], int):
+                if n > 1:
+                    eval_results[key + f' ({n} runs average)'] = np.mean(
+                        eval_results[key])
+                    eval_results.pop(key)
+                else:
+                    eval_results[key] = np.mean(eval_results[key])
+            else:
+                eval_results[key] = eval_results[key][0]
+
+        grouped_examples = self.group(n, all_details, original_dataset)
+        can_calculate = False
+        if len(all_details) != 0:
+            eval_details = []
+            for example_abbr, examples in grouped_examples.items():
+                detail = {'predictions': [], 'example_abbr': example_abbr}
+
+                c = 0
+                for example in examples:
+                    detail['predictions'].append(example['detail'])
+                    # only compute G-Pass@k when details have correct labels
+                    if example['detail'].get('correct', None) is not None:
+                        can_calculate = True
+                        c += int(example['detail']['correct'])
+                    elif example['detail'].get('is_correct', None) is not None:
+                        can_calculate = True
+                        c += int(example['detail']['is_correct'])
+
+                if can_calculate and n > 1 and k > 1:
+                    thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
+                    for _k in ([k] if isinstance(k, int) else k):
+                        for threshold in thresholds:
+                            g_pass = compute_g_pass_at_k(n=n,
+                                                         c=c,
+                                                         k=_k,
+                                                         t=threshold)
+                            detail[f'G-Pass@{_k}_{threshold}'] = g_pass
+                        detail[f'mG-Pass@{_k}'] = compute_mg_pass_at_k(n=n,
+                                                                       c=c,
+                                                                       k=_k)
+
+                eval_details.append(detail)
+
+            if can_calculate and n > 1 and k > 1:
+                eval_results.update(self.reduce(eval_details))
+            eval_results['details'] = eval_details
+
+        return eval_results
+
     def score(self):
         raise NotImplementedError("Method hasn't been implemented yet")
 
diff --git a/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py b/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py
deleted file mode 100644
index 80a59073..00000000
--- a/opencompass/openicl/icl_evaluator/icl_gpassk_evaluator.py
+++ /dev/null
@@ -1,163 +0,0 @@
-from abc import abstractmethod
-from typing import Any, Dict, List, Union
-
-import numpy as np
-from scipy.stats import hypergeom
-
-from opencompass.registry import ICL_EVALUATORS
-
-from .icl_base_evaluator import BaseEvaluator
-
-
-def compute_pass_at_k(n, c, k):
-    if n - c < k:
-        return 1.0
-    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
-
-
-def _compute_g_pass_at_k(n, c, k, m):
-    if m > min(c, k) or k > n or c < 0 or n <= 0 or m < 0:
-        return 0.0
-    return hypergeom.sf(m - 1, n, c, k)
-
-
-def compute_g_pass_at_k(n, c, k, t):
-    m = max(int(np.ceil(k * t)), 1)
-    return _compute_g_pass_at_k(n, c, k, m)
-
-
-def compute_mg_pass_at_k(n, c, k):
-    l, r = int(np.ceil(k * 0.5)), k
-
-    mg_pass_at_k = 0.0
-    for i in range(l + 1, r + 1):
-        mg_pass_at_k += _compute_g_pass_at_k(n, c, k, i)
-    mg_pass_at_k = 2 * mg_pass_at_k / k
-
-    return mg_pass_at_k
-
-
-@ICL_EVALUATORS.register_module()
-class GPassKEvaluator(BaseEvaluator):
-    """Evaluator for computing the G-Pass@k Metric.
-
-    This evaluator performs the following steps:
-    1. Invokes task-specific `preprocess` on predictions to
-    assign a consistency label to each prediction and its
-    corresponding reference.
-    2. Calculates metrics for each input example based on
-    these labels.
-    3. Aggregates the overall metrics through a task-specific
-    `postprocess`.
-
-    Args:
-        k (int or list of int): Number of predictions to be
-        considered in G-Pass@k. It can be a single integer
-        (e.g., `k=16` computes G-Pass@16) or a list of
-        integers (e.g., `[4, 8, 16]` computes G-Pass@4,
-        G-Pass@8, and G-Pass@16).
-
-        replication (int): Controls the number of generations
-        used to estimate G-Pass@k. The total number of
-        generations is determined by multiplying the
-        maximum of `k` with `replication`. This parameter
-        should be a single integer.
-
-        thresholds (list of float): A list of floating-point
-        numbers that define the thresholds for the G-Pass@k
-        metric.
-    """
-
-    def __init__(
-            self,
-            k: Union[int, List[int]] = 16,
-            replication: int = 3,
-            thresholds: List[float] = [0.0, 0.25, 0.5, 0.75, 1.0]) -> None:
-        super().__init__()
-
-        if isinstance(k, int):
-            k = [k]
-
-        self.k = k
-        self.replication = replication
-        self.n = max(k) * replication
-        self.thresholds = thresholds
-
-    @property
-    def output_dir(self):
-        # please see opencompass/opencompass/tasks/openicl_eval.py Line 197-200
-        return self._out_dir
-
-    @abstractmethod
-    def preprocess(self, predictions, references, test_set) -> None:
-        """Perform operations on predictions before computing metrics, for
-        example, do answer_extraction and model_judge in mathematical reasoning
-        task.
-
-        Return:
-            labels: A list contains the label which indicates whether
-            prediction is consistency with reference at each position.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def group(self, predictions, labels, test_set) -> Dict[str, Any]:
-        """Group the predictions and references.
-
-        Return:
-            A dict contains the grouped predictions and references.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def reduce(self, details) -> Dict[str, Any]:
-        """Aggregate the overall metrics.
-
-        Return:
-            A dict contains overall metrics, like:
-            {'details': details for each example, 'G-Pass@16': xxx}
-        """
-        raise NotImplementedError
-
-    def score(self, predictions, references, test_set) -> Dict[str, Any]:
-        """Compute G-Pass@k metrics.
-
-        Return:
-            A dict contains  metrics for each dataset sample and
-            overall metrics reduced by `self.reduce`, like:
-            {'details': details for each example, 'G-Pass@16': xxx}
-        """
-        labels = self.preprocess(predictions, references, test_set)
-        grouped_examples = self.group(predictions, labels, test_set)
-
-        details = []
-        total_pass_num, count = 0, 0
-        for example_abbr, examples in grouped_examples.items():
-            detail = {
-                k: v
-                for k, v in examples[0].items()
-                if k not in ['prediction', 'label']
-            }
-            detail.update({
-                'predictions': [{
-                    'prediction': example['prediction'],
-                    'label': example['label']
-                } for example in examples],
-            })
-
-            current_example_labels = [e['label'] for e in examples]
-            c = int(np.sum(current_example_labels))
-
-            for k in self.k:
-                for threshold in self.thresholds:
-                    detail[f'G-Pass@{k}_{threshold}'] = compute_g_pass_at_k(
-                        n=self.n, c=c, k=k, t=threshold)
-                detail[f'mG-Pass@{k}'] = compute_mg_pass_at_k(n=self.n,
-                                                              c=c,
-                                                              k=k)
-            count += self.n
-            total_pass_num += c
-
-            details.append(detail)
-
-        return self.reduce(details)
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index 65ea510f..7c769060 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -240,7 +240,10 @@ class OpenICLEvalTask(BaseTask):
                 k: preds[k]
                 for k in signature(icl_evaluator.score).parameters
             }
-            result = icl_evaluator.score(**preds)
+            k = self.dataset_cfg.get('k', 1)
+            n = self.dataset_cfg.get('n', 1)
+            result = icl_evaluator.evaluate(k, n, copy.deepcopy(test_set),
+                                            **preds)
 
             # Get model postprocess result
             model_details = None
@@ -248,7 +251,9 @@ class OpenICLEvalTask(BaseTask):
             if 'model_postprocessor' in self.eval_cfg:
                 model_preds = copy.deepcopy(preds)
                 model_preds['predictions'] = model_pred_strs
-                model_result = icl_evaluator.score(**model_preds)
+                model_result = icl_evaluator.evaluate(k, n,
+                                                      copy.deepcopy(test_set),
+                                                      **model_preds)
                 for key in model_result:
                     if key == 'details':
                         model_details = model_result[key]
diff --git a/opencompass/utils/build.py b/opencompass/utils/build.py
index 14a66683..f0973d7f 100644
--- a/opencompass/utils/build.py
+++ b/opencompass/utils/build.py
@@ -9,7 +9,6 @@ def build_dataset_from_cfg(dataset_cfg: ConfigDict):
     dataset_cfg = copy.deepcopy(dataset_cfg)
     dataset_cfg.pop('infer_cfg', None)
     dataset_cfg.pop('eval_cfg', None)
-    dataset_cfg.pop('abbr', None)
     return LOAD_DATASET.build(dataset_cfg)
 
 

From 6a573f671b35fa1ac848ac7468a181172d19d0a5 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 3 Mar 2025 15:35:57 +0800
Subject: [PATCH 13/58] [Fix] Fix compatible issue

---
 opencompass/datasets/TheoremQA/main.py                    | 2 +-
 opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py | 6 +++++-
 requirements/runtime.txt                                  | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/opencompass/datasets/TheoremQA/main.py b/opencompass/datasets/TheoremQA/main.py
index 7e06792e..4c145896 100644
--- a/opencompass/datasets/TheoremQA/main.py
+++ b/opencompass/datasets/TheoremQA/main.py
@@ -65,7 +65,7 @@ class TheoremQAEvaluatorV3(BaseEvaluator):
                 {
                     # "question": question,
                     # "solution": output,
-                    "correct": groundtruth,
+                    # "correct": groundtruth,
                     "pred": answer,
                     "is_correct": is_correct,
                 }
diff --git a/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py b/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
index 61fdbd23..b0b73188 100644
--- a/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_plugin_evaluator.py
@@ -2,8 +2,12 @@
 
 import json
 
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
 
-class TEvalEvaluator:
+
+@ICL_EVALUATORS.register_module()
+class TEvalEvaluator(BaseEvaluator):
     """This module contains the following evaluators for evaluating the
     capabilities of the various dimensions of the LLM.
 
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 348df85d..6bd5e9a9 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -37,7 +37,7 @@ rouge_score
 sacrebleu
 scikit_learn==1.5.0
 seaborn
-sentence_transformers==2.2.2
+sentence_transformers
 tabulate
 tiktoken
 timeout_decorator

From f0809fe6f62bf66cc1c3ed666c6d796caab2881f Mon Sep 17 00:00:00 2001
From: Junnan Liu <to.liujn@outlook.com>
Date: Mon, 3 Mar 2025 18:17:15 +0800
Subject: [PATCH 14/58] [Update] Fix Hard Configs With General GPassK (#1906)

* support dataset repeat and g-pass compute for each evaluator

* fix pre-commit errors

* delete print

* delete gpassk_evaluator and fix potential errors

* change `repeat` to `n`

* fix `repeat` to `n` in openicl_eval

* update doc for multi-run and g-pass

* update latex equation in doc

* update eng doc for multi-run and g-pass

* update datasets.md

* update datasets.md

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation

* fix multi-line equation in zh_cn user_guides

* mmodify pre-commit-zh-cn

* recover pre-commit and edit math expr in doc

* del [TIP]

* del cite tag in doc

* del extract_model param in livemathbench config

* fix livemathbench hard configs
---
 .../livemathbench/livemathbench_hard_gen_353ae7.py     | 10 ++--------
 .../livemathbench_hard_greedy_gen_353ae7.py            | 10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
index e932d3c3..6b2f9f5a 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_gen_353ae7.py
@@ -9,7 +9,7 @@ livemathbench_dataset = dict(
     type=LiveMathBenchDataset,
     path='',
     k=16,
-    replication=3,
+    n=48,
     dataset_splits=['hard'],
     dataset_languages=['cn', 'en'],
     cot=True,
@@ -37,13 +37,7 @@ livemathbench_dataset = dict(
         evaluator=dict(
             type=LiveMathBenchEvaluator,
             model_name='',
-            url=[],
-            use_extract_model=False,
-            extract_url=[],
-            extract_model_name='',
-            k=[4, 8, 16],
-            replication=3,
-            thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
+            url=[]
         )
     )
 )
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
index 830e55af..f956f83e 100644
--- a/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_greedy_gen_353ae7.py
@@ -9,7 +9,7 @@ livemathbench_dataset = dict(
     type=LiveMathBenchDataset,
     path='',
     k=1,
-    replication=1,
+    n=1,
     dataset_splits=['hard'],
     dataset_languages=['cn', 'en'],
     cot=True,
@@ -37,13 +37,7 @@ livemathbench_dataset = dict(
         evaluator=dict(
             type=LiveMathBenchEvaluator,
             model_name='',
-            url=[],
-            use_extract_model=False,
-            extract_url=[],
-            extract_model_name='',
-            k=[1],
-            replication=1,
-            thresholds=[0.0]
+            url=[]
         )
     )
 )

From c84bc18ac1a533f7b4ae430f2497a134b028635c Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Mon, 3 Mar 2025 18:56:11 +0800
Subject: [PATCH 15/58] [Update] Support
 OlympiadBench-Math/OmniMath/LiveMathBench-Hard (#1899)

* [Update] Support OlympiadBench-Math/OmniMath/LiveMathBench-Hard with LLM Verify

* Update

* Update

* Update DeepSeek-R1 example

* Update DeepSeek-R1 example

* Update DeepSeek-R1 example
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 docs/en/index.rst                             |   1 +
 docs/en/user_guides/deepseek_r1.md            | 192 ++++++++++++++++
 docs/zh_cn/index.rst                          |   1 +
 docs/zh_cn/user_guides/deepseek_r1.md         | 192 ++++++++++++++++
 examples/eval_deepseek_r1.py                  | 212 ++++++++++++++++++
 ...iadBenchMath_0shot_llmverify_gen_9c22f2.py | 105 +++++++++
 .../OlympiadBench/OlympiadBench_categories.py |  11 +
 ...hbench_hard_custom_llmverify_gen_85d0ef.py |  96 ++++++++
 ...rm800k_500_llmverify_repeat4_gen_97b203.py |   2 +-
 .../omni_math_llmverify_gen_ccf9c0.py         |  89 ++++++++
 .../summarizers/groups/OlympiadBench.py       |  11 +
 opencompass/datasets/generic.py               |  10 +-
 .../datasets/livemathbench/livemathbench.py   |  25 ++-
 .../models/turbomind_with_tf_above_v4_33.py   |   1 +
 16 files changed, 936 insertions(+), 14 deletions(-)
 create mode 100644 docs/en/user_guides/deepseek_r1.md
 create mode 100644 docs/zh_cn/user_guides/deepseek_r1.md
 create mode 100644 examples/eval_deepseek_r1.py
 create mode 100644 opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
 create mode 100644 opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py
 create mode 100644 opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py

diff --git a/README.md b/README.md
index 93457e09..93c2a5fd 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 - **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 6d071ebc..55c2faf5 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,6 +57,7 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 8c64f210..5ae2e173 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -39,6 +39,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    user_guides/evaluation.md
    user_guides/experimentation.md
    user_guides/metrics.md
+   user_guides/deepseek_r1.md
 
 .. _Prompt:
 .. toctree::
diff --git a/docs/en/user_guides/deepseek_r1.md b/docs/en/user_guides/deepseek_r1.md
new file mode 100644
index 00000000..923ea05b
--- /dev/null
+++ b/docs/en/user_guides/deepseek_r1.md
@@ -0,0 +1,192 @@
+# Tutorial for Evaluating Reasoning Models
+
+OpenCompass provides an evaluation tutorial for DeepSeek R1 series reasoning models (mathematical datasets).
+
+- At the model level, we recommend using the sampling approach to reduce repetitions caused by greedy decoding
+- For datasets with limited samples, we employ multiple evaluation runs and take the average
+- For answer validation, we utilize LLM-based verification to reduce misjudgments from rule-based evaluation
+
+## Installation and Preparation
+
+Please follow OpenCompass's installation guide.
+
+## Evaluation Configuration Setup
+
+We provide example configurations in `example/eval_deepseek_r1.py`. Below is the configuration explanation:
+
+### Configuration Interpretation
+
+#### 1. Dataset and Validator Configuration
+
+```python
+# Configuration supporting multiple runs (example)
+from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# LLM validator configuration. Users need to deploy API services via LMDeploy/vLLM/SGLang or use OpenAI-compatible endpoints
+verifier_cfg = dict(
+    abbr='qwen2-5-32B-Instruct',
+    type=OpenAISDK,
+    path='Qwen/Qwen2.5-32B-Instruct',  # Replace with actual path
+    key='YOUR_API_KEY',  # Use real API key
+    openai_api_base=['http://your-api-endpoint'],  # Replace with API endpoint
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    max_out_len=16384
+)
+
+# Apply validator to all datasets
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+```
+
+#### 2. Model Configuration
+
+We provided an example of evaluation based on LMDeploy as the reasoning model backend, users can modify path (i.e., HF path)
+
+```python
+# LMDeploy model configuration example
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768
+        ),
+        max_seq_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # Extendable 14B/32B configurations...
+]
+```
+
+#### 3. Evaluation Process Configuration
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+    
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
+```
+
+#### 4. Summary Configuration
+
+```python
+# Multiple runs results average configuration
+summary_groups = [
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    # Other dataset average configurations...
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['AIME2024-Aveage8', 'naive_average'],
+        # Other dataset metrics...
+    ],
+    summary_groups=summary_groups
+)
+
+# Work directory configuration
+work_dir = "outputs/deepseek_r1_reasoning"
+```
+
+## Evaluation Execution
+
+### Scenario 1: Model loaded on 1 GPU, data evaluated by 1 worker, using a total of 1 GPU
+
+```bash
+opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
+```
+
+Evaluation logs will be output in the command line.
+
+### Scenario 2: Model loaded on 1 GPU, data evaluated by 8 workers, using a total of 8 GPUs
+
+You need to modify the `infer` configuration in the configuration file and set `num_worker` to 8
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+At the same time, remove the `--debug` parameter from the evaluation command
+
+```bash
+opencompass example/eval_deepseek_r1.py --dump-eval-details
+```
+
+In this mode, OpenCompass will use multithreading to start `$num_worker` tasks. Specific logs will not be displayed in the command line, instead, detailed evaluation logs will be shown under `$work_dir`.
+
+### Scenario 3: Model loaded on 2 GPUs, data evaluated by 4 workers, using a total of 8 GPUs
+
+Note that in the model configuration, `num_gpus` in `run_cfg` needs to be set to 2 (if using an inference backend, parameters such as `tp` in LMDeploy also need to be modified accordingly to 2), and at the same time, set `num_worker` in the `infer` configuration to 4
+
+```python
+models += [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=128,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+```
+
+```python
+# Inference configuration
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+### Evaluation Results
+
+The evaluation results are displayed as follows:
+
+```bash
+dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
+
+```
+
+## Performance Baseline
+
+Since the model uses Sampling for decoding, and the AIME dataset size is small, there may still be a performance fluctuation of 1-3 points even when averaging over 8 evaluations.
+
+| Model                        | Dataset  | Metric   | Value |
+| ---------------------------- | -------- | -------- | ----- |
+| DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3  |
+| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2  |
+| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2  |
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 2a2628c9..c9b6e8d3 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -40,6 +40,7 @@ OpenCompass 上手路线
    user_guides/evaluation.md
    user_guides/experimentation.md
    user_guides/metrics.md
+   user_guides/deepseek_r1.md
 
 .. _提示词:
 .. toctree::
diff --git a/docs/zh_cn/user_guides/deepseek_r1.md b/docs/zh_cn/user_guides/deepseek_r1.md
new file mode 100644
index 00000000..b5bb1a17
--- /dev/null
+++ b/docs/zh_cn/user_guides/deepseek_r1.md
@@ -0,0 +1,192 @@
+# 强推理模型评测教程
+
+OpenCompass提供针对DeepSeek R1系列推理模型的评测教程（数学数据集）。
+
+- 在模型层面，我们建议使用Sampling方式，以减少因为Greedy评测带来的大量重复
+- 在数据集层面，我们对数据量较小的评测基准，使用多次评测并取平均的方式。
+- 在答案验证层面，为了减少基于规则评测带来的误判，我们统一使用基于LLM验证的方式进行评测。
+
+## 安装和准备
+
+请按OpenCompass安装教程进行安装。
+
+## 构建评测配置
+
+我们在 `example/eval_deepseek_r1.py` 中提供了示例配置，以下对评测配置进行解读
+
+### 评测配置解读
+
+#### 1. 数据集与验证器配置
+
+```python
+# 支持多运行次数的数据集配置（示例）
+from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# 设置LLM验证器， 用户需事先通过LMDeploy/vLLM/SGLang等工具启动API 评测服务器，或者直接使用兼容OpenAI标准接口的模型服务
+verifier_cfg = dict(
+    abbr='qwen2-5-32B-Instruct',
+    type=OpenAISDK,
+    path='Qwen/Qwen2.5-32B-Instruct',  # 需替换实际路径
+    key='YOUR_API_KEY',  # 需替换真实API Key
+    openai_api_base=['http://your-api-endpoint'],  # 需替换API地址
+    query_per_second=16,
+    batch_size=1024,
+    temperature=0.001,
+    max_out_len=16384
+)
+
+# 应用验证器到所有数据集
+for item in datasets:
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+```
+
+#### 2. 模型配置
+
+我们提供了基于LMDeploy作为推理后端的评测示例，用户可以通过修改path（即HF路径）
+
+```python
+# LMDeploy模型配置示例
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.95,
+            max_new_tokens=32768
+        ),
+        max_seq_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # 可扩展14B/32B配置...
+]
+```
+
+#### 3. 评估流程配置
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+    
+# 评估配置
+eval = dict(
+    partitioner=dict(type=NaivePartitioner, n=8),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLEvalTask)))
+```
+
+#### 4. 结果汇总配置
+
+```python
+# 多运行结果平均配置
+summary_groups = [
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    # 其他数据集平均配置...
+]
+
+summarizer = dict(
+    dataset_abbrs=[
+        ['AIME2024-Aveage8', 'naive_average'],
+        # 其他数据集指标...
+    ],
+    summary_groups=summary_groups
+)
+
+# 工作目录设置
+work_dir = "outputs/deepseek_r1_reasoning"
+```
+
+## 执行评测
+
+### 场景1：模型1卡加载，数据1个worker评测，共使用1个GPU
+
+```bash
+opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
+```
+
+评测日志会在命令行输出。
+
+### 场景2：模型1卡加载，数据8个worker评测，共使用8个GPU
+
+需要修改配置文件中的infer配置，将num_worker设置为8
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=1),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+同时评测命令去掉`--debug`参数
+
+```bash
+opencompass example/eval_deepseek_r1.py --dump-eval-details
+```
+
+此模式下，OpenCompass将使用多线程启动`$num_worker`个任务，命令行不展示具体日志，具体的评测日志将会在`$work_dir`下中展示。
+
+### 场景3：模型2卡加载，数据4个worker评测，共使用8个GPU
+
+需要注意模型配置中，`run_cfg`中的`num_gpus`需要设置为2(如使用推理后端，则推理后端的参数也需要同步修改，比如LMDeploy中的tp需要设置为2)，同时修改`infer`配置中的`num_worker`为4
+
+```python
+models += [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-14b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=128,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+]
+```
+
+```python
+# 推理配置
+infer = dict(
+    partitioner=dict(type=NumWorkerPartitioner, num_worker=4),
+    runner=dict(type=LocalRunner, task=dict(type=OpenICLInferTask))
+```
+
+### 评测结果
+
+评测结果展示如下：
+
+```bash
+dataset                             version    metric         mode    deepseek-r1-distill-qwen-7b-turbomind                                                                                                       ----------------------------------  ---------  -------------  ------  ---------------------------------------                                                                                                     MATH                                -          -              -                                         AIME2024-Aveage8                    -          naive_average  gen     56.25     
+
+```
+
+## 性能基线参考
+
+由于模型使用Sampling进行解码，同时AIME数据量较小，使用8次评测取平均情况下，仍会出现1-3分的性能抖动
+
+| 模型                         | 数据集   | 指标     | 数值 |
+| ---------------------------- | -------- | -------- | ---- |
+| DeepSeek-R1-Distill-Qwen-7B  | AIME2024 | Accuracy | 56.3 |
+| DeepSeek-R1-Distill-Qwen-14B | AIME2024 | Accuracy | 74.2 |
+| DeepSeek-R1-Distill-Qwen-32B | AIME2024 | Accuracy | 74.2 |
diff --git a/examples/eval_deepseek_r1.py b/examples/eval_deepseek_r1.py
new file mode 100644
index 00000000..11d8e473
--- /dev/null
+++ b/examples/eval_deepseek_r1.py
@@ -0,0 +1,212 @@
+# Support AIME-2024 with Repeat8
+# Support MATH-500
+# Support OlympiadBench
+# Support OmniMath
+# Support LiveMathBench-202412-Hard
+
+import os.path as osp
+from itertools import product
+from opencompass.models import OpenAISDK
+from mmengine.config import read_base
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
+from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask
+from opencompass.runners import LocalRunner
+from opencompass.models import (
+    TurboMindModelwithChatTemplate,
+)
+
+#######################################################################
+#                          PART 1  Datasets List                      #
+#######################################################################
+with read_base():
+    # You can comment out the datasets you don't want to evaluate
+
+    # Datasets
+    # from opencompass.configs.datasets.math.math_prm800k_500_llmverify_gen_6ff468 import math_datasets # 1 Run
+    from opencompass.configs.datasets.aime2024.aime2024_llmverify_repeat8_gen_e8fcee import aime2024_datasets # 8 Run
+    # from opencompass.configs.datasets.OlympiadBench.OlympiadBench_0shot_llmverify_gen_be8b13 import olympiadbench_datasets
+    # from opencompass.configs.datasets.omni_math.omni_math_llmverify_gen_ccf9c0 import omnimath_datasets # 1 Run
+    # from opencompass.configs.datasets.livemathbench.livemathbench_hard_custom_llmverify_gen_85d0ef import livemathbench_datasets
+
+
+    # Summarizer
+    from opencompass.configs.summarizers.groups.OlympiadBench import OlympiadBenchMath_summary_groups
+
+datasets = sum(
+    (v for k, v in locals().items() if k.endswith('_datasets')),
+    [],
+)
+
+# Set LLM Verifier used for each dataset
+
+verifier_cfg = dict(
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct', # You need to set your own judge model path
+        key='sk-1234', # You need to set your own API key
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1', # You need to set your own API base
+        ],
+        meta_template=dict(
+            round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ], 
+        ),
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        # max_seq_len=32768,
+        max_seq_len=49152,
+)
+
+for item in datasets:
+    # item['infer_cfg']['inferencer']['max_out_len'] = 32768 # You can unset this line if you want to avoid length cutoff
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = verifier_cfg
+
+
+#######################################################################
+#                          PART 2  Model List                         #
+#######################################################################
+
+models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
+
+models += [
+    # You can comment out the models you don't want to evaluate
+    # All models use sampling mode
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='deepseek-r1-distill-qwen-7b-turbomind',
+        path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
+        engine_config=dict(session_len=32768, max_batch_size=128, tp=1),
+        gen_config=dict(
+                        do_sample=True,
+                        temperature=0.6,
+                        top_p=0.95,
+                        max_new_tokens=32768),
+        max_seq_len=32768,
+        max_out_len=32768,
+        batch_size=64,
+        run_cfg=dict(num_gpus=1),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-14b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=2),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=32768),
+    #     max_seq_len=32768,
+    #     max_out_len=32768,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=2),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+    # dict(
+    #     type=TurboMindModelwithChatTemplate,
+    #     abbr='deepseek-r1-distill-qwen-32b-turbomind',
+    #     path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
+    #     engine_config=dict(session_len=32768, max_batch_size=128, tp=4),
+    #     gen_config=dict(
+    #                     do_sample=True,
+    #                     temperature=0.6,
+    #                     top_p=0.95,
+    #                     max_new_tokens=16384),
+    #     max_seq_len=32768,
+    #     max_out_len=16384,
+    #     batch_size=128,
+    #     run_cfg=dict(num_gpus=4),
+    #     pred_postprocessor=dict(type=extract_non_reasoning_content)
+    # ),
+]
+
+#######################################################################
+#                          PART 3  Inference/Evaluation               #
+#######################################################################
+
+# Inference configuration
+infer = dict(
+    partitioner=dict(
+        type=NumWorkerPartitioner,
+        num_worker=1
+        # Similar with data-parallelism, how many workers for evaluation,
+        # each worker will evaluate a part of the dataset. Total GPUs = num_worker * num_gpus_per_worker
+        # For example, If you have 8 GPUs, for 7B model using 1 GPU for one instance, you can set num_worker=8
+        # to max-utilize the GPUs.
+        # If you have 8 GPUs, for 14B model using 2 GPUs for one instance, you can set num_worker=4
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(type=OpenICLInferTask)
+    ),
+)
+
+# Evaluation configuration
+eval = dict(
+    partitioner=dict(
+        type=NaivePartitioner, n=8
+    ),
+    runner=dict(
+        type=LocalRunner,
+        task=dict(
+            type=OpenICLEvalTask)
+    ),
+)
+
+
+#######################################################################
+#                          PART 4  Summarizer                         #
+#######################################################################
+
+
+summary_groups = sum(
+    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
+)
+
+summary_groups.extend([
+    {
+        'name': 'AIME2024-Aveage8',
+        'subsets':[[f'aime2024-run{idx}', 'accuracy'] for idx in range(8)]
+    },
+    {
+        'name': 'LiveMathBench-v202412-Hard-Aveage8',
+        'subsets':[[
+            f'livemathbench_hard_custom_{split}_run{run_idx}', 'accuracy'] 
+                for split, run_idx in product(['hard_cn', 'hard_en'], range(8))
+        ]
+    }
+])
+
+# Summarizer
+summarizer = dict(
+    dataset_abbrs=[
+        'MATH',
+        # ['LiveMathBench-k1-n1', 'pass@1'],
+        # ['LiveMathBench-v202412-greedy', 'G-Pass@1_0.0'],
+        # ['aime2024', 'accuracy'],
+        ['math_prm800k_500-llmjudge', 'accuracy'],
+        ['AIME2024-Aveage8', 'naive_average'],
+        ['LiveMathBench-v202412-Hard-Aveage8', 'naive_average'],
+        ['OlympiadBenchMath', 'accuracy'],
+        ['OmniMath', 'accuracy'],
+    ],
+    summary_groups=summary_groups,
+)
+
+
+#######################################################################
+#                          PART 5  Utils                              #
+#######################################################################
+
+work_dir = 'outputs/deepseek_r1_reasoning'
+
+
diff --git a/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py b/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
new file mode 100644
index 00000000..03881387
--- /dev/null
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBenchMath_0shot_llmverify_gen_9c22f2.py
@@ -0,0 +1,105 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import OlympiadBenchDataset, OlympiadBenchEvaluator, olympiadbench_postprocess_v2
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .OlympiadBench_categories import math_categories as categories
+
+# Create prompter instance for problems
+olympiadbench_prompter_cfg = dict(
+    type='OlympiadBenchPrompter'
+)
+
+olympiadbench_reader_cfg = dict(
+    input_columns=[
+        'problem', 'language', 'subject', 'question_type', 
+        'answer_type', 'is_multiple_answer', 'unit', 'questions'
+    ], 
+    output_column='solution'
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+olympiadbenchMath_datasets = []
+for _name in categories:
+    olympiadbench_infer_cfg = dict(
+        prompt_template=dict(
+            type='OlympiadBenchTemplate'
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # Evaluation configuration
+    olympiadbench_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlympiadBenchDataset,
+                path='opencompass/OlympiadBench',
+                name=_name,
+                reader_cfg=olympiadbench_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olympiadbenchMath_datasets.append(
+        dict(
+            type=OlympiadBenchDataset,
+            abbr=f'OlympiadBench_{_name}',
+            path='opencompass/OlympiadBench',
+            name=_name,
+            reader_cfg=olympiadbench_reader_cfg,
+            infer_cfg=olympiadbench_infer_cfg,
+            eval_cfg=olympiadbench_eval_cfg,
+        )
+    )
+
+del _name
diff --git a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
index 818e5293..fdd4a56f 100644
--- a/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
+++ b/opencompass/configs/datasets/OlympiadBench/OlympiadBench_categories.py
@@ -5,3 +5,14 @@ categories = [
     'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
     'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
 ]
+
+math_categories = [
+    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
+]
+
+physics_categories = [
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
diff --git a/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py
new file mode 100644
index 00000000..1e085273
--- /dev/null
+++ b/opencompass/configs/datasets/livemathbench/livemathbench_hard_custom_llmverify_gen_85d0ef.py
@@ -0,0 +1,96 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import CustomDataset
+from opencompass.datasets import generic_llmjudge_postprocess
+from itertools import product
+
+livemathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+# Inference configuration
+livemathbench_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\n',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Template for the LLM judge
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+
+splits = ['hard_cn', 'hard_en']
+# Dataset configuration
+livemathbench_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr=f'livemathbench_hard_custom_{split}_run{run_idx}',
+        path='data/LiveMathBench',
+        local_mode=True,
+        file_name=f'202412/{split}.jsonl',
+        reader_cfg=livemathbench_reader_cfg,
+        infer_cfg=livemathbench_infer_cfg,
+        eval_cfg=dict(
+            # # Evaluation configuration using LLM as judge
+            evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                        begin=[
+                            dict(
+                                role='SYSTEM',
+                                fallback_role='HUMAN',
+                                prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                            )
+                        ],
+                        round=[
+                            dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                        ],
+                    ),
+                ),
+                dataset_cfg=dict(
+                    type=CustomDataset,
+                    path='data/LiveMathBench',
+                    local_mode=True,
+                    file_name=f'202412/{split}.jsonl',
+                    reader_cfg=livemathbench_reader_cfg,
+                ),
+                judge_cfg={},
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+        ),
+    ) for split, run_idx in product(splits, range(8))
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py
index a7e373e9..1ac43b7c 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_llmverify_repeat4_gen_97b203.py
@@ -88,7 +88,7 @@ math_eval_cfg = dict(
 math_datasets = [
     dict(
         type=MATHDataset,
-        abbr=f'math_prm800k_500-llmjudge-run{idx}',
+        abbr=f'math_prm800k_500-llmverify-run{idx}',
         path='opencompass/math',
         file_name = 'test_prm800k_500.json',
         reader_cfg=math_reader_cfg,
diff --git a/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py b/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py
new file mode 100644
index 00000000..2a4320b8
--- /dev/null
+++ b/opencompass/configs/datasets/omni_math/omni_math_llmverify_gen_ccf9c0.py
@@ -0,0 +1,89 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets.omni_math import OmniMathDataset
+
+
+omnimath_reader_cfg = dict(
+    input_columns=['problem'], 
+    output_column='answer'
+)
+
+omnimath_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='please answer the following mathematical question, put your final answer in \\boxed{}.\n\n{problem}'),
+            ]
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+omnimath_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=OmniMathDataset,
+            reader_cfg=omnimath_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+omnimath_datasets = [
+    dict(
+        type=OmniMathDataset,
+        abbr='OmniMath',
+        reader_cfg=omnimath_reader_cfg,
+        infer_cfg=omnimath_infer_cfg,
+        eval_cfg=omnimath_eval_cfg
+    )
+]
\ No newline at end of file
diff --git a/opencompass/configs/summarizers/groups/OlympiadBench.py b/opencompass/configs/summarizers/groups/OlympiadBench.py
index 12fb5807..fc57f603 100644
--- a/opencompass/configs/summarizers/groups/OlympiadBench.py
+++ b/opencompass/configs/summarizers/groups/OlympiadBench.py
@@ -9,3 +9,14 @@ categories = [
 OlympiadBench_summary_groups = [
     {'name': 'OlympiadBench', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in categories]},
 ]
+
+math_categories = [
+    'OE_TO_maths_en_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_COMP', # OpenEnded - TextOnly - maths - COMP
+    'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
+]
+
+
+OlympiadBenchMath_summary_groups = [
+    {'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
+]
diff --git a/opencompass/datasets/generic.py b/opencompass/datasets/generic.py
index 28a37a02..dc783167 100644
--- a/opencompass/datasets/generic.py
+++ b/opencompass/datasets/generic.py
@@ -1,5 +1,7 @@
 import re
 
+from opencompass.utils import get_logger
+
 
 def get_final_results(judged_answers,
                       references,
@@ -68,7 +70,13 @@ def generic_llmjudge_postprocess(
         processed_judge = _generic_llmjudge_postprocess(v['prediction'])
         if processed_judge is not None:
             judged_answers.append(processed_judge)
-            references.append(v['gold'])
+            try:
+                references.append(v['gold'])
+
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                references.append('')
     results = get_final_results(judged_answers, references, origial_responses)
     results['details'] = output
     return results
diff --git a/opencompass/datasets/livemathbench/livemathbench.py b/opencompass/datasets/livemathbench/livemathbench.py
index 57ec7048..2c6fdbe5 100644
--- a/opencompass/datasets/livemathbench/livemathbench.py
+++ b/opencompass/datasets/livemathbench/livemathbench.py
@@ -41,9 +41,8 @@ class LiveMathBenchDataset(BaseDataset):
         dataset = []
         dataset_info = {}
 
-        if path != '':
-            path = get_data_path(path)
-            path = os.path.join(path, version)
+        # Use dataset mapping to generate path
+        data_dir = get_data_path(path)
 
         for split, language in product(dataset_splits, dataset_languages):
             dataset_info[f'{split}_{language}'] = {
@@ -59,8 +58,17 @@ class LiveMathBenchDataset(BaseDataset):
                 '问答': 'problem-solving'
             }
 
-            if path != '':
-                file_path = os.path.join(path, f'{split}_{language}.jsonl')
+            examples = []
+            if data_dir.startswith('opencompass/'):
+                # Using HF Dataset
+                hf_dataset = load_dataset(
+                    data_dir, f'v{version}_{split}_{language}')['test']
+                for example in hf_dataset:
+                    examples.append(example)
+            else:
+                file_path = os.path.join(data_dir, version,
+                                         f'{split}_{language}.jsonl')
+
                 if not os.path.exists(file_path):
                     raise FileNotFoundError(
                         f'File {file_path} does not exist, please check the '
@@ -69,13 +77,6 @@ class LiveMathBenchDataset(BaseDataset):
                 with jsonlines.open(file_path, 'r') as file:
                     for example in file:
                         examples.append(example)
-            else:
-                hf_dataset = load_dataset(
-                    'opencompass/LiveMathBench',
-                    f'v{version}_{split}_{language}')['test']
-                examples = []
-                for example in hf_dataset:
-                    examples.append(example)
 
             for example_idx, example in enumerate(examples):
                 dataset_info[f'{split}_{language}'][
diff --git a/opencompass/models/turbomind_with_tf_above_v4_33.py b/opencompass/models/turbomind_with_tf_above_v4_33.py
index 7138974d..cbf14263 100644
--- a/opencompass/models/turbomind_with_tf_above_v4_33.py
+++ b/opencompass/models/turbomind_with_tf_above_v4_33.py
@@ -130,6 +130,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
         if self.fastchat_template:
             messages = _format_with_fast_chat_template(messages, self.fastchat_template)
         else:
+            # NOTE: DeepSeek-R1 series model's chat template will add <think> after the
             messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
             # LMDeploy tokenize prompts by AutoTokenizer with its default parameter "add_special_token=True"
             # OC add bos_token in the prompt, which requires tokenizing prompts using "add_speicial_token=False"

From 198c08632eb69db98d1d5aadcb899f3e3e8867da Mon Sep 17 00:00:00 2001
From: liushz <qq1791167085@163.com>
Date: Tue, 4 Mar 2025 16:42:37 +0800
Subject: [PATCH 16/58] [Feature] Add HLE (Humanity's Last Exam) dataset
 (#1902)

* Support OlympiadBench Benchmark

* Support OlympiadBench Benchmark

* Support OlympiadBench Benchmark

* update dataset path

* Update olmpiadBench

* Update olmpiadBench

* Update olmpiadBench

* Add HLE dataset

* Add HLE dataset

* Add HLE dataset

---------

Co-authored-by: sudanl <sudanl@foxmail.com>
---
 dataset-index.yml                             |  5 +
 opencompass/configs/datasets/HLE/hle_gen.py   |  5 +
 .../datasets/HLE/hle_llmverify_gen_6ff468.py  | 91 +++++++++++++++++++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/hle.py                   | 17 ++++
 5 files changed, 119 insertions(+)
 create mode 100644 opencompass/configs/datasets/HLE/hle_gen.py
 create mode 100644 opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
 create mode 100644 opencompass/datasets/hle.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 9fbde8bd..b8ec7041 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -399,6 +399,11 @@
     category: Math
     paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
     configpath: opencompass/configs/datasets/gsm_hard
+- hle:
+    name: HLE(Humanity's Last Exam)
+    category: Reasoning
+    paper: https://lastexam.ai/paper
+    configpath: opencompass/configs/datasets/HLE
 - hellaswag:
     name: HellaSwag
     category: Reasoning
diff --git a/opencompass/configs/datasets/HLE/hle_gen.py b/opencompass/configs/datasets/HLE/hle_gen.py
new file mode 100644
index 00000000..598f1dde
--- /dev/null
+++ b/opencompass/configs/datasets/HLE/hle_gen.py
@@ -0,0 +1,5 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Default use LLM as a judge
+    from .hle_llmverify_gen_6ff468 import hle_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py b/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
new file mode 100644
index 00000000..bb6f40bf
--- /dev/null
+++ b/opencompass/configs/datasets/HLE/hle_llmverify_gen_6ff468.py
@@ -0,0 +1,91 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import HLEDataset
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=HLEDataset,
+            path='cais/hle',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+hle_datasets = [
+    dict(
+        type=HLEDataset,
+        abbr='hle_llmjudge',
+        path='cais/hle',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index b28f78ed..4052c630 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -57,6 +57,7 @@ from .gpqa import *  # noqa: F401, F403
 from .gsm8k import *  # noqa: F401, F403
 from .gsm_hard import *  # noqa: F401, F403
 from .hellaswag import *  # noqa: F401, F403
+from .hle import *  # noqa: F401, F403
 from .huggingface import *  # noqa: F401, F403
 from .humaneval import *  # noqa: F401, F403
 from .humaneval_multi import *  # noqa: F401, F403
diff --git a/opencompass/datasets/hle.py b/opencompass/datasets/hle.py
new file mode 100644
index 00000000..2d7cf74b
--- /dev/null
+++ b/opencompass/datasets/hle.py
@@ -0,0 +1,17 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class HLEDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str):
+        dataset = load_dataset(path)
+        dataset['test'] = dataset['test'].filter(lambda x: x['image'] == '')
+        dataset['test'] = dataset['test'].rename_column('question', 'problem')
+        dataset['train'] = dataset['test']
+        return dataset

From 5547fd15924798214bddb343b3b9d565407e9d5d Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 4 Mar 2025 18:26:14 +0800
Subject: [PATCH 17/58] [Bump] Bump version to 0.4.1

---
 opencompass/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/__init__.py b/opencompass/__init__.py
index abeeedbf..f0ede3d3 100644
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
@@ -1 +1 @@
-__version__ = '0.4.0'
+__version__ = '0.4.1'

From fff2d51440010e7e50c6b1de59ac0e6b200e8916 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Tue, 4 Mar 2025 18:49:38 +0800
Subject: [PATCH 18/58] [Update] Code evaluation alignment (#1909)

* code alignment

* update oss md5

* bigcodebench update

* lint

* lint_

* lint yapf
---
 .../bigcodebench_full_complete_gen_faf748.py  |  58 +++---
 .../bigcodebench_full_instruct_gen_8815eb.py  |  58 +++---
 .../bigcodebench_hard_complete_gen_faf748.py  |  42 ++--
 .../bigcodebench_hard_instruct_gen_8815eb.py  |  42 ++--
 .../livecodebench_time_split_gen.py           | 132 ++++++++++++
 .../datasets/bigcodebench/bigcodebench.py     |  14 +-
 .../datasets/livecodebench/evaluator.py       |  46 +++-
 .../datasets/livecodebench/livecodebench.py   |  17 +-
 opencompass/utils/datasets_info.py            | 196 ++++++++++++------
 9 files changed, 405 insertions(+), 200 deletions(-)
 create mode 100644 opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py

diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
index f2edc098..6ae8a218 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_complete_gen_faf748.py
@@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 
 bigcodebench_full_reader_cfg = dict(
-        input_columns=['complete_prompt'],
-        output_column='test',
+    input_columns=['complete_prompt'],
+    output_column='test',
 )
 
-
-bigcodebench_full_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{complete_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024)
-)
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=1024))
 
 bigcodebench_full_eval_cfg = dict(
     evaluator=dict(
         type=BigCodeBenchEvaluator,
         release_version='v0.1.2',
         eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
         dataset_version='full',
     ),
     pred_role='BOT',
 )
 
 bigcodebench_full_complete_datasets = [
-    dict(
-        abbr='bigcodebench_full_complete',
-        type=BigCodeBenchDataset,
-        path='opencompass/bigcodebench',
-        reader_cfg=bigcodebench_full_reader_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
-        release_version='v0.1.2'
-    )
-]
\ No newline at end of file
+    dict(abbr='bigcodebench_full_complete',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
+]
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
index 88b410ae..eed4d04d 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_full_instruct_gen_8815eb.py
@@ -1,53 +1,43 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 
 bigcodebench_full_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
-        output_column='test',
+    input_columns=['instruct_prompt'],
+    output_column='test',
 )
 
-
-bigcodebench_full_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=8192)
-)
+bigcodebench_full_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=8192))
 
 bigcodebench_full_eval_cfg = dict(
     evaluator=dict(
         type=BigCodeBenchEvaluator,
         release_version='v0.1.2',
         eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
         dataset_version='full',
     ),
     pred_role='BOT',
 )
 
 bigcodebench_full_instruct_datasets = [
-    dict(
-        abbr='bigcodebench_full_instruct',
-        type=BigCodeBenchDataset,
-        path='opencompass/bigcodebench',
-        reader_cfg=bigcodebench_full_reader_cfg,
-        infer_cfg=bigcodebench_full_infer_cfg,
-        eval_cfg=bigcodebench_full_eval_cfg,
-        release_version='v0.1.2'
-    )
-]
\ No newline at end of file
+    dict(abbr='bigcodebench_full_instruct',
+         type=BigCodeBenchDataset,
+         path='opencompass/bigcodebench',
+         reader_cfg=bigcodebench_full_reader_cfg,
+         infer_cfg=bigcodebench_full_infer_cfg,
+         eval_cfg=bigcodebench_full_eval_cfg,
+         release_version='v0.1.2')
+]
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
index c0419774..c411f411 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_faf748.py
@@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 
 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['complete_prompt'],
-        output_column='test',
+    input_columns=['complete_prompt'],
+    output_column='test',
 )
 
-
-bigcodebench_hard_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{complete_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024)
-)
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=1024))
 
 bigcodebench_hard_eval_cfg = dict(
     evaluator=dict(
         type=BigCodeBenchEvaluator,
         release_version='v0.1.2',
         eval_type='complete',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
         dataset_version='hard',
     ),
     pred_role='BOT',
@@ -51,4 +43,4 @@ bigcodebench_hard_complete_datasets = [
         release_version='v0.1.2',
         dataset_version='hard',
     )
-]
\ No newline at end of file
+]
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
index 3d1cc82c..7187041e 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_8815eb.py
@@ -1,40 +1,32 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import (
-    BigCodeBenchDataset,
-    BigCodeBenchEvaluator
-)
-
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
 
 bigcodebench_hard_reader_cfg = dict(
-        input_columns=['instruct_prompt'],
-        output_column='test',
+    input_columns=['instruct_prompt'],
+    output_column='test',
 )
 
-
-bigcodebench_hard_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            begin=[dict(role='system',
-                        fallback_role='HUMAN',
-                        prompt='')],
-            round=[
-               dict(role='HUMAN', prompt='{instruct_prompt}'),
-            ]
-        )
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=8192)
-)
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer,
+                                                   max_out_len=8192))
 
 bigcodebench_hard_eval_cfg = dict(
     evaluator=dict(
         type=BigCodeBenchEvaluator,
         release_version='v0.1.2',
         eval_type='instruct',
-        remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
         dataset_version='hard',
     ),
     pred_role='BOT',
@@ -51,4 +43,4 @@ bigcodebench_hard_instruct_datasets = [
         release_version='v0.1.2',
         dataset_version='hard',
     )
-]
\ No newline at end of file
+]
diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py
new file mode 100644
index 00000000..89bd9eb1
--- /dev/null
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_time_split_gen.py
@@ -0,0 +1,132 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (LCBCodeGenerationDataset,
+                                  LCBCodeExecutionDataset,
+                                  LCBTestOutputPredictionDataset,
+                                  LCBCodeGenerationEvaluator,
+                                  LCBCodeExecutionEvaluator,
+                                  LCBTestOutputEvaluator)
+
+lcb_code_generation_reader_cfg = dict(
+    input_columns=[
+        'question_content',
+        'format_prompt',
+    ],
+    # output_column='evaluation_sample',
+    output_column='question_id',
+)
+
+SYSTEM_MESSAGE_GENERIC = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'  # noqa: E501
+
+prompt_template = '### Question:\n{question_content}\n\n{format_prompt}' + \
+                    '### Answer: (use the provided format with backticks)\n\n'
+
+# Code Generation Tasks
+lcb_code_generation_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(round=[dict(role='HUMAN', prompt=prompt_template)])),
+                                     retriever=dict(type=ZeroRetriever),
+                                     inferencer=dict(type=GenInferencer))
+
+lcb_code_generation_eval_cfg = dict(
+    evaluator=dict(type=LCBCodeGenerationEvaluator,
+                   num_process_evaluate=4,
+                   timeout=6,
+                   release_version='release_v5',
+                   start_date='2024-08-01',
+                   end_date='2025-02-01'),
+    pred_role='BOT',
+)
+
+LCBCodeGeneration_dataset = dict(
+    type=LCBCodeGenerationDataset,
+    abbr='lcb_code_generation',
+    path='opencompass/code_generation_lite',
+    reader_cfg=lcb_code_generation_reader_cfg,
+    infer_cfg=lcb_code_generation_infer_cfg,
+    eval_cfg=lcb_code_generation_eval_cfg,
+    release_version='release_v5',
+)
+
+# Code Execution Dataset
+lcb_code_execution_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+lcb_code_execution_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt=
+                    'You are an expert at Python programming, code execution, test case generation, and fuzzing.'  # noqa: E501
+                ),
+            ],
+            round=[dict(role='HUMAN', prompt='{prompt}')])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+lcb_code_execution_eval_cfg = dict(
+    evaluator=dict(type=LCBCodeExecutionEvaluator, ),
+    pred_role='BOT',
+)
+
+LCBCodeExecution_dataset = dict(
+    type=LCBCodeExecutionDataset,
+    abbr='lcb_code_execution',
+    path='opencompass/execution-v2',
+    reader_cfg=lcb_code_execution_reader_cfg,
+    infer_cfg=lcb_code_execution_infer_cfg,
+    eval_cfg=lcb_code_execution_eval_cfg,
+)
+
+# TestOuputput Dataset
+lcb_test_output_reader_cfg = dict(
+    input_columns=[
+        'prompt',
+    ],
+    output_column='evaluation_sample',
+)
+
+system_prompt = 'You are an expert Python programmer. You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. You will NOT return anything except for the program.'  # noqa: E501
+
+lcb_test_output_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            # begin=[
+            #     dict(
+            #         role='SYSTEM',
+            #         prompt=system_prompt
+            #     ),
+            # ],
+            round=[dict(role='HUMAN', prompt='{prompt}')])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+lcb_test_output_eval_cfg = dict(
+    evaluator=dict(type=LCBTestOutputEvaluator, ),
+    pred_role='BOT',
+)
+
+LCBTestOutput_dataset = dict(
+    type=LCBTestOutputPredictionDataset,
+    abbr='lcb_test_output',
+    path='opencompass/test_generation',
+    reader_cfg=lcb_test_output_reader_cfg,
+    infer_cfg=lcb_test_output_infer_cfg,
+    eval_cfg=lcb_test_output_eval_cfg,
+)
+
+LCB_datasets = [
+    LCBCodeGeneration_dataset,
+    LCBCodeExecution_dataset,
+    LCBTestOutput_dataset,
+]
diff --git a/opencompass/datasets/bigcodebench/bigcodebench.py b/opencompass/datasets/bigcodebench/bigcodebench.py
index f347e9e2..9ce3d196 100644
--- a/opencompass/datasets/bigcodebench/bigcodebench.py
+++ b/opencompass/datasets/bigcodebench/bigcodebench.py
@@ -197,11 +197,21 @@ class BigCodeBenchEvaluator(BaseEvaluator):
                 break
             except (httpx.ReadTimeout, CancelledError):
                 logger.info('Read timeout error. Retrying in 4s...')
-                time.sleep(4)
+                time.sleep(10)
 
         if 'pass@1' in pass_at_k.keys():
             pass_at_k['pass@1'] *= 100
-        dump_results = {'details': results}
+        dump_results = {'details': self._results_processor(results)}
         dump_results.update(pass_at_k)
 
         return dump_results
+
+    def _results_processor(self, results):
+        details = []
+        for key, value in results['eval'].items():
+            if value[0]['status'] == 'pass':
+                value[0]['correct'] = True
+            else:
+                value[0]['correct'] = False
+            details.append(value[0])
+        return details
diff --git a/opencompass/datasets/livecodebench/evaluator.py b/opencompass/datasets/livecodebench/evaluator.py
index e9fb70d7..65867d47 100644
--- a/opencompass/datasets/livecodebench/evaluator.py
+++ b/opencompass/datasets/livecodebench/evaluator.py
@@ -146,9 +146,12 @@ def evaluate_generations(
         with ProcessPoolExecutor(
                 max_workers=1 if debug else num_process_evaluate) as executor:
             futures = {
-                executor.submit(evaluate_generations_by_problem,
-                                problem_generations, sample, debug, timeout):
-                index
+                executor.submit(
+                    evaluate_generations_by_problem,  # noqa: E501
+                    problem_generations,
+                    sample,
+                    debug,
+                    timeout): index
                 for (problem_generations, sample, debug,
                      timeout), index in inputs
             }
@@ -233,15 +236,27 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
                  num_process_evaluate,
                  timeout=6,
                  release_version='release_v1',
-                 extractor_version='v1'):
+                 extractor_version='v1',
+                 start_date=None,
+                 end_date=None):
         super().__init__()
         self.num_process_evaluate = num_process_evaluate
         self.timeout = timeout
         self.dataset = LCBCodeGenerationDataset.load(
-            release_version=release_version)['test']
+            release_version=release_version,
+            start_date=start_date,
+            end_date=end_date)['test']
         self.extractor_version = extractor_version
 
     def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
         if self.extractor_version == 'v1':
             predictions = [[extract_code_generation(item)]
                            for item in predictions]
@@ -254,19 +269,28 @@ class LCBCodeGenerationEvaluator(BaseEvaluator):
             evaluation_samples[self.dataset[idx][
                 'question_id']] = self.dataset[idx]['evaluation_sample']
 
-        references = [evaluation_samples[item] for item in references]
+        filtered_predictions = []
+        filtered_references = []
+        for idx, item in enumerate(references):
+            if item in self.dataset['question_id']:
+                filtered_predictions.append(predictions[idx])
+                filtered_references.append(item)
 
-        references = [{'input_output': item} for item in references]
+        filtered_references = [
+            evaluation_samples[item] for item in filtered_references
+        ]  # noqa: E501
 
-        BaseEvaluator.is_num_equal(predictions, references)
+        filtered_references = [{
+            'input_output': item
+        } for item in filtered_references]  # noqa: E501
 
         extracted_predictions = {}
-        for idx, content in enumerate(predictions):
+        for idx, content in enumerate(filtered_predictions):
             extracted_predictions[idx] = content
 
         metrics, eval_results, final_metadata = codegen_metrics(
-            references,
-            predictions,
+            filtered_references,
+            filtered_predictions,
             k_list=[1],
             num_process_evaluate=self.num_process_evaluate,
             timeout=self.timeout,
diff --git a/opencompass/datasets/livecodebench/livecodebench.py b/opencompass/datasets/livecodebench/livecodebench.py
index dbd76d71..9ad3f84c 100644
--- a/opencompass/datasets/livecodebench/livecodebench.py
+++ b/opencompass/datasets/livecodebench/livecodebench.py
@@ -6,6 +6,7 @@ import json
 import pickle
 import zlib
 from dataclasses import dataclass
+from datetime import datetime
 from enum import Enum
 
 from datasets import DatasetDict, load_dataset, load_from_disk
@@ -53,7 +54,9 @@ class LCBCodeGenerationDataset(BaseDataset):
     @staticmethod
     def load(path: str = 'opencompass/code_generation_lite',
              local_mode: bool = False,
-             release_version: str = 'release_v1'):
+             release_version: str = 'release_v1',
+             start_date: str = None,
+             end_date: str = None):
 
         def transform(item):
             # Define the dataitem mapping logic
@@ -61,7 +64,7 @@ class LCBCodeGenerationDataset(BaseDataset):
             # starter_code
             if item['starter_code']:
                 format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_MESSAGE_WITH_STARTER_CODE}\n'  # noqa: E501
-                format_prompt += f"```python\n{item['starter_code']}\n```\n\n"
+                format_prompt += f"```python\n{item['starter_code']}\n```\n\n"  # noqa: Q000, E501
             else:
                 format_prompt = f'### Format: {CodeGenerationPromptConstants.FORMATTING_WITHOUT_STARTER_CODE}\n'  # noqa: E501
                 format_prompt += '```python\n# YOUR CODE HERE\n```\n\n'
@@ -107,6 +110,16 @@ class LCBCodeGenerationDataset(BaseDataset):
 
         dataset = dataset.map(transform)
 
+        if start_date is not None:
+            p_start_date = datetime.strptime(start_date, '%Y-%m-%d')
+            dataset = dataset.filter(
+                lambda e: p_start_date <= datetime.fromisoformat(e[
+                    'contest_date']))  # noqa: E501
+        if end_date is not None:
+            p_end_date = datetime.strptime(end_date, '%Y-%m-%d')
+            dataset = dataset.filter(lambda e: datetime.fromisoformat(e[
+                'contest_date']) <= p_end_date)  # noqa: E501
+
         return DatasetDict({'test': dataset, 'train': dataset})
 
 
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 050d5983..79be5736 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -376,7 +376,7 @@ DATASETS_MAPPING = {
     "opencompass/LiveReasonBench": {
         "ms_id": "",
         "hf_id": "",
-        "local": "./data/LiveReasonBench/",    
+        "local": "./data/LiveReasonBench/",
     },
     "opencompass/bigcodebench": {
         "ms_id": "",
@@ -407,251 +407,313 @@ DATASETS_MAPPING = {
 
 DATASETS_URL = {
     "/OlympiadBench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/OlympiadBench.zip",
         "md5": "97e8b1ae7f6170d94817288a8930ef00",
     },
-    "/longbenchv2":{
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip",
+    "/longbenchv2": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/longbenchv2.zip",
         "md5": "09b7e06e6f98c5cca8ad597b3d7b42f0",
     },
     "/livestembench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/livestembench.zip",
         "md5": "0ff59d031c3dcff56a2e00e8c1489f5d",
     },
     "/musr": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/musr.zip",
         "md5": "7447d2a5bec4586035196102135e2af9",
     },
     "/mmlu/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu.zip",
         "md5": "761310671509a239e41c4b717f7fab9c",
     },
     "/mmmlu_lite": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmmlu_lite.zip",
         "md5": "a776af1220e1826fd0608eda1bc4425e",
     },
     "/simpleqa": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/simpleqa.zip",
         "md5": "1d83fc2e15798d39cb265c9a3cb5195a",
     },
     "/chinese_simpleqa": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/chinese_simpleqa.zip",
         "md5": "4bdf854b291fc0ee29da57dc47ac47b5",
     },
     "/gpqa/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gpqa.zip",
         "md5": "2e9657959030a765916f1f2aca29140d",
     },
     "/CHARM/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/CHARM.zip",
         "md5": "fdf51e955d1b8e0bb35bc1997eaf37cb",
     },
     "/ifeval/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ifeval.zip",
         "md5": "64d98b6f36b42e7390c9cef76cace75f",
     },
     "/mbpp/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mbpp.zip",
         "md5": "777739c90f04bce44096a5bc96c8f9e5",
     },
     "/cmmlu/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmmlu.zip",
         "md5": "a59f4003d6918509a719ce3bc2a5d5bc",
     },
     "/math/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip",
         "md5": "cb5b4c8378085929e20345174e731fdf",
     },
     "/hellaswag/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/hellaswag.zip",
         "md5": "2b700a02ffb58571c7df8d8d0619256f",
     },
     "/BBH/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/BBH.zip",
         "md5": "60c49f9bef5148aa7e1941328e96a554",
     },
     "/compass_arena/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/compass_arena.zip",
         "md5": "cd59b54a179d16f2a858b359b60588f6",
     },
     "/TheoremQA/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/TheoremQA.zip",
         "md5": "f2793b07bc26510d507aa710d9bd8622",
     },
     "/mathbench_v1/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mathbench_v1.zip",
         "md5": "50257a910ca43d1f61a610a79fdb16b5",
     },
     "/gsm8k/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/gsm8k.zip",
         "md5": "901e5dc93a2889789a469da9850cdca8",
     },
     "/LCBench2023/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LCBench2023.zip",
         "md5": "e1a38c94a42ad1809e9e0650476a9306",
     },
     "/humaneval/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humaneval.zip",
         "md5": "88b1b89dc47b7121c81da6bcd85a69c3",
     },
     "/humanevalx": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/humanevalx.zip",
         "md5": "22930355c03fb73fb5bae14b50f1deb9",
     },
     "/ds1000_data": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ds1000_data.zip",
         "md5": "1a4990aec04a2fd73ccfad12e2d43b43",
     },
     "/drop_simple_eval/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/drop_simple_eval.zip",
         "md5": "c912afe5b4a63509851cf16e6b91830e",
     },
     "subjective/alignment_bench/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alignment_bench.zip",
         "md5": "d8ae9a0398526479dbbcdb80fafabceb",
     },
     "subjective/alpaca_eval": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/alpaca_eval.zip",
         "md5": "d7399d63cb46c82f089447160ef49b6a",
     },
     "subjective/arena_hard": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arena_hard.zip",
         "md5": "02cd09a482cb0f0cd9d2c2afe7a1697f",
     },
     "subjective/mtbench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench.zip",
         "md5": "d1afc0787aeac7f1f24872742e161069",
     },
     "subjective/fofo": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/fofo.zip",
         "md5": "8a302712e425e27e4292a9369df5b9d3",
     },
     "subjective/followbench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/followbench.zip",
         "md5": "da7a831817c969da15d1e78d4a245d8a",
     },
     "subjective/mtbench101": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mtbench101.zip",
         "md5": "5d80257bc9929ebe5cfbf6d11184b04c",
     },
     "subjective/WildBench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/wildbench.zip",
         "md5": "b06252857f1f8f44a17b1bfca4888ff4",
     },
     "/ruler/": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ruler.zip",
         "md5": "c60bdfff3d02358067104cc1dea7c0f7",
     },
     "/scicode": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/scicode.zip",
         "md5": "9c6c64b8c70edc418f713419ea39989c",
     },
     "/commonsenseqa": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/commonsenseqa.zip",
         "md5": "c4a82fc07c81ae1462605f5d7fd2bb2e",
     },
     "FewCLUE": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/FewCLUE.zip",
         "md5": "7976e2bb0e9d885ffd3c55f7c5d4021e",
     },
     "/race": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/race.zip",
         "md5": "b758251764a264746cf45749c02363f9",
     },
     "/ARC": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/ARC.zip",
         "md5": "d720629b69f1a51cfe78bf65b00b44f6",
     },
     "/SuperGLUE": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SuperGLUE.zip",
         "md5": "b60904915b0b61d1a04ea52280169936",
     },
     "SQuAD2.0": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/SQuAD2.0.zip",
         "md5": "1321cbf9349e1102a57d31d1b2bfdd7e",
     },
     "mmlu_pro": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
         "md5": "e3200c7380f4cea5f13c768f2815fabb",
     },
     "/Longbench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip",
         "md5": "ab0cb9e520ae5cfb899bf38b564249bb",
     },
     "/needlebench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/needlebench.zip",
         "md5": "dad5c903ebfea16eaf186b8997aeedad",
     },
     "/teval": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/teval.zip",
         "md5": "7628ab5891a26bf96ca17becfd044867",
     },
     "/code_generation_lite": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip",
-        "md5": "60103a18ca63b05ea06e98d24170f23d",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/code_generation_lite.zip",
+        "md5": "ebcf8db56f5c817ca8202a542be30cb4",
     },
     "/execution-v2": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/execution-v2.zip",
         "md5": "019ef1a0686ee6ca34f51c8af104fcd9",
     },
     "/test_generation": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip",
         "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7",
     },
     "/aime": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip",
         "md5": "fbe2d0577fc210962a549f8cea1a00c8",
     },
     "/cmo": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",
         "md5": "fad52c81290506a8ca74f46b5400d8fc",
-    },  
+    },
     "/nq-open": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/nq-open.zip",
         "md5": "a340521e5c9ec591227dcb367f718b25",
     },
     "/winogrande": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/winogrande.zip",
         "md5": "9e949a75eacc26ed4fd2b9aa870b495b",
     },
     "/triviaqa": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/triviaqa.zip",
         "md5": "e6a118d744236814926b2ec7ec66c034",
     },
     "/GAOKAO-BENCH": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/GAOKAO-BENCH.zip",
         "md5": "ba3c71b8b9db96d2a0664b977c4f9784",
     },
     "/WikiBench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/WikiBench.zip",
         "md5": "6dac1d1a3133fe1effff185cbf71d928",
     },
     "/babilong": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/babilong.zip",
         "md5": "e400864c31bc58d29eaa3e199751f99b",
     },
     "/korbench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
         "md5": "9107597d137e7362eaf7d218ddef7a6d",
     },
     "subjective/judgerbench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",
         "md5": "60d605883aa8cac9755819140ab42c6b"
     },
     "/arc_prize_public_evaluation": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/arc_prize_public_evaluation.zip",
         "md5": "367a33977651496efddba7670009807e"
     },
     "P-MMEval": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip",
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/pmmeval.zip",
         "md5": "09e401e6229a50647b9e13c429e634d1",
     },
     "LiveMathBench": {
-        'url': "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip",
+        'url':
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/LiveMathBench.zip",
         "md5": "d0781f9185c9bb50e81e6e3ca8c59013",
     },
     "bigcodebench": {
-        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
-        "md5": "2c1c7956ca49a1124617e8c037ec57d8"
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bigcodebench.zip",
+        "md5": "270f399f4142b74f47ecff116cc3b21d"
     }
 }

From 54324657f097efbeb0d208c148f5ecf66579320a Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Wed, 5 Mar 2025 18:23:54 +0800
Subject: [PATCH 19/58] [Docs] Results persistance (#1908)

* feat persistance.md

* doc

* doc

* lint

* doc

* fix

* doc
---
 docs/en/advanced_guides/persistence.md    | 65 +++++++++++++++++++++++
 docs/en/index.rst                         |  1 +
 docs/zh_cn/advanced_guides/persistence.md | 65 +++++++++++++++++++++++
 docs/zh_cn/index.rst                      |  1 +
 4 files changed, 132 insertions(+)
 create mode 100644 docs/en/advanced_guides/persistence.md
 create mode 100644 docs/zh_cn/advanced_guides/persistence.md

diff --git a/docs/en/advanced_guides/persistence.md b/docs/en/advanced_guides/persistence.md
new file mode 100644
index 00000000..e3a6bcd1
--- /dev/null
+++ b/docs/en/advanced_guides/persistence.md
@@ -0,0 +1,65 @@
+# Evaluation Results Persistence
+
+## Introduction
+
+Normally, the evaluation results of OpenCompass will be saved to your work directory. But in some cases, there may be a need for data sharing among users or quickly browsing existing public evaluation results. Therefore, we provide an interface that can quickly transfer evaluation results to external public data stations, and on this basis, provide functions such as uploading, overwriting, and reading.
+
+## Quick Start
+
+### Uploading
+
+By adding `args` to the evaluation command or adding configuration in the Eval script, the results of evaluation can be stored in the path you specify. Here are the examples:
+
+(Approach 1) Add an `args` option to the command and specify your public path address.
+
+```bash
+opencompass  ...  -sp '/your_path'
+```
+
+(Approach 2) Add configuration in the Eval script.
+
+```pythonE
+station_path = '/your_path'
+```
+
+### Overwriting
+
+The above storage method will first determine whether the same task result already exists in the data station based on the `abbr` attribute in the model and dataset configuration before uploading data. If results already exists, cancel this storage. If you need to update these results, please add the `station-overwrite` option to the command, here is an example:
+
+```bash
+opencompass  ...  -sp '/your_path' --station-overwrite
+```
+
+### Reading
+
+You can directly read existing results from the data station to avoid duplicate evaluation tasks. The read results will directly participate in the 'summarize' step. When using this configuration, only tasks that do not store results in the data station will be initiated. Here is an example:
+
+```bash
+opencompass  ...  -sp '/your_path' --read-from-station
+```
+
+### Command Combination
+
+1. Only upload the results under your latest working directory to the data station, without supplementing tasks that missing results:
+
+```bash
+opencompass  ...  -sp '/your_path' -r latest -m viz
+```
+
+## Storage Format of the Data Station
+
+In the data station, the evaluation results are stored as `json` files for each `model-dataset` pair. The specific directory form is `/your_path/dataset_name/model_name.json `. Each `json` file stores a dictionary corresponding to the results, including `predictions`, `results`, and `cfg`, here is an example:
+
+```pythonE
+Result = {
+    'predictions': List[Dict],
+    'results': Dict,
+    'cfg': Dict = {
+        'models': Dict,
+        'datasets': Dict,
+        (Only subjective datasets)'judge_models': Dict
+    }
+}
+```
+
+Among this three keys, `predictions` records the predictions of the model on each item of data in the dataset. `results` records the total score of the model on the dataset. `cfg` records detailed configurations of the model and the dataset in this evaluation task.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 5ae2e173..09c35c5b 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    advanced_guides/code_eval.md
    advanced_guides/code_eval_service.md
    advanced_guides/subjective_evaluation.md
+   advanced_guides/persistence.md
 
 .. _Tools:
 .. toctree::
diff --git a/docs/zh_cn/advanced_guides/persistence.md b/docs/zh_cn/advanced_guides/persistence.md
new file mode 100644
index 00000000..85fef0a3
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/persistence.md
@@ -0,0 +1,65 @@
+# 评测结果持久化
+
+## 介绍
+
+通常情况下，OpenCompass的评测结果将会保存到工作目录下。 但在某些情况下，可能会产生用户间的数据共享，以及快速查看已有的公共评测结果等需求。 因此，我们提供了一个能够将评测结果快速转存到外部公共数据站的接口，并且在此基础上提供了对数据站的上传、更新、读取等功能。
+
+## 快速开始
+
+### 向数据站存储数据
+
+通过在CLI评测指令中添加`args`或在Eval脚本中添加配置，即可将本次评测结果存储到您所指定的路径，示例如下：
+
+（方式1）在指令中添加`args`选项并指定你的公共路径地址。
+
+```bash
+opencompass  ...  -sp '/your_path'
+```
+
+（方式2）在Eval脚本中添加配置。
+
+```pythonE
+station_path = '/your_path'
+```
+
+### 向数据站更新数据
+
+上述存储方法在上传数据前会首先根据模型和数据集配置中的`abbr`属性来判断数据站中是否已有相同任务结果。若已有结果，则取消本次存储。如果您需要更新这部分结果，请在指令中添加`station-overwrite`选项，示例如下：
+
+```bash
+opencompass  ...  -sp '/your_path' --station-overwrite
+```
+
+### 读取数据站中已有的结果
+
+您可以直接从数据站中读取已有的结果，以避免重复进行评测任务。读取到的结果会直接参与到`summarize`步骤。采用该配置时，仅有数据站中未存储结果的任务会被启动。示例如下：
+
+```bash
+opencompass  ...  -sp '/your_path' --read-from-station
+```
+
+### 指令组合
+
+1. 仅向数据站上传最新工作目录下结果，不补充运行缺失结果的任务：
+
+```bash
+opencompass  ...  -sp '/your_path' -r latest -m viz
+```
+
+## 数据站存储格式
+
+在数据站中，评测结果按照每个`model-dataset`对的结果存储为`json`文件。具体的目录组织形式为`/your_path/dataset_name/model_name.json`。每个`json`文件都存储了对应结果的字典，包括`predictions`、`results`以及`cfg`三个子项，具体示例如下：
+
+```pythonE
+Result = {
+    'predictions': List[Dict],
+    'results': Dict,
+    'cfg': Dict = {
+        'models': Dict,
+        'datasets': Dict,
+        (Only subjective datasets)'judge_models': Dict
+    }
+}
+```
+
+其中，`predictions`记录了模型对数据集中每一条数据的prediction的结果，`results`记录了模型在该数据集上的评分，`cfg`记录了该评测任务中模型和数据集的详细配置。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index c9b6e8d3..f3ce6b74 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -67,6 +67,7 @@ OpenCompass 上手路线
    advanced_guides/code_eval.md
    advanced_guides/code_eval_service.md
    advanced_guides/subjective_evaluation.md
+   advanced_guides/persistence.md
 
 .. _工具:
 .. toctree::

From 1585c0adbed8e1361e28b70f78faf6dc24a8e614 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Wed, 5 Mar 2025 18:33:34 +0800
Subject: [PATCH 20/58] [Feature]  Evaluation Results Persistence (#1894)

* feat results_station.py

* lint

* feat save_to_station

* feat result_station.py and lint

* feat

* fix

* fix and lint

* fix

* fix subjective processing

* fix

* fix

* style function name

* lint
---
 opencompass/cli/main.py             |  35 ++-
 opencompass/partitioners/base.py    |  20 +-
 opencompass/utils/__init__.py       |   1 +
 opencompass/utils/result_station.py | 417 ++++++++++++++++++++++++++++
 4 files changed, 471 insertions(+), 2 deletions(-)
 create mode 100644 opencompass/utils/result_station.py

diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index 21308e10..4eaa5b9b 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -12,7 +12,8 @@ from mmengine.config import Config, DictAction
 from opencompass.registry import PARTITIONERS, RUNNERS, build_from_cfg
 from opencompass.runners import SlurmRunner
 from opencompass.summarizers import DefaultSummarizer
-from opencompass.utils import LarkReporter, get_logger
+from opencompass.utils import (LarkReporter, get_logger, read_from_station,
+                               save_to_station)
 from opencompass.utils.run import (fill_eval_cfg, fill_infer_cfg,
                                    get_config_from_arg)
 
@@ -127,6 +128,27 @@ def parse_args():
         'correctness of each sample, bpb, etc.',
         action='store_true',
     )
+
+    parser.add_argument('-sp',
+        '--station-path',
+        help='Path to your results station.',
+        type=str,
+        default=None,
+    )
+
+    parser.add_argument('--station-overwrite',
+        help='Whether to overwrite the results at station.',
+        action='store_true',
+    )
+
+    parser.add_argument(
+        '--read-from-station',
+        help='Whether to save the evaluation results to the '
+             'data station.',
+        action='store_true',
+    )
+
+
     # set srun args
     slurm_parser = parser.add_argument_group('slurm_args')
     parse_slurm_args(slurm_parser)
@@ -260,6 +282,12 @@ def main():
     # types cannot be serialized
     cfg = Config.fromfile(output_config_path, format_python_code=False)
 
+    # get existed results from station
+    if args.read_from_station:
+        existing_results_list = read_from_station(cfg, args)
+        rs_exist_results = [comb['combination'] for comb in existing_results_list]
+        cfg['rs_exist_results'] = rs_exist_results
+
     # report to lark bot if specify --lark
     if not args.lark:
         cfg['lark_bot_url'] = None
@@ -267,6 +295,7 @@ def main():
         content = f'{getpass.getuser()}\'s task has been launched!'
         LarkReporter(cfg['lark_bot_url']).post(content)
 
+    # infer
     if args.mode in ['all', 'infer']:
         # When user have specified --slurm or --dlc, or have not set
         # "infer" in config, we will provide a default configuration
@@ -348,6 +377,10 @@ def main():
         else:
             runner(tasks)
 
+    # save to station
+    if args.station_path is not None or cfg.get('station_path') is not None:
+        save_to_station(cfg, args)
+
     # visualize
     if args.mode in ['all', 'eval', 'viz']:
         summarizer_cfg = cfg.get('summarizer', {})
diff --git a/opencompass/partitioners/base.py b/opencompass/partitioners/base.py
index 07baa8e4..17cedfc7 100644
--- a/opencompass/partitioners/base.py
+++ b/opencompass/partitioners/base.py
@@ -102,6 +102,7 @@ class BasePartitioner:
         return tasks
 
     def parse_model_dataset_args(self, cfg: ConfigDict):
+
         models = cfg['models']
         datasets = cfg['datasets']
 
@@ -109,7 +110,24 @@ class BasePartitioner:
         if 'model_dataset_combinations' in sig.parameters:
             combs = cfg.get('model_dataset_combinations', None)
             if combs is None:
-                combs = [{'models': models, 'datasets': datasets}]
+                if 'rs_exist_results' in cfg.keys():
+                    rs_exist_results = cfg['rs_exist_results']
+                    combs = []
+                    for model in models:
+                        comb = {'models': [model], 'datasets': datasets}
+                        combs.append(comb)
+                    for i in range(len(combs)):
+                        combs[i]['datasets'] = [
+                            dataset for dataset in combs[i]['datasets'] if [
+                                model_abbr_from_cfg(combs[i]['models'][0]),
+                                dataset_abbr_from_cfg(dataset)
+                            ] not in rs_exist_results
+                        ]
+                    combs = [
+                        comb for comb in combs if len(comb['datasets']) != 0
+                    ]
+                else:
+                    combs = [{'models': models, 'datasets': datasets}]
             else:
                 # sanity check
                 model_abbrs = [model_abbr_from_cfg(model) for model in models]
diff --git a/opencompass/utils/__init__.py b/opencompass/utils/__init__.py
index 2e528663..ba4c80c3 100644
--- a/opencompass/utils/__init__.py
+++ b/opencompass/utils/__init__.py
@@ -14,4 +14,5 @@ from .model_postprocessors import *  # noqa
 from .network import *  # noqa
 from .postprocessors import *  # noqa
 from .prompt import *  # noqa
+from .result_station import *  # noqa
 from .text_postprocessors import *  # noqa
diff --git a/opencompass/utils/result_station.py b/opencompass/utils/result_station.py
new file mode 100644
index 00000000..7ad6b9eb
--- /dev/null
+++ b/opencompass/utils/result_station.py
@@ -0,0 +1,417 @@
+import json
+import os
+import os.path as osp
+import re
+
+from opencompass.utils.abbr import (dataset_abbr_from_cfg,
+                                    deal_with_judge_model_abbr,
+                                    model_abbr_from_cfg)
+
+
+def save_to_station(cfg, args):
+
+    if args.station_path is not None:
+        station_path = args.station_path
+    else:
+        station_path = cfg.get('station_path')
+
+    work_dict = cfg['work_dir']
+
+    # objective dataset processing
+    if 'judge_models' not in cfg.keys():
+        model_list = [model_abbr_from_cfg(model) for model in cfg['models']]
+        dataset_list = [
+            dataset_abbr_from_cfg(dataset) for dataset in cfg['datasets']
+        ]
+
+        rs_exist_results = []
+        if 'rs_exist_results' in cfg.keys():
+            rs_exist_results = cfg['rs_exist_results']
+
+        for dataset in dataset_list:
+            result_path = osp.join(station_path, dataset)
+            if not osp.exists(result_path):
+                os.makedirs(result_path)
+
+            for model in model_list:
+                if ([model, dataset] in rs_exist_results
+                        and not args.station_overwrite):
+                    continue
+                result_file_name = model + '.json'
+                if osp.exists(osp.join(
+                        result_path,
+                        result_file_name)) and not args.station_overwrite:
+                    print('result of {} with {} already exists'.format(
+                        dataset, model))
+                    continue
+                else:
+                    # get result dict
+                    local_result_path = osp.join(work_dict, 'results', model)
+                    local_result_json = osp.join(local_result_path,
+                                                 dataset + '.json')
+                    if not osp.exists(local_result_json):
+                        if args.mode == 'viz':
+                            continue
+                        raise ValueError(
+                            'invalid file: {}'.format(local_result_json))
+                    with open(local_result_json, 'r') as f:
+                        this_result = json.load(f)
+                    f.close()
+
+                    # get prediction list
+                    local_prediction_path = osp.join(work_dict, 'predictions',
+                                                     model)
+                    local_prediction_regex = \
+                        rf'^{re.escape(dataset)}(?:_\d+)?\.json$'
+                    local_prediction_json = find_files_by_regex(
+                        local_prediction_path, local_prediction_regex)
+                    if not check_filenames(
+                            dataset,
+                            local_prediction_json) and args.mode != 'viz':
+                        raise ValueError('invalid filelist: {}'.format(
+                            local_prediction_json))
+
+                    this_prediction = []
+                    for prediction_json in local_prediction_json:
+                        with open(
+                                osp.join(local_prediction_path,
+                                         prediction_json), 'r') as f:
+                            this_prediction_load_json = json.load(f)
+                        f.close()
+                        for prekey in this_prediction_load_json.keys():
+                            this_prediction.append(
+                                this_prediction_load_json[prekey])
+
+                    # get config dict
+                    model_cfg = [
+                        i for i in cfg['models']
+                        if model_abbr_from_cfg(i) == model
+                    ][0]
+                    dataset_cfg = [
+                        i for i in cfg['datasets']
+                        if dataset_abbr_from_cfg(i) == dataset
+                    ][0]
+                    this_cfg = {'models': model_cfg, 'datasets': dataset_cfg}
+
+                    # dict combine
+                    data_model_results = {
+                        'predictions': this_prediction,
+                        'results': this_result,
+                        'cfg': this_cfg
+                    }
+                    with open(osp.join(result_path, result_file_name),
+                              'w') as f:
+                        json.dump(data_model_results,
+                                  f,
+                                  ensure_ascii=False,
+                                  indent=4)
+                    f.close()
+                    print(
+                        'successfully save result of {} with {} to the station'
+                        .format(dataset, model))
+        return True
+
+    # subjective processing
+    else:
+        model_list = [model for model in cfg['models']]
+        judge_list = [judge_model for judge_model in cfg['judge_models']]
+        model_pair_list = [[
+            deal_with_judge_model_abbr(model, judge_model)
+            for judge_model in judge_list
+        ] for model in model_list]
+
+        dataset_list = [[
+            dataset_abbr_from_cfg(dataset),
+            [dataset_abbr_from_cfg(base) for base in dataset['base_models']]
+        ] if 'base_models' in dataset.keys() else
+                        [dataset_abbr_from_cfg(dataset), ['']]
+                        for dataset in cfg['datasets']]
+
+        rs_exist_results = []
+        if 'rs_exist_results' in cfg.keys():
+            rs_exist_results = cfg['rs_exist_results']
+
+        for pair_of_dataset_and_base in dataset_list:
+            dataset, base_list = pair_of_dataset_and_base[
+                0], pair_of_dataset_and_base[1]
+
+            result_path = osp.join(station_path, dataset)
+            if not osp.exists(result_path):
+                os.makedirs(result_path)
+
+            for base_model in base_list:
+                base_model_name = base_model
+                if base_model_name != '':
+                    base_model_name += '_'
+                for model_pair_sub_list in model_pair_list:
+                    for model_pair in model_pair_sub_list:
+                        model = model_abbr_from_cfg(model_pair[0])
+                        model_result = model_abbr_from_cfg(model_pair)
+                        if ([model, dataset] in rs_exist_results
+                                and not args.station_overwrite):
+                            continue
+                        result_file_name = (base_model_name + model_result +
+                                            '.json')
+                        if osp.exists(osp.join(result_path, result_file_name)
+                                      ) and not args.station_overwrite:
+                            print('{} at {} already exists'.format(
+                                result_file_name, result_path))
+                            continue
+                        else:
+                            # get result dict
+                            local_result_path = osp.join(
+                                work_dict, 'results',
+                                base_model_name + model_result)
+                            local_result_json = osp.join(
+                                local_result_path, dataset + '.json')
+                            if not osp.exists(local_result_json):
+                                if args.mode == 'viz':
+                                    continue
+                                raise ValueError('invalid file: {}'.format(
+                                    local_result_json))
+                            with open(local_result_json, 'r') as f:
+                                this_result = json.load(f)
+                            f.close()
+
+                            # get prediction list
+                            local_prediction_path = osp.join(
+                                work_dict, 'predictions', model)
+                            local_prediction_regex = \
+                                rf'^{re.escape(dataset)}(?:_\d+)?\.json$'
+                            local_prediction_json = find_files_by_regex(
+                                local_prediction_path, local_prediction_regex)
+                            if not check_filenames(dataset,
+                                                   local_prediction_json
+                                                   ) and args.mode != 'viz':
+                                raise ValueError('invalid filelist: {}'.format(
+                                    local_prediction_json))
+
+                            this_prediction = []
+                            for prediction_json in local_prediction_json:
+                                with open(
+                                        osp.join(local_prediction_path,
+                                                 prediction_json), 'r') as f:
+                                    this_prediction_load_json = json.load(f)
+                                f.close()
+                                for prekey in this_prediction_load_json.keys():
+                                    this_prediction.append(
+                                        this_prediction_load_json[prekey])
+
+                            # get config dict
+                            model_cfg = [
+                                i for i in cfg['models']
+                                if model_abbr_from_cfg(i) == model
+                            ][0]
+                            dataset_cfg = [
+                                i for i in cfg['datasets']
+                                if dataset_abbr_from_cfg(i) == dataset
+                            ][0]
+                            judge_model_cfg = [
+                                i for i in cfg['judge_models']
+                                if 'judged-by--' + model_abbr_from_cfg(i) ==
+                                model_abbr_from_cfg(model_pair[1])
+                            ]
+
+                            this_cfg = {
+                                'models': model_cfg,
+                                'datasets': dataset_cfg,
+                                'judge_models': judge_model_cfg
+                            }
+
+                            # dict combine
+                            data_model_results = {
+                                'predictions': this_prediction,
+                                'results': this_result,
+                                'cfg': this_cfg
+                            }
+
+                            with open(osp.join(result_path, result_file_name),
+                                      'w') as f:
+                                json.dump(data_model_results,
+                                          f,
+                                          ensure_ascii=False,
+                                          indent=4)
+                            f.close()
+                            print('successfully save result: {} at {} to the'
+                                  'station'.format(result_file_name,
+                                                   result_path))
+        return True
+
+
+def read_from_station(cfg, args):
+
+    assert args.station_path is not None or cfg.get('station_path') is not None
+    if args.station_path is not None:
+        station_path = args.station_path
+    else:
+        station_path = cfg.get('station_path')
+
+    # objective check
+    if 'judge_models' not in cfg.keys():
+        model_list = [model_abbr_from_cfg(model) for model in cfg['models']]
+        dataset_list = [
+            dataset_abbr_from_cfg(dataset) for dataset in cfg['datasets']
+        ]
+
+        existing_results_list = []
+        result_local_path = osp.join(cfg['work_dir'], 'results')
+        if not osp.exists(result_local_path):
+            os.makedirs(result_local_path)
+
+        for dataset in dataset_list:
+            for model in model_list:
+                result_file_path = osp.join(station_path, dataset,
+                                            model + '.json')
+                if not osp.exists(result_file_path):
+                    print('do not find result file: {} with {} at station'.
+                          format(model, dataset))
+                    continue
+                else:
+                    print('find result file: {} with {} at station'.format(
+                        model, dataset))
+                    with open(result_file_path, 'r') as f:
+                        download_json = json.load(f)
+                    f.close()
+                    existing_results_list.append({
+                        'combination': [model, dataset],
+                        'file':
+                        download_json
+                    })
+
+        # save results to local
+        for i in existing_results_list:
+            this_result = i['file']['results']
+            this_result_local_path = osp.join(result_local_path,
+                                              i['combination'][0])
+            if not osp.exists(this_result_local_path):
+                os.makedirs(this_result_local_path)
+            this_result_local_file_path = osp.join(
+                this_result_local_path, i['combination'][1] + '.json')
+            if osp.exists(this_result_local_file_path):
+                continue
+            with open(this_result_local_file_path, 'w') as f:
+                json.dump(this_result, f, ensure_ascii=False, indent=4)
+            f.close()
+
+        return existing_results_list
+
+    # subjective check
+    else:
+        model_list = [model for model in cfg['models']]
+        judge_list = [judge_model for judge_model in cfg['judge_models']]
+        model_pair_list = [[
+            deal_with_judge_model_abbr(model, judge_model)
+            for judge_model in judge_list
+        ] for model in model_list]
+
+        dataset_list = [[
+            dataset_abbr_from_cfg(dataset),
+            [dataset_abbr_from_cfg(base) for base in dataset['base_models']]
+        ] if 'base_models' in dataset.keys() else
+                        [dataset_abbr_from_cfg(dataset), ['']]
+                        for dataset in cfg['datasets']]
+
+        existing_results_list = []
+        result_local_path = osp.join(cfg['work_dir'], 'results')
+        if not osp.exists(result_local_path):
+            os.makedirs(result_local_path)
+
+        for pair_of_dataset_and_base in dataset_list:
+            dataset, base_list = pair_of_dataset_and_base[
+                0], pair_of_dataset_and_base[1]
+
+            for model_pair_sub_list in model_pair_list:
+                result_file_path_list_origin = []
+                for model_pair in model_pair_sub_list:
+                    model_result = model_abbr_from_cfg(model_pair)
+                    for base_model in base_list:
+                        base_model_name = base_model
+                        if base_model_name != '':
+                            base_model_name += '_'
+
+                        result_file_path_list_origin.append(
+                            osp.join(station_path, dataset,
+                                     base_model_name + model_result + '.json'))
+
+                result_file_path_list = [
+                    result_file_path
+                    for result_file_path in result_file_path_list_origin
+                    if osp.exists(result_file_path)
+                ]
+                model = model_abbr_from_cfg(model_pair_sub_list[0][0])
+
+                # save all parts of results to local
+                for result_file_path in result_file_path_list:
+                    with open(result_file_path, 'r') as f:
+                        this_result = json.load(f)['results']
+                    f.close()
+                    this_result_local_path = osp.join(
+                        result_local_path,
+                        osp.splitext(osp.basename(result_file_path))[0])
+                    if not osp.exists(this_result_local_path):
+                        os.makedirs(this_result_local_path)
+                    this_result_local_file_path = osp.join(
+                        this_result_local_path, dataset + '.json')
+                    if osp.exists(this_result_local_file_path):
+                        continue
+                    with open(this_result_local_file_path, 'w') as f:
+                        json.dump(this_result, f, ensure_ascii=False, indent=4)
+                    f.close()
+
+                # check whether complete
+                if len(result_file_path_list) == len(
+                        result_file_path_list_origin):
+                    print('find complete results of {} with {} at station'.
+                          format(model, dataset))
+                    existing_results_list.append({
+                        'combination': [model, dataset],
+                        'file':
+                        result_file_path_list
+                    })
+                else:
+                    print('results of {} with {} at station is not complete'.
+                          format(model, dataset))
+
+        return existing_results_list
+
+
+def find_files_by_regex(directory, pattern):
+
+    regex = re.compile(pattern)
+
+    matched_files = []
+    for filename in os.listdir(directory):
+        if regex.match(filename):
+            matched_files.append(filename)
+
+    return matched_files
+
+
+def check_filenames(x, filenames):
+
+    if not filenames:
+        return False
+
+    single_pattern = re.compile(rf'^{re.escape(x)}\.json$')
+    numbered_pattern = re.compile(rf'^{re.escape(x)}_(\d+)\.json$')
+
+    is_single = all(single_pattern.match(name) for name in filenames)
+    is_numbered = all(numbered_pattern.match(name) for name in filenames)
+
+    if not (is_single or is_numbered):
+        return False
+
+    if is_single:
+        return len(filenames) == 1
+
+    if is_numbered:
+        numbers = []
+        for name in filenames:
+            match = numbered_pattern.match(name)
+            if match:
+                numbers.append(int(match.group(1)))
+
+        if sorted(numbers) != list(range(len(numbers))):
+            return False
+
+    return True

From 277d7946f5ac314138b8c30e985ebde87552e474 Mon Sep 17 00:00:00 2001
From: Shudong Liu <sudanl@foxmail.com>
Date: Wed, 5 Mar 2025 19:37:22 +0800
Subject: [PATCH 21/58] [Fix] Fix typo in deepseed_r1.md (#1916)

---
 docs/en/user_guides/deepseek_r1.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/user_guides/deepseek_r1.md b/docs/en/user_guides/deepseek_r1.md
index 923ea05b..95a9c9ed 100644
--- a/docs/en/user_guides/deepseek_r1.md
+++ b/docs/en/user_guides/deepseek_r1.md
@@ -12,7 +12,7 @@ Please follow OpenCompass's installation guide.
 
 ## Evaluation Configuration Setup
 
-We provide example configurations in `example/eval_deepseek_r1.py`. Below is the configuration explanation:
+We provide example configurations in `examples/eval_deepseek_r1.py`. Below is the configuration explanation:
 
 ### Configuration Interpretation
 
@@ -116,7 +116,7 @@ work_dir = "outputs/deepseek_r1_reasoning"
 ### Scenario 1: Model loaded on 1 GPU, data evaluated by 1 worker, using a total of 1 GPU
 
 ```bash
-opencompass example/eval_deepseek_r1.py --debug --dump-eval-details
+opencompass examples/eval_deepseek_r1.py --debug --dump-eval-details
 ```
 
 Evaluation logs will be output in the command line.
@@ -135,7 +135,7 @@ infer = dict(
 At the same time, remove the `--debug` parameter from the evaluation command
 
 ```bash
-opencompass example/eval_deepseek_r1.py --dump-eval-details
+opencompass examples/eval_deepseek_r1.py --dump-eval-details
 ```
 
 In this mode, OpenCompass will use multithreading to start `$num_worker` tasks. Specific logs will not be displayed in the command line, instead, detailed evaluation logs will be shown under `$work_dir`.

From 570c30cf1b38341694215daf812984b4a92961cc Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Fri, 7 Mar 2025 18:24:30 +0800
Subject: [PATCH 22/58] [Fix] Fix CLI option for results persistence (#1920)

* fix

* fix

* fix
---
 opencompass/cli/main.py             | 8 +++++---
 opencompass/utils/result_station.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index 4eaa5b9b..a5937033 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -263,9 +263,11 @@ def main():
         else:
             dir_time_str = args.reuse
         logger.info(f'Reusing experiements from {dir_time_str}')
-    elif args.mode in ['eval', 'viz']:
-        raise ValueError('You must specify -r or --reuse when running in eval '
-                         'or viz mode!')
+    elif args.mode in ['eval', 'viz'] and not args.read_from_station:
+        raise ValueError(
+            'You must specify -r or --reuse, or you have to specify '
+            '--read-from-station and --station-path when running in eval '
+            'or viz mode!')
 
     # update "actual" work_dir
     cfg['work_dir'] = osp.join(cfg.work_dir, dir_time_str)
diff --git a/opencompass/utils/result_station.py b/opencompass/utils/result_station.py
index 7ad6b9eb..f1cedd00 100644
--- a/opencompass/utils/result_station.py
+++ b/opencompass/utils/result_station.py
@@ -210,7 +210,7 @@ def save_to_station(cfg, args):
                                 i for i in cfg['judge_models']
                                 if 'judged-by--' + model_abbr_from_cfg(i) ==
                                 model_abbr_from_cfg(model_pair[1])
-                            ]
+                            ][0]
 
                             this_cfg = {
                                 'models': model_cfg,

From cbf84fb33c903cfc15e56322bb4d9801001b9409 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Fri, 7 Mar 2025 21:01:20 +0800
Subject: [PATCH 23/58] [Feature] Update LLM Evaluation for MMLU-Pro  (#1923)

---
 ...o_0shot_nocot_genericllmeval_gen_08c1de.py | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py

diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py
new file mode 100644
index 00000000..e12f43fe
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de.py
@@ -0,0 +1,106 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import MMLUProDataset, generic_llmjudge_postprocess
+
+with read_base():
+    from .mmlu_pro_categories import categories
+
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
+
+Question:\n
+{question}
+
+Options:\n
+{options_str}
+
+""".strip()
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_pro_datasets = []
+
+for category in categories:
+    mmlu_pro_reader_cfg = dict(
+        input_columns=['question', 'cot_content', 'options_str'],
+        output_column='answer',
+        train_split='validation',
+        test_split='test',
+    )
+    mmlu_pro_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_pro_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MMLUProDataset,
+                path='opencompass/mmlu_pro',
+                category=category,
+                reader_cfg=mmlu_pro_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+    )
+
+    mmlu_pro_datasets.append(
+        dict(
+            abbr=f'mmlu_pro_{category.replace(" ", "_")}',
+            type=MMLUProDataset,
+            path='opencompass/mmlu_pro',
+            category=category,
+            reader_cfg=mmlu_pro_reader_cfg,
+            infer_cfg=mmlu_pro_infer_cfg,
+            eval_cfg=mmlu_pro_eval_cfg,
+        )
+    )

From e403fd21be1782537b38fd335033827effba5000 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 11 Mar 2025 17:35:04 +0800
Subject: [PATCH 24/58] [Fix] Fix math-verify evaluator (#1917)

* update

* update

* update
---
 opencompass/openicl/icl_evaluator/__init__.py      |  1 +
 .../openicl/icl_evaluator/math_evaluator.py        | 14 +++++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/opencompass/openicl/icl_evaluator/__init__.py b/opencompass/openicl/icl_evaluator/__init__.py
index 1fd1683b..fa8f25ab 100644
--- a/opencompass/openicl/icl_evaluator/__init__.py
+++ b/opencompass/openicl/icl_evaluator/__init__.py
@@ -12,3 +12,4 @@ from .icl_misc_evaluator import AveragePPLEvaluator  # noqa
 from .icl_plugin_evaluator import TEvalEvaluator  # noqa
 from .icl_toxic_evaluator import ToxicEvaluator  # noqa
 from .lm_evaluator import LMEvaluator  # noqa
+from .math_evaluator import MATHEvaluator  # noqa
diff --git a/opencompass/openicl/icl_evaluator/math_evaluator.py b/opencompass/openicl/icl_evaluator/math_evaluator.py
index c790c17b..48764252 100644
--- a/opencompass/openicl/icl_evaluator/math_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/math_evaluator.py
@@ -1,7 +1,3 @@
-from latex2sympy2_extended import NormalizationConfig
-from math_verify import (ExprExtractionConfig, LatexExtractionConfig, parse,
-                         verify)
-
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS
 
@@ -10,6 +6,14 @@ from opencompass.registry import ICL_EVALUATORS
 class MATHEvaluator(BaseEvaluator):
 
     def score(self, predictions, references):
+        try:
+            from latex2sympy2_extended import NormalizationConfig
+            from math_verify import (ExprExtractionConfig,
+                                     LatexExtractionConfig, parse, verify)
+        except ImportError:
+            raise ImportError('Failed to import required modules. Please '
+                              'install the necessary packages: '
+                              'pip install math_verify latex2sympy2_extended')
 
         self.is_num_equal(predictions, references)
 
@@ -75,7 +79,7 @@ class MATHEvaluator(BaseEvaluator):
 
 if __name__ == '__main__':
     import sympy
-
+    from math_verify import parse
     test_cases = [
         # 1. Basic arithmetic operations
         r'Simple fraction: \boxed{\frac{1}{2}}',

From 59e49aedf1f836bb455d8cdc0386fc7fe5ba668a Mon Sep 17 00:00:00 2001
From: Kangreen <69177317+kangreen0210@users.noreply.github.com>
Date: Tue, 11 Mar 2025 19:32:08 +0800
Subject: [PATCH 25/58] [Feature] Support SuperGPQA (#1924)

* support supergpqa

* remove unnecessary code

* remove unnecessary code

* Add Readme

* Add Readme

* fix lint

* fix lint

* update

* update

---------

Co-authored-by: mkj3085003 <mkj3085003@gmail.com>
Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   1 +
 dataset-index.yml                             |   8 +-
 .../datasets/supergpqa/supergpqa_gen.py       |  57 ++
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/supergpqa/__init__.py    |   0
 opencompass/datasets/supergpqa/supergpqa.py   | 184 +++++
 .../config_default.yaml                       |  17 +
 .../config_reasoning_models.yaml              |  17 +
 .../config_wrapper.py                         |  88 +++
 .../prompt/five-shot.yaml                     |  91 +++
 .../prompt/robustness-exp.yaml                |  23 +
 .../prompt/zero-shot-with-subfield.yaml       |   5 +
 .../prompt/zero-shot.yaml                     |   5 +
 .../datasets/supergpqa/supergpqa_eval.py      |  96 +++
 .../datasets/supergpqa/supergpqa_utils.py     | 693 ++++++++++++++++++
 .../icl_evaluator/icl_base_evaluator.py       |  38 +-
 17 files changed, 1317 insertions(+), 8 deletions(-)
 create mode 100644 opencompass/configs/datasets/supergpqa/supergpqa_gen.py
 create mode 100644 opencompass/datasets/supergpqa/__init__.py
 create mode 100644 opencompass/datasets/supergpqa/supergpqa.py
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/five-shot.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot.yaml
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_eval.py
 create mode 100644 opencompass/datasets/supergpqa/supergpqa_utils.py

diff --git a/README.md b/README.md
index 93c2a5fd..4a29f2b7 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 55c2faf5..e1bc6f7f 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,6 +57,7 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
diff --git a/dataset-index.yml b/dataset-index.yml
index b8ec7041..f72e7362 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -734,6 +734,8 @@
     category: Understanding
     paper: https://arxiv.org/pdf/1808.08745
     configpath: opencompass/configs/datasets/Xsum
-
-
-
+- supergpqa:
+    name: SuperGPQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2502.14739
+    configpath: opencompass/configs/datasets/supergpqa
diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_gen.py b/opencompass/configs/datasets/supergpqa/supergpqa_gen.py
new file mode 100644
index 00000000..edf2e57e
--- /dev/null
+++ b/opencompass/configs/datasets/supergpqa/supergpqa_gen.py
@@ -0,0 +1,57 @@
+from opencompass.datasets.supergpqa.supergpqa import (
+    SuperGPQADataset,
+    SuperGPQAEvaluator,
+)
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'discipline',
+        'field',
+        'subfield',
+        'difficulty',
+        'infer_prompt',
+        'prompt_mode',
+    ],
+    output_column='answer_letter',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{infer_prompt}',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=SuperGPQAEvaluator),
+    pred_role='BOT',
+)
+supergpqa_dataset = dict(
+    type=SuperGPQADataset,
+    abbr='supergpqa',
+    path='m-a-p/SuperGPQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+supergpqa_datasets = [supergpqa_dataset]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 4052c630..ffcc217d 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -127,6 +127,7 @@ from .strategyqa import *  # noqa: F401, F403
 from .subjective import *  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
 from .summscreen import *  # noqa: F401, F403
+from .supergpqa import *  # noqa: F401, F403
 from .svamp import *  # noqa: F401, F403
 from .tabmwp import *  # noqa: F401, F403
 from .taco import *  # noqa: F401, F403
diff --git a/opencompass/datasets/supergpqa/__init__.py b/opencompass/datasets/supergpqa/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py
new file mode 100644
index 00000000..7193722d
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa.py
@@ -0,0 +1,184 @@
+import os
+
+from datasets import Dataset, load_dataset
+
+from opencompass.datasets.supergpqa.supergpqa_eval import (
+    extract_option_content, extract_option_labels)
+from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+
+def _parse(item, template, prompt_mode):
+    prompt_format = [
+        item['question'] + '\n' + '\n'.join([
+            f'{chr(65+i)}) {option}'
+            for i, option in enumerate(item['options'])
+        ])
+    ]
+    item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class SuperGPQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        path = get_data_path(path, local_mode=True)
+        dataset = load_dataset(path, split='train')
+
+        # get prompt template
+        template_path = None
+        if prompt_mode == 'zero-shot':
+            template_path = os.path.join(
+                os.path.dirname(__file__),
+                'supergpqa_dataset_config/prompt/zero-shot.yaml',
+            )
+        elif prompt_mode == 'five-shot':
+            template_path = os.path.join(
+                os.path.dirname(__file__),
+                'supergpqa_dataset_config/prompt/five-shot.yaml',
+            )
+        try:
+            template = load_yaml(template_path)
+        except FileNotFoundError:
+            print(f'[ERROR] Missing prompt template: {template_path}')
+            return Dataset.from_list([])
+
+        dataset = dataset.map(lambda item: _parse(item, template, prompt_mode))
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class SuperGPQAEvaluator(BaseEvaluator):
+
+    def __init__(self):
+        super().__init__()
+
+    def score(self, predictions, references, test_set):
+        mode = test_set[0]['prompt_mode']
+        acc = 0
+        count = 0
+        err = 0
+        miss = 0
+        acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
+        count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
+        stats = {'discipline': {}, 'field': {}, 'subfield': {}}
+        details = []
+        for i, sample in enumerate(test_set):
+            sample['pred'] = prediction = predictions[i]
+            gold = references[i]
+            if mode == 'zero-shot':
+                predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
+                if predict is None:
+                    predict = extract_option_content(prediction,
+                                                     sample['options'])
+                    predict = (chr(sample['options'].index(predict) +
+                                   65) if predict else None)
+                sample['extracted_answer'] = predict
+            elif mode == 'five-shot':
+                response = prediction.split('Question:')[0]
+                predict = extract_option_labels(response, 'ABCDEFGHIJ')
+                if predict is None:
+                    predict = extract_option_content(response,
+                                                     sample['options'])
+                    predict = (chr(sample['options'].index(predict) +
+                                   65) if predict else None)
+                if predict is None:
+                    predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
+                    if predict is None:
+                        predict = extract_option_content(
+                            prediction, sample['options'])
+                        predict = (chr(sample['options'].index(predict) +
+                                       65) if predict else None)
+                sample['extracted_answer'] = predict
+
+            discipline = sample.get('discipline', 'unknown')
+            field = sample.get('field', 'unknown')
+            subfield = sample.get('subfield', 'unknown')
+            difficulty = sample.get('difficulty', 'unknown')
+
+            for level, key in [
+                ('discipline', discipline),
+                    # ('field', f"{discipline}/{field}"),
+                    # ('subfield', f"{discipline}/{field}/{subfield}"),
+            ]:
+                if key not in stats[level]:
+                    stats[level][key] = {
+                        'correct': 0,
+                        'total': 0,
+                        'miss': 0,
+                        'error': 0,
+                        'discipline': discipline,
+                        'field': field,
+                        'subfield': subfield,
+                        'difficulty': {
+                            'easy': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                            'middle': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                            'hard': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                        },
+                    }
+
+                stats[level][key]['total'] += 1
+                stats[level][key]['difficulty'][difficulty]['total'] += 1
+
+                answer_letter = sample['answer_letter']
+                assert answer_letter == gold
+                if predict and answer_letter == predict:
+                    acc += 1
+                    acc_difficulty[difficulty] += 1
+                    sample['status'] = 'correct'
+                    stats[level][key]['correct'] += 1
+                    stats[level][key]['difficulty'][difficulty]['correct'] += 1
+                elif predict is None or predict == '':
+                    miss += 1
+                    sample['status'] = 'miss'
+                    stats[level][key]['miss'] += 1
+                elif predict == 'error':
+                    err += 1
+                    sample['status'] = 'error'
+                    stats[level][key]['error'] += 1
+                else:
+                    sample['status'] = 'incorrect'
+                count += 1
+                count_difficulty[difficulty] += 1
+                details.append({
+                    'pred': sample['pred'],
+                    'answer': sample['answer'],
+                    'parsed_answer': sample['extracted_answer'],
+                    'correct': True if sample['status'] else False,
+                })
+
+        return {
+            'accuracy':
+            acc / count if count > 0 else 0,
+            'error_rate':
+            err / count if count > 0 else 0,
+            'miss_rate':
+            miss / count if count > 0 else 0,
+            'hard_accuracy':
+            (acc_difficulty['hard'] /
+             count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0),
+            'middle_accuracy':
+            (acc_difficulty['middle'] / count_difficulty['middle']
+             if count_difficulty['middle'] > 0 else 0),
+            'easy_accuracy':
+            (acc_difficulty['easy'] /
+             count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0),
+            'details':
+            details,
+        }
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml
new file mode 100644
index 00000000..9cbb006b
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml
@@ -0,0 +1,17 @@
+response_key: 'response'
+error_key: 'error'
+id_key:
+  - 'uuid'
+prompt_key: 'prompt'
+
+
+
+history_key: 'history'
+status_key: 'status'
+
+save_prompt: True
+max_tokens: 4096
+temperatrue: 0.0
+
+max_rounds: 30
+BoN: 32
\ No newline at end of file
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml
new file mode 100644
index 00000000..c1fd105c
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml
@@ -0,0 +1,17 @@
+response_key: 'response'
+error_key: 'error'
+id_key:
+  - 'uuid'
+prompt_key: 'prompt'
+
+
+
+history_key: 'history'
+status_key: 'status'
+
+save_prompt: True
+max_tokens: 32768
+temperatrue: 0.0
+
+max_rounds: 30
+BoN: 32
\ No newline at end of file
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py
new file mode 100644
index 00000000..45c02e5a
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/config_wrapper.py
@@ -0,0 +1,88 @@
+import yaml
+
+
+class ConfigWrapper:
+
+    def __init__(self, config_path):
+        self._config = {}
+        with open(config_path, 'r') as file:
+            self._config = yaml.safe_load(file)
+        for key, value in self._config.items():
+            setattr(self, key, value)
+
+    def __setattr__(self, key, value):
+        if key.startswith('_'):
+            super().__setattr__(key, value)
+        else:
+            self._config[key] = value
+            super().__setattr__(key, value)
+
+    def __getattr__(self, key):
+        if key in self._config:
+            return self._config[key]
+        raise AttributeError(
+            f"'ConfigWrapper' object has no attribute '{key}'")
+
+    def get_id(self, data):
+        if isinstance(self._config.get('id_key'), str):
+            return data.get(self._config.get('id_key'), None)
+        elif isinstance(self._config.get('id_key'), list):
+            return '_'.join([
+                str(data[key]) for key in self._config.get('id_key')
+                if key in data
+            ])
+
+    def print_all_keys(self):
+        print('config keys:')
+        for key, value in self._config.items():
+            print(f'  - {key}: {value}')
+
+
+config_wrapper = None
+
+
+def initialize_config(config_path):
+    global config_wrapper
+    config_wrapper = ConfigWrapper(config_path)
+
+
+def get_config_wrapper():
+    global config_wrapper
+    if config_wrapper is None:
+        raise RuntimeError(
+            'ConfigWrapper not initialized. Call initialize_config first.')
+    return config_wrapper
+
+
+if __name__ == '__main__':
+    config_path = 'config/config.yaml'
+    initialize_config(config_path)
+    data = {
+        'idx':
+        '50',
+        'step':
+        21,
+        'question':
+        'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
+        'Please provide the decrypted answer, encapsulated in double square'
+        ' brackets. For example, the format should be: [[decrypted answer]].',
+        'answer':
+        '[[P]]',
+        'category':
+        'Decryption',
+        'rule_id':
+        '23',
+        'input':
+        'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
+        'steps_num':
+        23,
+        'description':
+        'For a number c=228 in the ciphertext:\n'
+        'Calculate z = c^e mod n. Here ^ means multiplication.\nz is 80.'
+        '\nBased on the decimal number represented by z, use the ascii '
+        'code to find the corresponding letter as the plaintext letter p.'
+        '\nPlease give the letter p in [[...]] format.\n',
+        'atom':
+        80,
+    }
+    print(config_wrapper.get_id(data))
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/five-shot.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/five-shot.yaml
new file mode 100644
index 00000000..73cdb985
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/five-shot.yaml
@@ -0,0 +1,91 @@
+prompt_format: 
+  - |
+    Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+    
+    Question: 
+    A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is
+    A) 10
+    B) 40
+    C) 6
+    D) 25
+    E) 15
+    F) 50
+    G) 30
+    H) 4
+    I) 5
+    J) 20
+
+    Answer: Let's think step by step. In a refracting telescope, if both lenses are converging, the focus of both lenses must be between the two lenses, and thus the focal lengths of the two lenses must add up to their separation. Since the focal length of one lens is 20 cm, the focal length of the other must be 80 cm. The magnification is the ratio of these two focal lengths, or 4.
+    Answer: H.
+    
+    Question: 
+    Say the pupil of your eye has a diameter of 5 mm and you have a telescope with an aperture of 50 cm. How much more light can the telescope gather than your eye? 
+    A) 1000 times more
+    B) 50 times more
+    C) 5000 times more
+    D) 500 times more
+    E) 10000 times more
+    F) 20000 times more
+    G) 2000 times more
+    H) 100 times more
+    I) 10 times more
+    J) N/A
+
+    Answer: Let's think step by step. The amount of light a telescope can gather compared to the human eye is proportional to the area of its apertures. The area of a circle is given by the formula $A = \pi \left(\frac{{D}}{{2}}\right)^2$, where $D$ is the diameter. Therefore, the relative light-gathering power is calculated as:
+    \[
+    \frac{{\left(\frac{{50 \text{{ cm}}}}{{2}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{2}}\right)^2}} = \frac{{\left(\frac{{50 \text{{ cm}}}}{{0.1 \text{{ cm}}}}\right)^2}}{{\left(\frac{{5 \text{{ mm}}}}{{0.1 \text{{ cm}}}}\right)^2}} = \frac{{500^2}}{{5^2}} = 10000.
+    \]
+    Answer: E.
+    
+    Question: 
+    Where do most short-period comets come from and how do we know? 
+    A) The Kuiper belt; short period comets tend to be in the plane of the solar system like the Kuiper belt. 
+    B) The asteroid belt; short period comets tend to come from random directions indicating a spherical distribution of comets called the asteroid belt. 
+    C) The asteroid belt; short period comets tend to be in the plane of the solar system just like the asteroid belt. 
+    D) The Oort cloud; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the Oort cloud. 
+    E) The Oort Cloud; short period comets tend to come from random directions indicating a spherical distribution of comets called the Oort Cloud. 
+    F) The Oort cloud; short period comets tend to be in the plane of the solar system just like the Oort cloud. 
+    G) The asteroid belt; short period comets have orbital periods similar to asteroids like Vesta and are found in the plane of the solar system just like the asteroid belt. 
+    Answer: Let's think step by step. Most short-period comets originate from the Kuiper belt. This is deduced from the observation that these comets tend to follow orbits that lie in the plane of the solar system, similar to the distribution of objects in the Kuiper belt itself. Thus, the alignment of these cometary orbits with the ecliptic plane points to their Kuiper belt origin.
+    Answer: A.
+    
+    Question: 
+    Colors in a soap bubble result from light 
+    A) dispersion 
+    B) deflection 
+    C) refraction 
+    D) reflection 
+    E) interference 
+    F) converted to a different frequency 
+    G) polarization 
+    H) absorption 
+    I) diffraction 
+    J) transmission 
+
+    Answer: Let's think step by step. The colorful patterns observed in a soap bubble are caused by the phenomenon of light interference. This occurs when light waves bounce between the two surfaces of the soap film, combining constructively or destructively based on their phase differences and the varying thickness of the film. These interactions result in vibrant color patterns due to variations in the intensity of different wavelengths of light.
+    Answer: E.
+    
+    Question: 
+    A microwave oven is connected to an outlet, 120 V, and draws a current of 2 amps. At what rate is energy being used by the microwave oven? 
+    A) 240 W
+    B) 120 W
+    C) 10 W
+    D) 480 W
+    E) 360 W
+    F) 200 W
+    G) 30 W
+    H) 150 W
+    I) 60 W
+    J) 300 W
+
+    Answer: Let's think step by step. The rate of energy usage, known as power, in an electrical circuit is calculated by the product of voltage and current. For a microwave oven connected to a 120 V outlet and drawing a current of 2 amps, the power consumption can be calculated as follows:
+    \[
+    \text{{Power}} = \text{{Voltage}} \times \text{{Current}} = 120 \, \text{{V}} \times 2 \, \text{{A}} = 240 \, \text{{W}}.
+    \]
+    Therefore, the microwave oven uses energy at a rate of 240 watts.
+    Answer: A.
+    
+    Question: 
+    {}
+    
+    Answer: Let's think step by step. 
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml
new file mode 100644
index 00000000..6cffdd93
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/robustness-exp.yaml
@@ -0,0 +1,23 @@
+initial_prompt_0: 
+  - |
+    Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+
+    {}
+
+initial_prompt_1:
+  - |
+    You are a helpful assistant. Answer the given multiple-choice question. Only one option is correct. The last line of your response should be in the format 'The correct answer is: $LETTER', where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+
+    {}
+
+initial_prompt_2:
+  - |
+    Select the correct answer for the following multiple-choice question. There is only one valid choice. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+
+    {}
+
+initial_prompt_3:
+  - |
+    Review the following multiple-choice question and choose the one correct answer. Ensure that your response concludes with a line exactly formatted as 'The correct answer is: $LETTER', where LETTER represents one of A, B, C, D, E, F, G, H, I, or J.
+
+    {}
\ No newline at end of file
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml
new file mode 100644
index 00000000..991c6a4d
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot-with-subfield.yaml
@@ -0,0 +1,5 @@
+prompt_format: 
+  - |
+    Answer the following multiple choice question about {}. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+    
+    {}
\ No newline at end of file
diff --git a/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot.yaml b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot.yaml
new file mode 100644
index 00000000..5f1ead34
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_dataset_config/prompt/zero-shot.yaml
@@ -0,0 +1,5 @@
+prompt_format: 
+  - |
+    Answer the following multiple choice question. There is only one correct answer. The last line of your response should be in the format 'Answer: $LETTER' (without quotes), where LETTER is one of A, B, C, D, E, F, G, H, I, or J.
+    
+    {}
diff --git a/opencompass/datasets/supergpqa/supergpqa_eval.py b/opencompass/datasets/supergpqa/supergpqa_eval.py
new file mode 100644
index 00000000..e596a2f8
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_eval.py
@@ -0,0 +1,96 @@
+# flake8: noqa: W605
+import re
+
+import timeout_decorator
+
+
+@timeout_decorator.timeout(5)  # 5 seconds timeout
+def safe_regex_search(pattern, text, flags=0):
+    try:
+        return re.search(pattern, text, flags)
+    except timeout_decorator.TimeoutError:
+        print(f'Regex match timeout: pattern={pattern}, text={text[:100]}...')
+        return None
+    except Exception as e:
+        print(f'Regex match error: {str(e)}')
+        return None
+
+
+def extract_option_labels(text, options='ABCDEFGHIJ'):
+    if not isinstance(text, str) or not isinstance(options, str):
+        return 'error'
+
+    text = text.rstrip()
+    last_line = text.split('\n')[-1]
+
+    option_str = ''.join([chr(65 + i) for i in range(len(options))
+                          ]) if options else 'ABCDEFGHIJ'
+
+    patterns = [
+        # e.g. "The final answer to this question is: A."
+        #      "The best option is $\boxed{B}:"
+        #      "The correct answer is (C)."
+        f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is?:?\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+
+        # e.g. "ANSWER: A"
+        #      "Answer: $\boxed{B}."
+        #      "ANSWER: (C):"
+        f'(?i:Answer)[\*\s]*:\s*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+
+        # e.g. "A"
+        #      "$\boxed{B}$"
+        #      "(C)."
+        #      "[D]:"
+        f'^[^\w\r\n]*(?:[\*\$\\{{(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*([{option_str}])(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+    ]
+
+    for pattern in patterns:
+        match = safe_regex_search(pattern, last_line, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+    for pattern in patterns:
+        match = safe_regex_search(pattern, text, re.IGNORECASE)
+        if match:
+            return match.group(1)
+
+    return None
+
+
+def extract_option_content(text, options_content=None):
+    if not isinstance(text, str) or not isinstance(options_content, list):
+        return 'error'
+
+    escaped_options_content = [
+        re.escape(option_content) for option_content in options_content
+    ]
+    escaped_options_content_str = '|'.join(escaped_options_content)
+
+    text = text.rstrip()
+    last_line = text.split('\n')[-1]
+
+    patterns = [
+        f'[Tt]he\s+(?:\w+\s+)?(?:answer|option)(?:\w+\s+)?\s+is:?\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+        f'(?i:Answer)\s*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+        f'^[^\w\r\n]*(?:[\*\$\\{{\(\[\\\\(]*?(?:(?:\\\\boxed|\\\\mathbf|\\\\mathrm|\\\\text){{)?)*\s*({escaped_options_content_str})(?:\\\\?\}}?\$?\)?\]?\}}?)*(?:[\s:\.\*)]|$)',
+    ]
+
+    for pattern in patterns:
+        match = safe_regex_search(pattern, last_line)
+        if match:
+            if match.group(1) in escaped_options_content:
+                return options_content[escaped_options_content.index(
+                    match.group(1))]
+            else:
+                return match.group(1)
+
+    for pattern in patterns:
+        match = safe_regex_search(pattern, text)
+        if match:
+            if match.group(1) in escaped_options_content:
+                return options_content[escaped_options_content.index(
+                    match.group(1))]
+            else:
+                return match.group(1)
+
+    return None
diff --git a/opencompass/datasets/supergpqa/supergpqa_utils.py b/opencompass/datasets/supergpqa/supergpqa_utils.py
new file mode 100644
index 00000000..c8913a9b
--- /dev/null
+++ b/opencompass/datasets/supergpqa/supergpqa_utils.py
@@ -0,0 +1,693 @@
+import json
+import os
+import re
+
+import sympy as sp
+import yaml
+from sympy.parsing.latex import parse_latex
+
+
+def load_yaml(yaml_path):
+    """Load a YAML file."""
+    if not os.path.exists(yaml_path):
+        raise FileNotFoundError(f'YAML file not found: {yaml_path}')
+    with open(yaml_path, 'r', encoding='utf-8') as file:
+        return yaml.safe_load(file)
+
+
+def load_json_or_jsonl(file_path):
+    """Load data from a JSON or JSONL file."""
+    if not os.path.exists(file_path):
+        return None
+    with open(file_path, 'r', encoding='utf-8') as file:
+        if file_path.endswith('.json'):
+            return json.load(file)
+        elif file_path.endswith('.jsonl'):
+            return [json.loads(line) for line in file]
+    return None
+
+
+def find_file(base_path, sub_path, extensions=('json', 'jsonl')):
+    """Find the first available file with given extensions."""
+    for ext in extensions:
+        file_path = os.path.join(base_path, f'{sub_path}.{ext}')
+        if os.path.exists(file_path):
+            return file_path
+    return None
+
+
+def load_json_or_jsonl_with_idx(data_path, split='', idx=None):
+    base_path = os.path.join(data_path, split)
+    if os.path.exists(f'{base_path}.json'):
+        file_path = f'{base_path}.json'
+    elif os.path.exists(f'{base_path}.jsonl'):
+        file_path = f'{base_path}.jsonl'
+    elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
+        file_path = base_path
+    else:
+        raise FileNotFoundError('No JSON or JSONL file found.')
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        if file_path.endswith('.json'):
+            data = json.load(file)
+        elif file_path.endswith('.jsonl'):
+            data = [json.loads(line) for line in file]
+
+    if idx is not None:
+        try:
+            return next(item for item in data if item.get('idx') == idx)
+        except StopIteration:
+            raise ValueError(f'No entry found for idx {idx}')
+    else:
+        return data
+
+
+def load_split_data(base_path, split_name):
+    """Load the rule and sample data for a specific split."""
+    split_path = os.path.join(base_path, split_name)
+    rule_path = find_file(split_path, 'rule')
+    sample_path = find_file(split_path, 'sample')
+
+    rules = load_json_or_jsonl(rule_path) if rule_path else []
+    samples = load_json_or_jsonl(sample_path) if sample_path else []
+
+    return {'rules': rules, 'samples': samples}
+
+
+def process_mixed_data(base_path, mode):
+    """Load and process data for the 'mixed' split and specific mode."""
+    mixed_path = os.path.join(base_path, 'mixed')
+    file_path = find_file(mixed_path, mode)
+    if not file_path:
+        print(f'[WARNING] Missing file for mixed mode: {mode}')
+        return []
+
+    data = load_json_or_jsonl(file_path)
+    template_path = os.path.join(base_path, 'config/prompt/mixed.yaml')
+    template = load_yaml(template_path)
+
+    processed = []
+    for item in data:
+        rules = '\n'.join(item.get('rule_list', []))
+        questions = '\n'.join(item.get('question_list', []))
+        item['prompt'] = template['prompt_format'][0].format(rules, questions)
+        processed.append(item)
+
+    return processed
+
+
+class ConfigWrapper:
+
+    def __init__(self, config_path):
+        self._config = {}
+        with open(config_path, 'r') as file:
+            self._config = yaml.safe_load(file)
+        for key, value in self._config.items():
+            setattr(self, key, value)
+
+    def __setattr__(self, key, value):
+        if key.startswith('_'):
+            super().__setattr__(key, value)
+        else:
+            self._config[key] = value
+            super().__setattr__(key, value)
+
+    def __getattr__(self, key):
+        if key in self._config:
+            return self._config[key]
+        raise AttributeError(
+            f"'ConfigWrapper' object has no attribute '{key}'")
+
+    def get_id(self, data):
+        if isinstance(self._config.get('id_key'), str):
+            return data.get(self._config.get('id_key'), None)
+        elif isinstance(self._config.get('id_key'), list):
+            return '_'.join([
+                str(data[key]) for key in self._config.get('id_key')
+                if key in data
+            ])
+
+    def print_all_keys(self):
+        print('config keys:')
+        for key, value in self._config.items():
+            print(f'  - {key}: {value}')
+
+
+config_wrapper = None
+
+
+def initialize_config(config_path):
+    global config_wrapper
+    config_wrapper = ConfigWrapper(config_path)
+
+
+def get_config_wrapper():
+    global config_wrapper
+    if config_wrapper is None:
+        raise RuntimeError(
+            'ConfigWrapper not initialized. Call initialize_config first.')
+    return config_wrapper
+
+
+if __name__ == '__main__':
+    config_path = 'config/config.yaml'
+    initialize_config(config_path)
+    data = {
+        'idx':
+        '50',
+        'step':
+        21,
+        'question':
+        ('Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"\n\n'
+         'Please provide the decrypted answer, encapsulated in double '
+         'square brackets. '
+         'For example, the format should be: [[decrypted answer]].'),
+        'answer':
+        '[[P]]',
+        'category':
+        'Decryption',
+        'rule_id':
+        '23',
+        'input':
+        'Ciphertext: "17,156,4,54,213,17,23,84,228,54,281"',
+        'steps_num':
+        23,
+        'description':
+        ('For a number c=228 in the ciphertext:\n'
+         'Calculate z = c^e mod n. Here ^ means multiplication.\n'
+         'z is 80.\nBased on the decimal number represented by z, '
+         'use the ascii code to find the corresponding letter '
+         'as the plaintext letter p.\n'
+         'Please give the letter p in [[...]] format.\n'),
+        'atom':
+        80
+    }
+    print(config_wrapper.get_id(data))
+
+
+def read_yaml(config='default'):
+    if os.path.exists(f'config/prompt/{config}.yaml'):
+        yaml_file = f'config/prompt/{config}.yaml'
+    else:
+        yaml_file = config
+    with open(yaml_file, 'r') as yaml_file:
+        return yaml.safe_load(yaml_file)
+
+
+def write_jsonl_lines(file, data):
+    config_wrapper = get_config_wrapper()
+    if config_wrapper.save_prompt:
+        json.dump(data, file, ensure_ascii=False)
+    else:
+        data.pop(config_wrapper.prompt_key)
+        json.dump(data, file, ensure_ascii=False)
+    file.write('\n')
+    file.flush()
+
+
+def print_info(info):
+    print('-' * 100)
+    print('[INFO] model_name:', info['model_name'])
+    print('[INFO] splits:', info['splits'])
+    print('[INFO] modes:', info['modes'])
+    print('[INFO] output_dir:', info['output_dir'])
+    print('[INFO] Infer Limit:',
+          'No limit' if info['infer_limit'] is None else info['infer_limit'])
+    print('[INFO] Number of Workers:', info['num_workers'])
+    print('[INFO] Batch Size:', info['batch_size'])
+    print('[INFO] Use Accel:', info['use_accel'])
+    print('-' * 100)
+
+
+def read_json_or_jsonl(data_path, split='', mapping_key=None):
+    base_path = os.path.join(data_path, split)
+    if os.path.exists(f'{base_path}.json'):
+        file_path = f'{base_path}.json'
+    elif os.path.exists(f'{base_path}.jsonl'):
+        file_path = f'{base_path}.jsonl'
+    elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
+        file_path = base_path
+    else:
+        raise FileNotFoundError('No JSON or JSONL file found.')
+
+    with open(file_path, 'r') as file:
+        if file_path.endswith('.json'):
+            data = json.load(file)
+        elif file_path.endswith('.jsonl'):
+            data = [json.loads(line) for line in file]
+
+    if mapping_key:
+        return {
+            item[mapping_key]: item
+            for item in data if mapping_key in item
+        }
+    else:
+        return data
+
+
+def read_json_or_jsonl_with_idx(data_path, split='', idx=None):
+    base_path = os.path.join(data_path, split)
+    if os.path.exists(f'{base_path}.json'):
+        file_path = f'{base_path}.json'
+    elif os.path.exists(f'{base_path}.jsonl'):
+        file_path = f'{base_path}.jsonl'
+    elif base_path.endswith('.json') or base_path.endswith('.jsonl'):
+        file_path = base_path
+    else:
+        raise FileNotFoundError('No JSON or JSONL file found.')
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        if file_path.endswith('.json'):
+            data = json.load(file)
+        elif file_path.endswith('.jsonl'):
+            data = [json.loads(line) for line in file]
+
+    if idx is not None:
+        try:
+            return next(item for item in data if item.get('idx') == idx)
+        except StopIteration:
+            raise ValueError(f'No entry found for idx {idx}')
+    else:
+        return data
+
+
+idx_ranges = [
+    [18],
+    [73, 74, 77],
+    [94],
+    [115, 116, 117],
+    [121, 122, 123, 125],
+    [131, 132, 134, 135, 136],
+    [141, 143, 149],
+    list(range(145, 148)),
+    list(range(151, 157)),
+    [160, 161, 162],
+    [164, 165, 166],
+    [170],
+    [206, 209],
+    list(range(211, 216)),
+    [217, 218],
+]
+
+
+def clean_json_string(json_str):
+    json_str = re.sub(r'[\x00-\x1F\x7F]', '', json_str)
+    return json_str
+
+
+def is_in_idx_ranges(idx, idx_ranges):
+    for range_list in idx_ranges:
+        if int(idx) in range_list:
+            return True
+    return False
+
+
+def extract_json(text):
+    matches = re.findall(r'{.*}', text, re.DOTALL)
+    if matches:
+        json_str = matches[-1]
+        json_str = clean_json_string(json_str)
+        try:
+            data = json.loads(json_str)
+            return data
+        except json.JSONDecodeError as e:
+            print(f'Error decoding JSON: {e}')
+            return 'NULL'
+    return 'NULL'
+
+
+def extract_all_responses_from_json(response_json):
+    results = []
+    for key, value in response_json.items():
+        results.append(str(value))
+    return results
+
+
+def clean_latex(latex_expr):
+    if '=' in latex_expr:
+        latex_expr = latex_expr.rsplit('=', 1)[1]
+    latex_expr = re.sub(r'\\[()\[\]]', '', latex_expr)
+    latex_expr = re.sub(r'\\text\{.*?\}', '', latex_expr)
+    latex_expr = re.sub(r'\\(left|right|displaystyle)', '', latex_expr)
+    latex_expr = latex_expr.replace('\\\\', '\\')
+    return latex_expr
+
+
+def extract_text_from_brackets(text, clean_level='basic'):
+    matches = re.findall(r'\[\[\s*(.*?)\s*\]\]', text, re.DOTALL)
+    if not matches:
+        matches = re.findall(r'\$\\boxed\{(.*?)\}\$', text, re.DOTALL)
+    if not matches:
+        matches = re.findall(r'\[\s*(.*?)\s*\]', text, re.DOTALL)
+    if matches:
+        match_str = matches[0].strip()
+        if clean_level == 'clean':
+            match_str = match_str.replace('"', '').replace('\n', '').replace(
+                ' ', '').replace('[', '').replace(']', '')
+        elif clean_level == 'logic':
+            match_str = match_str.replace('"', '').replace('\n', '').replace(
+                ' ', '').replace('.', '')
+        elif clean_level == 'math':
+            match_str = match_str.replace('"', '').replace('\n', '').replace(
+                '[', '').replace(']', '').replace('$', '')
+            return f'{clean_latex(match_str)}'
+        return f'[[{match_str}]]'
+    return 'NULL'
+
+
+def extract_inner_text_from_brackets(text):
+    if not isinstance(text, str):
+        print(f'text type: {type(text)}, text value: {text}')
+        return 'NULL'
+    match = re.search(r'\[\[(.*?)\]\]', text, re.DOTALL)
+    return match.group(1) if match else 'NULL'
+
+
+def extract_numbers(str):
+    numbers = re.findall(r'\d+', str)
+    numbers = list(map(int, numbers))
+    return numbers
+
+
+def extract_and_sort_inequalities(latex_expr):
+    pattern = r'(≥|≤)\s*([-]?\d+\.?\d*)'
+    matches = re.findall(pattern, latex_expr)
+    extracted_inequalities = [''.join(match) for match in matches]
+    sorted_inequalities = sorted(extracted_inequalities)
+    return sorted_inequalities
+
+
+def rule5_normalize_content(content):
+    parts = [part for part in content.split(';')]
+    sorted_parts = sorted(parts)
+    return sorted_parts
+
+
+def normalize_string(s):
+    s = re.sub(r'[^0-9]', '', s)
+    pairs = s.split(',')
+    pairs.sort()
+    return pairs
+
+
+def remove_commas_and_spaces(s):
+    return re.sub(r'[,\s\[\]]+', '', s)
+
+
+def remove_non_alphanumeric(s):
+    return re.sub(r'\W+', '', s)
+
+
+def contains_or(answer):
+    return 'or' in answer
+
+
+def compare_multi_results(response, answer):
+    try:
+        response_text = extract_text_from_brackets(response, 'clean')
+        response_text = re.sub(r'\\text\{or\}', 'or', response_text)
+        if response_text == 'NULL':
+            return False
+        answer = extract_text_from_brackets(answer, 'clean')
+        response_split = response_text.strip('[[]]').split('or')
+        answer_split = answer.strip('[[]]').split('or')
+        response_sorted = sorted([x.strip() for x in response_split])
+        answer_sorted = sorted([x.strip() for x in answer_split])
+        return response_sorted == answer_sorted
+    except Exception as e:
+        print(f'Error during comparison: {e}')
+        return False
+
+
+def split_or_expression(expression):
+    return [part.strip() for part in expression.split('or')]
+
+
+def compare_math_expressions(response, answer):
+    response_text = extract_text_from_brackets(response, 'math')
+    answer_text = extract_text_from_brackets(answer, 'math')
+    if response_text == 'NULL':
+        return False
+    if contains_or(answer_text):
+        response_parts = split_or_expression(response_text)
+        answer_parts = split_or_expression(answer_text)
+        try:
+            response_exprs = {
+                sp.simplify(parse_latex(part))
+                for part in response_parts
+            }
+            answer_exprs = {
+                sp.simplify(parse_latex(part))
+                for part in answer_parts
+            }
+            return response_exprs == answer_exprs
+        except Exception as e:
+            print(f'Error during simplification or parsing: {e}')
+            return response_text == answer_text
+    else:
+        try:
+            response_expr = sp.simplify(parse_latex(response_text))
+            answer_expr = sp.simplify(parse_latex(answer_text))
+            return response_expr == answer_expr
+        except Exception as e:
+            print(f'Error during simplification or parsing: {e}')
+            return response_text == answer_text
+
+
+def method_equal(response_text, answer):
+    return response_text == answer
+
+
+def method_1(response_text, answer):
+    cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
+    cleaned_string = cleaned_string.lower()
+    answer = re.sub(r'[^A-Za-z]', '', answer)
+    answer = answer.lower()
+    return cleaned_string == answer
+
+
+def method_2(response_text, answer):
+    cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
+    cleaned_string = cleaned_string.lower()
+    answer = answer.split(',')
+    return cleaned_string in answer
+
+
+def method_3(response_text, answer):
+    response_text = response_text.lower()
+    pairs1 = re.split(r'\W+', response_text)
+    pairs2 = answer.split(' ')
+    pairs1 = [word for word in pairs1 if word]
+    pairs1.sort()
+    pairs2.sort()
+    return pairs1 == pairs2
+
+
+def method_4(response_text, answer):
+    cleaned_string = re.sub(r'[^A-Za-z]', '', response_text)
+    cleaned_string = cleaned_string.lower()
+    return cleaned_string in answer
+
+
+def method_5(response_text, answer):
+    response_text = re.sub(r'\s+', '', response_text)
+    response_text = response_text.split(',')
+    answer = answer.split(',')
+    response_text.sort()
+    answer.sort()
+    return response_text == answer
+
+
+def method_9(response_text, answer):
+    response_text = response_text.replace('×', '*').replace('−', '-')
+    answer = answer.replace('×', '*').replace('−', '-')
+
+    def extract_operators(s):
+        return re.findall(r'[+\-*/]', s)
+
+    response_ops = extract_operators(response_text.split('=')[0])
+    answer_ops = extract_operators(answer.split('=')[0])
+    if response_ops != answer_ops:
+        return False
+    match = re.search(r'=\s*(-?\d+)', answer)
+    expected_result = int(match.group(1))
+    try:
+        left_side = response_text.split('=')[0]
+        result = eval(left_side)
+    except Exception as e:
+        print(f'Error during evaluation: {e}')
+        return False
+    return result == expected_result
+
+
+def method_10(response_text, answer):
+    response_text = response_text.replace('×', '*').replace('−', '-')
+    response_text = response_text.split('=')[0]
+    answer = answer.split('\n')[0].split('=')[0]
+    response_ops = sorted(remove_non_alphanumeric(response_text))
+    answer_ops = sorted(remove_non_alphanumeric(answer))
+    if response_ops != answer_ops:
+        return False
+    try:
+        result = eval(response_text)
+    except Exception as e:
+        print(f'Error during evaluation: {e}')
+        return False
+    return result == 24
+
+
+def method_18(response_text, answer):
+    cleaned_s1 = remove_commas_and_spaces(response_text)
+    cleaned_s2 = remove_commas_and_spaces(answer)
+    return cleaned_s1 == cleaned_s2
+
+
+def method_general(response_text, answer):
+    cleaned_s1 = remove_non_alphanumeric(response_text)
+    cleaned_s2 = remove_non_alphanumeric(answer)
+    return cleaned_s1 == cleaned_s2
+
+
+question_methods = {
+    '1': method_1,
+    '2': method_2,
+    '3': method_3,
+    '4': method_4,
+    '5': method_5,
+    '9': method_9,
+    '10': method_10,
+    '18': method_18,
+}
+
+
+def evaluate_response_vs_answer(response, answer, question_type, rule_id, idx):
+    if question_type == 'logic' and rule_id == '5':
+        response_text = extract_text_from_brackets(response, 'logic')
+        answer_text = extract_text_from_brackets(answer, 'logic')
+        if response_text is None:
+            return False
+        normalized_response = rule5_normalize_content(response_text)
+        normalized_answer = rule5_normalize_content(answer)
+        return normalized_response == normalized_answer
+    elif question_type == 'logic':
+        response_text = extract_text_from_brackets(response, 'logic')
+        answer_text = extract_text_from_brackets(answer, 'logic')
+        return response_text == answer_text
+    elif question_type == 'operation' and (idx == '178' or idx == '179'):
+        response_text = extract_text_from_brackets(response, 'clean')
+        response_text = extract_and_sort_inequalities(response_text)
+        answer_text = extract_and_sort_inequalities(answer)
+        # print(response_text, answer_text)
+        return response_text == answer_text
+    elif question_type == 'operation' and rule_id == '18':
+        response_text = extract_text_from_brackets(response, 'clean')
+        answer = extract_inner_text_from_brackets(answer)
+        response_text = ''.join(sorted(re.sub(r'\W+', '', response_text)))
+        answer = ''.join(sorted(re.sub(r'\W+', '', answer)))
+        return response_text == answer
+    elif question_type == 'operation' and rule_id in {'23', '24', '25'}:
+        response_text = extract_text_from_brackets(response, 'clean')
+        if response_text is None:
+            return False
+        response_text = extract_numbers(response_text)
+        answer_text = extract_numbers(answer)
+        return response_text == answer_text
+    elif question_type == 'operation' and is_in_idx_ranges(idx, idx_ranges):
+        return compare_math_expressions(response, answer)
+    elif question_type == 'operation' and contains_or(answer):
+        return compare_multi_results(response, answer)
+    elif question_type == 'puzzle':
+        response_text = extract_inner_text_from_brackets(response)
+        answer = extract_inner_text_from_brackets(answer)
+        method = question_methods.get(rule_id)
+        if method:
+            return method(response_text, answer)
+        return method_general(response_text, answer)
+    else:
+        response_text = extract_text_from_brackets(response, 'clean')
+        return response_text == answer
+
+
+def compute_one_mixed_question_pass_rate(idx,
+                                         question_list,
+                                         response_json,
+                                         base_path=None):
+    if response_json == 'NULL':
+        result_dict = {
+            'idx': idx,
+            'response': response_json,
+            'details': None,
+            'pass_rate': 0,
+            'is_correct': False
+        }
+        return result_dict
+    response_list = extract_all_responses_from_json(response_json)
+    correct_num = 0
+    results = []
+    for q_idx, question in enumerate(question_list):
+        category, question_idx = question.rsplit('_', 1)
+        question_content = load_json_or_jsonl_with_idx(base_path,
+                                                       os.path.join(
+                                                           category, 'sample'),
+                                                       idx=question_idx)
+        answer = question_content['answer']
+        if q_idx >= len(response_list):
+            break
+        response = response_list[q_idx]
+        response_text = extract_text_from_brackets(response)
+        rule_id = question_content['rule_id']
+        is_correct = evaluate_response_vs_answer(response, answer, category,
+                                                 rule_id, q_idx)
+        if is_correct:
+            correct_num += 1
+        results.append({
+            'question': question,
+            'response_text': response_text,
+            'answer': answer,
+            'is_correct': is_correct
+        })
+
+    pass_rate = correct_num / len(question_list)
+    question_correct = pass_rate == 1.0
+    result_dict = {
+        'idx': idx,
+        'response': response_json,
+        'details': results,
+        'pass_rate': pass_rate,
+        'is_correct': question_correct
+    }
+    return result_dict
+
+
+def evaluate_responses(data, mode, base_path=None):
+    results = []
+
+    # Iterate over the values of the dictionary (numerical keys)
+    for key, record in data.items():
+        idx = key  # Use the dictionary key as the "idx"
+        response = record.get('prediction', '')
+        question_type = record.get('category', '')
+        response_text = extract_text_from_brackets(response)
+        answer = record.get('gold', '')
+        rule_id = record.get('rule_id', '')
+        is_correct = evaluate_response_vs_answer(response, answer,
+                                                 question_type, rule_id, idx)
+        result_dict = {
+            'idx': idx,
+            'response': response,
+            'response_text': response_text,
+            'answer': answer,
+            'is_correct': is_correct
+        }
+        if question_type == 'counterfactual':
+            real_life_answer = record.get('real_life_answer', '')
+            is_real_life = evaluate_response_vs_answer(response,
+                                                       real_life_answer,
+                                                       question_type, rule_id,
+                                                       idx)
+            result_dict['real_life_answer'] = real_life_answer
+            result_dict['is_real_life'] = is_real_life
+        if question_type == 'cipher' and mode == 'subquestions':
+            result_dict['type'] = record.get('type', '')
+        results.append(result_dict)
+    return results
diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index 0956f498..42da0799 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -1,4 +1,5 @@
 """Base Evaluator."""
+
 from collections import OrderedDict
 from copy import deepcopy
 from typing import Any, Dict, Iterable, List, Union
@@ -77,12 +78,17 @@ class BaseEvaluator:
         for metric in all_metrics:
             if metric in ['predictions', 'example_abbr']:
                 continue
-            g_passk_details[metric] = 100. * np.mean(
+            g_passk_details[metric] = 100.0 * np.mean(
                 [detail[metric] for detail in details])
         return g_passk_details
 
-    def evaluate(self, k: Union[int, List[int]], n: int,
-                 original_dataset: Dataset, **score_kwargs):
+    def evaluate(
+        self,
+        k: Union[int, List[int]],
+        n: int,
+        original_dataset: Dataset,
+        **score_kwargs,
+    ):
         real_size = len(original_dataset) // n
         all_details = []
         all_results = []
@@ -146,7 +152,7 @@ class BaseEvaluator:
 
                 if can_calculate and n > 1 and k > 1:
                     thresholds = [0.0, 0.25, 0.5, 0.75, 1.0]
-                    for _k in ([k] if isinstance(k, int) else k):
+                    for _k in [k] if isinstance(k, int) else k:
                         for threshold in thresholds:
                             g_pass = compute_g_pass_at_k(n=n,
                                                          c=c,
@@ -161,9 +167,31 @@ class BaseEvaluator:
 
             if can_calculate and n > 1 and k > 1:
                 eval_results.update(self.reduce(eval_details))
+
+            # Store eval_details in eval_results
             eval_results['details'] = eval_details
 
-        return eval_results
+            # Process details to flatten the predictions
+            for detail in eval_details:
+                # Extract all prediction fields and flatten them
+                flattened_predictions = {}
+                for pred in detail['predictions']:
+                    for k, v in pred.items():
+                        if k not in flattened_predictions:
+                            flattened_predictions[k] = [v]
+                        else:
+                            flattened_predictions[k].append(v)
+
+                # Replace the predictions list with the flattened dictionary
+                for k, v in flattened_predictions.items():
+                    detail[k] = v
+
+                # Remove the original predictions field
+                detail.pop('predictions')
+            return eval_results
+
+        # If there are no details, return an empty dictionary
+        return {}
 
     def score(self):
         raise NotImplementedError("Method hasn't been implemented yet")

From bc2969dba8b97e8caef54c5ae98d02a4af2f17b5 Mon Sep 17 00:00:00 2001
From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com>
Date: Wed, 12 Mar 2025 10:53:31 +0800
Subject: [PATCH 26/58] [Feature] Add support for BBEH dataset (#1925)

* bbeh

* bbeh

* fix_smallbugs_bbeh

* removeprint

* results

---------

Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
---
 dataset-index.yml                             |   5 +
 opencompass/configs/datasets/bbeh/README.md   |  26 +++
 opencompass/configs/datasets/bbeh/bbeh_gen.py |  93 +++++++++++
 .../configs/summarizers/groups/bbeh.py        |  12 ++
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/bbeh.py                  | 149 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  10 ++
 7 files changed, 296 insertions(+)
 create mode 100644 opencompass/configs/datasets/bbeh/README.md
 create mode 100644 opencompass/configs/datasets/bbeh/bbeh_gen.py
 create mode 100644 opencompass/configs/summarizers/groups/bbeh.py
 create mode 100644 opencompass/datasets/bbeh.py

diff --git a/dataset-index.yml b/dataset-index.yml
index f72e7362..e998f65f 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -234,6 +234,11 @@
     category: Reasoning
     paper: https://arxiv.org/pdf/2210.09261
     configpath: opencompass/configs/datasets/bbh
+- bbeh:
+    name: BIG-Bench Extra Hard
+    category: Reasoning
+    paper: https://arxiv.org/abs/2502.19187
+    configpath: opencompass/configs/datasets/bbeh
 - BoolQ:
     name: SuperGLUE / BoolQ
     category: Knowledge
diff --git a/opencompass/configs/datasets/bbeh/README.md b/opencompass/configs/datasets/bbeh/README.md
new file mode 100644
index 00000000..1fd034ff
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/README.md
@@ -0,0 +1,26 @@
+# BB#H
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets bbeh_gen --debug
+python3 run.py --models hf_meta_llama3_8b_instruct --datasets bbeh_gen --debug
+```
+
+## Models
+
+|                   model                    | score |
+|:-----------------------------------------:|------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     | 10.93 |
+
+### Details
+
+|                   model                    | boolean_expressions | disambiguation_qa | geometric_shapes | hyperbaton | movie_recommendation | nycc | shuffled_objects | boardgame_qa |
+|:-----------------------------------------:|--------------------:|------------------:|-----------------:|-----------:|---------------------:|-----:|-----------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |               14.00 |             33.33 |            13.50 |       1.00 |               28.00 | 11.00 |            10.00 |        18.50 |
+
+|                   model                    | buggy_tables | causal_understanding | dyck_languages | linguini | multistep_arithmetic | object_counting | object_properties | sarc_triples |
+|:-----------------------------------------:|-------------:|---------------------:|---------------:|---------:|---------------------:|----------------:|------------------:|-------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |         0.00 |               42.50 |           3.50 |     2.00 |                 0.00 |            0.00 |              1.00 |        17.00 |
+
+|                   model                    | spatial_reasoning | sportqa | temporal_sequence | time_arithmetic | web_of_lies | word_sorting | zebra_puzzles |
+|:-----------------------------------------:|------------------:|-------:|-----------------:|----------------:|------------:|-------------:|--------------:|
+| Meta-Llama-3-8B-Instruct-LMDeploy-API     |              4.00 |   5.00 |             2.00 |            3.00 |        7.50 |         2.00 |          3.50 |
diff --git a/opencompass/configs/datasets/bbeh/bbeh_gen.py b/opencompass/configs/datasets/bbeh/bbeh_gen.py
new file mode 100644
index 00000000..522ade24
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/bbeh_gen.py
@@ -0,0 +1,93 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBEHDataset, BBEHEvaluator, bbeh_mcq_postprocess, BBEHEvaluator_mcq
+
+bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+
+bbeh_multiple_choice_sets = [
+    'bbeh_boolean_expressions',
+    'bbeh_disambiguation_qa',
+    'bbeh_geometric_shapes',
+    'bbeh_hyperbaton',
+    'bbeh_movie_recommendation',
+    'bbeh_nycc',
+    'bbeh_shuffled_objects',
+]
+
+bbeh_free_form_sets = [
+    'bbeh_boardgame_qa',
+    'bbeh_buggy_tables',
+    'bbeh_causal_understanding',
+    'bbeh_dyck_languages',
+    'bbeh_linguini',
+    'bbeh_multistep_arithmetic',
+    'bbeh_object_counting',
+    'bbeh_object_properties',
+    'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning',
+    'bbeh_sportqa',
+    'bbeh_temporal_sequence',
+    'bbeh_time_arithmetic',
+    'bbeh_web_of_lies',
+    'bbeh_word_sorting',
+    'bbeh_zebra_puzzles',
+]
+
+bbeh_datasets = []
+for _name in bbeh_multiple_choice_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(
+        evaluator=dict(type=BBEHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbeh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
+
+for _name in bbeh_free_form_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=8192))
+    bbeh_eval_cfg = dict(evaluator=dict(type=BBEHEvaluator), pred_role='BOT', pred_postprocessor=dict(type=bbeh_mcq_postprocess), dataset_postprocessor=dict(type=bbeh_mcq_postprocess))
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg.copy(),
+            eval_cfg=bbeh_eval_cfg.copy()))
\ No newline at end of file
diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py
new file mode 100644
index 00000000..5e5cc222
--- /dev/null
+++ b/opencompass/configs/summarizers/groups/bbeh.py
@@ -0,0 +1,12 @@
+bbeh_summary_groups = []
+
+# bbeh
+_bbeh = [
+    'bbeh_boolean_expressions', 'bbeh_disambiguation_qa', 'bbeh_geometric_shapes', 'bbeh_hyperbaton',
+    'bbeh_movie_recommendation', 'bbeh_nycc', 'bbeh_shuffled_objects', 'bbeh_boardgame_qa',
+    'bbeh_buggy_tables', 'bbeh_causal_understanding', 'bbeh_dyck_languages', 'bbeh_linguini',
+    'bbeh_multistep_arithmetic', 'bbeh_object_counting', 'bbeh_object_properties', 'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
+    'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
+]
+bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index ffcc217d..6d135f61 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -9,6 +9,7 @@ from .arc import *  # noqa: F401, F403
 from .arc_prize_public_evaluation import *  # noqa: F401, F403
 from .ax import *  # noqa: F401, F403
 from .babilong import *  # noqa: F401, F403
+from .bbeh import *  # noqa: F401, F403
 from .bbh import *  # noqa: F401, F403
 from .bigcodebench import *  # noqa: F401, F403
 from .boolq import *  # noqa: F401, F403
diff --git a/opencompass/datasets/bbeh.py b/opencompass/datasets/bbeh.py
new file mode 100644
index 00000000..0b3a49a7
--- /dev/null
+++ b/opencompass/datasets/bbeh.py
@@ -0,0 +1,149 @@
+import json
+import os.path as osp
+import re
+from os import environ
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
+                                  TEXT_POSTPROCESSORS)
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class BBEHDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, name: str):
+        path = get_data_path(path)
+        if environ.get('DATASET_SOURCE') == 'ModelScope':
+            from modelscope import MsDataset
+            dataset = MsDataset.load(path, subset_name=name, split='test')
+        else:
+            with open(osp.join(path, f'{name}/task.json'), 'r') as f:
+                data = json.load(f)['examples']
+            dataset = Dataset.from_list(data)
+        return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module('bbeh_freeform')
+def bbeh_freeform_postprocess(text: str) -> str:
+    # Extract answer using specified prefixes
+    prefixes = [
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
+        'The final answer is '
+    ]
+    answer = text
+    for prefix in prefixes:
+        if prefix in text:
+            answer = text.split(prefix)[-1]
+            break
+
+    # Remove formatting markup
+    if '\\boxed' in answer:
+        answer = re.sub(r'\\boxed{(.*?)}', r'\1', answer)  # latex box
+    if '\\text' in answer:
+        answer = re.sub(r'\\text(?:tt)?{(.*?)}', r'\1', answer)  # text/texttt
+    if '**' in answer:
+        answer = re.sub(r'\*\*(.*?)\*\*', r'\1', answer)  # bold
+
+    # Take first line and clean
+    if '\n' in answer:
+        answer = answer.split('\n')[0].strip()
+
+    return answer.strip().lower()
+
+
+@TEXT_POSTPROCESSORS.register_module('bbeh_mcq')
+def bbeh_mcq_postprocess(text: str) -> str:
+    # Extract answer using specified prefixes
+    prefixes = [
+        'The answer is: ', 'The answer is ', 'The final answer is: ',
+        'The final answer is '
+    ]
+    answer = text
+    for prefix in prefixes:
+        if prefix in text:
+            answer = text.split(prefix)[-1]
+            break
+
+    # Remove parentheses if present
+    answer = answer.strip('()')
+
+    # Take first line and clean
+    if '\n' in answer:
+        answer = answer.split('\n')[0].strip()
+
+    return answer.strip().lower()
+
+
+@ICL_EVALUATORS.register_module()
+class BBEHEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different length'
+            }
+
+        processed_preds = [bbeh_freeform_postprocess(p) for p in predictions]
+        # References are already in correct format
+        processed_refs = [r.lower() for r in references]
+
+        details = []
+        correct_count = 0
+
+        for pred, ref in zip(processed_preds, processed_refs):
+            correct = False
+
+            # Rule 1: Exact match
+            if pred == ref:
+                correct = True
+            # Rule 2: Match after removing quotes/brackets
+            elif pred == ref.strip("'\"()[]"):
+                correct = True
+            # Rule 4: Comma - separated answers
+            elif ',' in ref:
+                norm_pred = re.sub(r'\s*,\s*', ',', pred)
+                norm_ref = re.sub(r'\s*,\s*', ',', ref)
+                if norm_pred == norm_ref:
+                    correct = True
+
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
+            correct_count += int(correct)
+
+        score = (correct_count / len(predictions)) * 100
+        return {'score': score, 'details': details}
+
+
+@ICL_EVALUATORS.register_module()
+class BBEHEvaluator_mcq(BaseEvaluator):
+
+    def score(self, predictions, references):
+        if len(predictions) != len(references):
+            return {
+                'error': 'predictions and references have different length'
+            }
+
+        processed_preds = [bbeh_mcq_postprocess(p) for p in predictions]
+        # References are already in correct format
+        processed_refs = [r.lower().strip('()') for r in references]
+
+        details = []
+        correct_count = 0
+
+        for pred, ref in zip(processed_preds, processed_refs):
+            correct = False
+
+            # Rule 1: Exact match
+            if pred == ref:
+                correct = True
+
+            details.append({'pred': pred, 'answer': ref, 'correct': correct})
+            correct_count += int(correct)
+
+        score = (correct_count / len(predictions)) * 100
+        return {'score': score, 'details': details}
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 79be5736..25c877c6 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -33,6 +33,12 @@ DATASETS_MAPPING = {
         "hf_id": "opencompass/bbh",
         "local": "./data/BBH/data",
     },
+    # bbeh
+    "opencompass/bbeh": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/bbeh/",
+    },
     # C-Eval
     "opencompass/ceval-exam": {
         "ms_id": "opencompass/ceval-exam",
@@ -691,6 +697,10 @@ DATASETS_URL = {
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/korbench.zip",
         "md5": "9107597d137e7362eaf7d218ddef7a6d",
     },
+    "/bbeh": {
+        "url": "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/bbeh.zip",
+        "md5": "43a3c2d73aee731ac68ac790bc9a358e",
+    },
     "subjective/judgerbench": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/judgerbench.zip",

From 709bc4af0e93fbcd56886e10a2acaef216122be4 Mon Sep 17 00:00:00 2001
From: liushz <qq1791167085@163.com>
Date: Wed, 12 Mar 2025 18:41:16 +0800
Subject: [PATCH 27/58] [Update] Add AIME2025 oss info (#1936)

* Support OlympiadBench Benchmark

* Support OlympiadBench Benchmark

* Support OlympiadBench Benchmark

* update dataset path

* Update olmpiadBench

* Update olmpiadBench

* Update olmpiadBench

* Add HLE dataset

* Add HLE dataset

* Add HLE dataset

* Add AIME2025 oss info

---------

Co-authored-by: sudanl <sudanl@foxmail.com>
---
 opencompass/utils/datasets_info.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 25c877c6..5f055cc0 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -309,6 +309,11 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/aime.jsonl",
     },
+    "opencompass/aime2025": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/aime2025/aime2025.jsonl",
+    },
     "opencompass/cmo_fib": {
         "ms_id": "",
         "hf_id": "",
@@ -652,11 +657,16 @@ DATASETS_URL = {
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/test_generation.zip",
         "md5": "918a6ea2b1eee6f2b1314db3c21cb4c7",
     },
-    "/aime": {
+    "/aime2024": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime.zip",
         "md5": "fbe2d0577fc210962a549f8cea1a00c8",
     },
+    "/aime2025": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/aime2025.zip",
+        "md5": "aa18cd5d2e2de246c5397f5eb1e61004",
+    },
     "/cmo": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/cmo.zip",

From 1c60e3a0f6b9e47f635894d648262474970a9192 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Thu, 13 Mar 2025 17:30:04 +0800
Subject: [PATCH 28/58] [Update] Add configurations for llmjudge dataset
 (#1940)

* Add configurations for llmjudge dataset

* update
---
 .../aime2024/aime2024_llmjudge_gen_5e9f4f.py  |  90 +++++++++
 .../aime2025/aime2025_llmjudge_gen_5e9f4f.py  |  90 +++++++++
 .../datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py | 126 ++++++++++++
 .../cmmlu/cmmlu_llmjudge_gen_e1cd9a.py        | 185 ++++++++++++++++++
 .../datasets/drop/drop_llmjudge_gen_3857b0.py |  89 +++++++++
 .../hellaswag_llmjudge_gen_809ef1.py          |  97 +++++++++
 .../datasets/mmlu/mmlu_llmjudge_gen_f4336b.py | 111 +++++++++++
 .../datasets/musr/musr_llmjudge_gen_b47fd3.py | 131 +++++++++++++
 .../supergpqa_llmjudge_gen_12b8bc.py          | 103 ++++++++++
 .../datasets/livecodebench/pass_k_utils.py    |   2 +-
 opencompass/datasets/supergpqa/supergpqa.py   |   2 -
 opencompass/tasks/openicl_eval.py             |  48 +++--
 12 files changed, 1050 insertions(+), 24 deletions(-)
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
 create mode 100644 opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
 create mode 100644 opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
 create mode 100644 opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
 create mode 100644 opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
 create mode 100644 opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
 create mode 100644 opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py
 create mode 100644 opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py
 create mode 100644 opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py

diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
new file mode 100644
index 00000000..c3b4eb07
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
@@ -0,0 +1,90 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/aime2025',
+            reader_cfg=aime2024_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    )
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=CustomDataset,
+        path='opencompass/aime2025',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py b/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
new file mode 100644
index 00000000..6af993b3
--- /dev/null
+++ b/opencompass/configs/datasets/aime2025/aime2025_llmjudge_gen_5e9f4f.py
@@ -0,0 +1,90 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CustomDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+aime2025_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+
+aime2025_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{question}\nRemember to put your final answer within \\boxed{}.',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{question}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+aime2025_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=CustomDataset,
+            path='opencompass/aime2025',
+            reader_cfg=aime2025_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+aime2025_datasets = [
+    dict(
+        type=CustomDataset,
+        abbr='aime2025',
+        path='opencompass/aime2025',
+        reader_cfg=aime2025_reader_cfg,
+        infer_cfg=aime2025_infer_cfg,
+        eval_cfg=aime2025_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
new file mode 100644
index 00000000..6fa3d563
--- /dev/null
+++ b/opencompass/configs/datasets/bbeh/bbeh_llmjudge_gen_86c3a0.py
@@ -0,0 +1,126 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (
+    BBEHDataset,
+    generic_llmjudge_postprocess,
+)
+from opencompass.evaluator import GenericLLMEvaluator
+
+bbeh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+
+bbeh_multiple_choice_sets = [
+    'bbeh_boolean_expressions',
+    'bbeh_disambiguation_qa',
+    'bbeh_geometric_shapes',
+    'bbeh_hyperbaton',
+    'bbeh_movie_recommendation',
+    'bbeh_nycc',
+    'bbeh_shuffled_objects',
+]
+
+bbeh_free_form_sets = [
+    'bbeh_boardgame_qa',
+    'bbeh_buggy_tables',
+    'bbeh_causal_understanding',
+    'bbeh_dyck_languages',
+    'bbeh_linguini',
+    'bbeh_multistep_arithmetic',
+    'bbeh_object_counting',
+    'bbeh_object_properties',
+    'bbeh_sarc_triples',
+    'bbeh_spatial_reasoning',
+    'bbeh_sportqa',
+    'bbeh_temporal_sequence',
+    'bbeh_time_arithmetic',
+    'bbeh_web_of_lies',
+    'bbeh_word_sorting',
+    'bbeh_zebra_puzzles',
+]
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+bbeh_datasets = []
+for _name in bbeh_multiple_choice_sets + bbeh_free_form_sets:
+    bbeh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(
+                        role='HUMAN',
+                        prompt=f"Think step by step, and when you provide the final answer, please use the prefix \"The answer is:\"without any modification, and provide the answer directly, with no formatting, no bolding, and no markup. For instance: \"The answer is: 42\" or \"The answer is: yes\". If the question is multiple choice with a single correct answer, the final answer must only be the letter corresponding to the correct answer. For example, \"The answer is: (a)\"\n\nQ: {{input}}\nA: ",
+                    )
+                ]
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+    bbeh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=BBEHDataset,
+                path='opencompass/bbeh',
+                name=_name,
+                abbr=_name,
+                reader_cfg=bbeh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    bbeh_datasets.append(
+        dict(
+            type=BBEHDataset,
+            path='opencompass/bbeh',
+            name=_name,
+            abbr=_name,
+            reader_cfg=bbeh_reader_cfg,
+            infer_cfg=bbeh_infer_cfg,
+            eval_cfg=bbeh_eval_cfg,
+        )
+    )
\ No newline at end of file
diff --git a/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py b/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
new file mode 100644
index 00000000..a242032b
--- /dev/null
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llmjudge_gen_e1cd9a.py
@@ -0,0 +1,185 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import CMMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+cmmlu_subject_mapping = {
+    'agronomy': '农学',
+    'anatomy': '解剖学',
+    'ancient_chinese': '古汉语',
+    'arts': '艺术学',
+    'astronomy': '天文学',
+    'business_ethics': '商业伦理',
+    'chinese_civil_service_exam': '中国公务员考试',
+    'chinese_driving_rule': '中国驾驶规则',
+    'chinese_food_culture': '中国饮食文化',
+    'chinese_foreign_policy': '中国外交政策',
+    'chinese_history': '中国历史',
+    'chinese_literature': '中国文学',
+    'chinese_teacher_qualification': '中国教师资格',
+    'clinical_knowledge': '临床知识',
+    'college_actuarial_science': '大学精算学',
+    'college_education': '大学教育学',
+    'college_engineering_hydrology': '大学工程水文学',
+    'college_law': '大学法律',
+    'college_mathematics': '大学数学',
+    'college_medical_statistics': '大学医学统计',
+    'college_medicine': '大学医学',
+    'computer_science': '计算机科学',
+    'computer_security': '计算机安全',
+    'conceptual_physics': '概念物理学',
+    'construction_project_management': '建设工程管理',
+    'economics': '经济学',
+    'education': '教育学',
+    'electrical_engineering': '电气工程',
+    'elementary_chinese': '小学语文',
+    'elementary_commonsense': '小学常识',
+    'elementary_information_and_technology': '小学信息技术',
+    'elementary_mathematics': '初等数学',
+    'ethnology': '民族学',
+    'food_science': '食品科学',
+    'genetics': '遗传学',
+    'global_facts': '全球事实',
+    'high_school_biology': '高中生物',
+    'high_school_chemistry': '高中化学',
+    'high_school_geography': '高中地理',
+    'high_school_mathematics': '高中数学',
+    'high_school_physics': '高中物理学',
+    'high_school_politics': '高中政治',
+    'human_sexuality': '人类性行为',
+    'international_law': '国际法学',
+    'journalism': '新闻学',
+    'jurisprudence': '法理学',
+    'legal_and_moral_basis': '法律与道德基础',
+    'logical': '逻辑学',
+    'machine_learning': '机器学习',
+    'management': '管理学',
+    'marketing': '市场营销',
+    'marxist_theory': '马克思主义理论',
+    'modern_chinese': '现代汉语',
+    'nutrition': '营养学',
+    'philosophy': '哲学',
+    'professional_accounting': '专业会计',
+    'professional_law': '专业法学',
+    'professional_medicine': '专业医学',
+    'professional_psychology': '专业心理学',
+    'public_relations': '公共关系',
+    'security_study': '安全研究',
+    'sociology': '社会学',
+    'sports_science': '体育学',
+    'traditional_chinese_medicine': '中医中药',
+    'virology': '病毒学',
+    'world_history': '世界历史',
+    'world_religions': '世界宗教',
+}
+
+QUERY_TEMPLATE = """
+你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
+
+{question}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+cmmlu_all_sets = list(cmmlu_subject_mapping.keys())
+
+cmmlu_datasets = []
+for _name in cmmlu_all_sets:
+    _ch_name = cmmlu_subject_mapping[_name]
+    prompt_prefix = f'请回答以下关于{_ch_name}的单项选择题, '
+    cmmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=prompt_prefix + QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    cmmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=CMMLUDataset,
+                path='opencompass/cmmlu',
+                name=_name,
+                reader_cfg=dict(
+                    input_columns=['question', 'A', 'B', 'C', 'D'],
+                    output_column='answer',
+                    train_split='dev',
+                    test_split='test',
+                ),
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    cmmlu_datasets.append(
+        dict(
+            type=CMMLUDataset,
+            path='opencompass/cmmlu',
+            name=_name,
+            abbr=f'cmmlu-{_name}',
+            reader_cfg=dict(
+                input_columns=['question', 'A', 'B', 'C', 'D'],
+                output_column='answer',
+                train_split='dev',
+                test_split='test',
+            ),
+            infer_cfg=cmmlu_infer_cfg,
+            eval_cfg=cmmlu_eval_cfg,
+            mode='singlescore',
+        )
+    )
+
+del _name, _ch_name
diff --git a/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py b/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
new file mode 100644
index 00000000..57fe6bfa
--- /dev/null
+++ b/opencompass/configs/datasets/drop/drop_llmjudge_gen_3857b0.py
@@ -0,0 +1,89 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import DropOpenAIDataset
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .drop_examples import drop_examples  # noqa: F401, F403
+
+drop_reader_cfg = dict(
+    input_columns=['prompt'],
+    output_column='answers',
+    train_split='validation',
+    test_split='validation',
+)
+
+template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {prompt}\n \n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answers}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+drop_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[dict(role='HUMAN', prompt=template)]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+drop_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=DropOpenAIDataset,
+            path='data/drop_simple_eval/dev.jsonl',
+            reader_cfg=drop_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+drop_datasets = [
+    dict(
+        abbr='drop',
+        type=DropOpenAIDataset,
+        path='data/drop_simple_eval/dev.jsonl',
+        reader_cfg=drop_reader_cfg,
+        infer_cfg=drop_infer_cfg,
+        eval_cfg=drop_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py b/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
new file mode 100644
index 00000000..4772c0a8
--- /dev/null
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llmjudge_gen_809ef1.py
@@ -0,0 +1,97 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
+from opencompass.datasets import HellaswagDatasetwithICE
+from opencompass.utils.text_postprocessors import first_option_postprocess
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+hellaswag_reader_cfg = dict(
+    input_columns=['ctx', 'A', 'B', 'C', 'D'],
+    output_column='label',
+    train_split='train',
+    test_split='val',
+)
+
+align_prompt = """Continue the following text without adding any additional information or formatting:
+{ctx}
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+What is the right option?'"""
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {ctx}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+hellaswag_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=align_prompt),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+hellaswag_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=HellaswagDatasetwithICE,
+            path='opencompass/hellaswag_ice',
+            reader_cfg=hellaswag_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+
+hellaswag_datasets = [
+    dict(
+        abbr='hellaswag',
+        type=HellaswagDatasetwithICE,
+        path='opencompass/hellaswag_ice',
+        reader_cfg=hellaswag_reader_cfg,
+        infer_cfg=hellaswag_infer_cfg,
+        eval_cfg=hellaswag_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py b/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py
new file mode 100644
index 00000000..645fa9f0
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu/mmlu_llmjudge_gen_f4336b.py
@@ -0,0 +1,111 @@
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import MMLUDataset
+from opencompass.utils.text_postprocessors import match_answer_pattern
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+with read_base():
+    from .mmlu_all_sets import mmlu_all_sets
+# None of the mmlu dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
+
+QUERY_TEMPLATE = """
+Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. 
+
+{input}
+
+A) {A}
+B) {B}
+C) {C}
+D) {D}
+""".strip()
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+mmlu_reader_cfg = dict(
+    input_columns=['input', 'A', 'B', 'C', 'D'],
+    output_column='target',
+    train_split='dev',
+)
+
+mmlu_datasets = []
+for name in mmlu_all_sets:
+    mmlu_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                round=[
+                    dict(role='HUMAN', prompt=QUERY_TEMPLATE),
+                ],
+            ),
+        ),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    mmlu_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                        )
+                    ],
+                    round=[
+                        dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                    ],
+                ),
+            ),
+            dataset_cfg=dict(
+                type=MMLUDataset,
+                path='opencompass/mmlu',
+                name=name,
+                reader_cfg=mmlu_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+    mmlu_datasets.append(
+        dict(
+            abbr=f'lukaemon_mmlu_{name}',
+            type=MMLUDataset,
+            path='opencompass/mmlu',
+            name=name,
+            reader_cfg=mmlu_reader_cfg,
+            infer_cfg=mmlu_infer_cfg,
+            eval_cfg=mmlu_eval_cfg,
+            mode='singlescore',
+        )
+    )
diff --git a/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py b/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py
new file mode 100644
index 00000000..8f72fbd9
--- /dev/null
+++ b/opencompass/configs/datasets/musr/musr_llmjudge_gen_b47fd3.py
@@ -0,0 +1,131 @@
+from opencompass.datasets import MusrDataset, generic_llmjudge_postprocess
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: {system_prompt}\n{prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{gold_answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Common configuration components
+reader_cfg = dict(
+    input_columns=[
+        'context',
+        'question_text',
+        'question',
+        'answer',
+        'choices',
+        'choices_str',
+        'intermediate_trees',
+        'intermediate_data',
+        'prompt',
+        'system_prompt',
+        'gold_answer',
+        'scidx',
+        'self_consistency_n',
+        'ablation_name',
+    ],
+    output_column='gold_answer',
+)
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt='{system_prompt}',
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt='{prompt}'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+# Dataset configurations
+DATASET_CONFIGS = {
+    'murder_mysteries': {
+        'abbr': 'musr_murder_mysteries',
+        'name': 'murder_mysteries',
+        'path': 'opencompass/musr',
+    },
+    'object_placements': {
+        'abbr': 'musr_object_placements',
+        'name': 'object_placements',
+        'path': 'opencompass/musr',
+    },
+    'team_allocation': {
+        'abbr': 'musr_team_allocation',
+        'name': 'team_allocation',
+        'path': 'opencompass/musr',
+    },
+}
+
+# Create dataset configurations
+musr_datasets = []
+
+for config in DATASET_CONFIGS.values():
+    dataset = dict(
+        abbr=config['abbr'],
+        type=MusrDataset,
+        path=config['path'],
+        name=config['name'],
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=dict(
+            evaluator=dict(
+                type=GenericLLMEvaluator,
+                prompt_template=dict(
+                    type=PromptTemplate,
+                    template=dict(
+                        begin=[
+                            dict(
+                                role='SYSTEM',
+                                fallback_role='HUMAN',
+                                prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                            )
+                        ],
+                        round=[
+                            dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                        ],
+                    ),
+                ),
+                dataset_cfg=dict(
+                    type=MusrDataset,
+                    path=config['path'],
+                    name=config['name'],
+                    reader_cfg=reader_cfg,
+                ),
+                judge_cfg=dict(),
+                dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+            ),
+        ),
+    )
+    musr_datasets.append(dataset)
diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
new file mode 100644
index 00000000..02e6f2da
--- /dev/null
+++ b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
@@ -0,0 +1,103 @@
+from opencompass.datasets.supergpqa.supergpqa import (
+    SuperGPQADataset,
+)
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: {infer_prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer_letter}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'discipline',
+        'field',
+        'subfield',
+        'difficulty',
+        'infer_prompt',
+        'prompt_mode',
+    ],
+    output_column='answer_letter',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{infer_prompt}',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=SuperGPQADataset,
+            path='m-a-p/SuperGPQA',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+)
+supergpqa_dataset = dict(
+    type=SuperGPQADataset,
+    abbr='supergpqa',
+    path='m-a-p/SuperGPQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+supergpqa_datasets = [supergpqa_dataset]
diff --git a/opencompass/datasets/livecodebench/pass_k_utils.py b/opencompass/datasets/livecodebench/pass_k_utils.py
index ec895bbb..5a5f7980 100644
--- a/opencompass/datasets/livecodebench/pass_k_utils.py
+++ b/opencompass/datasets/livecodebench/pass_k_utils.py
@@ -53,7 +53,7 @@ def compute_metrics_from_results(results, k_list=[1, 5]):
         k: dict(zip(task_ids, v))
         for k, v in detail_pass_at_k.items()
     }
-    pass_at_k['detail'] = detail_metrics
+    pass_at_k['details'] = detail_metrics
     return pass_at_k
 
 
diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py
index 7193722d..9dd96dd4 100644
--- a/opencompass/datasets/supergpqa/supergpqa.py
+++ b/opencompass/datasets/supergpqa/supergpqa.py
@@ -7,7 +7,6 @@ from opencompass.datasets.supergpqa.supergpqa_eval import (
 from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
-from opencompass.utils import get_data_path
 
 from ..base import BaseDataset
 
@@ -29,7 +28,6 @@ class SuperGPQADataset(BaseDataset):
 
     @staticmethod
     def load(path: str, prompt_mode: str, **kwargs):
-        path = get_data_path(path, local_mode=True)
         dataset = load_dataset(path, split='train')
 
         # get prompt template
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index 7c769060..252a120a 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -263,28 +263,34 @@ class OpenICLEvalTask(BaseTask):
 
             if self.dump_details:
                 details = result.get('details', None)
-                try:
-                    result['details'] = self.format_details(
-                        pred_strs,
-                        model_pred_strs,
-                        test_set[self.output_column],
-                        details,
-                        model_details,
-                        pred_dicts,
-                    )
-                    self.logger.warning(
-                        f"result['details'] : {result['details']}"),
-                    result['type'] = result['details'].pop('type', None)
-                    if self.cal_extract_rate:
-                        # Calculate the extraction success rate for prediction
-                        result['extract_rate'] = self.extract_rate(result)
+                # Try to format details is details is not provided by evaluator
+                if details is None:
+                    self.logger.info(
+                        'Details is not give by evaluator, try to format it')
+                    try:
+                        result['details'] = self.format_details(
+                            pred_strs,
+                            model_pred_strs,
+                            test_set[self.output_column],
+                            details,
+                            model_details,
+                            pred_dicts,
+                        )
+                        self.logger.warning(
+                            f"result['details'] : {result['details']}"),
+                        result['type'] = result['details'].pop('type', None)
+                        if self.cal_extract_rate:
+                            # Calculate the extraction success
+                            # rate for prediction
+                            result['extract_rate'] = self.extract_rate(result)
 
-                    if 'PPL' in str(
-                            self.dataset_cfg.infer_cfg.inferencer.type):
-                        result['correct_bpb'], result['incorrect_bpb'] = (
-                            self.calculate_bpb(pred_dicts))
-                except Exception as e:
-                    self.logger.warning(f'Skip dumping details due to: {e}.')
+                        if 'PPL' in str(
+                                self.dataset_cfg.infer_cfg.inferencer.type):
+                            result['correct_bpb'], result['incorrect_bpb'] = (
+                                self.calculate_bpb(pred_dicts))
+                    except Exception as e:
+                        self.logger.warning(
+                            f'Skip dumping details due to: {e}.')
             else:
                 result.pop('details', None)
 

From 854c6bf025ed53e332ae58a7ee66807eae48618d Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Thu, 13 Mar 2025 20:52:50 +0800
Subject: [PATCH 29/58] [Update] Update requirement and base evaluator

---
 opencompass/openicl/icl_evaluator/icl_base_evaluator.py | 4 ++--
 requirements/extra.txt                                  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index 42da0799..e2aad9be 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -190,8 +190,8 @@ class BaseEvaluator:
                 detail.pop('predictions')
             return eval_results
 
-        # If there are no details, return an empty dictionary
-        return {}
+        # If there are no details, return results
+        return results
 
     def score(self):
         raise NotImplementedError("Method hasn't been implemented yet")
diff --git a/requirements/extra.txt b/requirements/extra.txt
index a98b3bc8..fd3f7a2f 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -12,7 +12,7 @@ faiss_gpu==1.7.2
 # IFEval
 langdetect
 # TheoremQA
-latex2sympy2
+latex2sympy2==1.9.1
 # Lawbench, leval
 ltp
 # Math

From 15c825a51a840341c500e40bf51fac829773e99c Mon Sep 17 00:00:00 2001
From: Yufeng Zhao <115388472+epsilondylan@users.noreply.github.com>
Date: Mon, 17 Mar 2025 17:19:56 +0800
Subject: [PATCH 30/58] [Update] Bbeh harmony summarizer added (#1951)

* bbeh

* bbeh

* fix_smallbugs_bbeh

* removeprint

* harmonic

* update_summerizer

* harmonic-tested

* harmonic-tested

* clean

* clean

* cleaned_rebased

---------

Co-authored-by: yufeng zhao <zhaoyufeng@pjlab.org.cn>
---
 opencompass/configs/summarizers/bbeh.py | 30 +++++++++++++++++++++++++
 opencompass/summarizers/default.py      | 13 +++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 opencompass/configs/summarizers/bbeh.py

diff --git a/opencompass/configs/summarizers/bbeh.py b/opencompass/configs/summarizers/bbeh.py
new file mode 100644
index 00000000..ba469f82
--- /dev/null
+++ b/opencompass/configs/summarizers/bbeh.py
@@ -0,0 +1,30 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .groups.bbeh import bbeh_summary_groups
+
+# Get all the BBEH subset names from the imported bbeh_summary_groups
+bbeh_subsets = []
+for group in bbeh_summary_groups:
+    if group['name'] == 'bbeh':
+        bbeh_subsets = group['subsets']
+        break
+
+summarizer = dict(
+    # Include both individual datasets and the summary metrics we want to see
+    dataset_abbrs=bbeh_subsets + ['bbeh_naive_average'] + ['bbeh_harmonic_mean'],
+    
+    # Define the summary group for bbeh
+    summary_groups=[
+        {
+            'name': 'bbeh_naive_average',
+            'subsets': bbeh_subsets,
+            'metric': 'naive_average'  # Explicitly specify the metric to use
+        },
+        {
+            'name': 'bbeh_harmonic_mean',
+            'subsets': bbeh_subsets,
+            'metric': 'harmonic_mean'
+        }
+    ]
+)
\ No newline at end of file
diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py
index 8a0da5b2..f1094f14 100644
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -171,6 +171,8 @@ class DefaultSummarizer:
                         default_metric = 'sum'
                     elif sg.get('weights', []):
                         default_metric = 'weighted_average'
+                    elif sg.get('harmonic_mean', False):
+                        default_metric = 'harmonic_mean'
                     else:
                         default_metric = 'naive_average'
 
@@ -204,6 +206,17 @@ class DefaultSummarizer:
                         avg = sum(scores[metric].values()) / len(scores[metric])
                         variance = sum((scores[metric][k] - avg) ** 2 for k in scores[metric]) / len(scores[metric])
                         scores[metric] = result[metric] = math.sqrt(variance)
+                    elif default_metric == 'harmonic_mean':
+                        # Check for non-positive values that would cause issues in harmonic mean
+                        if any(scores[metric][k] <= 0 for k in scores[metric]):
+                            self.logger.warning(f'Non-positive values found when calculating harmonic mean for {sg["name"]}')
+                            # Handle non-positive values (either skip or use a small positive value)
+                            numerator = len(scores[metric])
+                            denominator = sum(1 / max(scores[metric][k], 1) for k in scores[metric])
+                        else:
+                            numerator = len(scores[metric])
+                            denominator = sum(1 / scores[metric][k] for k in scores[metric])
+                        scores[metric] = result[metric] = numerator / denominator
                     else:
                         if sg.get('weights', []):
                             # check sg['weights'][k] != 0 in case of scores[metric][k] is NaN

From 0b7f76e19345280e66490cfb34f023970b38eca6 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 17 Mar 2025 18:25:08 +0800
Subject: [PATCH 31/58] [Bug] Fix Summarizer logic (#1953)

---
 opencompass/configs/summarizers/bbeh.py       | 30 -------------------
 .../configs/summarizers/groups/bbeh.py        |  3 +-
 opencompass/summarizers/default.py            | 22 +++++++-------
 3 files changed, 13 insertions(+), 42 deletions(-)
 delete mode 100644 opencompass/configs/summarizers/bbeh.py

diff --git a/opencompass/configs/summarizers/bbeh.py b/opencompass/configs/summarizers/bbeh.py
deleted file mode 100644
index ba469f82..00000000
--- a/opencompass/configs/summarizers/bbeh.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    from .groups.bbeh import bbeh_summary_groups
-
-# Get all the BBEH subset names from the imported bbeh_summary_groups
-bbeh_subsets = []
-for group in bbeh_summary_groups:
-    if group['name'] == 'bbeh':
-        bbeh_subsets = group['subsets']
-        break
-
-summarizer = dict(
-    # Include both individual datasets and the summary metrics we want to see
-    dataset_abbrs=bbeh_subsets + ['bbeh_naive_average'] + ['bbeh_harmonic_mean'],
-    
-    # Define the summary group for bbeh
-    summary_groups=[
-        {
-            'name': 'bbeh_naive_average',
-            'subsets': bbeh_subsets,
-            'metric': 'naive_average'  # Explicitly specify the metric to use
-        },
-        {
-            'name': 'bbeh_harmonic_mean',
-            'subsets': bbeh_subsets,
-            'metric': 'harmonic_mean'
-        }
-    ]
-)
\ No newline at end of file
diff --git a/opencompass/configs/summarizers/groups/bbeh.py b/opencompass/configs/summarizers/groups/bbeh.py
index 5e5cc222..95697144 100644
--- a/opencompass/configs/summarizers/groups/bbeh.py
+++ b/opencompass/configs/summarizers/groups/bbeh.py
@@ -9,4 +9,5 @@ _bbeh = [
     'bbeh_spatial_reasoning', 'bbeh_sportqa', 'bbeh_temporal_sequence', 'bbeh_time_arithmetic',
     'bbeh_web_of_lies', 'bbeh_word_sorting', 'bbeh_zebra_puzzles'
 ]
-bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh})
+bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'naive_average'})
+bbeh_summary_groups.append({'name': 'bbeh', 'subsets': _bbeh, 'metric':'harmonic_mean'})
\ No newline at end of file
diff --git a/opencompass/summarizers/default.py b/opencompass/summarizers/default.py
index f1094f14..88dd793b 100644
--- a/opencompass/summarizers/default.py
+++ b/opencompass/summarizers/default.py
@@ -188,18 +188,18 @@ class DefaultSummarizer:
                         eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
                 else:
                     group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
-                    if need_smart_metric and len(group_metrics) > 1:
-                        for metric in group_metrics:
-                            for dataset_abbr in sg['subsets']:
-                                scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
-                                eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
-                    else:
-                        group_metrics = [default_metric]
+                    group_metrics.append(default_metric)
+                    for metric in group_metrics:
                         for dataset_abbr in sg['subsets']:
-                            metric = dataset_metrics[dataset_abbr][0]
-                            scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
-                            eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
-
+                            if metric == default_metric:
+                                metric_default = dataset_metrics[dataset_abbr][0]
+                                scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
+                                parsed_results[model_abbr][dataset_abbr][metric_default]
+                                eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
+                            else:
+                                scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
+                                parsed_results[model_abbr][dataset_abbr][metric]
+                                eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
                 result = {}
                 for metric in scores:
                     if default_metric == 'standard_deviation':

From 5d2d253d838f9aec844f46163d8c6184dcbaee63 Mon Sep 17 00:00:00 2001
From: Jason Cheung <89123721+Jiajun0425@users.noreply.github.com>
Date: Tue, 18 Mar 2025 20:08:15 +0800
Subject: [PATCH 32/58] [BUG] Fix model_kwargs pass logic for vllm (#1958)

---
 opencompass/utils/run.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index accd3468..2443b829 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -276,13 +276,15 @@ def change_accelerator(models, accelerator):
                     if model.get(item) is not None:
                         acc_model[item] = model[item]
             elif accelerator == 'vllm':
+                model_kwargs = dict(tensor_parallel_size=model['run_cfg']['num_gpus'], max_model_len=model.get('max_seq_len', None))
+                model_kwargs.update(model.get('model_kwargs'))
                 logger.info(f'Transforming {model["abbr"]} to {accelerator}')
 
                 acc_model = dict(
                     type=f'{VLLM.__module__}.{VLLM.__name__}',
                     abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
                     path=model['path'],
-                    model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus'], max_model_len=model.get('max_seq_len', None)),
+                    model_kwargs=model_kwargs,
                     max_out_len=model['max_out_len'],
                     max_seq_len=model.get('max_seq_len', None),
                     batch_size=model['batch_size'],
@@ -296,12 +298,14 @@ def change_accelerator(models, accelerator):
                 raise ValueError(f'Unsupported accelerator {accelerator} for model type {model["type"]}')
         elif model['type'] in [HuggingFacewithChatTemplate, f'{HuggingFacewithChatTemplate.__module__}.{HuggingFacewithChatTemplate.__name__}']:
             if accelerator == 'vllm':
+                model_kwargs = dict(tensor_parallel_size=model['run_cfg']['num_gpus'], max_model_len=model.get('max_seq_len', None))
+                model_kwargs.update(model.get('model_kwargs'))
                 mod = VLLMwithChatTemplate
                 acc_model = dict(
                     type=f'{mod.__module__}.{mod.__name__}',
                     abbr=model['abbr'].replace('hf', 'vllm') if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
                     path=model['path'],
-                    model_kwargs=dict(tensor_parallel_size=model['run_cfg']['num_gpus'], max_model_len=model.get('max_seq_len', None)),
+                    model_kwargs=model_kwargs,
                     max_seq_len=model.get('max_seq_len', None),
                     max_out_len=model['max_out_len'],
                     batch_size=16,

From c98599271be058e4c39f2e1a57b07e3f001caddd Mon Sep 17 00:00:00 2001
From: Songyang Zhang <tonysy@users.noreply.github.com>
Date: Tue, 18 Mar 2025 20:15:20 +0800
Subject: [PATCH 33/58] [Update] Update OlympiadBench and Update LLM Judge
 (#1954)

---
 docs/en/advanced_guides/llm_judge.md          | 17 ++++++++++
 docs/zh_cn/advanced_guides/llm_judge.md       | 19 ++++++++++-
 .../summarizers/groups/OlympiadBench.py       | 10 ++++++
 .../evaluator/generic_llm_evaluator.py        | 34 ++++++++++++++++++-
 opencompass/utils/run.py                      | 10 +++++-
 5 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/docs/en/advanced_guides/llm_judge.md b/docs/en/advanced_guides/llm_judge.md
index 91a1a5bf..1d9e9760 100644
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@@ -34,6 +34,23 @@ problem,answer
 
 ## Configuration
 
+### Using LLM for Evaluation via Command Line
+
+Some datasets in OpenCompass already include LLM judge configurations.
+You need to use a model service (such as OpenAI or DeepSeek's official API) or start a model service locally using tools like LMDeploy, vLLM, or SGLang.
+
+Then, you can set the environment variables for the evaluation service and evaluate models using the following commands:
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
+```
+
+Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
+
+### ### Using LLM for Evaluation via Configuration Files
+
 To set up an LLM judge evaluation, you'll need to configure three main components:
 
 1. Dataset Reader Configuration
diff --git a/docs/zh_cn/advanced_guides/llm_judge.md b/docs/zh_cn/advanced_guides/llm_judge.md
index 66d288a8..bc49696e 100644
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@@ -34,7 +34,24 @@ problem,answer
 
 ## 配置说明
 
-要设置LLM评判评估，你需要配置三个主要组件：
+### 基于命令行使用LLM进行评估
+
+OpenCompass中部分数据集已经包含了LLM评判器的配置。
+你需要使用一个模型服务（如OpenAI或DeepSeek官方提供的API）或本地使用LMDeploy、vLLM、SGLang等工具启动一个模型服务。
+
+然后，你可以通过以下命令设置相关评估服务的环境变量，并对模型进行评估：
+
+```bash
+export OC_JUDGE_MODEL=Qwen/Qwen2.5-32B-Instruct
+export OC_JUDGE_API_KEY=sk-1234
+export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1 
+```
+
+注意，默认情况下，OpenCompass会使用这三个环境变量，但如果你使用了基于配置文件的方式配置评估服务，这三个环境变量将不会生效。
+
+### 基于配置文件使用LLM进行评估
+
+对一个数据集设置LLM评判评估，你需要配置三个主要组件：
 
 1. 数据集读取配置
 
diff --git a/opencompass/configs/summarizers/groups/OlympiadBench.py b/opencompass/configs/summarizers/groups/OlympiadBench.py
index fc57f603..e30831ff 100644
--- a/opencompass/configs/summarizers/groups/OlympiadBench.py
+++ b/opencompass/configs/summarizers/groups/OlympiadBench.py
@@ -16,7 +16,17 @@ math_categories = [
     'OE_TO_maths_zh_CEE', # OpenEnded - TextOnly - maths - CEE
 ]
 
+physics_categories = [
+    'OE_TO_physics_en_COMP', # OpenEnded - TextOnly - physics - COMP
+    'OE_TO_physics_zh_CEE' # OpenEnded - TextOnly - physics - CEE
+]
+
 
 OlympiadBenchMath_summary_groups = [
     {'name': 'OlympiadBenchMath', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in math_categories]},
 ]
+
+
+OlympiadBenchPhysics_summary_groups = [
+    {'name': 'OlympiadBenchPhysics', 'subsets': ['OlympiadBench_' + c.replace(' ', '_') for c in physics_categories]},
+]
diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py
index 131c2e75..c0b33a69 100644
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@@ -1,3 +1,4 @@
+import os
 import os.path as osp
 from typing import Dict, List, Optional
 
@@ -36,7 +37,11 @@ class GenericLLMEvaluator(BaseEvaluator):
     ) -> None:
 
         self.logger = get_logger()
-        self.judge_cfg = judge_cfg
+        # If judge_cfg is not provided, fall back to the default configuration
+        if not judge_cfg:
+            self.judge_cfg = self.default_judge_cfg
+        else:
+            self.judge_cfg = judge_cfg
         self.output_path = ''
 
         self.prompt_template = ICL_PROMPT_TEMPLATES.build(prompt_template)
@@ -141,3 +146,30 @@ class GenericLLMEvaluator(BaseEvaluator):
             kwargs = self.dict_postprocessor
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
             return proc(output, self.output_path, **kwargs)
+
+    @property
+    def default_judge_cfg(self):
+        from opencompass.models import OpenAISDK
+
+        DEFAULT_JUDGE_CFG = dict(
+            type=OpenAISDK,
+            path=os.environ['OC_JUDGE_MODEL'],
+            key=os.environ['OC_JUDGE_API_KEY'],
+            openai_api_base=[
+                os.environ.get('OC_JUDGE_API_BASE',
+                               'https://api.openai.com/v1/')
+            ],
+            meta_template=dict(round=[
+                dict(role='HUMAN', api_role='HUMAN'),
+                dict(role='BOT', api_role='BOT', generate=True),
+            ], ),
+            query_per_second=16,
+            batch_size=1024,
+            temperature=0.001,
+            tokenizer_path='gpt-4o-2024-05-13',
+            verbose=True,
+            max_out_len=16384,
+            max_seq_len=49152,
+        )
+
+        return DEFAULT_JUDGE_CFG
diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py
index 2443b829..772c0a8a 100644
--- a/opencompass/utils/run.py
+++ b/opencompass/utils/run.py
@@ -313,6 +313,14 @@ def change_accelerator(models, accelerator):
                     stop_words=model.get('stop_words', []),
                 )
             elif accelerator == 'lmdeploy':
+
+                if model.get('generation_kwargs') is not None:
+                    logger.warning(f'LMDeploy uses do_sample=False as default, and you need to set do_sample=True for sampling mode')
+                    gen_config = model['generation_kwargs'].copy()
+                else:
+                    logger.info('OpenCompass uses greedy decoding as default, you can set generation-kwargs for your purpose')
+                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9)
+
                 mod = TurboMindModelwithChatTemplate
                 acc_model = dict(
                     type=f'{mod.__module__}.{mod.__name__}',
@@ -324,7 +332,7 @@ def change_accelerator(models, accelerator):
                         session_len=model.get('max_seq_len', None),
                         max_new_tokens=model['max_out_len']
                     ),
-                    gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9),
+                    gen_config=gen_config,
                     max_seq_len=model.get('max_seq_len', None),
                     max_out_len=model['max_out_len'],
                     batch_size=16,

From b9de8b0e2b47f9561395c6e5fd23bd4ca1e5e4f6 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 18 Mar 2025 20:24:07 +0800
Subject: [PATCH 34/58] [Update] Unset disallowed_special token for Openai
 model (#1960)

---
 opencompass/models/openai_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index d5ac02d8..f46de71c 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -399,7 +399,7 @@ class OpenAI(BaseAPIModel):
                 self.logger.info(
                     f'Successfully load default tiktoken tokenizer: '
                     f' {default_tokenizer}')
-            return len(enc.encode(prompt))
+            return len(enc.encode(prompt, disallowed_special=()))
 
     def _bin_trim(self, prompt: str, num_token: int, mode: str) -> str:
         """Get a suffix of prompt which is no longer than num_token tokens.

From 8a5029b1219ebdd3f3c5e4d30a25bc4e3851f058 Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Fri, 21 Mar 2025 20:09:25 +0800
Subject: [PATCH 35/58] [Feature] Add MultiPL-E & Code Evaluator (#1963)

* multiple_code develop

* multiple_code update

* comments upadate

* index upadate
---
 dataset-index.yml                             |   5 +
 .../multipl_e/multiple_top_ten_gen.py         |  56 ++++
 opencompass/configs/models/phi/hf_phi_4.py    |  12 +
 opencompass/datasets/__init__.py              |   1 +
 opencompass/datasets/custom.py                |  27 ++
 opencompass/datasets/multipl_e.py             | 103 +++++++
 .../openicl/icl_evaluator/code_evaluator.py   | 267 ++++++++++++++++++
 opencompass/utils/datasets_info.py            |  11 +
 8 files changed, 482 insertions(+)
 create mode 100644 opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
 create mode 100644 opencompass/configs/models/phi/hf_phi_4.py
 create mode 100644 opencompass/datasets/multipl_e.py
 create mode 100644 opencompass/openicl/icl_evaluator/code_evaluator.py

diff --git a/dataset-index.yml b/dataset-index.yml
index e998f65f..dc50f396 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -529,6 +529,11 @@
     category: Understanding
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
     configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
+- multipl_e:
+    name: MultiPL-E
+    category: Code
+    paper: https://arxiv.org/pdf/2210.14868
+    configpath: opencompass/configs/datasets/multipl_e
 - narrativeqa:
     name: NarrativeQA
     category: Understanding
diff --git a/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
new file mode 100644
index 00000000..93ab2962
--- /dev/null
+++ b/opencompass/configs/datasets/multipl_e/multiple_top_ten_gen.py
@@ -0,0 +1,56 @@
+# Select the 10 most popular programming languages from MultiPL-E to compose the test set.
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MultiplEDataset, MultiplEEvaluator
+
+
+_TOP_TEN_LANGUAGE_ = ['cpp', 'cs', 'go', 'java', 'rb', 'js', 'php', 'r', 'rs', 'sh']
+
+multiple_reader_cfg = dict(input_columns=['language', 'prompt'], output_column='tests')
+
+multiple_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template='Based on the provided {language} code snippet, complete the subsequent content. The initial part of the completed code must match the provided code snippet exactly:\n{prompt}'),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+multiple_eval_cfg = {
+    lang: dict(
+        evaluator=dict(
+            type=MultiplEEvaluator,
+            language=lang,
+            ip_address='https://opencompass-multiple-evaluator.hf.space',
+        ),
+        pred_role='BOT',
+    ) for lang in _TOP_TEN_LANGUAGE_
+}
+
+multiple_datasets = [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'humaneval-multiple-{lang}',
+        language=lang,
+        num_repeats=1,
+        path='opencompass/multipl_e',
+        tag='humaneval',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
+
+multiple_datasets += [
+    dict(
+        type=MultiplEDataset,
+        abbr=f'mbpp-multiple-{lang}',
+        language=lang,
+        num_repeats=1,
+        path='opencompass/multipl_e',
+        tag='mbpp',
+        reader_cfg=multiple_reader_cfg,
+        infer_cfg=multiple_infer_cfg,
+        eval_cfg=multiple_eval_cfg[lang],
+    ) for lang in _TOP_TEN_LANGUAGE_
+]
diff --git a/opencompass/configs/models/phi/hf_phi_4.py b/opencompass/configs/models/phi/hf_phi_4.py
new file mode 100644
index 00000000..1f4f6754
--- /dev/null
+++ b/opencompass/configs/models/phi/hf_phi_4.py
@@ -0,0 +1,12 @@
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='phi-4',
+        path='microsoft/phi-4',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=2),
+    )
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 6d135f61..49cd1522 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -98,6 +98,7 @@ from .mmlu_cf import *  # noqa: F401, F403
 from .mmlu_pro import *  # noqa: F401, F403
 from .MMLUArabic import *  # noqa: F401, F403
 from .mmmlu import *  # noqa: F401, F403
+from .multipl_e import *  # noqa: F401, F403
 from .multirc import *  # noqa: F401, F403
 from .musr import *  # noqa: F401, F403
 from .narrativeqa import *  # noqa: F401, F403
diff --git a/opencompass/datasets/custom.py b/opencompass/datasets/custom.py
index 110cb72b..b5eb8dbb 100644
--- a/opencompass/datasets/custom.py
+++ b/opencompass/datasets/custom.py
@@ -183,6 +183,33 @@ class CustomDataset(BaseDataset):
         return Dataset.from_list(data)
 
 
+@LOAD_DATASET.register_module()
+class CodeCustomDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, file_name=None, local_mode=False, num_repeats=1, **kwargs):
+        path = get_data_path(path, local_mode=local_mode)
+        if file_name is not None:
+            path = os.path.join(path, file_name)
+        data = []
+        if path.endswith('.jsonl'):
+            with open(path, 'r', encoding='utf-8') as f:
+                for line in f:
+                    data.extend(
+                        [json.loads(line.strip()) for _ in range(num_repeats)])
+        elif path.endswith('.csv'):
+            with open(path, 'r', encoding='utf-8-sig') as f:
+                reader = csv.reader(f)
+                header = next(reader)
+                for row in reader:
+                    data.extend(
+                        [dict(zip(header, row)) for _ in range(num_repeats)])
+        else:
+            raise ValueError(f'Unsupported file format: {path}')
+
+        return Dataset.from_list(data)
+
+
 class CircularCustomDataset(CustomDataset, metaclass=CircularDatasetMeta):
     dataset_class = CustomDataset
 
diff --git a/opencompass/datasets/multipl_e.py b/opencompass/datasets/multipl_e.py
new file mode 100644
index 00000000..657b52de
--- /dev/null
+++ b/opencompass/datasets/multipl_e.py
@@ -0,0 +1,103 @@
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator.code_evaluator import CodeEvaluator
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from .base import BaseDataset
+
+# currently supporting languages
+_HUMANEVAL_LANGUAGE_ = [
+    'adb', 'clj', 'cpp', 'cs', 'd', 'dart', 'elixir', 'go', 'hs', 'java', 'jl',
+    'js', 'lua', 'ml', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala',
+    'sh', 'swift', 'ts'
+]
+_MBPP_LANGUAGE_ = [
+    'adb', 'clj', 'cpp', 'cs', 'd', 'elixir', 'go', 'hs', 'java', 'jl', 'js',
+    'lua', 'ml', 'php', 'pl', 'py', 'r', 'rb', 'rkt', 'rs', 'scala', 'sh',
+    'swift', 'ts'
+]
+
+
+@LOAD_DATASET.register_module()
+class MultiplEDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str,
+             language: str,
+             num_repeats: int = 1,
+             tag: str = 'humaneval',
+             local_mode: bool = False):
+        """Load dataset for pass k mode.
+
+        Args:
+            path(str): The path to the dataset.
+            language(str): The language of the dataset.
+            num_repeats(int): Number of repetition for this dataset to get.
+            tag(str): The tag of the dataset.
+            local_mode(bool): Whether to load the dataset in local mode.
+
+        Returns:
+            Dataset: A PyTorch dataset.
+        """
+        path = get_data_path(path, local_mode=local_mode)
+        assert tag in ['humaneval',
+                       'mbpp'], 'tag must be in ["humaneval", "mbpp"]'
+        if tag == 'humaneval':
+            assert language in _HUMANEVAL_LANGUAGE_, (
+                f'language must be in {_HUMANEVAL_LANGUAGE_}')
+        else:
+            assert language in _MBPP_LANGUAGE_, (
+                f'language must be in {_MBPP_LANGUAGE_}')
+        file_path = osp.join(path, f'{tag}-{language}.jsonl')
+        dataset = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                dataset.extend(
+                    [json.loads(line.strip()) for _ in range(num_repeats)])
+        return Dataset.from_list(dataset)
+
+
+class MultiplEEvaluator(CodeEvaluator):
+
+    def _stop_at_stop_token(self, decoded_string, stop_tokens):
+        """Produces the prefix of decoded_string that ends at the first
+        occurrence of a stop_token.
+
+        WARNING: the decoded_string *must not* include the prompt,
+        which may have stop tokens itself.
+
+        Args:
+            decoded_string: A string generated by the model.
+            stop_tokens: A list of strings, where each string is a stop token.
+        Returns:
+            The decoded_string, truncated at the first occurrence of a stop
+            token.
+        """
+        min_stop_index = len(decoded_string)
+        for stop_token in stop_tokens:
+            stop_index = decoded_string.find(stop_token)
+            if stop_index != -1 and stop_index < min_stop_index:
+                min_stop_index = stop_index
+        return decoded_string[:min_stop_index]
+
+    def _process_completions(self, test_case, completions):
+        """Process completions with a test case.
+
+        Args:
+            test_case: A test case.
+            completions: A list of completions.
+        Returns:
+            A list of processed completions.
+        """
+        processed_completions = []
+        for comp in completions:
+            comp = self._extract_code(comp)
+            post_comp = self._remove_prefix(test_case['prompt'], comp)
+            post_comp = self._stop_at_stop_token(post_comp,
+                                                 test_case['stop_tokens'])
+            processed_completions.append(post_comp)
+        return processed_completions
diff --git a/opencompass/openicl/icl_evaluator/code_evaluator.py b/opencompass/openicl/icl_evaluator/code_evaluator.py
new file mode 100644
index 00000000..d586cd6e
--- /dev/null
+++ b/opencompass/openicl/icl_evaluator/code_evaluator.py
@@ -0,0 +1,267 @@
+# flake8: noqa: E501
+
+import difflib
+import os
+import re
+import tempfile
+import time
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from datasets import Dataset
+from gradio_client import Client
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+
+
+@ICL_EVALUATORS.register_module()
+class CodeEvaluator(BaseEvaluator):
+    """Evaluator for code generation tasks.
+
+    This evaluator sends code to a remote evaluation service to test its
+    functionality against provided test cases. It handles code extraction,
+    processing, and result analysis.
+    """
+
+    def __init__(self,
+                 language: str,
+                 ip_address: str = 'localhost',
+                 retry: int = 3) -> None:
+        """Initialize the CodeEvaluator.
+
+        Args:
+            language (str): Programming language of the code to evaluate.
+            ip_address (str, optional): IP address of the evaluation service. Defaults to 'localhost'.
+            retry (int, optional): Number of retry attempts for failed connections. Defaults to 3.
+        """
+        self.language = language
+        self.retry = retry
+        self.client = Client(ip_address)
+        super().__init__()
+
+    def _extract_code(self, text: str) -> str:
+        """Extract code from markdown-formatted text.
+
+        Args:
+            text (str): Text that may contain code blocks in markdown format.
+
+        Returns:
+            str: Extracted code from the last code block, or the original text if no code blocks found.
+        """
+        blocks = re.findall(r'```\w*\n(.*?)```', text, re.DOTALL)
+        if len(blocks) >= 1:
+            text = blocks[0]
+        return text
+
+    def _code_eval_service(
+        self, input_data: Union[Dict, List,
+                                str]) -> Tuple[bool, Union[Dict, List, Any]]:
+        """Send code to the remote evaluation service using gradio_client and
+        get the results.
+
+        Args:
+            input_data: Can be one of:
+                - dict: Dictionary containing code information for a single test case
+                - list: List of dictionaries for batch evaluation
+                - str: File path to code file
+
+        Returns:
+            tuple: (succeed, output)
+                - succeed (bool): Whether the request was successful
+                - output (dict/list/str): Evaluation results or error message
+        """
+        try:
+            temp_file_path = None
+            # Handle file path input
+            if isinstance(input_data, str):
+                with tempfile.NamedTemporaryFile(suffix=f'.{self.language}',
+                                                 delete=False) as temp_file:
+                    temp_file_path = temp_file.name
+                    with open(input_data, 'r') as src_file:
+                        content = src_file.read()
+                    temp_file.write(content.encode())
+                input_data = temp_file_path
+
+            # Send to evaluation service
+            result = self.client.predict(input_data, api_name='/evaluate')
+
+            # Process the result
+            if isinstance(result, (dict, list)):
+                return True, result
+            else:
+                # Try to parse the result as JSON if it's a string
+                try:
+                    import json
+                    parsed_result = json.loads(result)
+                    return True, parsed_result
+                except:  # noqa: E722
+                    return True, {'status': 'unknown', 'raw_result': result}
+
+        except Exception as e:
+            return False, str(e)
+        finally:
+            # Clean up temporary file if it was created
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.unlink(temp_file_path)
+                except:  # noqa: E722
+                    pass
+
+    def _remove_prefix(self,
+                       prompt: str,
+                       completion: str,
+                       threshold: float = 0.95) -> str:
+        """Determine the truncation point in the completion based on the last
+        line of the prompt, remove all content before that line in the
+        completion, and return the completion string after removing the prefix.
+        This is done to convert chatbot-style inference mode to completion
+        mode.
+
+        Args:
+            prompt (str): The prompt text.
+            completion (str): The completion text.
+            threshold (float): Line similarity threshold.
+
+        Returns:
+            str: The completion string after removing the prefix.
+        """
+        prompt_lines = prompt.splitlines()
+        completion_lines = completion.splitlines()
+
+        if not prompt_lines:
+            return completion
+
+        last_prompt_line = prompt_lines[-1]
+        cut_index = -1
+
+        for i, completion_line in enumerate(completion_lines):
+            similarity = difflib.SequenceMatcher(None, last_prompt_line,
+                                                 completion_line).ratio()
+            if similarity >= threshold:
+                cut_index = i
+                break
+
+        if cut_index != -1:
+            return '\n'.join(completion_lines[cut_index + 1:])
+        else:
+            return completion
+
+    def _process_completions(self, test_case: dict, completions: list) -> list:
+        """Process code completion list, which typically involves extracting
+        code, removing repetitive prefixes caused by chatbot mode, and other
+        steps to ensure the model-generated code can be compiled successfully.
+
+        Args:
+            test_case (dict): Dictionary containing test case information including:
+            completions (list): List of code completions generated by the model.
+
+        Returns:
+            list: Processed code completion list.
+        """
+        processed_completions = []
+        for comp in completions:
+            comp = self._extract_code(comp)
+            post_comp = self._remove_prefix(test_case['prompt'], comp)
+            processed_completions.append(post_comp)
+        return processed_completions
+
+    def _evaluate(
+        self, input_data: Union[Dict, List]
+    ) -> Tuple[bool, Optional[Union[Dict, List]], Optional[str]]:
+        """Evaluate code with retry mechanism.
+
+        Args:
+            input_data: Can be either:
+                - dict: Dictionary containing code and test information for a single test case
+                - list: List of dictionaries for batch evaluation
+
+        Returns:
+            tuple: (success, output, error_message)
+                - success (bool): Whether the evaluation was successful
+                - output (dict or list): Evaluation output (if successful)
+                - error_message (str): Error message (if failed)
+        """
+        num_retry = 0
+        while num_retry < self.retry:
+            succeed, output = self._code_eval_service(input_data)
+            if not succeed:
+                num_retry += 1
+                time.sleep(10)
+            else:
+                break
+
+        if not succeed:
+            return False, None, f'code eval service connection failed: {output}'
+
+        return True, output, None
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        """Score code generation predictions against references.
+
+        Args:
+            predictions (list): List of model-generated code completions.
+            references (list): List of reference solutions (not directly used in evaluation).
+            test_set (Dataset): Dataset containing test cases and other metadata.
+
+        Returns:
+            dict: Evaluation results including:
+                - accuracy: Percentage of correctly solved problems
+                - details: Detailed results for each test case
+                - error: Error message if evaluation failed
+        """
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        test_set = test_set.to_pandas()
+        # Use the first column as the unique identifier
+        test_set_origin = test_set.drop_duplicates(subset=test_set.columns[0])
+        num_repeats = int(len(test_set) / len(test_set_origin))
+
+        # 1. Prepare data for all test cases
+        all_test_cases = []
+        for i in range(len(test_set_origin)):
+            test_case = test_set_origin.iloc[i]
+            completions = predictions[i * num_repeats:(i + 1) * num_repeats]
+
+            # Process code completions
+            processed_completions = self._process_completions(
+                test_case, completions)
+
+            result_dict = {
+                'name': test_case['name'],
+                'language': test_case['language'],
+                'prompt': test_case['prompt'],
+                'tests': test_case['tests'],
+                'processed_completions': processed_completions,
+                'completions': completions
+            }
+
+            all_test_cases.append(result_dict)
+
+        # 2. Send all test cases to the evaluation service
+        success, outputs, error_message = self._evaluate(all_test_cases)
+        if not success:
+            return {'error': error_message}
+
+        # 3. Process the returned results
+        details = []
+        correct = 0
+        for output in outputs:
+            if output.get('status') == 'OK':
+                output['correct'] = True
+                correct += 1
+            else:
+                output['correct'] = False
+
+            details.append(output)
+
+        return {
+            f'pass@{num_repeats}': 100 * correct / len(test_set_origin),
+            'details': details
+        }
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
index 5f055cc0..00db25e8 100644
--- a/opencompass/utils/datasets_info.py
+++ b/opencompass/utils/datasets_info.py
@@ -193,6 +193,12 @@ DATASETS_MAPPING = {
         "hf_id": "",
         "local": "./data/mmlu_pro",
     },
+    # MultiPL-E
+    "opencompass/multipl_e": {
+        "ms_id": "",
+        "hf_id": "",
+        "local": "./data/multipl_e",
+    },
     # NQ
     "opencompass/natural_question": {
         "ms_id": "opencompass/natural_question",
@@ -627,6 +633,11 @@ DATASETS_URL = {
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/mmlu_pro.zip",
         "md5": "e3200c7380f4cea5f13c768f2815fabb",
     },
+    "multipl_e": {
+        "url":
+        "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/multipl_e.zip",
+        "md5": "24462aac7a38a4a62f5c5e89eb614e20",
+    },
     "/Longbench": {
         "url":
         "http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/Longbench.zip",

From 64128916d0a180b63b634cc9e894ae907a605759 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 24 Mar 2025 11:21:14 +0800
Subject: [PATCH 36/58] [Update] Increase memory size for CPU job of VOLC
 Runner (#1962)

* [Update] Increase memory size for CPU job of VOLC Runner

* [Update] Increase memory size for CPU job of VOLC Runner
---
 opencompass/runners/volc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/runners/volc.py b/opencompass/runners/volc.py
index a01f7cac..37cd441b 100644
--- a/opencompass/runners/volc.py
+++ b/opencompass/runners/volc.py
@@ -256,7 +256,7 @@ class VOLCRunner(BaseRunner):
         with open(config_path) as fp:
             volc_cfg = yaml.safe_load(fp)
         if num_gpus <= 0:
-            flavor = 'ml.c3i.2xlarge'
+            flavor = 'ml.r3i.2xlarge'
         elif num_gpus == 1:
             flavor = 'ml.pni2l.3xlarge'
         elif num_gpus == 2:

From aa059939225d6490ea3aa650237f1920a51648b8 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 24 Mar 2025 14:24:12 +0800
Subject: [PATCH 37/58] [Update] Add dataset configurations of no max_out_len
 (#1967)

* [Update] Add dataset configurations of no max_out_len

* update test torch version

* update test torch version

* update test torch version

* update test torch version
---
 .github/workflows/pr-stage-check.yml          |   6 +-
 .../arc_prize_public_evaluation_gen_fedd04.py |  56 ++++++
 .../GaokaoBench_no_subjective_gen_d16acb.py   |  45 +++++
 .../MathBench/mathbench_2024_gen_4b8f28.py    |  81 ++++++++
 .../datasets/bbh/bbh_llmjudge_gen_b5bdf1.py   | 189 ++++++++++++++++++
 .../bigcodebench_hard_complete_gen_2888d3.py  |  45 +++++
 .../bigcodebench_hard_instruct_gen_c3d5ad.py  |  45 +++++
 .../datasets/cmo_fib/cmo_fib_gen_2783e5.py    |  39 ++++
 .../gsm8k/gsm8k_0shot_v2_gen_17d799.py        |  37 ++++
 .../korbench/korbench_llmjudge_gen_17854d.py  | 117 +++++++++++
 .../math/math_500_llmjudge_gen_6ff468.py      |  96 +++++++++
 .../datasets/nq/nq_open_1shot_gen_2e45e5.py   |   2 +-
 .../datasets/scicode/scicode_gen_62c139.py    |  29 +++
 .../triviaqa_wiki_1shot_gen_c87d61.py         |  62 ++++++
 14 files changed, 845 insertions(+), 4 deletions(-)
 create mode 100644 opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
 create mode 100644 opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
 create mode 100644 opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
 create mode 100644 opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
 create mode 100644 opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
 create mode 100644 opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
 create mode 100644 opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
 create mode 100644 opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
 create mode 100644 opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py
 create mode 100644 opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py
 create mode 100644 opencompass/configs/datasets/scicode/scicode_gen_62c139.py
 create mode 100644 opencompass/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_c87d61.py

diff --git a/.github/workflows/pr-stage-check.yml b/.github/workflows/pr-stage-check.yml
index 15669a3f..a9871887 100644
--- a/.github/workflows/pr-stage-check.yml
+++ b/.github/workflows/pr-stage-check.yml
@@ -20,7 +20,7 @@ jobs:
       matrix:
         python-version: ['3.10']
         include:
-          - torch: 2.0.0
+          - torch: 2.5.1
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
@@ -30,7 +30,7 @@ jobs:
       - name: Upgrade pip
         run: python -m pip install --upgrade pip
       - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
+        run: pip install torch==${{matrix.torch}} -f https://download.pytorch.org/whl/cpu/torch_stable.html
       - name: Install system dependencies
         run: |
           sudo sed -i '$ a deb http://th.archive.ubuntu.com/ubuntu jammy main' /etc/apt/sources.list
@@ -106,7 +106,7 @@ jobs:
       - name: Upgrade pip
         run: python -m pip install pip --upgrade
       - name: Install PyTorch
-        run: pip install torch==2.0.0+${{matrix.platform}} -f https://download.pytorch.org/whl/${{matrix.platform}}/torch_stable.html
+        run: pip install torch==2.5.1 -f https://download.pytorch.org/whl/cpu/torch_stable.html
       - name: Install opencompass dependencies
         run: |
           pip install -r requirements.txt
diff --git a/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
new file mode 100644
index 00000000..536349e9
--- /dev/null
+++ b/opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen_fedd04.py
@@ -0,0 +1,56 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets.arc_prize_public_evaluation import ARCPrizeDataset, ARCPrizeEvaluator
+
+
+# The system_prompt defines the initial instructions for the model, 
+# setting the context for solving ARC tasks.
+system_prompt = '''You are a puzzle solving wizard. You are given a puzzle from the abstraction and reasoning corpus developed by Francois Chollet.'''
+
+# User message template is a template for creating user prompts. It includes placeholders for training data and test input data, 
+# guiding the model to learn the rule and apply it to solve the given puzzle.
+user_message_template = '''Here are the example input and output pairs from which you should learn the underlying rule to later predict the output for the given test input:
+----------------------------------------
+{training_data}
+----------------------------------------
+Now, solve the following puzzle based on its input grid by applying the rules you have learned from the training data.:
+----------------------------------------
+[{{'input': {input_test_data}, 'output': [[]]}}]
+----------------------------------------
+What is the output grid? Only provide the output grid in the form as in the example input and output pairs. Do not provide any additional information:'''
+
+
+arc_prize_public_evaluation_reader_cfg = dict(
+    input_columns=['training_data', 'input_test_data'], 
+    output_column='output_test_data'
+)
+
+arc_prize_public_evaluation_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='SYSTEM',fallback_role='HUMAN', prompt=system_prompt),
+                dict(role='HUMAN', prompt=user_message_template),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+arc_prize_public_evaluation_eval_cfg = dict(
+    evaluator=dict(type=ARCPrizeEvaluator)
+)
+
+arc_prize_public_evaluation_datasets = [
+    dict(
+        abbr='ARC_Prize_Public_Evaluation',
+        type=ARCPrizeDataset,
+        path='opencompass/arc_prize_public_evaluation',
+        reader_cfg=arc_prize_public_evaluation_reader_cfg,
+        infer_cfg=arc_prize_public_evaluation_infer_cfg,
+        eval_cfg=arc_prize_public_evaluation_eval_cfg
+    )
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py b/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
new file mode 100644
index 00000000..9ee5c917
--- /dev/null
+++ b/opencompass/configs/datasets/GaokaoBench/GaokaoBench_no_subjective_gen_d16acb.py
@@ -0,0 +1,45 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GaokaoBenchDataset
+from mmengine.config import read_base
+
+with read_base():
+    from .GaokaoBench_prompts import MCQ_prompts, FBQ_prompts
+
+GaokaoBench_datasets = []
+for folder, prompts in [
+    ('Multiple-choice_Questions', MCQ_prompts),
+    ('Fill-in-the-blank_Questions', FBQ_prompts),
+]:
+    for p in prompts:
+        reader_cfg = {
+            'input_columns': ['question'],
+            'output_column': 'answer',
+        }
+        infer_cfg = {
+            'ice_template': {
+                'type': PromptTemplate,
+                'template': {'round': [{'role': 'HUMAN', 'prompt': p['prefix_prompt'] + '{question}'}]},
+                'ice_token': '</E>',
+            },
+            'retriever': {'type': ZeroRetriever},
+            'inferencer': {'type': GenInferencer},
+        }
+        eval_cfg = {
+            'evaluator': {'type': 'GaokaoBenchEvaluator' + '_' + p['type']},
+            'pred_role': 'BOT',
+        }
+        _base_path = 'opencompass/GAOKAO-BENCH'
+        dataset = {
+            'type': GaokaoBenchDataset,
+            'abbr': 'GaokaoBench_' + p['keyword'],
+            'path': _base_path,
+            'filename': '/' + folder + '/' + p['keyword'] + '.json',
+            'name': p['keyword'],
+            'reader_cfg': reader_cfg,
+            'infer_cfg': infer_cfg,
+            'eval_cfg': eval_cfg,
+        }
+        GaokaoBench_datasets.append(dataset)
diff --git a/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
new file mode 100644
index 00000000..c3183716
--- /dev/null
+++ b/opencompass/configs/datasets/MathBench/mathbench_2024_gen_4b8f28.py
@@ -0,0 +1,81 @@
+from mmengine.config import read_base
+from copy import deepcopy
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
+from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
+from opencompass.datasets import MathBenchDataset, math_postprocess_v2
+from opencompass.utils.text_postprocessors import first_option_postprocess
+
+with read_base():
+    from .mathbench_prompt import zero_shot_prompts, few_shot_prompts, mathbench_sets
+
+# Max for this dataset is 4
+num_shot = 0
+# Generate reasoning path or not, only for single choice
+with_reasoning = True
+# Use circular evaluation or not
+with_circular_eval = True
+# Use PPL mode in single choice test or not
+use_ppl_single_choice = False
+
+assert 0 <= num_shot <= 4
+if num_shot == 0:
+    prompts = zero_shot_prompts
+else:
+    prompts = {name: p[- 2 * num_shot - 2:] for name, p in few_shot_prompts.items()}
+
+mathbench_datasets = []
+for _split in mathbench_sets:
+    for _name in mathbench_sets[_split]:
+        if 'single_choice' in _name:
+            if with_reasoning:
+                template_round = prompts[_name + '_with_reasoning']
+            else:
+                template_round = prompts[_name]
+        else:
+            template_round = prompts[_name]
+
+        if 'single_choice' in _name:
+            pred_postprocessor = dict(type=first_option_postprocess, options='ABCD')
+        else:
+            pred_postprocessor = dict(type=math_postprocess_v2)
+
+        if 'single_choice' in _name and with_circular_eval:
+            evaluator = dict(type=CircularEvaluator)
+        else:
+            evaluator = dict(type=AccEvaluator)
+
+        # assemble the final config
+        mathbench_reader_cfg = dict(input_columns=['question'], output_column='answer')
+        if use_ppl_single_choice and 'single_choice' in _name and not with_reasoning:
+            template = {}
+            for answer in ['A', 'B', 'C', 'D']:
+                one_template_round = deepcopy(template_round)
+                one_template_round['round'][-1]['prompt'] = one_template_round['round'][-1]['prompt'].format(answer=answer)
+                template[answer] = dict(round=one_template_round)
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=template),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=PPLInferencer),
+            )
+        else:
+            mathbench_infer_cfg = dict(
+                prompt_template=dict(type=PromptTemplate, template=dict(round=template_round)),
+                retriever=dict(type=ZeroRetriever),
+                inferencer=dict(type=GenInferencer),
+            )
+        mathbench_eval_cfg = dict(evaluator=evaluator, pred_postprocessor=pred_postprocessor)
+
+        mathbench_datasets.append(
+            dict(
+                abbr='mathbench-' + _split + '-' + _name,
+                type=MathBenchDataset,
+                path=f'data/mathbench_v1/{_split}',
+                name=_name,
+                with_circular=with_circular_eval,
+                reader_cfg=mathbench_reader_cfg,
+                infer_cfg=mathbench_infer_cfg,
+                eval_cfg=mathbench_eval_cfg,
+            )
+        )
diff --git a/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py b/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
new file mode 100644
index 00000000..00426660
--- /dev/null
+++ b/opencompass/configs/datasets/bbh/bbh_llmjudge_gen_b5bdf1.py
@@ -0,0 +1,189 @@
+# flake8: noqa
+
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import BBHDataset
+from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
+
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{input}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
+
+# For zero shot inference in bbh
+bbh_datasets = []
+for _name in bbh_sets:
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy())
+        )
+
+
+# For original 3 shot inference in bbh
+bbh_3_shot_datasets = []
+for _name in bbh_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+
+    bbh_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
+                    round=[
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
+            ),
+            dataset_cfg=dict(
+                type=BBHDataset,
+                name=_name,
+                path='opencompass/bbh',
+                reader_cfg=bbh_reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
+        ),
+        pred_role='BOT',
+    )
+
+    bbh_3_shot_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
new file mode 100644
index 00000000..e4c663fc
--- /dev/null
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_complete_gen_2888d3.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['complete_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{complete_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer))
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='complete',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_complete_datasets = [
+    dict(
+        abbr='bigcodebench_hard_complete',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
new file mode 100644
index 00000000..b8dcc8ed
--- /dev/null
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
@@ -0,0 +1,45 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import (BigCodeBenchDataset, BigCodeBenchEvaluator)
+
+bigcodebench_hard_reader_cfg = dict(
+    input_columns=['instruct_prompt'],
+    output_column='test',
+)
+
+bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
+    type=PromptTemplate,
+    template=dict(
+        begin=[dict(role='system', fallback_role='HUMAN', prompt='')],
+        round=[
+            dict(role='HUMAN', prompt='{instruct_prompt}'),
+        ])),
+                                   retriever=dict(type=ZeroRetriever),
+                                   inferencer=dict(type=GenInferencer))
+
+bigcodebench_hard_eval_cfg = dict(
+    evaluator=dict(
+        type=BigCodeBenchEvaluator,
+        release_version='v0.1.2',
+        eval_type='instruct',
+        # remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/',
+        remote_execute_api=
+        'https://opencompass-opencompass-bigcodebench-evaluator.hf.space',  # noqa: E501
+        dataset_version='hard',
+    ),
+    pred_role='BOT',
+)
+
+bigcodebench_hard_instruct_datasets = [
+    dict(
+        abbr='bigcodebench_hard_instruct',
+        type=BigCodeBenchDataset,
+        path='opencompass/bigcodebench',
+        reader_cfg=bigcodebench_hard_reader_cfg,
+        infer_cfg=bigcodebench_hard_infer_cfg,
+        eval_cfg=bigcodebench_hard_eval_cfg,
+        release_version='v0.1.2',
+        dataset_version='hard',
+    )
+]
diff --git a/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
new file mode 100644
index 00000000..6fc1147c
--- /dev/null
+++ b/opencompass/configs/datasets/cmo_fib/cmo_fib_gen_2783e5.py
@@ -0,0 +1,39 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import CMOFibDataset, MATHEvaluator, math_postprocess_v2
+
+
+cmo_fib_reader_cfg = dict(
+    input_columns=['question'], 
+    output_column='answer'
+)
+
+
+cmo_fib_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\n请一步一步地推理，并将最终答案写入\\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+cmo_fib_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2)
+)
+
+cmo_fib_datasets = [
+    dict(
+        abbr='cmo_fib',
+        type=CMOFibDataset,
+        path='opencompass/cmo_fib',
+        reader_cfg=cmo_fib_reader_cfg,
+        infer_cfg=cmo_fib_infer_cfg,
+        eval_cfg=cmo_fib_eval_cfg
+    )
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
new file mode 100644
index 00000000..43b38546
--- /dev/null
+++ b/opencompass/configs/datasets/gsm8k/gsm8k_0shot_v2_gen_17d799.py
@@ -0,0 +1,37 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
+from opencompass.datasets import MATHEvaluator, math_postprocess_v2
+
+gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
+
+gsm8k_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+gsm8k_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
+    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
+)
+
+gsm8k_datasets = [
+    dict(
+        abbr='gsm8k',
+        type=GSM8KDataset,
+        path='opencompass/gsm8k',
+        reader_cfg=gsm8k_reader_cfg,
+        infer_cfg=gsm8k_infer_cfg,
+        eval_cfg=gsm8k_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py
new file mode 100644
index 00000000..a9cb644b
--- /dev/null
+++ b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_17854d.py
@@ -0,0 +1,117 @@
+from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+korbench_0shot_single_datasets = []
+
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='HUMAN',
+                    prompt=''
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}' # f-string
+                )
+            ]
+        )
+    )
+
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=['prompt'],
+        output_column='answer',
+    )
+
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024),
+    )
+
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=korbenchDataset,
+                path='opencompass/korbench',
+                prompt_mode='0_shot',
+                category=category,
+                reader_cfg=reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    # Dataset
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f'korbench_{category}',
+        path='opencompass/korbench',
+        prompt_mode='0_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+        mode='singlescore',
+    )
+
+    korbench_0shot_single_datasets.append(korbench_dataset)
diff --git a/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py b/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py
new file mode 100644
index 00000000..67d74266
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_500_llmjudge_gen_6ff468.py
@@ -0,0 +1,96 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            file_name = 'test_prm800k_500.json',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500-llmjudge',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py b/opencompass/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
index e877b397..2155c404 100644
--- a/opencompass/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
+++ b/opencompass/configs/datasets/nq/nq_open_1shot_gen_2e45e5.py
@@ -20,7 +20,7 @@ for k in [1]:
                 )
             ),
             retriever=dict(type=ZeroRetriever),
-            inferencer=dict(type=GenInferencer, max_out_len=50)
+            inferencer=dict(type=GenInferencer)
         )
     else:
         nq_infer_cfg = dict(
diff --git a/opencompass/configs/datasets/scicode/scicode_gen_62c139.py b/opencompass/configs/datasets/scicode/scicode_gen_62c139.py
new file mode 100644
index 00000000..9e1842c3
--- /dev/null
+++ b/opencompass/configs/datasets/scicode/scicode_gen_62c139.py
@@ -0,0 +1,29 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import ChatInferencer
+from opencompass.datasets import SciCodeDataset, SciCodeEvaluator
+
+
+SciCode_reader_cfg = dict(input_columns=['prompt'], output_column=None)
+
+SciCode_infer_cfg = dict(
+    ice_template=dict(
+        type=PromptTemplate,
+        template='',
+        ),
+
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=ChatInferencer, infer_mode='every'))
+
+SciCode_eval_cfg = dict(evaluator=dict(type=SciCodeEvaluator, dataset_path='./data/scicode', with_bg=False))
+
+SciCode_datasets = [
+    dict(
+        abbr='SciCode',
+        type=SciCodeDataset,
+        path='./data/scicode',
+        with_bg=False,
+        reader_cfg=SciCode_reader_cfg,
+        infer_cfg=SciCode_infer_cfg,
+        eval_cfg=SciCode_eval_cfg)
+]
diff --git a/opencompass/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_c87d61.py b/opencompass/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_c87d61.py
new file mode 100644
index 00000000..27853fa4
--- /dev/null
+++ b/opencompass/configs/datasets/triviaqa/triviaqa_wiki_1shot_gen_c87d61.py
@@ -0,0 +1,62 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TriviaQADatasetV2, TriviaQAEvaluator
+
+
+triviaqa_datasets = []
+for k in [1]:
+    triviaqa_reader_cfg = dict(
+        input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
+
+    if k == 0:
+        triviaqa_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                )
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer)
+        )
+    else:
+        triviaqa_infer_cfg = dict(
+            ice_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A: {answer}.\n'),
+                    ]
+                ),
+            ),
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                    begin='</E>',
+                    round=[
+                        dict(role='HUMAN', prompt='Q: {question}'),
+                        dict(role='BOT', prompt='A:'),
+                    ]
+                ),
+                ice_token='</E>',
+            ),
+            retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
+            inferencer=dict(type=GenInferencer),
+        )
+
+    triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role='BOT')
+
+    triviaqa_datasets.append(
+    dict(
+        type=TriviaQADatasetV2,
+        abbr=f'triviaqa_wiki_{k}shot',
+        path='opencompass/trivia_qa',
+        reader_cfg=triviaqa_reader_cfg,
+        infer_cfg=triviaqa_infer_cfg,
+        eval_cfg=triviaqa_eval_cfg)
+    )

From db96161a4eeb0fc5be9b174d05e443215b6a0879 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 24 Mar 2025 14:25:12 +0800
Subject: [PATCH 38/58] [Update] Add SuperGPQA subset metrics (#1966)

---
 .../supergpqa_llmjudge_gen_12b8bc.py          |   4 +-
 opencompass/datasets/supergpqa/supergpqa.py   | 132 ++++++++++++++++++
 .../evaluator/generic_llm_evaluator.py        |  17 ++-
 .../icl_evaluator/icl_base_evaluator.py       |   8 ++
 4 files changed, 156 insertions(+), 5 deletions(-)

diff --git a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
index 02e6f2da..053eda07 100644
--- a/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
+++ b/opencompass/configs/datasets/supergpqa/supergpqa_llmjudge_gen_12b8bc.py
@@ -1,5 +1,5 @@
 from opencompass.datasets.supergpqa.supergpqa import (
-    SuperGPQADataset,
+    SuperGPQADataset, supergpqa_llmjudge_postprocess
 )
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_prompt_template import PromptTemplate
@@ -87,7 +87,7 @@ eval_cfg = dict(
             reader_cfg=reader_cfg,
         ),
         judge_cfg=dict(),
-        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        dict_postprocessor=dict(type=supergpqa_llmjudge_postprocess),
     ),
 )
 supergpqa_dataset = dict(
diff --git a/opencompass/datasets/supergpqa/supergpqa.py b/opencompass/datasets/supergpqa/supergpqa.py
index 9dd96dd4..401422e1 100644
--- a/opencompass/datasets/supergpqa/supergpqa.py
+++ b/opencompass/datasets/supergpqa/supergpqa.py
@@ -1,4 +1,5 @@
 import os
+import re
 
 from datasets import Dataset, load_dataset
 
@@ -7,6 +8,7 @@ from opencompass.datasets.supergpqa.supergpqa_eval import (
 from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
 from opencompass.openicl.icl_evaluator import BaseEvaluator
 from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_logger
 
 from ..base import BaseDataset
 
@@ -180,3 +182,133 @@ class SuperGPQAEvaluator(BaseEvaluator):
             'details':
             details,
         }
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def supergpqa_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    # Get the original dataset
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'discipline': {}, 'field': {}, 'subfield': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        discipline = sample.get('discipline', 'unknown')
+        field = sample.get('field', 'unknown')
+        subfield = sample.get('subfield', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('discipline', discipline),
+            ('field', f'{discipline}/{field}'),
+            ('subfield', f'{discipline}/{field}/{subfield}'),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('discipline', discipline),
+                    ('field', f'{discipline}/{field}'),
+                    ('subfield', f'{discipline}/{field}/{subfield}'),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('discipline', discipline),
+                ('field', f'{discipline}/{field}'),
+                ('subfield', f'{discipline}/{field}/{subfield}'),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'question': sample['question'],
+                'options': sample['options'],
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'discipline': discipline,
+                'field': field,
+                'subfield': subfield,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'SuperGPQA-{level}'
+                if level == 'discipline':
+                    flat_key = f'SuperGPQA-{key}'
+                elif level == 'field':
+                    discipline, field = key.split('/')
+                    flat_key = f'SuperGPQA-{discipline}-{field}'
+                elif level == 'subfield':
+                    discipline, field, subfield = key.split('/')
+                    flat_key = f'SuperGPQA-{discipline}-{field}-{subfield}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py
index c0b33a69..2b829ba1 100644
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@@ -84,6 +84,8 @@ class GenericLLMEvaluator(BaseEvaluator):
         references: Optional[List] = None,
     ) -> Dict:
         """Apply to single-model scoring."""
+        assert len(predictions) == len(
+            references), 'predictions and references must have the same length'
         # -------------- Build Inferencer ----------------
         self.build_inferencer()
 
@@ -127,7 +129,7 @@ class GenericLLMEvaluator(BaseEvaluator):
                                   prompt_template=self.prompt_template)
 
         output = mmengine.load(self.output_path)
-        return self.output_postprocess(output)
+        return self.output_postprocess(output, dataset)
 
     def pred_postprocess(self, predictions: List) -> Dict:
         if self.pred_postprocessor is None:
@@ -137,15 +139,24 @@ class GenericLLMEvaluator(BaseEvaluator):
             proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
             return [proc(pred, **kwargs) for pred in predictions]
 
-    def output_postprocess(self, output: Dict) -> Dict:
+    def output_postprocess(self, output: Dict, dataset=None) -> Dict:
         """Postprocess output by adding necessary statistics or data into
         it."""
+        import inspect
+
         if self.dict_postprocessor is None:
             return output
         else:
             kwargs = self.dict_postprocessor
             proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
-            return proc(output, self.output_path, **kwargs)
+            sig = inspect.signature(proc)
+            if 'dataset' in sig.parameters:
+                return proc(output,
+                            self.output_path,
+                            dataset=dataset,
+                            **kwargs)
+            else:
+                return proc(output, self.output_path, **kwargs)
 
     @property
     def default_judge_cfg(self):
diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index e2aad9be..794c0ed6 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -89,6 +89,14 @@ class BaseEvaluator:
         original_dataset: Dataset,
         **score_kwargs,
     ):
+        # Check if predictions and references have the
+        # same length if both are provided
+        if 'predictions' in score_kwargs and 'references' in score_kwargs:
+            if len(score_kwargs['predictions']) != len(
+                    score_kwargs['references']):
+                raise ValueError(
+                    'Predictions and references must have the same length')
+
         real_size = len(original_dataset) // n
         all_details = []
         all_results = []

From 37307fa99671260a4f8e52b27c48219dedcd74fd Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Mon, 24 Mar 2025 14:51:39 +0800
Subject: [PATCH 39/58] [Update] Add QWQ32b model config (#1959)

* feat qwq-32b

* fix

* feat phi_4

---------

Co-authored-by: Linchen Xiao <xxllcc1993@gmail.com>
---
 .../configs/models/qwq/lmdeploy_qwq_32b.py      | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 opencompass/configs/models/qwq/lmdeploy_qwq_32b.py

diff --git a/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py b/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py
new file mode 100644
index 00000000..6c2bf078
--- /dev/null
+++ b/opencompass/configs/models/qwq/lmdeploy_qwq_32b.py
@@ -0,0 +1,17 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+from opencompass.utils.text_postprocessors import extract_non_reasoning_content
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='QwQ-32B',
+        path='Qwen/QwQ-32B',
+        engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
+        max_seq_len=32768,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=2),
+        pred_postprocessor=dict(type=extract_non_reasoning_content)
+    )
+]
\ No newline at end of file

From 07930b854a51908dfe0ea00c24b705d4a10662fd Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Mon, 24 Mar 2025 18:38:06 +0800
Subject: [PATCH 40/58] [Update] Add Korbench  config with no max_out_len
 (#1968)

* Add Korbench no max_out_len

* Add Korbench no max_out_len
---
 .../korbench/korbench_llmjudge_gen_56cf43.py  | 117 ++++++++++++++++++
 1 file changed, 117 insertions(+)
 create mode 100644 opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py

diff --git a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
new file mode 100644
index 00000000..8c5c0cd2
--- /dev/null
+++ b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
@@ -0,0 +1,117 @@
+from opencompass.datasets.korbench.korbench import korbenchDataset, korbenchEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+
+categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+korbench_0shot_single_datasets = []
+
+for category in categories:
+    # Prompt template
+    prompt_template = dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='HUMAN',
+                    prompt=''
+                )
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{prompt}' # f-string
+                )
+            ]
+        )
+    )
+
+    # Reader configuration
+    reader_cfg = dict(
+        input_columns=['prompt'],
+        output_column='answer',
+    )
+
+    # Inference configuration
+    infer_cfg = dict(
+        prompt_template=prompt_template,
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer),
+    )
+
+    # Evaluation configuration
+    eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=korbenchDataset,
+                path='opencompass/korbench',
+                prompt_mode='0_shot',
+                category=category,
+                reader_cfg=reader_cfg,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    # Dataset
+    korbench_dataset = dict(
+        type=korbenchDataset,
+        abbr=f'korbench_{category}',
+        path='opencompass/korbench',
+        prompt_mode='0_shot',
+        category=category,
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+        mode='singlescore',
+    )
+
+    korbench_0shot_single_datasets.append(korbench_dataset)

From 61185963627a6d4fcd2fc790454322442d8f35f5 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Tue, 25 Mar 2025 14:54:13 +0800
Subject: [PATCH 41/58] [Feature] Add recommendation configs for datasets
 (#1937)

* feat datasetrefine drop

* fix datasets in fullbench_int3

* fix

* fix

* back

* fix

* fix and doc

* feat

* fix hook

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* fix

* doc

* fix

* fix

* Update dataset-index.yml
---
 README.md                                     |   2 +
 README_zh-CN.md                               |   2 +
 dataset-index.yml                             | 524 +++++++++++++-----
 docs/en/statis.py                             |  20 +-
 docs/zh_cn/statis.py                          |  14 +-
 .../configs/datasets/IFEval/IFEval_gen.py     |   2 +-
 .../configs/datasets/aime2024/aime2024_gen.py |   2 +-
 .../datasets/aime2024/aime2024_gen_17d799.py  |  40 ++
 .../aime2024/aime2024_llm_judge_gen.py        |   4 +
 opencompass/configs/datasets/bbh/bbh_gen.py   |   2 +-
 .../configs/datasets/bbh/bbh_gen_ee62e9.py    |  99 ++++
 .../configs/datasets/bbh/bbh_llm_judge_gen.py |   4 +
 .../datasets/bigcodebench/bigcodebench_gen.py |   7 +
 .../bigcodebench_hard_instruct_gen.py         |   2 +-
 .../bigcodebench_hard_instruct_gen_c3d5ad.py  |   5 +-
 .../configs/datasets/cmmlu/cmmlu_gen.py       |   2 +-
 .../datasets/cmmlu/cmmlu_llm_judge_gen.py     |   4 +
 opencompass/configs/datasets/drop/drop_gen.py |   2 +-
 .../datasets/drop/drop_llm_judge_gen.py       |   4 +
 opencompass/configs/datasets/gpqa/gpqa_gen.py |   2 +-
 .../datasets/gpqa/gpqa_llm_judge_gen.py       |   4 +
 .../datasets/hellaswag/hellaswag_gen.py       |   2 +-
 .../hellaswag/hellaswag_llm_judge_gen.py      |   4 +
 .../datasets/humaneval/humaneval_gen.py       |   2 +-
 .../configs/datasets/korbench/korbench_gen.py |   4 +
 .../korbench/korbench_llm_judge_gen.py        |   4 +
 .../korbench/korbench_llmjudge_gen_56cf43.py  |  30 +-
 .../korbench/korbench_single_0_shot_gen.py    |   2 +-
 ..._single_0shot_genericllmeval_gen_17854d.py |  31 +-
 .../livecodebench/livecodebench_gen.py        |   2 +-
 .../math/math_prm800k_500_0shot_cot_gen.py    |   5 +-
 ...0_0shot_nocot_genericllmeval_gen_6ff468.py |  96 ++++
 .../datasets/math/math_prm800k_500_gen.py     |  38 +-
 .../math/math_prm800k_500_gen_393424.py       |  36 ++
 .../math/math_prm800k_500_llm_judge_gen.py    |   4 +
 opencompass/configs/datasets/mmlu/mmlu_gen.py |   2 +-
 .../datasets/mmlu/mmlu_llm_judge_gen.py       |   4 +
 .../configs/datasets/mmlu_pro/mmlu_pro_gen.py |   4 +
 .../mmlu_pro/mmlu_pro_llm_judge_gen.py        |   4 +
 .../datasets/musr/musr_llm_judge_gen.py       |   4 +
 tools/update_dataset_suffix.py                |  40 +-
 41 files changed, 813 insertions(+), 252 deletions(-)
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
 create mode 100644 opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
 create mode 100644 opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
 create mode 100644 opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/drop/drop_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/korbench/korbench_gen.py
 create mode 100644 opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
 create mode 100644 opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/musr/musr_llm_judge_gen.py

diff --git a/README.md b/README.md
index 4a29f2b7..f3c6028a 100644
--- a/README.md
+++ b/README.md
@@ -286,6 +286,8 @@ We have supported a statistical list of all datasets that can be used on this pl
 
 You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
 
+In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
+
 Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
 
 <p align="right"><a href="#top">🔝Back to top</a></p>
diff --git a/README_zh-CN.md b/README_zh-CN.md
index e1bc6f7f..a4ef743f 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -281,6 +281,8 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 
 您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 
+另外，我们为每个数据集都提供了一种推荐配置，部分数据集还支持了基于LLM Judge的配置。
+
 详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
 
 <p align="right"><a href="#top">🔝返回顶部</a></p>
diff --git a/dataset-index.yml b/dataset-index.yml
index dc50f396..59b6d4da 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -1,751 +1,987 @@
-
 - ifeval:
     name: IFEval
     category: Instruction Following
     paper: https://arxiv.org/pdf/2311.07911
-    configpath: opencompass/configs/datasets/IFEval
+    configpath: opencompass/configs/datasets/IFEval/IFEval_gen.py
+    configpath_llmjudge: ''
 - nphard:
     name: NPHardEval
     category: Reasoning
     paper: https://arxiv.org/pdf/2312.14890v2
-    configpath: opencompass/configs/datasets/NPHardEval
+    configpath: opencompass/configs/datasets/NPHardEval/NPHardEval_gen.py
+    configpath_llmjudge: ''
 - pmmeval:
     name: PMMEval
     category: Language
     paper: https://arxiv.org/pdf/2411.09116v1
-    configpath: opencompass/configs/datasets/PMMEval
+    configpath: opencompass/configs/datasets/PMMEval/pmmeval_gen.py
+    configpath_llmjudge: ''
 - theoremqa:
     name: TheroremQA
     category: Reasoning
     paper: https://arxiv.org/pdf/2305.12524
-    configpath: opencompass/configs/datasets/TheroremQA
+    configpath: opencompass/configs/datasets/TheroremQA/TheoremQA_gen.py
+    configpath_llmjudge: ''
 - agieval:
     name: AGIEval
     category: Examination
     paper: https://arxiv.org/pdf/2304.06364
-    configpath: opencompass/configs/datasets/agieval
+    configpath: opencompass/configs/datasets/agieval/agieval_gen.py
+    configpath_llmjudge: ''
 - babilong:
     name: BABILong
     category: Long Context
     paper: https://arxiv.org/pdf/2406.10149
     configpath: opencompass/configs/datasets/babilong
+    configpath_llmjudge: ''
 - bigcodebench:
     name: BigCodeBench
     category: Code
     paper: https://arxiv.org/pdf/2406.15877
-    configpath: opencompass/configs/datasets/bigcodebench
+    configpath: opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
+    configpath_llmjudge: ''
 - calm:
     name: CaLM
     category: Reasoning
     paper: https://arxiv.org/pdf/2405.00622
-    configpath: opencompass/configs/datasets/calm
+    configpath: opencompass/configs/datasets/calm/calm.py
+    configpath_llmjudge: ''
 - infinitebench:
     name: InfiniteBench (∞Bench)
     category: Long Context
     paper: https://aclanthology.org/2024.acl-long.814.pdf
-    configpath: opencompass/configs/datasets/infinitebench
+    configpath: opencompass/configs/datasets/infinitebench/infinitebench.py
+    configpath_llmjudge: ''
 - korbench:
     name: KOR-Bench
     category: Reasoning
     paper: https://arxiv.org/pdf/2410.06526v1
-    configpath: opencompass/configs/datasets/korbench
+    configpath: opencompass/configs/datasets/korbench/korbench_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
 - lawbench:
     name: LawBench
     category: Knowledge / Law
     paper: https://arxiv.org/pdf/2309.16289
-    configpath: opencompass/configs/datasets/lawbench
+    configpath:
+      - opencompass/configs/datasets/lawbench/lawbench_zero_shot_gen_002588.py
+      - opencompass/configs/datasets/lawbench/lawbench_one_shot_gen_002588.py
+    configpath_llmjudge: ''
 - leval:
     name: L-Eval
     category: Long Context
     paper: https://arxiv.org/pdf/2307.11088v1
-    configpath: opencompass/configs/datasets/leval
+    configpath: opencompass/configs/datasets/leval/leval.py
+    configpath_llmjudge: ''
 - livecodebench:
     name: LiveCodeBench
     category: Code
     paper: https://arxiv.org/pdf/2403.07974
-    configpath: opencompass/configs/datasets/livecodebench
+    configpath: opencompass/configs/datasets/livecodebench/livecodebench_gen.py
+    configpath_llmjudge: ''
 - livemathbench:
     name: LiveMathBench
     category: Math
     paper: https://arxiv.org/pdf/2412.13147
-    configpath: opencompass/configs/datasets/livemathbench
+    configpath: opencompass/configs/datasets/livemathbench/livemathbench_gen.py
+    configpath_llmjudge: ''
+- livereasonbench:
+    name: LiveReasonBench
+    category: Reasoning
+    paper: ''
+    configpath: opencompass/configs/datasets/livereasonbench/livereasonbench_gen.py
+    configpath_llmjudge: ''
 - longbench:
     name: LongBench
     category: Long Context
     paper: https://github.com/THUDM/LongBench
-    configpath: opencompass/configs/datasets/livemathbench
+    configpath:
+      - opencompass/configs/datasets/longbench/longbench.py
+      - opencompass/configs/datasets/longbenchv2/longbenchv2_gen.py
+    configpath_llmjudge: ''
 - lveval:
     name: LV-Eval
     category: Long Context
     paper: https://arxiv.org/pdf/2402.05136
-    configpath: opencompass/configs/datasets/lveval
+    configpath: opencompass/configs/datasets/lveval/lveval.py
+    configpath_llmjudge: ''
+- mastermath2024v1:
+    name: Mastermath2024v1
+    category: Math
+    paper: ''
+    configpath: opencompass/configs/datasets/mastermath2024v1/mastermath2024v1_gen.py
+    configpath_llmjudge: ''
 - medbench:
     name: MedBench
     category: Knowledge / Medicine
     paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
-    configpath: opencompass/configs/datasets/MedBench
+    configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
+    configpath_llmjudge: ''
 - musr:
     name: MuSR
     category: Reasoning
     paper: https://arxiv.org/pdf/2310.16049
-    configpath: opencompass/configs/datasets/musr
+    configpath: opencompass/configs/datasets/musr/musr_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
 - needlebench:
     name: NeedleBench
     category: Long Context
     paper: https://arxiv.org/pdf/2407.11963
     configpath: opencompass/configs/datasets/needlebench
+    configpath_llmjudge: ''
 - ruler:
     name: RULER
     category: Long Context
     paper: https://arxiv.org/pdf/2404.06654
     configpath: opencompass/configs/datasets/ruler
+    configpath_llmjudge: ''
 - alignment:
     name: AlignBench
     category: Subjective / Alignment
     paper: https://arxiv.org/pdf/2311.18743
     configpath: opencompass/configs/datasets/subjective/alignbench
+    configpath_llmjudge: ''
 - alpaca:
     name: AlpacaEval
     category: Subjective / Instruction Following
     paper: https://github.com/tatsu-lab/alpaca_eval
     configpath: opencompass/configs/datasets/subjective/aplaca_eval
+    configpath_llmjudge: ''
 - arenahard:
     name: Arena-Hard
     category: Subjective / Chatbot
     paper: https://lmsys.org/blog/2024-04-19-arena-hard/
     configpath: opencompass/configs/datasets/subjective/arena_hard
+    configpath_llmjudge: ''
 - flames:
     name: FLAMES
     category: Subjective / Alignment
     paper: https://arxiv.org/pdf/2311.06899
-    configpath: opencompass/configs/datasets/subjective/flames
+    configpath: opencompass/configs/datasets/subjective/flames/flames_gen.py
+    configpath_llmjudge: ''
 - fofo:
     name: FOFO
     category: Subjective / Format Following
     paper: https://arxiv.org/pdf/2402.18667
     configpath: opencompass/configs/datasets/subjective/fofo
+    configpath_llmjudge: ''
 - followbench:
     name: FollowBench
     category: Subjective / Instruction Following
     paper: https://arxiv.org/pdf/2310.20410
     configpath: opencompass/configs/datasets/subjective/followbench
+    configpath_llmjudge: ''
 - hellobench:
     name: HelloBench
     category: Subjective / Long Context
     paper: https://arxiv.org/pdf/2409.16191
     configpath: opencompass/configs/datasets/subjective/hellobench
+    configpath_llmjudge: ''
 - judgerbench:
     name: JudgerBench
     category: Subjective / Long Context
     paper: https://arxiv.org/pdf/2410.16256
     configpath: opencompass/configs/datasets/subjective/judgerbench
+    configpath_llmjudge: ''
 - multiround:
     name: MT-Bench-101
     category: Subjective / Multi-Round
     paper: https://arxiv.org/pdf/2402.14762
     configpath: opencompass/configs/datasets/subjective/multiround
+    configpath_llmjudge: ''
 - wildbench:
     name: WildBench
     category: Subjective / Real Task
     paper: https://arxiv.org/pdf/2406.04770
     configpath: opencompass/configs/datasets/subjective/wildbench
+    configpath_llmjudge: ''
 - teval:
     name: T-Eval
     category: Tool Utilization
     paper: https://arxiv.org/pdf/2312.14033
-    configpath: opencompass/configs/datasets/teval
+    configpath:
+      - opencompass/configs/datasets/teval/teval_en_gen.py
+      - opencompass/configs/datasets/teval/teval_zh_gen.py
+    configpath_llmjudge: ''
 - finalceiq:
     name: FinanceIQ
     category: Knowledge / Finance
     paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
-    configpath: opencompass/configs/datasets/FinanceIQ
+    configpath: opencompass/configs/datasets/FinanceIQ/FinanceIQ_gen.py
+    configpath_llmjudge: ''
 - gaokaobench:
     name: GAOKAOBench
     category: Examination
     paper: https://arxiv.org/pdf/2305.12474
-    configpath: opencompass/configs/datasets/GaokaoBench
+    configpath: opencompass/configs/datasets/GaokaoBench/GaokaoBench_gen.py
+    configpath_llmjudge: ''
 - lcbench:
     name: LCBench
     category: Code
     paper: https://github.com/open-compass/CodeBench/
-    configpath: opencompass/configs/datasets/LCBench
+    configpath: opencompass/configs/datasets/LCBench/lcbench_gen.py
+    configpath_llmjudge: ''
 - MMLUArabic:
     name: ArabicMMLU
     category: Language
     paper: https://arxiv.org/pdf/2402.12840
-    configpath: opencompass/configs/datasets/MMLUArabic
+    configpath: opencompass/configs/datasets/MMLUArabic/MMLUArabic_gen.py
+    configpath_llmjudge: ''
 - OpenFinData:
     name: OpenFinData
     category: Knowledge / Finance
     paper: https://github.com/open-compass/OpenFinData
-    configpath: opencompass/configs/datasets/OpenFinData
+    configpath: opencompass/configs/datasets/OpenFinData/OpenFinData_gen.py
+    configpath_llmjudge: ''
 - QuALITY:
     name: QuALITY
     category: Long Context
     paper: https://arxiv.org/pdf/2112.08608
-    configpath: opencompass/configs/datasets/QuALITY
+    configpath: opencompass/configs/datasets/QuALITY/QuALITY_gen.py
+    configpath_llmjudge: ''
 - advglue:
     name: Adversarial GLUE
     category: Safety
     paper: https://openreview.net/pdf?id=GF9cSKI3A_q
-    configpath: opencompass/configs/datasets/adv_glue
+    configpath:
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli/adv_glue_mnli_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_mm/adv_glue_mnli_mm_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qnli/adv_glue_qnli_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_qqp/adv_glue_qqp_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_rte/adv_glue_rte_gen.py
+    - opencompass/configs/datasets/adv_glue/adv_glue_mnli_sst2/adv_glue_sst2_gen.py
+    configpath_llmjudge: ''
 - afqmcd:
     name: CLUE / AFQMC
     category: Language
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_afqmc
+    configpath: opencompass/configs/datasets/CLUE_afqmc/CLUE_afqmc_gen.py
+    configpath_llmjudge: ''
 - aime2024:
     name: AIME2024
     category: Examination
     paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
-    configpath: opencompass/configs/datasets/aime2024
+    configpath: opencompass/configs/datasets/aime2024/aime2024_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
 - anli:
     name: Adversarial NLI
     category: Reasoning
     paper: https://arxiv.org/pdf/1910.14599v2
-    configpath: opencompass/configs/datasets/anli
+    configpath: opencompass/configs/datasets/anli/anli_gen.py
+    configpath_llmjudge: ''
 - anthropics_evals:
     name: Anthropics Evals
     category: Safety
     paper: https://arxiv.org/pdf/2212.09251
-    configpath: opencompass/configs/datasets/anthropics_evals
+    configpath:
+    - opencompass/configs/datasets/anthropics_evals/airisk_gen.py
+    - opencompass/configs/datasets/anthropics_evals/persona_gen.py
+    - opencompass/configs/datasets/anthropics_evals/sycophancy_gen.py
+    configpath_llmjudge: ''
 - apps:
     name: APPS
     category: Code
     paper: https://arxiv.org/pdf/2105.09938
-    configpath: opencompass/configs/datasets/apps
+    configpath:
+    - opencompass/configs/datasets/apps/apps_gen.py
+    - opencompass/configs/datasets/apps/apps_mini_gen.py
+    configpath_llmjudge: ''
 - arc:
     name: ARC
     category: Reasoning
     paper: https://arxiv.org/pdf/1803.05457
-    configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e]
+    configpath:
+    - opencompass/configs/datasets/ARC_c/ARC_c_gen.py
+    - opencompass/configs/datasets/ARC_e/ARC_e_gen.py
+    configpath_llmjudge: ''
 - arc_prize_public_eval:
     name: ARC Prize
     category: ARC-AGI
     paper: https://arcprize.org/guide#private
-    configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
+    configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation/arc_prize_public_evaluation_gen.py
+    configpath_llmjudge: ''
 - ax:
     name: SuperGLUE / AX
     category: Reasoning
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g]
+    configpath:
+    - opencompass/configs/datasets/SuperGLUE_AX_b/SuperGLUE_AX_b_gen.py
+    - opencompass/configs/datasets/SuperGLUE_AX_g/SuperGLUE_AX_g_gen.py
+    configpath_llmjudge: ''
 - bbh:
     name: BIG-Bench Hard
     category: Reasoning
     paper: https://arxiv.org/pdf/2210.09261
-    configpath: opencompass/configs/datasets/bbh
+    configpath: opencompass/configs/datasets/bbh/bbh_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
 - bbeh:
     name: BIG-Bench Extra Hard
     category: Reasoning
     paper: https://arxiv.org/abs/2502.19187
     configpath: opencompass/configs/datasets/bbeh
+    configpath_llmjudge: ''
 - BoolQ:
     name: SuperGLUE / BoolQ
     category: Knowledge
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
+    configpath: opencompass/configs/datasets/SuperGLUE_BoolQ/SuperGLUE_BoolQ_gen.py
+    configpath_llmjudge: ''
 - c3:
     name: CLUE / C3 (C³)
     category: Understanding
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_C3
+    configpath: opencompass/configs/datasets/CLUE_C3/CLUE_C3_gen.py
+    configpath_llmjudge: ''
 - cb:
     name: SuperGLUE / CB
     category: Reasoning
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_CB
+    configpath: opencompass/configs/datasets/SuperGLUE_CB/SuperGLUE_CB_gen.py
+    configpath_llmjudge: ''
 - ceval:
     name: C-EVAL
     category: Examination
     paper: https://arxiv.org/pdf/2305.08322v1
-    configpath: opencompass/configs/datasets/ceval
+    configpath: opencompass/configs/datasets/ceval/ceval_gen.py
+    configpath_llmjudge: ''
 - charm:
     name: CHARM
     category: Reasoning
     paper: https://arxiv.org/pdf/2403.14112
-    configpath: opencompass/configs/datasets/CHARM
+    configpath: opencompass/configs/datasets/CHARM/charm_reason_gen.py
+    configpath_llmjudge: ''
 - chembench:
     name: ChemBench
     category: Knowledge / Chemistry
     paper: https://arxiv.org/pdf/2404.01475
-    configpath: opencompass/configs/datasets/ChemBench
+    configpath: opencompass/configs/datasets/ChemBench/ChemBench_gen.py
+    configpath_llmjudge: ''
 - chid:
     name: FewCLUE / CHID
     category: Language
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_chid
+    configpath: opencompass/configs/datasets/FewCLUE_chid/FewCLUE_chid_gen.py
+    configpath_llmjudge: ''
 - chinese_simpleqa:
     name: Chinese SimpleQA
     category: Knowledge
     paper: https://arxiv.org/pdf/2411.07140
-    configpath: opencompass/configs/datasets/chinese_simpleqa
+    configpath: opencompass/configs/datasets/chinese_simpleqa/chinese_simpleqa_gen.py
+    configpath_llmjudge: ''
 - cibench:
     name: CIBench
     category: Code
     paper: https://www.arxiv.org/pdf/2407.10499
-    configpath: opencompass/configs/datasets/CIBench
+    configpath:
+      - opencompass/configs/datasets/CIBench/CIBench_generation_gen_8ab0dc.py
+      - opencompass/configs/datasets/CIBench/CIBench_template_gen_e6b12a.py
+      - opencompass/configs/datasets/CIBench/CIBench_template_oracle_gen_fecda1.py
+    configpath_llmjudge: ''
 - civilcomments:
     name: CivilComments
     category: Safety
     paper: https://arxiv.org/pdf/1903.04561
-    configpath: opencompass/configs/datasets/civilcomments
+    configpath: opencompass/configs/datasets/civilcomments/civilcomments_clp.py
+    configpath_llmjudge: ''
 - clozeTest_maxmin:
     name: Cloze Test-max/min
     category: Code
     paper: https://arxiv.org/pdf/2102.04664
-    configpath: opencompass/configs/datasets/clozeTest_maxmin
+    configpath: opencompass/configs/datasets/clozeTest_maxmin/clozeTest_maxmin_gen.py
+    configpath_llmjudge: ''
 - cluewsc:
     name: FewCLUE / CLUEWSC
     category: Language / WSC
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_cluewsc
+    configpath: opencompass/configs/datasets/FewCLUE_cluewsc/FewCLUE_cluewsc_gen.py
+    configpath_llmjudge: ''
 - cmb:
     name: CMB
     category: Knowledge / Medicine
     paper: https://arxiv.org/pdf/2308.08833
-    configpath: opencompass/configs/datasets/cmb
+    configpath: opencompass/configs/datasets/cmb/cmb_gen.py
+    configpath_llmjudge: ''
 - cmmlu:
     name: CMMLU
     category: Understanding
     paper: https://arxiv.org/pdf/2306.09212
-    configpath: opencompass/configs/datasets/cmmlu
+    configpath: opencompass/configs/datasets/cmmlu/cmmlu_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
 - cmnli:
     name: CLUE / CMNLI
     category: Reasoning
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_cmnli
+    configpath: opencompass/configs/datasets/CLUE_cmnli/CLUE_cmnli_gen.py
+    configpath_llmjudge: ''
 - cmo_fib:
     name: cmo_fib
     category: Examination
-    paper: ""
-    configpath: opencompass/configs/datasets/cmo_fib
+    paper: ''
+    configpath: opencompass/configs/datasets/cmo_fib/cmo_fib_gen.py
+    configpath_llmjudge: ''
 - cmrc:
     name: CLUE / CMRC
     category: Understanding
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_CMRC
+    configpath: opencompass/configs/datasets/CLUE_CMRC/CLUE_CMRC_gen.py
+    configpath_llmjudge: ''
 - commonsenseqa:
     name: CommonSenseQA
     category: Knowledge
     paper: https://arxiv.org/pdf/1811.00937v2
-    configpath: opencompass/configs/datasets/commonsenseqa
+    configpath: opencompass/configs/datasets/commonsenseqa/commonsenseqa_gen.py
+    configpath_llmjudge: ''
 - commonsenseqa_cn:
     name: CommonSenseQA-CN
     category: Knowledge
-    paper: ""
-    configpath: opencompass/configs/datasets/commonsenseqa_cn
+    paper: ''
+    configpath: opencompass/configs/datasets/commonsenseqa_cn/commonsenseqacn_gen.py
+    configpath_llmjudge: ''
 - copa:
     name: SuperGLUE / COPA
     category: Reasoning
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_COPA
+    configpath: opencompass/configs/datasets/SuperGLUE_COPA/SuperGLUE_COPA_gen.py
+    configpath_llmjudge: ''
 - crowspairs:
     name: CrowsPairs
     category: Safety
     paper: https://arxiv.org/pdf/2010.00133
-    configpath: opencompass/configs/datasets/crowspairs
+    configpath: opencompass/configs/datasets/crowspairs/crowspairs_gen.py
+    configpath_llmjudge: ''
 - crowspairs_cn:
     name: CrowsPairs-CN
     category: Safety
-    paper: ""
-    configpath: opencompass/configs/datasets/crowspairs_cn
+    paper: ''
+    configpath: opencompass/configs/datasets/crowspairs_cn/crowspairscn_gen.py
+    configpath_llmjudge: ''
 - cvalues:
     name: CVALUES
     category: Safety
     paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
-    configpath: opencompass/configs/datasets/cvalues
+    configpath: opencompass/configs/datasets/cvalues/cvalues_responsibility_gen.py
+    configpath_llmjudge: ''
 - drcd:
     name: CLUE / DRCD
     category: Understanding
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_DRCD
+    configpath: opencompass/configs/datasets/CLUE_DRCD/CLUE_DRCD_gen.py
+    configpath_llmjudge: ''
 - drop:
     name: DROP (DROP Simple Eval)
     category: Understanding
     paper: https://arxiv.org/pdf/1903.00161
-    configpath: opencompass/configs/datasets/drop
+    configpath: opencompass/configs/datasets/drop/drop_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/drop/drop_llm_judge_gen.py
 - ds1000:
     name: DS-1000
     category: Code
     paper: https://arxiv.org/pdf/2211.11501
-    configpath: opencompass/configs/datasets/ds1000
+    configpath:
+    - opencompass/configs/datasets/ds1000/ds1000_gen_5c4bec.py
+    configpath_llmjudge: ''
 - eprstmt:
     name: FewCLUE / EPRSTMT
     category: Understanding
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_eprstmt
+    configpath: opencompass/configs/datasets/FewCLUE_eprstmt/FewCLUE_eprstmt_gen.py
+    configpath_llmjudge: ''
 - flores:
     name: Flores
     category: Language
     paper: https://aclanthology.org/D19-1632.pdf
-    configpath: opencompass/configs/datasets/flores
+    configpath: opencompass/configs/datasets/flores/flores_gen.py
+    configpath_llmjudge: ''
 - game24:
     name: Game24
     category: Math
     paper: https://huggingface.co/datasets/nlile/24-game
-    configpath: opencompass/configs/datasets/game24
+    configpath: opencompass/configs/datasets/game24/game24_gen.py
+    configpath_llmjudge: ''
 - govrepcrs:
     name: Government Report Dataset
     category: Long Context
     paper: https://aclanthology.org/2021.naacl-main.112.pdf
-    configpath: opencompass/configs/datasets/govrepcrs
+    configpath: opencompass/configs/datasets/govrepcrs/govrepcrs_gen.py
+    configpath_llmjudge: ''
 - gpqa:
     name: GPQA
     category: Knowledge
     paper: https://arxiv.org/pdf/2311.12022v1
-    configpath: opencompass/configs/datasets/gpqa
+    configpath: opencompass/configs/datasets/gpqa/gpqa_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
 - gsm8k:
     name: GSM8K
     category: Math
     paper: https://arxiv.org/pdf/2110.14168v2
-    configpath: opencompass/configs/datasets/gsm8k
+    configpath: opencompass/configs/datasets/gsm8k/gsm8k_gen.py
+    configpath_llmjudge: ''
 - gsm_hard:
     name: GSM-Hard
     category: Math
     paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
-    configpath: opencompass/configs/datasets/gsm_hard
+    configpath: opencompass/configs/datasets/gsm_hard/gsmhard_gen.py
+    configpath_llmjudge: ''
 - hle:
     name: HLE(Humanity's Last Exam)
     category: Reasoning
     paper: https://lastexam.ai/paper
-    configpath: opencompass/configs/datasets/HLE
+    configpath: opencompass/configs/datasets/HLE/hle_gen.py
+    configpath_llmjudge: ''
 - hellaswag:
     name: HellaSwag
     category: Reasoning
     paper: https://arxiv.org/pdf/1905.07830
-    configpath: opencompass/configs/datasets/hellaswag
+    configpath: opencompass/configs/datasets/hellaswag/hellaswag_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
 - humaneval:
     name: HumanEval
     category: Code
     paper: https://arxiv.org/pdf/2107.03374v2
-    configpath: opencompass/configs/datasets/humaneval
+    configpath: opencompass/configs/datasets/humaneval/humaneval_gen.py
+    configpath_llmjudge: ''
 - humaneval_cn:
     name: HumanEval-CN
     category: Code
-    paper: ""
-    configpath: opencompass/configs/datasets/humaneval_cn
+    paper: ''
+    configpath: opencompass/configs/datasets/humaneval_cn/humaneval_cn_gen.py
+    configpath_llmjudge: ''
 - humaneval_multi:
     name: Multi-HumanEval
     category: Code
     paper: https://arxiv.org/pdf/2210.14868
-    configpath: opencompass/configs/datasets/humaneval_multi
+    configpath: opencompass/configs/datasets/humaneval_multi/humaneval_multi_gen.py
+    configpath_llmjudge: ''
+- humaneval_multi:
+    name: HumanEval+
+    category: Code
+    paper: https://arxiv.org/pdf/2305.01210
+    configpath: opencompass/configs/datasets/humaneval_plus/humaneval_plus_gen.py
+    configpath_llmjudge: ''
 - humanevalx:
     name: HumanEval-X
     category: Code
     paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
-    configpath: opencompass/configs/datasets/humanevalx
+    configpath: opencompass/configs/datasets/humanevalx/humanevalx_gen.py
+    configpath_llmjudge: ''
 - hungarian_math:
     name: Hungarian_Math
     category: Math
     paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
-    configpath: opencompass/configs/datasets/hungarian_exam
+    configpath: opencompass/configs/datasets/hungarian_exam/hungarian_exam_gen.py
+    configpath_llmjudge: ''
 - iwslt2017:
     name: IWSLT2017
     category: Language
     paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
-    configpath: opencompass/configs/datasets/iwslt2017
+    configpath: opencompass/configs/datasets/iwslt2017/iwslt2017_gen.py
+    configpath_llmjudge: ''
 - jigsawmultilingual:
     name: JigsawMultilingual
     category: Safety
     paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
-    configpath: opencompass/configs/datasets/jigsawmultilingual
+    configpath: opencompass/configs/datasets/jigsawmultilingual/jigsawmultilingual_clp.py
+    configpath_llmjudge: ''
 - lambada:
     name: LAMBADA
     category: Understanding
     paper: https://arxiv.org/pdf/1606.06031
-    configpath: opencompass/configs/datasets/lambada
+    configpath: opencompass/configs/datasets/lambada/lambada_gen.py
+    configpath_llmjudge: ''
 - lcsts:
     name: LCSTS
     category: Understanding
     paper: https://aclanthology.org/D15-1229.pdf
-    configpath: opencompass/configs/datasets/lcsts
+    configpath: opencompass/configs/datasets/lcsts/lcsts_gen.py
+    configpath_llmjudge: ''
 - livestembench:
     name: LiveStemBench
-    category: ""
-    paper: ""
-    configpath: opencompass/configs/datasets/livestembench
+    category: ''
+    paper: ''
+    configpath: opencompass/configs/datasets/livestembench/livestembench_gen.py
+    configpath_llmjudge: ''
 - llm_compression:
     name: LLM Compression
     category: Bits Per Character (BPC)
     paper: https://arxiv.org/pdf/2404.09937
-    configpath: opencompass/configs/datasets/llm_compression
+    configpath: opencompass/configs/datasets/llm_compression/llm_compression.py
+    configpath_llmjudge: ''
 - math:
     name: MATH
     category: Math
     paper: https://arxiv.org/pdf/2103.03874
     configpath: opencompass/configs/datasets/math
+    configpath_llmjudge: ''
+- math500:
+    name: MATH500
+    category: Math
+    paper: https://github.com/openai/prm800k
+    configpath: opencompass/configs/datasets/math/math_prm800k_500_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
 - math401:
     name: MATH 401
     category: Math
     paper: https://arxiv.org/pdf/2304.02015
-    configpath: opencompass/configs/datasets/math401
+    configpath: opencompass/configs/datasets/math401/math401_gen.py
+    configpath_llmjudge: ''
 - mathbench:
     name: MathBench
     category: Math
     paper: https://arxiv.org/pdf/2405.12209
-    configpath: opencompass/configs/datasets/mathbench
+    configpath: opencompass/configs/datasets/mathbench/mathbench_gen.py
+    configpath_llmjudge: ''
 - mbpp:
     name: MBPP
     category: Code
     paper: https://arxiv.org/pdf/2108.07732
-    configpath: opencompass/configs/datasets/mbpp
+    configpath: opencompass/configs/datasets/mbpp/mbpp_gen.py
+    configpath_llmjudge: ''
 - mbpp_cn:
     name: MBPP-CN
     category: Code
-    paper: ""
-    configpath: opencompass/configs/datasets/mbpp_cn
+    paper: ''
+    configpath: opencompass/configs/datasets/mbpp_cn/mbpp_cn_gen.py
+    configpath_llmjudge: ''
 - mbpp_plus:
     name: MBPP-PLUS
     category: Code
-    paper: ""
-    configpath: opencompass/configs/datasets/mbpp_plus
+    paper: ''
+    configpath: opencompass/configs/datasets/mbpp_plus/mbpp_plus_gen.py
+    configpath_llmjudge: ''
 - mgsm:
     name: MGSM
     category: Language / Math
     paper: https://arxiv.org/pdf/2210.03057
-    configpath: opencompass/configs/datasets/mgsm
+    configpath: opencompass/configs/datasets/mgsm/mgsm_gen.py
+    configpath_llmjudge: ''
 - mmlu:
     name: MMLU
     category: Understanding
     paper: https://arxiv.org/pdf/2009.03300
-    configpath: opencompass/configs/datasets/mmlu
+    configpath: opencompass/configs/datasets/mmlu/mmlu_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
 - mmlu_cf:
     name: MMLU-CF
     category: Understanding
     paper: https://arxiv.org/pdf/2412.15194
-    configpath: opencompass/configs/datasets/mmlu_cf
+    configpath: opencompass/configs/datasets/mmlu_cf/mmlu_cf_gen.py
+    configpath_llmjudge: ''
 - mmlu_pro:
     name: MMLU-Pro
     category: Understanding
     paper: https://arxiv.org/pdf/2406.01574
-    configpath: opencompass/configs/datasets/mmlu_pro
+    configpath: opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
 - mmmlu:
     name: MMMLU
     category: Language / Understanding
     paper: https://huggingface.co/datasets/openai/MMMLU
-    configpath: opencompass/configs/datasets/mmmlu
+    configpath:
+      - opencompass/configs/datasets/mmmlu/mmmlu_gen.py
+      - opencompass/configs/datasets/mmmlu_lite/mmmlu_lite_gen.py
+    configpath_llmjudge: ''
 - multirc:
     name: SuperGLUE / MultiRC
     category: Understanding
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
+    configpath: opencompass/configs/datasets/SuperGLUE_MultiRC/SuperGLUE_MultiRC_gen.py
+    configpath_llmjudge: ''
 - multipl_e:
     name: MultiPL-E
     category: Code
     paper: https://arxiv.org/pdf/2210.14868
     configpath: opencompass/configs/datasets/multipl_e
+    configpath_llmjudge: ''
 - narrativeqa:
     name: NarrativeQA
     category: Understanding
     paper: https://github.com/google-deepmind/narrativeqa
-    configpath: opencompass/configs/datasets/narrativeqa
+    configpath: opencompass/configs/datasets/narrativeqa/narrativeqa_gen.py
+    configpath_llmjudge: ''
 - natural_question:
     name: NaturalQuestions
     category: Knowledge
     paper: https://github.com/google-research-datasets/natural-questions
-    configpath: opencompass/configs/datasets/nq
+    configpath: opencompass/configs/datasets/nq/nq_gen.py
+    configpath_llmjudge: ''
 - natural_question_cn:
     name: NaturalQuestions-CN
     category: Knowledge
-    paper: ""
-    configpath: opencompass/configs/datasets/nq_cn
+    paper: ''
+    configpath: opencompass/configs/datasets/nq_cn/nqcn_gen.py
+    configpath_llmjudge: ''
 - obqa:
     name: OpenBookQA
     category: Knowledge
     paper: https://arxiv.org/pdf/1809.02789v1
-    configpath: opencompass/configs/datasets/obqa
+    configpath: opencompass/configs/datasets/obqa/obqa_gen.py
+    configpath_llmjudge: ''
 - piqa:
     name: OpenBookQA
     category: Knowledge / Physics
     paper: https://arxiv.org/pdf/1911.11641v1
-    configpath: opencompass/configs/datasets/piqa
+    configpath: opencompass/configs/datasets/piqa/piqa_gen.py
+    configpath_llmjudge: ''
 - py150:
     name: py150
     category: Code
     paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
-    configpath: opencompass/configs/datasets/py150
+    configpath: opencompass/configs/datasets/py150/py150_gen.py
+    configpath_llmjudge: ''
 - qasper:
     name: Qasper
     category: Long Context
     paper: https://arxiv.org/pdf/2105.03011
-    configpath: opencompass/configs/datasets/qasper
+    configpath: opencompass/configs/datasets/qasper/qasper_gen.py
+    configpath_llmjudge: ''
 - qaspercut:
     name: Qasper-Cut
     category: Long Context
-    paper: ""
-    configpath: opencompass/configs/datasets/qaspercut
+    paper: ''
+    configpath: opencompass/configs/datasets/qaspercut/qaspercut_gen.py
+    configpath_llmjudge: ''
 - race:
     name: RACE
     category: Examination
     paper: https://arxiv.org/pdf/1704.04683
-    configpath: opencompass/configs/datasets/race
+    configpath: opencompass/configs/datasets/race/race_gen.py
+    configpath_llmjudge: ''
 - realtoxicprompts:
     name: RealToxicPrompts
     category: Safety
     paper: https://arxiv.org/pdf/2009.11462
-    configpath: opencompass/configs/datasets/realtoxicprompts
+    configpath: opencompass/configs/datasets/realtoxicprompts/realtoxicprompts_gen.py
+    configpath_llmjudge: ''
 - record:
     name: SuperGLUE / ReCoRD
     category: Understanding
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
+    configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD/SuperGLUE_ReCoRD_gen.py
+    configpath_llmjudge: ''
 - rte:
     name: SuperGLUE / RTE
     category: Reasoning
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_RTE
+    configpath: opencompass/configs/datasets/SuperGLUE_RTE/SuperGLUE_RTE_gen.py
+    configpath_llmjudge: ''
 - ocnli:
     name: CLUE / OCNLI
     category: Reasoning
     paper: https://arxiv.org/pdf/2004.05986
-    configpath: opencompass/configs/datasets/CLUE_ocnli
+    configpath: opencompass/configs/datasets/CLUE_ocnli/CLUE_ocnli_gen.py
+    configpath_llmjudge: ''
+- ocnlifc:
+    name: FewCLUE / OCNLI-FC
+    category: Reasoning
+    paper: https://arxiv.org/pdf/2107.07498
+    configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc/FewCLUE_ocnli_fc_gen.py
+    configpath_llmjudge: ''
 - rolebench:
     name: RoleBench
     category: Role Play
     paper: https://arxiv.org/pdf/2310.00746
     configpath: opencompass/configs/datasets/rolebench
+    configpath_llmjudge: ''
 - s3eval:
     name: S3Eval
     category: Long Context
     paper: https://aclanthology.org/2024.naacl-long.69.pdf
-    configpath: opencompass/configs/datasets/s3eval
+    configpath: opencompass/configs/datasets/s3eval/s3eval_gen.py
+    configpath_llmjudge: ''
 - scibench:
     name: SciBench
     category: Reasoning
     paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
-    configpath: opencompass/configs/datasets/scibench
+    configpath: opencompass/configs/datasets/scibench/scibench_gen.py
+    configpath_llmjudge: ''
 - scicode:
     name: SciCode
     category: Code
     paper: https://arxiv.org/pdf/2407.13168
-    configpath: opencompass/configs/datasets/scicode
+    configpath: opencompass/configs/datasets/scicode/scicode_gen.py
+    configpath_llmjudge: ''
 - simpleqa:
     name: SimpleQA
     category: Knowledge
     paper: https://arxiv.org/pdf/2411.04368
-    configpath: opencompass/configs/datasets/SimpleQA
+    configpath: opencompass/configs/datasets/SimpleQA/simpleqa_gen.py
+    configpath_llmjudge: ''
 - siqa:
     name: SocialIQA
     category: Reasoning
     paper: https://arxiv.org/pdf/1904.09728
-    configpath: opencompass/configs/datasets/siqa
+    configpath: opencompass/configs/datasets/siqa/siqa_gen.py
+    configpath_llmjudge: ''
 - squad20:
     name: SQuAD2.0
     category: Understanding
     paper: https://arxiv.org/pdf/1806.03822
-    configpath: opencompass/configs/datasets/squad20
+    configpath: opencompass/configs/datasets/squad20/squad20_gen.py
+    configpath_llmjudge: ''
 - storycloze:
     name: StoryCloze
     category: Reasoning
     paper: https://aclanthology.org/2022.emnlp-main.616.pdf
-    configpath: opencompass/configs/datasets/storycloze
+    configpath: opencompass/configs/datasets/storycloze/storycloze_gen.py
+    configpath_llmjudge: ''
 - strategyqa:
     name: StrategyQA
     category: Reasoning
     paper: https://arxiv.org/pdf/2101.02235
-    configpath: opencompass/configs/datasets/strategyqa
+    configpath: opencompass/configs/datasets/strategyqa/strategyqa_gen.py
+    configpath_llmjudge: ''
 - summedits:
     name: SummEdits
     category: Language
     paper: https://aclanthology.org/2023.emnlp-main.600.pdf
-    configpath: opencompass/configs/datasets/summedits
+    configpath: opencompass/configs/datasets/summedits/summedits_gen.py
+    configpath_llmjudge: ''
 - summscreen:
     name: SummScreen
     category: Understanding
     paper: https://arxiv.org/pdf/2104.07091v1
-    configpath: opencompass/configs/datasets/summscreen
+    configpath: opencompass/configs/datasets/summscreen/summscreen_gen.py
+    configpath_llmjudge: ''
 - svamp:
     name: SVAMP
     category: Math
     paper: https://aclanthology.org/2021.naacl-main.168.pdf
-    configpath: opencompass/configs/datasets/SVAMP
+    configpath: opencompass/configs/datasets/SVAMP/svamp_gen.py
+    configpath_llmjudge: ''
 - tabmwp:
     name: TabMWP
     category: Math / Table
     paper: https://arxiv.org/pdf/2209.14610
-    configpath: opencompass/configs/datasets/TabMWP
+    configpath: opencompass/configs/datasets/TabMWP/TabMWP_gen.py
+    configpath_llmjudge: ''
 - taco:
     name: TACO
     category: Code
     paper: https://arxiv.org/pdf/2312.14852
-    configpath: opencompass/configs/datasets/taco
+    configpath: opencompass/configs/datasets/taco/taco_gen.py
+    configpath_llmjudge: ''
 - tnews:
     name: FewCLUE / TNEWS
     category: Understanding
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_tnews
+    configpath: opencompass/configs/datasets/FewCLUE_tnews/FewCLUE_tnews_gen.py
+    configpath_llmjudge: ''
 - bustm:
     name: FewCLUE / BUSTM
     category: Reasoning
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_bustm
+    configpath: opencompass/configs/datasets/FewCLUE_bustm/FewCLUE_bustm_gen.py
+    configpath_llmjudge: ''
 - csl:
     name: FewCLUE / CSL
     category: Understanding
     paper: https://arxiv.org/pdf/2107.07498
-    configpath: opencompass/configs/datasets/FewCLUE_csl
+    configpath: opencompass/configs/datasets/FewCLUE_csl/FewCLUE_csl_gen.py
+    configpath_llmjudge: ''
 - ocnli_fc:
     name: FewCLUE / OCNLI-FC
     category: Reasoning
     paper: https://arxiv.org/pdf/2107.07498
     configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
+    configpath_llmjudge: ''
 - triviaqa:
     name: TriviaQA
     category: Knowledge
     paper: https://arxiv.org/pdf/1705.03551v2
-    configpath: opencompass/configs/datasets/triviaqa
+    configpath: opencompass/configs/datasets/triviaqa/triviaqa_gen.py
+    configpath_llmjudge: ''
 - triviaqarc:
     name: TriviaQA-RC
     category: Knowledge / Understanding
-    paper: ""
-    configpath: opencompass/configs/datasets/triviaqarc
+    paper: ''
+    configpath: opencompass/configs/datasets/triviaqarc/triviaqarc_gen.py
+    configpath_llmjudge: ''
 - truthfulqa:
     name: TruthfulQA
     category: Safety
     paper: https://arxiv.org/pdf/2109.07958v2
-    configpath: opencompass/configs/datasets/truthfulqa
+    configpath: opencompass/configs/datasets/truthfulqa/truthfulqa_gen.py
+    configpath_llmjudge: ''
 - tydiqa:
     name: TyDi-QA
     category: Language
     paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
-    configpath: opencompass/configs/datasets/tydiqa
+    configpath: opencompass/configs/datasets/tydiqa/tydiqa_gen.py
+    configpath_llmjudge: ''
 - wic:
     name: SuperGLUE / WiC
     category: Language
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_WiC
+    configpath: opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
+    configpath_llmjudge: ''
 - wsc:
     name: SuperGLUE / WSC
     category: Language / WSC
     paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
-    configpath: opencompass/configs/datasets/SuperGLUE_WSC
+    configpath: opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_gen.py
+    configpath_llmjudge: ''
 - winogrande:
     name: WinoGrande
     category: Language / WSC
     paper: https://arxiv.org/pdf/1907.10641v2
-    configpath: opencompass/configs/datasets/winogrande
+    configpath: opencompass/configs/datasets/winogrande/winogrande_gen.py
+    configpath_llmjudge: ''
 - xcopa:
     name: XCOPA
     category: Language
     paper: https://arxiv.org/pdf/2005.00333
-    configpath: opencompass/configs/datasets/XCOPA
+    configpath: opencompass/configs/datasets/XCOPA/XCOPA_ppl.py
+    configpath_llmjudge: ''
 - xiezhi:
     name: Xiezhi
     category: Knowledge
     paper: https://arxiv.org/pdf/2306.05783
-    configpath: opencompass/configs/datasets/xiezhi
+    configpath: opencompass/configs/datasets/xiezhi/xiezhi_gen.py
+    configpath_llmjudge: ''
 - xlsum:
     name: XLSum
     category: Understanding
     paper: https://arxiv.org/pdf/2106.13822v1
-    configpath: opencompass/configs/datasets/XLSum
+    configpath: opencompass/configs/datasets/XLSum/XLSum_gen.py
+    configpath_llmjudge: ''
 - xsum:
     name: Xsum
     category: Understanding
     paper: https://arxiv.org/pdf/1808.08745
-    configpath: opencompass/configs/datasets/Xsum
+    configpath: opencompass/configs/datasets/Xsum/Xsum_gen.py
+    configpath_llmjudge: ''
+- cola:
+    name: GLUE / CoLA
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_CoLA/GLUE_CoLA_ppl.py
+    configpath_llmjudge: ''
+- mprc:
+    name: GLUE / MPRC
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_MRPC/GLUE_MRPC_ppl.py
+    configpath_llmjudge: ''
+- qqp:
+    name: GLUE / QQP
+    category: Understanding
+    paper: https://arxiv.org/pdf/1804.07461
+    configpath: opencompass/configs/datasets/GLUE_QQP/GLUE_QQP_ppl.py
+    configpath_llmjudge: ''
+- omni_math:
+    name: Omni-MATH
+    category: Math
+    paper: https://omni-math.github.io/
+    configpath: opencompass/configs/datasets/omni_math/omni_math_gen.py
+    configpath_llmjudge: ''
+- wikibench:
+    name: WikiBench
+    category: Knowledge
+    paper: ''
+    configpath: opencompass/configs/datasets/wikibench/wikibench_gen.py
+    configpath_llmjudge: ''
 - supergpqa:
     name: SuperGPQA
     category: Knowledge
     paper: https://arxiv.org/pdf/2502.14739
     configpath: opencompass/configs/datasets/supergpqa
+    configpath_llmjudge: ''
diff --git a/docs/en/statis.py b/docs/en/statis.py
index a110c631..483ebf78 100755
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@@ -14,6 +14,12 @@ On this page, we have listed all the datasets supported by OpenCompass.
 
 You can use sorting and search functions to find the dataset you need.
 
+We provide recommended running configurations for each dataset,
+and in some datasets also offer recommended configurations based on LLM Judge.
+
+You can quickly start evaluation tasks based on the recommended configurations.
+However, please note that these configurations may be updated over time.
+
 """
 
 with open('dataset_statistics.md', 'w') as f:
@@ -24,7 +30,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
 with open(load_path, 'r') as f2:
     data_list = yaml.load(f2, Loader=yaml.FullLoader)
 
-HEADER = ['name', 'category', 'paper', 'configpath']
+HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
 
 def table_format(data_list):
@@ -35,6 +41,13 @@ def table_format(data_list):
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[link](' + i[j][index] + ')')
+                elif index == 'configpath_llmjudge':
+                    if i[j][index] == '':
+                        table_format_list_sub.append(i[j][index])
+                    else:
+                        table_format_list_sub.append('[link](' +
+                                                     GITHUB_PREFIX +
+                                                     i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
@@ -61,7 +74,10 @@ def generate_table(data_list, title=None):
         if title is not None:
             f.write(f'\n{title}')
         f.write("""\n```{table}\n:class: dataset\n""")
-        header = ['Name', 'Category', 'Paper or Repository', 'Config File']
+        header = [
+            'Name', 'Category', 'Paper or Repository', 'Recommended Config',
+            'Recommended Config (LLM Judge)'
+        ]
         table_cfg = dict(tablefmt='pipe',
                          floatfmt='.2f',
                          numalign='right',
diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py
index eb5dc7fe..19d03bfd 100755
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@@ -14,6 +14,10 @@ DATASETZOO_TEMPLATE = """\
 
 你可以使用排序和搜索功能找到需要的数据集。
 
+我们对每一个数据集都给出了推荐的运行配置，部分数据集中还提供了基于LLM Judge的推荐配置。
+
+你可以基于推荐配置快速启动评测。但请注意，推荐配置可能随时间推移被更新。
+
 """
 
 with open('dataset_statistics.md', 'w') as f:
@@ -24,7 +28,7 @@ load_path = str(OC_ROOT / 'dataset-index.yml')
 with open(load_path, 'r') as f2:
     data_list = yaml.load(f2, Loader=yaml.FullLoader)
 
-HEADER = ['name', 'category', 'paper', 'configpath']
+HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
 
 def table_format(data_list):
@@ -35,6 +39,12 @@ def table_format(data_list):
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[链接](' + i[j][index] + ')')
+                elif index == 'configpath_llmjudge':
+                    if i[j][index] == '':
+                        table_format_list_sub.append(i[j][index])
+                    else:
+                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                                                     i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
@@ -60,7 +70,7 @@ def generate_table(data_list, title=None):
         if title is not None:
             f.write(f'\n{title}')
         f.write("""\n```{table}\n:class: dataset\n""")
-        header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
+        header = ['数据集名称', '数据集类型', '原文或资源地址', '推荐配置', '推荐配置(基于LLM评估)']
         table_cfg = dict(tablefmt='pipe',
                          floatfmt='.2f',
                          numalign='right',
diff --git a/opencompass/configs/datasets/IFEval/IFEval_gen.py b/opencompass/configs/datasets/IFEval/IFEval_gen.py
index 9103c13b..56ed7e03 100644
--- a/opencompass/configs/datasets/IFEval/IFEval_gen.py
+++ b/opencompass/configs/datasets/IFEval/IFEval_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .IFEval_gen_3321a3 import ifeval_datasets  # noqa: F401, F403
+    from .IFEval_gen_353ae7 import ifeval_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen.py b/opencompass/configs/datasets/aime2024/aime2024_gen.py
index 84aef387..8c63ca7e 100644
--- a/opencompass/configs/datasets/aime2024/aime2024_gen.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .aime2024_gen_6e39a4 import aime2024_datasets  # noqa: F401, F403
\ No newline at end of file
+    from .aime2024_gen_17d799 import aime2024_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
new file mode 100644
index 00000000..03f6ddec
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_gen_17d799.py
@@ -0,0 +1,40 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import Aime2024Dataset
+
+
+aime2024_reader_cfg = dict(
+    input_columns=['question'],
+    output_column='answer'
+)
+
+
+aime2024_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ],
+        )
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+aime2024_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+aime2024_datasets = [
+    dict(
+        abbr='aime2024',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
+        reader_cfg=aime2024_reader_cfg,
+        infer_cfg=aime2024_infer_cfg,
+        eval_cfg=aime2024_eval_cfg,
+    )
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
new file mode 100644
index 00000000..e1525f94
--- /dev/null
+++ b/opencompass/configs/datasets/aime2024/aime2024_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .aime2024_llmjudge_gen_5e9f4f import aime2024_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/bbh/bbh_gen.py b/opencompass/configs/datasets/bbh/bbh_gen.py
index cb9dff44..240d4457 100644
--- a/opencompass/configs/datasets/bbh/bbh_gen.py
+++ b/opencompass/configs/datasets/bbh/bbh_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .bbh_gen_5b92b0 import bbh_datasets  # noqa: F401, F403
+    from .bbh_gen_ee62e9 import bbh_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py b/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
new file mode 100644
index 00000000..03519aa1
--- /dev/null
+++ b/opencompass/configs/datasets/bbh/bbh_gen_ee62e9.py
@@ -0,0 +1,99 @@
+import os
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import BBHDataset, BBHEvaluator, bbh_mcq_postprocess, BBHEvaluator_mcq
+
+bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
+
+bbh_multiple_choice_sets = [
+    'temporal_sequences',
+    'disambiguation_qa',
+    'date_understanding',
+    'tracking_shuffled_objects_three_objects',
+    'penguins_in_a_table',
+    'geometric_shapes',
+    'snarks',
+    'ruin_names',
+    'tracking_shuffled_objects_seven_objects',
+    'tracking_shuffled_objects_five_objects',
+    'logical_deduction_three_objects',
+    'hyperbaton',
+    'logical_deduction_five_objects',
+    'logical_deduction_seven_objects',
+    'movie_recommendation',
+    'salient_translation_error_detection',
+    'reasoning_about_colored_objects',
+]
+bbh_free_form_sets = [
+    'multistep_arithmetic_two',
+    'navigate',
+    'dyck_languages',
+    'word_sorting',
+    'sports_understanding',
+    'boolean_expressions',
+    'object_counting',
+    'formal_fallacies',
+    'causal_judgement',
+    'web_of_lies',
+]
+
+bbh_datasets = []
+for _name in bbh_multiple_choice_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+    bbh_eval_cfg = dict(
+        evaluator=dict(type=BBHEvaluator_mcq),
+        pred_role='BOT',
+        pred_postprocessor=dict(type=bbh_mcq_postprocess),
+        dataset_postprocessor=dict(type=bbh_mcq_postprocess))
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
+
+for _name in bbh_free_form_sets:
+    with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
+        _hint = f.read()
+    bbh_infer_cfg = dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
+                )
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer))
+    bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
+
+    bbh_datasets.append(
+        dict(
+            type=BBHDataset,
+            path='opencompass/bbh',
+            name=_name,
+            abbr='bbh-' + _name,
+            reader_cfg=bbh_reader_cfg,
+            infer_cfg=bbh_infer_cfg.copy(),
+            eval_cfg=bbh_eval_cfg.copy()))
diff --git a/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
new file mode 100644
index 00000000..1b1c143b
--- /dev/null
+++ b/opencompass/configs/datasets/bbh/bbh_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bbh_llmjudge_gen_b5bdf1 import bbh_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
new file mode 100644
index 00000000..d211b2b6
--- /dev/null
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_gen.py
@@ -0,0 +1,7 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .bigcodebench_hard_instruct_gen import bigcodebench_hard_instruct_datasets
+    from .bigcodebench_hard_complete_gen import bigcodebench_hard_complete_datasets
+
+bigcodebench_hard_datasets = sum((v for k, v in locals().items() if k.endswith('_ds')), [])
\ No newline at end of file
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
index 2b762bb0..b5bb5b37 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .bigcodebench_hard_instruct_gen_8815eb import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
+    from .bigcodebench_hard_instruct_gen_c3d5ad import bigcodebench_hard_instruct_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
index b8dcc8ed..4af844fd 100644
--- a/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
+++ b/opencompass/configs/datasets/bigcodebench/bigcodebench_hard_instruct_gen_c3d5ad.py
@@ -15,8 +15,9 @@ bigcodebench_hard_infer_cfg = dict(prompt_template=dict(
         round=[
             dict(role='HUMAN', prompt='{instruct_prompt}'),
         ])),
-                                   retriever=dict(type=ZeroRetriever),
-                                   inferencer=dict(type=GenInferencer))
+       retriever=dict(type=ZeroRetriever),
+       inferencer=dict(type=GenInferencer)
+)
 
 bigcodebench_hard_eval_cfg = dict(
     evaluator=dict(
diff --git a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
index 7f3baa9f..f8b559cd 100644
--- a/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .cmmlu_gen_c13365 import cmmlu_datasets  # noqa: F401, F403
+    from .cmmlu_0shot_cot_gen_305931 import cmmlu_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
new file mode 100644
index 00000000..d5ca44de
--- /dev/null
+++ b/opencompass/configs/datasets/cmmlu/cmmlu_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .cmmlu_llmjudge_gen_e1cd9a import cmmlu_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/drop/drop_gen.py b/opencompass/configs/datasets/drop/drop_gen.py
index 69954a35..44592ff6 100644
--- a/opencompass/configs/datasets/drop/drop_gen.py
+++ b/opencompass/configs/datasets/drop/drop_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets
+    from .drop_openai_simple_evals_gen_3857b0 import drop_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/drop/drop_llm_judge_gen.py b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
new file mode 100644
index 00000000..0694c276
--- /dev/null
+++ b/opencompass/configs/datasets/drop/drop_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .drop_llmjudge_gen_3857b0 import drop_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/gpqa/gpqa_gen.py b/opencompass/configs/datasets/gpqa/gpqa_gen.py
index f1e8784f..433ef9f5 100644
--- a/opencompass/configs/datasets/gpqa/gpqa_gen.py
+++ b/opencompass/configs/datasets/gpqa/gpqa_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets
+    from .gpqa_openai_simple_evals_gen_5aeece import gpqa_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py b/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
new file mode 100644
index 00000000..43644b16
--- /dev/null
+++ b/opencompass/configs/datasets/gpqa/gpqa_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .gpqa_0shot_nocot_genericllmeval_gen_772ea0 import gpqa_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
index 8d786700..7806d705 100644
--- a/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .hellaswag_gen_6faab5 import hellaswag_datasets  # noqa: F401, F403
+    from .hellaswag_10shot_gen_e42710 import hellaswag_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
new file mode 100644
index 00000000..ff641d26
--- /dev/null
+++ b/opencompass/configs/datasets/hellaswag/hellaswag_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .hellaswag_llmjudge_gen_809ef1 import hellaswag_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/humaneval/humaneval_gen.py b/opencompass/configs/datasets/humaneval/humaneval_gen.py
index 74019908..61c3f3b3 100644
--- a/opencompass/configs/datasets/humaneval/humaneval_gen.py
+++ b/opencompass/configs/datasets/humaneval/humaneval_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .humaneval_gen_8e312c import humaneval_datasets  # noqa: F401, F403
+    from .humaneval_openai_sample_evals_gen_dcae0e import humaneval_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/korbench/korbench_gen.py b/opencompass/configs/datasets/korbench/korbench_gen.py
new file mode 100644
index 00000000..0492922a
--- /dev/null
+++ b/opencompass/configs/datasets/korbench/korbench_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .korbench_single_0_shot_gen import korbench_0shot_single_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
new file mode 100644
index 00000000..e87b86f8
--- /dev/null
+++ b/opencompass/configs/datasets/korbench/korbench_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .korbench_single_0shot_genericllmeval_gen_56cf43 import korbench_0shot_single_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
index 8c5c0cd2..cd4d947c 100644
--- a/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
+++ b/opencompass/configs/datasets/korbench/korbench_llmjudge_gen_56cf43.py
@@ -7,10 +7,9 @@ from opencompass.datasets import generic_llmjudge_postprocess
 
 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
 
-
 GRADER_TEMPLATE = """
     Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
-    
+
     Here are some evaluation criteria:
     1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
     2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
@@ -25,11 +24,10 @@ GRADER_TEMPLATE = """
 
     Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
 
-
     <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
     <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
     <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
-    
+
     Judging the correctness of candidates' answers:
 """.strip()
 
@@ -75,18 +73,18 @@ for category in categories:
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(
-                begin=[
-                    dict(
-                        role='SYSTEM',
-                        fallback_role='HUMAN',
-                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
-                ],
-                    round=[
-                    dict(
-                        role='HUMAN',
-                        prompt = GRADER_TEMPLATE
-                    ),
-                ]),
+                  begin=[
+                      dict(
+                          role='SYSTEM',
+                          fallback_role='HUMAN',
+                          prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                  ],
+                      round=[
+                      dict(
+                          role='HUMAN',
+                          prompt = GRADER_TEMPLATE
+                      ),
+                  ]),
             ),
             dataset_cfg=dict(
                 type=korbenchDataset,
diff --git a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py
index a23bf290..c69cad5e 100644
--- a/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py
+++ b/opencompass/configs/datasets/korbench/korbench_single_0_shot_gen.py
@@ -37,7 +37,7 @@ for category in categories:
     infer_cfg = dict(
         prompt_template=prompt_template,
         retriever=dict(type=ZeroRetriever),
-        inferencer=dict(type=GenInferencer, max_out_len=1024),
+        inferencer=dict(type=GenInferencer),
     )
 
     # Evaluation configuration
diff --git a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py b/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py
index a9cb644b..196a7978 100644
--- a/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py
+++ b/opencompass/configs/datasets/korbench/korbench_single_0shot_genericllmeval_gen_17854d.py
@@ -7,10 +7,9 @@ from opencompass.datasets import generic_llmjudge_postprocess
 
 categories = ['cipher', 'counterfactual', 'logic', 'operation', 'puzzle']
 
-
 GRADER_TEMPLATE = """
     Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
-    
+
     Here are some evaluation criteria:
     1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
     2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
@@ -29,7 +28,7 @@ GRADER_TEMPLATE = """
     <Original Question Begin>: \n{prompt}\n<Original Question End>\n\n
     <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
     <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
-    
+
     Judging the correctness of candidates' answers:
 """.strip()
 
@@ -49,7 +48,7 @@ for category in categories:
             round=[
                 dict(
                     role='HUMAN',
-                    prompt='{prompt}' # f-string
+                    prompt='{prompt}'  # f-string
                 )
             ]
         )
@@ -75,18 +74,18 @@ for category in categories:
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(
-                begin=[
-                    dict(
-                        role='SYSTEM',
-                        fallback_role='HUMAN',
-                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
-                ],
+                    begin=[
+                        dict(
+                            role='SYSTEM',
+                            fallback_role='HUMAN',
+                            prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                    ],
                     round=[
-                    dict(
-                        role='HUMAN',
-                        prompt = GRADER_TEMPLATE
-                    ),
-                ]),
+                        dict(
+                            role='HUMAN',
+                            prompt=GRADER_TEMPLATE
+                        ),
+                    ]),
             ),
             dataset_cfg=dict(
                 type=korbenchDataset,
@@ -114,4 +113,4 @@ for category in categories:
         mode='singlescore',
     )
 
-    korbench_0shot_single_datasets.append(korbench_dataset)
+    korbench_0shot_single_datasets.append(korbench_dataset)
\ No newline at end of file
diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
index f663df06..b1966fe9 100644
--- a/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
+++ b/opencompass/configs/datasets/livecodebench/livecodebench_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .livecodebench_gen_6966bc import LCB_datasets  # noqa: F401, F403
+    from .livecodebench_gen_a4f90b import LCB_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
index 6c71a60f..d49a1ccc 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
@@ -1,9 +1,9 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
     MATHDataset,
-    MATHEvaluator,
     math_postprocess_v2,
     normalize_final_answer,
 )
@@ -28,8 +28,7 @@ math_infer_cfg = dict(
 
 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'),
-    pred_postprocessor=dict(type=math_postprocess_v2),
+    evaluator=dict(type=MATHEvaluator)
 )
 
 math_datasets = [
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py
new file mode 100644
index 00000000..67d74266
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468.py
@@ -0,0 +1,96 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            file_name = 'test_prm800k_500.json',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500-llmjudge',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
index 1b3bba23..c74231fc 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
@@ -1,36 +1,4 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
+from mmengine.config import read_base
 
-math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
-
-math_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            round=[
-                dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
-            ]
-        ),
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024),
-)
-
-# postprocess v2
-math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
-)
-
-math_datasets = [
-    dict(
-        type=MATHDataset,
-        abbr='math_prm800k_500',
-        path='opencompass/math',
-        file_name = 'test_prm800k_500.json',
-        reader_cfg=math_reader_cfg,
-        infer_cfg=math_infer_cfg,
-        eval_cfg=math_eval_cfg,
-    )
-]
+with read_base():
+    from .math_prm800k_500_0shot_cot_gen import math_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py b/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py
new file mode 100644
index 00000000..1b3bba23
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_gen_393424.py
@@ -0,0 +1,36 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2, normalize_final_answer
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator, version='v2'), pred_postprocessor=dict(type=math_postprocess_v2),
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500',
+        path='opencompass/math',
+        file_name = 'test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
new file mode 100644
index 00000000..461b3a9a
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .math_prm800k_500_0shot_nocot_genericllmeval_gen_6ff468 import math_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mmlu/mmlu_gen.py b/opencompass/configs/datasets/mmlu/mmlu_gen.py
index 157ee329..5c8303b8 100644
--- a/opencompass/configs/datasets/mmlu/mmlu_gen.py
+++ b/opencompass/configs/datasets/mmlu/mmlu_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .mmlu_gen_4d595a import mmlu_datasets  # noqa: F401, F403
+    from .mmlu_openai_simple_evals_gen_b618ea import mmlu_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py b/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
new file mode 100644
index 00000000..b2389fb2
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mmlu_llmjudge_gen_f4336b import mmlu_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
new file mode 100644
index 00000000..228dad99
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mmlu_pro_0shot_cot_gen_08c1de import mmlu_pro_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
new file mode 100644
index 00000000..a895d5c2
--- /dev/null
+++ b/opencompass/configs/datasets/mmlu_pro/mmlu_pro_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import mmlu_pro_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/musr/musr_llm_judge_gen.py b/opencompass/configs/datasets/musr/musr_llm_judge_gen.py
new file mode 100644
index 00000000..29bc39dc
--- /dev/null
+++ b/opencompass/configs/datasets/musr/musr_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .musr_llmjudge_gen_b47fd3 import musr_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/tools/update_dataset_suffix.py b/tools/update_dataset_suffix.py
index b1634955..bcecf8ef 100755
--- a/tools/update_dataset_suffix.py
+++ b/tools/update_dataset_suffix.py
@@ -94,21 +94,21 @@ def check_and_rename(filepath):
     return None, None
 
 
-def update_imports(data):
-    python_file, name_pairs = data
-    for filepath, new_file in name_pairs:
-        old_name = os.path.basename(filepath)[:-3]
-        new_name = os.path.basename(new_file)[:-3]
-        if not os.path.exists(python_file):
-            return
-        with open(python_file, 'r') as file:
-            filedata = file.read()
-        # Replace the old name with new name
-        new_data = filedata.replace(old_name, new_name)
-        if filedata != new_data:
-            with open(python_file, 'w') as file:
-                file.write(new_data)
-            # print(f"Updated imports in {python_file}")
+# def update_imports(data):
+#     python_file, name_pairs = data
+#     for filepath, new_file in name_pairs:
+#         old_name = os.path.basename(filepath)[:-3]
+#         new_name = os.path.basename(new_file)[:-3]
+#         if not os.path.exists(python_file):
+#             return
+#         with open(python_file, 'r') as file:
+#             filedata = file.read()
+#         # Replace the old name with new name
+#         new_data = filedata.replace(old_name, new_name)
+#         if filedata != new_data:
+#             with open(python_file, 'w') as file:
+#                 file.write(new_data)
+#             # print(f"Updated imports in {python_file}")
 
 
 def main():
@@ -134,11 +134,11 @@ def main():
         return
     with Pool(16) as p:
         p.starmap(os.rename, name_pairs)
-    root_folder = 'configs'
-    python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
-    update_data = [(python_file, name_pairs) for python_file in python_files]
-    with Pool(16) as p:
-        p.map(update_imports, update_data)
+    # root_folder = 'configs'
+    # python_files = glob.glob(f'{root_folder}/**/*.py', recursive=True)
+    # update_data = [(python_file, name_pairs) for python_file in python_files]
+    # with Pool(16) as p:
+    #     p.map(update_imports, update_data)
 
 
 if __name__ == '__main__':

From 0f46c352110b4a46730e28b7a0eb1f1d3d0c3a64 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 25 Mar 2025 17:57:11 +0800
Subject: [PATCH 42/58] [Bug] Aime2024 config fix (#1974)

* [Bug] Aime2024 config fix

* fix
---
 .../datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
index c3b4eb07..a54e53f1 100644
--- a/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
+++ b/opencompass/configs/datasets/aime2024/aime2024_llmjudge_gen_5e9f4f.py
@@ -1,7 +1,7 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import CustomDataset
+from opencompass.datasets import Aime2024Dataset
 from opencompass.evaluator import GenericLLMEvaluator
 from opencompass.datasets import generic_llmjudge_postprocess
 
@@ -69,8 +69,8 @@ aime2024_eval_cfg = dict(
             ),
         ),
         dataset_cfg=dict(
-            type=CustomDataset,
-            path='opencompass/aime2025',
+            type=Aime2024Dataset,
+            path='opencompass/aime2024',
             reader_cfg=aime2024_reader_cfg,
         ),
         judge_cfg=dict(),
@@ -81,8 +81,8 @@ aime2024_eval_cfg = dict(
 aime2024_datasets = [
     dict(
         abbr='aime2024',
-        type=CustomDataset,
-        path='opencompass/aime2025',
+        type=Aime2024Dataset,
+        path='opencompass/aime2024',
         reader_cfg=aime2024_reader_cfg,
         infer_cfg=aime2024_infer_cfg,
         eval_cfg=aime2024_eval_cfg,

From f71eb78c7217cc60cd6ec4ca24c08827085db0ea Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Mon, 31 Mar 2025 19:08:55 +0800
Subject: [PATCH 43/58] [Doc] Add TBD Token in Datasets Statistics (#1986)

* feat

* doc

* doc

* doc

* doc
---
 README.md            | 102 ++++++++++++++++++++++++-------------------
 README_zh-CN.md      |  26 ++++++++---
 dataset-index.yml    |   2 +-
 docs/en/statis.py    |  17 ++++++--
 docs/zh_cn/statis.py |  19 ++++++--
 5 files changed, 108 insertions(+), 58 deletions(-)

diff --git a/README.md b/README.md
index f3c6028a..a17a1998 100644
--- a/README.md
+++ b/README.md
@@ -176,69 +176,83 @@ Some third-party features, like Humaneval and Llama, may require additional step
 
 After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared. Now you can start your first evaluation using OpenCompass!
 
-- Your first evaluation with OpenCompass!
+### Your first evaluation with OpenCompass!
 
-  OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
+OpenCompass support setting your configs via CLI or a python script. For simple evaluation settings we recommend using CLI, for more complex evaluation, it is suggested using the script way. You can find more example scripts under the configs folder.
 
-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen
 
-  # Python scripts
-  opencompass examples/eval_chat_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_chat_demo.py
+```
 
-  You can find more script examples under [examples](./examples) folder.
+You can find more script examples under [examples](./examples) folder.
 
-- API evaluation
+### API evaluation
 
-  OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
+OpenCompass, by its design, does not really discriminate between open-source models and API models. You can evaluate both model types in the same way or even in one settings.
 
-  ```bash
-  export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
-  # CLI
-  opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
+```bash
+export OPENAI_API_KEY="YOUR_OPEN_API_KEY"
+# CLI
+opencompass --models gpt_4o_2024_05_13 --datasets demo_gsm8k_chat_gen
 
-  # Python scripts
-  opencompass examples/eval_api_demo.py
+# Python scripts
+opencompass examples/eval_api_demo.py
 
-  # You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
-  ```
+# You can use o1_mini_2024_09_12/o1_preview_2024_09_12  for o1 models, we set max_completion_tokens=8192 as default.
+```
 
-- Accelerated Evaluation
+### Accelerated Evaluation
 
-  Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
+Additionally, if you want to use an inference backend other than HuggingFace for accelerated evaluation, such as LMDeploy or vLLM, you can do so with the command below. Please ensure that you have installed the necessary packages for the chosen backend and that your model supports accelerated inference with it. For more information, see the documentation on inference acceleration backends [here](docs/en/advanced_guides/accelerator_intro.md). Below is an example using LMDeploy:
 
-  ```bash
-  # CLI
-  opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
+```bash
+# CLI
+opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
 
-  # Python scripts
-  opencompass examples/eval_lmdeploy_demo.py
-  ```
+# Python scripts
+opencompass examples/eval_lmdeploy_demo.py
+```
 
-- Supported Models
+### Supported Models and Datasets
 
-  OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
+OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
 
-  ```bash
-  # List all configurations
-  python tools/list_configs.py
-  # List all configurations related to llama and mmlu
-  python tools/list_configs.py llama mmlu
-  ```
+```bash
+# List all configurations
+python tools/list_configs.py
+# List all configurations related to llama and mmlu
+python tools/list_configs.py llama mmlu
+```
 
-  If the model is not on the list but supported by Huggingface AutoModel class, you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
+#### Supported Models
 
-  ```bash
-  opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
-  ```
+If the model is not on the list but supported by Huggingface AutoModel class or encapsulation of inference engine based on OpenAI interface (see [docs](https://opencompass.readthedocs.io/en/latest/advanced_guides/new_model.html) for details), you can also evaluate it with OpenCompass. You are welcome to contribute to the maintenance of the OpenCompass supported model and dataset lists.
 
-  If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+```bash
+opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
+```
 
-  ```bash
-  CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
-  ```
+#### Supported Datasets
+
+Currently, OpenCompass have provided standard recommended configurations for datasets. Generally, config files ending with `_gen.py` or `_llm_judge_gen.py` will point to the recommended config we provide for this dataset. You can refer to [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for more details.
+
+```bash
+# Recommended Evaluation Config based on Rules
+opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+# Recommended Evaluation Config based on LLM Judge
+opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+```
+
+If you want to use multiple GPUs to evaluate the model in data parallel, you can use `--max-num-worker`.
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
+```
 
 > \[!TIP\]
 >
@@ -288,7 +302,7 @@ You can quickly find the dataset you need from the list through sorting, filteri
 
 In addition, we provide a recommended configuration for each dataset, and some datasets also support LLM Judge-based configurations.
 
-Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
+Please refer to the dataset statistics chapter of [docs](https://opencompass.readthedocs.io/en/latest/dataset_statistics.html) for details.
 
 <p align="right"><a href="#top">🔝Back to top</a></p>
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index a4ef743f..4406c7bc 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -208,9 +208,9 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   opencompass --models hf_internlm2_5_1_8b_chat --datasets demo_gsm8k_chat_gen -a lmdeploy
   ```
 
-  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
+- ### 支持的模型与数据集
 
-- ### 支持的模型
+  OpenCompass 预定义了许多模型和数据集的配置，你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
 
   ```bash
   # 列出所有配置
@@ -219,13 +219,27 @@ humaneval, triviaqa, commonsenseqa, tydiqa, strategyqa, cmmlu, lambada, piqa, ce
   python tools/list_configs.py llama mmlu
   ```
 
-  如果模型不在列表中但支持 Huggingface AutoModel 类，您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
+  #### 支持的模型
+
+  如果模型不在列表中，但支持 Huggingface AutoModel 类或支持针对 OpenAI 接口的推理引擎封装（详见[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/new_model.html)），您仍然可以使用 OpenCompass 对其进行评估。欢迎您贡献维护 OpenCompass 支持的模型和数据集列表。
 
   ```bash
   opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat
   ```
 
-  如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
+  #### 支持的数据集
+
+  目前，OpenCompass针对数据集给出了标准的推荐配置。通常，`_gen.py`或`_llm_judge_gen.py`为结尾的配置文件将指向我们为该数据集提供的推荐配置。您可以参阅[官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节来获取详细信息。
+
+  ```bash
+  # 基于规则的推荐配置
+  opencompass --datasets aime2024_gen --models hf_internlm2_5_1_8b_chat
+
+  # 基于LLM Judge的推荐配置
+  opencompass --datasets aime2024_llm_judge_gen --models hf_internlm2_5_1_8b_chat
+  ```
+
+  此外，如果你想在多块 GPU 上使用模型进行推理，您可以使用 `--max-num-worker` 参数。
 
   ```bash
   CUDA_VISIBLE_DEVICES=0,1 opencompass --datasets demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-1_8b-chat --max-num-worker 2
@@ -281,9 +295,7 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
 
 您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
 
-另外，我们为每个数据集都提供了一种推荐配置，部分数据集还支持了基于LLM Judge的配置。
-
-详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
+详情请参阅 [官方文档](https://opencompass.readthedocs.io/zh-cn/latest/dataset_statistics.html) 的数据集统计章节。
 
 <p align="right"><a href="#top">🔝返回顶部</a></p>
 
diff --git a/dataset-index.yml b/dataset-index.yml
index 59b6d4da..67162cca 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -121,7 +121,7 @@
     category: Reasoning
     paper: https://arxiv.org/pdf/2310.16049
     configpath: opencompass/configs/datasets/musr/musr_gen.py
-    configpath_llmjudge: opencompass/configs/datasets/mmlu/mmlu_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/musr/musr_llm_judge_gen.py
 - needlebench:
     name: NeedleBench
     category: Long Context
diff --git a/docs/en/statis.py b/docs/en/statis.py
index 483ebf78..daabe818 100755
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@@ -32,12 +32,23 @@ with open(load_path, 'r') as f2:
 
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr'
+]
+
 
 def table_format(data_list):
     table_format_list = []
     for i in data_list:
         table_format_list_sub = []
         for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[link]('
+            else:
+                link_token = '[link(TBD)]('
+
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[link](' + i[j][index] + ')')
@@ -45,18 +56,18 @@ def table_format(data_list):
                     if i[j][index] == '':
                         table_format_list_sub.append(i[j][index])
                     else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                      GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
                         for k in i[j][index]:
-                            sub_list_text += ('[link](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                               ') / ')
                         table_format_list_sub.append(sub_list_text[:-2])
                     else:
-                        table_format_list_sub.append('[link](' +
+                        table_format_list_sub.append(link_token +
                                                      GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 else:
diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py
index 19d03bfd..04134cf6 100755
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@@ -30,12 +30,23 @@ with open(load_path, 'r') as f2:
 
 HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 
+recommanded_dataset_list = [
+    'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
+    'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
+    'mmlu_pro', 'musr'
+]
+
 
 def table_format(data_list):
     table_format_list = []
     for i in data_list:
         table_format_list_sub = []
         for j in i:
+            if j in recommanded_dataset_list:
+                link_token = '[链接]('
+            else:
+                link_token = '[链接(TBD)]('
+
             for index in HEADER:
                 if index == 'paper':
                     table_format_list_sub.append('[链接](' + i[j][index] + ')')
@@ -43,17 +54,19 @@ def table_format(data_list):
                     if i[j][index] == '':
                         table_format_list_sub.append(i[j][index])
                     else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 elif index == 'configpath':
                     if isinstance(i[j][index], list):
                         sub_list_text = ''
                         for k in i[j][index]:
-                            sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
+                            sub_list_text += (link_token + GITHUB_PREFIX + k +
                                               ') / ')
                         table_format_list_sub.append(sub_list_text[:-2])
                     else:
-                        table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
+                        table_format_list_sub.append(link_token +
+                                                     GITHUB_PREFIX +
                                                      i[j][index] + ')')
                 else:
                     table_format_list_sub.append(i[j][index])

From 330a6e5ca731bcdcdf1876f94212905622bbdbfc Mon Sep 17 00:00:00 2001
From: Dongsheng Zhu <59612926+Zhudongsheng75@users.noreply.github.com>
Date: Tue, 1 Apr 2025 11:51:37 +0800
Subject: [PATCH 44/58] [Update] Add Intervl-8b&38b model configs (#1978)

---
 .../models/intervl/lmdeploy_intervl_2_5_38b.py    | 15 +++++++++++++++
 .../models/intervl/lmdeploy_intervl_2_5_8b.py     | 15 +++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py
 create mode 100644 opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py

diff --git a/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py
new file mode 100644
index 00000000..971ec947
--- /dev/null
+++ b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_38b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internvl2_5-38b-turbomind',
+        path='OpenGVLab/InternVL2_5-38B',
+        engine_config=dict(session_len=8192, max_batch_size=8, tp=4),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=1024),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+    )
+]
diff --git a/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py
new file mode 100644
index 00000000..68e3b5be
--- /dev/null
+++ b/opencompass/configs/models/intervl/lmdeploy_intervl_2_5_8b.py
@@ -0,0 +1,15 @@
+from opencompass.models import TurboMindModel
+
+models = [
+    dict(
+        type=TurboMindModel,
+        abbr='internvl2_5-8b-turbomind',
+        path='OpenGVLab/InternVL2_5-8B',
+        engine_config=dict(session_len=8192, max_batch_size=16, tp=1),
+        gen_config=dict(top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=8192),
+        max_seq_len=8192,
+        max_out_len=8192,
+        batch_size=16,
+        run_cfg=dict(num_gpus=1),
+    )
+]

From f66b0b347a8993628e122c9e39af4369b75e1354 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 2 Apr 2025 12:03:45 +0800
Subject: [PATCH 45/58] [Update] Requirements update (#1993)

---
 .../datasets/TheoremQA/number_utils.py        |  2 +-
 opencompass/datasets/TheoremQA/utils.py       |  2 +-
 opencompass/models/claude_sdk_api.py          | 27 +++++++++++++++----
 opencompass/models/openai_api.py              | 15 ++++++++---
 requirements/extra.txt                        |  4 +--
 5 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/opencompass/datasets/TheoremQA/number_utils.py b/opencompass/datasets/TheoremQA/number_utils.py
index 12f6e6dc..fd93fe66 100644
--- a/opencompass/datasets/TheoremQA/number_utils.py
+++ b/opencompass/datasets/TheoremQA/number_utils.py
@@ -48,7 +48,7 @@ def clean_units(pred_str: str):
 
 
 def number_it(num):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
     if isinstance(num, (int, float)):
         return num
 
diff --git a/opencompass/datasets/TheoremQA/utils.py b/opencompass/datasets/TheoremQA/utils.py
index a4f32b2b..ca9c2661 100644
--- a/opencompass/datasets/TheoremQA/utils.py
+++ b/opencompass/datasets/TheoremQA/utils.py
@@ -17,7 +17,7 @@ def time_limit(seconds: float):
 
 
 def extract_theoremqa_answer(pred: str, answer_flag: bool = True):
-    from latex2sympy2 import latex2sympy
+    from latex2sympy2_extended import latex2sympy
 
     if any([option in pred.lower() for option in ['yes', 'true']]):
         pred = 'True'
diff --git a/opencompass/models/claude_sdk_api.py b/opencompass/models/claude_sdk_api.py
index 8cbf98ef..173047f1 100644
--- a/opencompass/models/claude_sdk_api.py
+++ b/opencompass/models/claude_sdk_api.py
@@ -33,6 +33,7 @@ class ClaudeSDK(BaseAPIModel):
         max_seq_len: int = 2048,
         meta_template: Optional[Dict] = None,
         temperature: Optional[float] = 0.0,
+        thinking: Optional[Dict] = None,
         retry: int = 2,
     ):
         super().__init__(path=path,
@@ -49,6 +50,7 @@ class ClaudeSDK(BaseAPIModel):
         self.anthropic = Anthropic(api_key=key)
         self.model = path
         self.temperature = temperature
+        self.thinking = thinking
 
     def generate(
         self,
@@ -108,11 +110,26 @@ class ClaudeSDK(BaseAPIModel):
         while num_retries < self.retry:
             self.wait()
             try:
-                responses = self.anthropic.messages.create(
-                    model=self.model,
-                    max_tokens=max_out_len,
-                    temperature=self.temperature,
-                    messages=messages)
+                api_params = {
+                    'model': self.model,
+                    'max_tokens': max_out_len,
+                    'temperature': self.temperature,
+                    'messages': messages,
+                }
+
+                if self.thinking is not None:
+                    api_params['thinking'] = self.thinking
+                    api_params['stream'] = True
+
+                responses = self.anthropic.messages.create(**api_params)
+
+                # Handle new response format
+                for content in responses.content:
+                    if content.type == 'text':
+                        return content.text
+
+                # If no text type content is found, return the first
+                # content (backward compatibility)
                 return responses.content[0].text
             except Exception as e:
                 self.logger.error(e)
diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py
index f46de71c..7b2c2c53 100644
--- a/opencompass/models/openai_api.py
+++ b/opencompass/models/openai_api.py
@@ -652,7 +652,6 @@ class OpenAISDK(OpenAI):
                     self.logger.info('Start calling OpenAI API')
                 responses = self.openai_client.chat.completions.create(
                     **query_data, timeout=timeout)  # timeout in seconds
-
                 if self.verbose:
                     self.logger.info(
                         'Successfully get response from OpenAI API')
@@ -660,10 +659,18 @@ class OpenAISDK(OpenAI):
                         self.logger.info(responses)
                     except Exception:
                         pass  # noqa F841
-                if not responses.choices:
+
+                # Check if response is empty or content is empty
+                if not responses.choices or not responses.choices[
+                        0].message.content:
                     self.logger.error(
-                        'Response is empty, it is an internal server error \
-                            from the API provider.')
+                        'API response is empty, it might be due to excessive '
+                        'input length or an internal server error '
+                        'from your API provider.')
+                    num_retries += 1
+                    # Continue to retry instead of returning empty response
+                    continue
+
                 return responses.choices[0].message.content
 
             except (BadRequestError, APIStatusError) as e:
diff --git a/requirements/extra.txt b/requirements/extra.txt
index fd3f7a2f..fa90a34c 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -11,12 +11,10 @@ faiss_gpu==1.7.2
 -e git+https://github.com/open-compass/human-eval.git#egg=human-eval
 # IFEval
 langdetect
-# TheoremQA
-latex2sympy2==1.9.1
 # Lawbench, leval
 ltp
 # Math
-math-verify
+math-verify[antlr4_11_0]
 # Taco, apps Dataset
 pyext
 # Law Bench

From 97236c8e971e09c9a33c1b7734ddc4b87b561056 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 2 Apr 2025 14:25:16 +0800
Subject: [PATCH 46/58] [CI] Fix baseline score (#1996)

* update

* update

* update

* update
---
 .../scripts/oc_score_baseline_fullbench.yaml  | 68 +++++++++----------
 .github/workflows/daily-run-test.yml          |  4 +-
 .github/workflows/pr-run-test.yml             |  2 +-
 3 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 9f171a02..c0e735fb 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench:
         lcb_test_output_pass@1: 18.75
         bbh-logical_deduction_seven_objects_score: 50
         bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 72.6
-        cmmlu-china-specific_naive_average: 76.25
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 76.25
         mmlu_pro_math_accuracy: 25
         ds1000_Pandas_accuracy: 12.5
         ds1000_Numpy_accuracy: 0
@@ -101,8 +101,8 @@ internlm2_5-7b-chat-turbomind_fullbench:
         lcb_test_output_pass@1: 25.00
         bbh-logical_deduction_seven_objects_score: 50.00
         bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_naive_average: 69.71
-        cmmlu-china-specific_naive_average: 75.83
+        mmlu-other_accuracy: 69.71
+        cmmlu-china-specific_accuracy: 75.83
         mmlu_pro_math_accuracy: 31.25
         ds1000_Pandas_accuracy: 0
         ds1000_Numpy_accuracy: 0
@@ -234,15 +234,15 @@ internlm2_5-7b-turbomind:
         sanitized_mbpp_score: 55.25
         dingo_en_192_score: 60.94
         dingo_zh_170_score: 67.65
-        mmlu-stem_naive_average: 63.72
-        mmlu-social-science_naive_average: 80.15
-        mmlu-humanities_naive_average: 74.27
-        mmlu-other_naive_average: 71.85
-        cmmlu-stem_naive_average: 67.07
-        cmmlu-social-science_naive_average: 81.49
-        cmmlu-humanities_naive_average: 85.84
-        cmmlu-other_naive_average: 82.69
-        cmmlu-china-specific_naive_average: 79.88
+        mmlu-stem_accuracy: 63.72
+        mmlu-social-science_accuracy: 80.15
+        mmlu-humanities_accuracy: 74.27
+        mmlu-other_accuracy: 71.85
+        cmmlu-stem_accuracy: 67.07
+        cmmlu-social-science_accuracy: 81.49
+        cmmlu-humanities_accuracy: 85.84
+        cmmlu-other_accuracy: 82.69
+        cmmlu-china-specific_accuracy: 79.88
         mmlu_pro_biology_accuracy: 58.58
         mmlu_pro_business_accuracy: 28.01
         mmlu_pro_chemistry_accuracy: 22.79
@@ -281,12 +281,12 @@ internlm2_5-7b-turbomind:
         longbench_naive_average: 46.19
         longbench_zh_naive_average: 49.3
         longbench_en_naive_average: 43.97
-        longbench_single-document-qa_naive_average: 42.84
-        longbench_multi-document-qa_naive_average: 37.29
-        longbench_summarization_naive_average: 23.21
-        longbench_few-shot-learning_naive_average: 61.67
-        longbench_synthetic-tasks_naive_average: 60.05
-        longbench_code-completion_naive_average: 52.09
+        longbench_single-document-qa_score: 42.84
+        longbench_multi-document-qa_score: 41.25
+        longbench_summarization_score: 23.21
+        longbench_few-shot-learning_score: 61.67
+        longbench_synthetic-tasks_score: 60.05
+        longbench_code-completion_score: 52.09
 
 internlm2_5-7b-chat-turbomind:
     objective:
@@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind:
         teval_naive_average: 80
         SciCode_sub_accuracy: 5.56
         qa_dingo_cn_score: 99.01
-        mmlu-stem_naive_average: 68.2
-        mmlu-social-science_naive_average: 75.8
-        mmlu-humanities_naive_average: 69.3
-        mmlu-other_naive_average: 71.3
-        cmmlu-stem_naive_average: 66.64
-        cmmlu-social-science_naive_average: 76
-        cmmlu-humanities_naive_average: 77.9
-        cmmlu-other_naive_average: 77.25
-        cmmlu-china-specific_naive_average: 73.6
+        mmlu-stem_accuracy: 68.2
+        mmlu-social-science_accuracy: 75.8
+        mmlu-humanities_accuracy: 69.3
+        mmlu-other_accuracy: 71.3
+        cmmlu-stem_accuracy: 66.64
+        cmmlu-social-science_accuracy: 76
+        cmmlu-humanities_accuracy: 77.9
+        cmmlu-other_accuracy: 77.25
+        cmmlu-china-specific_accuracy: 73.6
         mmlu_pro_biology_accuracy: 66.67
         mmlu_pro_business_accuracy: 47.91
         mmlu_pro_chemistry_accuracy: 35
@@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind:
         babilong_32k_naive_average: 48.9
         babilong_128k_naive_average: 40.8
         babilong_256k_naive_average: 23.5
-        longbench_single-document-qa_naive_average: 43.56
-        longbench_multi-document-qa_naive_average: 46.24
-        longbench_summarization_naive_average: 24.32
-        longbench_few-shot-learning_naive_average: 51.67
-        longbench_synthetic-tasks_naive_average: 66.83
-        longbench_code-completion_naive_average: 45.99
+        longbench_single-document-qa_score: 43.56
+        longbench_multi-document-qa_score: 46.24
+        longbench_summarization_score: 24.32
+        longbench_few-shot-learning_score: 51.67
+        longbench_synthetic-tasks_score: 66.83
+        longbench_code-completion_score: 45.99
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 3cdb3a73..a5a930fa 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -157,7 +157,9 @@ jobs:
             pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}}
             pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}}
-            pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}}
+            pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}}
             FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
             pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
             cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml
index 032c4bc0..45fbd634 100644
--- a/.github/workflows/pr-run-test.yml
+++ b/.github/workflows/pr-run-test.yml
@@ -45,7 +45,7 @@ jobs:
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           python3 -m pip uninstall opencompass -y
-          python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}}
+          python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}}
           conda info --envs
       - name: conda env
         run: |

From 32d6859679539ebbfe8316039f87d095aa8bb4ee Mon Sep 17 00:00:00 2001
From: liushz <qq1791167085@163.com>
Date: Wed, 2 Apr 2025 17:34:07 +0800
Subject: [PATCH 47/58] [Feature] Add olymmath dataset (#1982)

* Add olymmath dataset

* Add olymmath dataset

* Add olymmath dataset

* Update olymmath dataset
---
 dataset-index.yml                             |  6 ++
 .../configs/datasets/OlymMATH/README.md       | 60 +++++++++++
 .../OlymMATH/olymmath_llm_judeg_gen.py        |  5 +
 .../OlymMATH/olymmath_llmverify_gen_97b203.py | 99 +++++++++++++++++++
 opencompass/datasets/__init__.py              |  1 +
 opencompass/datasets/olymmath.py              | 14 +++
 6 files changed, 185 insertions(+)
 create mode 100644 opencompass/configs/datasets/OlymMATH/README.md
 create mode 100644 opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
 create mode 100644 opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
 create mode 100644 opencompass/datasets/olymmath.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 67162cca..de5e316e 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -715,6 +715,12 @@
     paper: https://arxiv.org/pdf/1809.02789v1
     configpath: opencompass/configs/datasets/obqa/obqa_gen.py
     configpath_llmjudge: ''
+- olymmath:
+    name: OlymMATH
+    category: Math
+    paper: https://arxiv.org/abs/2503.21380
+    configpath: ''
+    configpath_llmjudge: opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
 - piqa:
     name: OpenBookQA
     category: Knowledge / Physics
diff --git a/opencompass/configs/datasets/OlymMATH/README.md b/opencompass/configs/datasets/OlymMATH/README.md
new file mode 100644
index 00000000..53c9b7a0
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/README.md
@@ -0,0 +1,60 @@
+# OlymMATH
+[GitHub Link](https://github.com/RUCAIBox/OlymMATH)
+
+Dataset OlymMATH, please refer to the paper:
+Challenging the Boundaries of Reasoning: An Olympiad-Level Math Benchmark for Large Language Models by Haoxiang Sun, Yingqian Min, Zhipeng Chen, Wayne Xin Zhao, Zheng Liu, Zhongyuan Wang, Lei Fang, and Ji-Rong Wen.
+
+
+## How to eval OlymMATH with model judge
+This is a simple example:
+```python
+
+from opencompass.models import OpenAISDK, OpenAI
+from mmengine.config import read_base
+
+
+with read_base():
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as qwen2_5_7b_instruct_model
+    from opencompass.configs.datasets.OlymMATH.olymmath_gen import olymmath_datasets
+
+##################  Judge Config  ##################
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+judge_cfg = dict(
+    # An API model with OpenAI API format is required for Judge
+        abbr='qwen2-5-32B-Instruct',
+        type=OpenAISDK,
+        path='Qwen/Qwen2.5-32B-Instruct',
+        key='sk-1234',
+        openai_api_base=[
+            'http://172.30.56.1:4000/v1',
+        ],
+        meta_template=api_meta_template,
+        query_per_second=16,
+        batch_size=1024,
+        temperature=0.001,
+        max_completion_tokens=32768,
+        tokenizer_path='gpt-4o-2024-05-13',
+        verbose=True,
+        max_out_len=16384,
+        max_seq_len=32768,
+)
+
+##################  Model Config  ##################
+models = [*qwen2_5_7b_instruct_model]
+
+##################  Dataset Config  ##################
+datasets = [*olymmath_datasets]
+
+# Set judge_cfg for evaluation
+for item in datasets:
+    item['infer_cfg']['inferencer']['max_out_len'] = 32768
+    if 'judge_cfg' in item['eval_cfg']['evaluator']:
+        item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
+
+
+work_dir = './outputs/olymmath_llm_eval'
+```
diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
new file mode 100644
index 00000000..dfc80538
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llm_judeg_gen.py
@@ -0,0 +1,5 @@
+from mmengine.config import read_base
+
+with read_base():
+    # Default use LLM as a judge
+    from .olymmath_llmverify_gen_97b203 import olymmath_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
new file mode 100644
index 00000000..0c517a13
--- /dev/null
+++ b/opencompass/configs/datasets/OlymMATH/olymmath_llmverify_gen_97b203.py
@@ -0,0 +1,99 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import OlymMATHDataset
+
+
+# ----------------------------- Detailed Config -----------------------------
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='answer', train_split='test')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt='{problem}\nRemember to put your final answer within \\boxed{}.'), 
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+sub_sets = ['en-hard', 'zh-hard', 'en-easy', 'zh-easy']
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+
+olymmath_datasets = []
+
+for sub_set in sub_sets:
+    math_eval_cfg = dict(
+        evaluator=dict(
+            type=GenericLLMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+                ],
+                    round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = GRADER_TEMPLATE
+                    ),
+                ]),
+            ),
+            dataset_cfg=dict(
+                type=OlymMATHDataset,
+                path='RUC-AIBOX/OlymMATH',
+                reader_cfg=math_reader_cfg,
+                subset=sub_set,
+            ),
+            judge_cfg=dict(),
+            dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+        ),
+        pred_role='BOT',
+    )
+
+    olymmath_datasets.append(
+        dict(
+            type=OlymMATHDataset,
+            abbr=f'olymmath_llmjudge_{sub_set}',
+            path='RUC-AIBOX/OlymMATH',
+            reader_cfg=math_reader_cfg,
+            infer_cfg=math_infer_cfg,
+            eval_cfg=math_eval_cfg,
+            subset=sub_set,
+        )
+    )
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 49cd1522..45209054 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -106,6 +106,7 @@ from .natural_question import *  # noqa: F401, F403
 from .natural_question_cn import *  # noqa: F401, F403
 from .NPHardEval import *  # noqa: F401, F403
 from .obqa import *  # noqa: F401, F403
+from .olymmath import *  # noqa: F401, F403
 from .OlympiadBench import *  # noqa: F401, F403
 from .OpenFinData import *  # noqa: F401, F403
 from .piqa import *  # noqa: F401, F403
diff --git a/opencompass/datasets/olymmath.py b/opencompass/datasets/olymmath.py
new file mode 100644
index 00000000..e9f8af40
--- /dev/null
+++ b/opencompass/datasets/olymmath.py
@@ -0,0 +1,14 @@
+from datasets import load_dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from .base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class OlymMATHDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, subset: str):
+        dataset = load_dataset(path, subset)
+        return dataset

From dc8deb6af0d452c3134ce72693b460c1e1774ed6 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 2 Apr 2025 17:47:15 +0800
Subject: [PATCH 48/58] [BUMP] Bump version to 0.4.2 (#1997)

---
 opencompass/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opencompass/__init__.py b/opencompass/__init__.py
index f0ede3d3..a9873473 100644
--- a/opencompass/__init__.py
+++ b/opencompass/__init__.py
@@ -1 +1 @@
-__version__ = '0.4.1'
+__version__ = '0.4.2'

From 9b489e9ea02b9f0932cebb754e93af02111ac5f0 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Thu, 3 Apr 2025 15:11:02 +0800
Subject: [PATCH 49/58] [Update] Revert math500 dataset configs (#1998)

---
 .../math/math_prm800k_500_0shot_cot_gen.py    |  8 ++--
 .../math_prm800k_500_0shot_cot_gen_11c4b5.py  | 44 +++++++++++++++++++
 .../datasets/math/math_prm800k_500_gen.py     |  2 +-
 3 files changed, 49 insertions(+), 5 deletions(-)
 create mode 100644 opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py

diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
index d49a1ccc..0faf8630 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen.py
@@ -1,15 +1,14 @@
 from opencompass.openicl.icl_prompt_template import PromptTemplate
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.openicl.icl_evaluator import MATHEvaluator
 from opencompass.datasets import (
     MATHDataset,
+    MATHEvaluator,
     math_postprocess_v2,
     normalize_final_answer,
 )
 
 math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
-
 math_infer_cfg = dict(
     prompt_template=dict(
         type=PromptTemplate,
@@ -28,7 +27,8 @@ math_infer_cfg = dict(
 
 # postprocess v2
 math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator)
+    evaluator=dict(type=MATHEvaluator, version='v2'),
+    pred_postprocessor=dict(type=math_postprocess_v2),
 )
 
 math_datasets = [
@@ -41,4 +41,4 @@ math_datasets = [
         infer_cfg=math_infer_cfg,
         eval_cfg=math_eval_cfg,
     )
-]
+]
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
new file mode 100644
index 00000000..d49a1ccc
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_prm800k_500_0shot_cot_gen_11c4b5.py
@@ -0,0 +1,44 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# postprocess v2
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math_prm800k_500',
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg,
+    )
+]
diff --git a/opencompass/configs/datasets/math/math_prm800k_500_gen.py b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
index c74231fc..759b1b63 100644
--- a/opencompass/configs/datasets/math/math_prm800k_500_gen.py
+++ b/opencompass/configs/datasets/math/math_prm800k_500_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .math_prm800k_500_0shot_cot_gen import math_datasets  # noqa: F401, F403
\ No newline at end of file
+    from .math_prm800k_500_0shot_cot_gen_11c4b5 import math_datasets  # noqa: F401, F403
\ No newline at end of file

From 3a9a38417343a31186722d4dd46852aefc50557f Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Thu, 3 Apr 2025 17:37:53 +0800
Subject: [PATCH 50/58] [Doc] Fix links between zh & en (#2001)

* test

* test

* test
---
 docs/en/conf.py    | 4 ++++
 docs/zh_cn/conf.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/docs/en/conf.py b/docs/en/conf.py
index 9101ba3f..150d5ca7 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -117,6 +117,10 @@ html_js_files = [
     'js/custom.js'
 ]
 
+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 8910ead0..2a5e3f59 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -117,6 +117,10 @@ html_js_files = [
     'js/custom.js'
 ]
 
+html_context = {
+    'github_version': 'main',
+}
+
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.

From f982d6278ea281bb3dfef602aa21d296444f677a Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Thu, 3 Apr 2025 19:32:36 +0800
Subject: [PATCH 51/58] [CI] fix baseline score (#2000)

* update

* update

* update

* update

* update

* update

* update

* updaste

* update

* update

* updaste

* updaste

* update

* update

* update

* update

* update

* update

* update

* update
---
 .github/scripts/eval_regression_api.py        |   6 +-
 .../scripts/eval_regression_base_models.py    |  18 +-
 .../scripts/eval_regression_chat_models.py    |  41 +-
 .github/scripts/oc_score_assert.py            |  40 +-
 .github/scripts/oc_score_baseline.yaml        |  29 +-
 .../scripts/oc_score_baseline_fullbench.yaml  | 599 ++++++++++++++++--
 .../scripts/oc_score_baseline_testrange.yaml  | 236 ++++---
 .github/workflows/daily-run-test.yml          |  21 +-
 .../lmdeploy_mixtral_8x22b_instruct_v0_1.py   |  22 +
 .../subjective/common_summarizer.py           |   1 -
 10 files changed, 780 insertions(+), 233 deletions(-)
 create mode 100644 opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py

diff --git a/.github/scripts/eval_regression_api.py b/.github/scripts/eval_regression_api.py
index ba1902a9..98f0fdf0 100644
--- a/.github/scripts/eval_regression_api.py
+++ b/.github/scripts/eval_regression_api.py
@@ -24,9 +24,9 @@ models = [
         abbr='lmdeploy-api-test',
         type=OpenAISDK,
         key='EMPTY',
-        openai_api_base='http://0.0.0.0:23333/v1',
-        path='internlm2',
-        tokenizer_path='internlm/internlm2_5-7b-chat',
+        openai_api_base='http://localhost:23333/v1',
+        path='internlm3',
+        tokenizer_path='internlm/internlm3-8b-instruct',
         rpm_verbose=True,
         meta_template=api_meta_template,
         query_per_second=128,
diff --git a/.github/scripts/eval_regression_base_models.py b/.github/scripts/eval_regression_base_models.py
index a8dc7a60..4259cc36 100644
--- a/.github/scripts/eval_regression_base_models.py
+++ b/.github/scripts/eval_regression_base_models.py
@@ -11,18 +11,10 @@ with read_base():
     from opencompass.configs.datasets.winogrande.winogrande_5shot_ll_252f01 import \
         winogrande_datasets  # noqa: F401, E501
     # read hf models - chat models
-    from opencompass.configs.models.chatglm.hf_glm4_9b import \
-        models as hf_glm4_9b_model  # noqa: F401, E501
     from opencompass.configs.models.chatglm.lmdeploy_glm4_9b import \
         models as lmdeploy_glm4_9b_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_base import \
         models as hf_deepseek_7b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_base import \
-        models as hf_deepseek_67b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_base import \
-        models as hf_deepseek_moe_16b_base_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite import \
-        models as hf_deepseek_v2_lite_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_7b_base import \
         models as lmdeploy_deepseek_7b_base_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_base import \
@@ -49,12 +41,6 @@ with read_base():
         models as hf_internlm2_5_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_7b import \
         models as hf_internlm2_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_20b import \
-        models as hf_internlm2_20b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_7b import \
-        models as hf_internlm2_base_7b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_internlm.hf_internlm2_base_20b import \
-        models as hf_internlm2_base_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_1_8b import \
         models as lmdeploy_internlm2_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b import \
@@ -65,14 +51,14 @@ with read_base():
         models as lmdeploy_internlm2_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_7b import \
         models as lmdeploy_internlm2_base_7b_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_base_20b import \
+        models as lmdeploy_internlm2_base_20b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama2_7b import \
         models as hf_llama2_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_1_8b import \
         models as hf_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_8b import \
         models as hf_llama3_8b_model  # noqa: F401, E501
-    from opencompass.configs.models.hf_llama.hf_llama3_70b import \
-        models as hf_llama3_70b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_1_8b import \
         models as lmdeploy_llama3_1_8b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.lmdeploy_llama3_8b import \
diff --git a/.github/scripts/eval_regression_chat_models.py b/.github/scripts/eval_regression_chat_models.py
index 40ec1bc5..bfe923f6 100644
--- a/.github/scripts/eval_regression_chat_models.py
+++ b/.github/scripts/eval_regression_chat_models.py
@@ -15,14 +15,24 @@ with read_base():
         models as vllm_glm4_9b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.hf_deepseek_7b_chat import \
         models as hf_deepseek_7b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_67b_chat import \
-        models as hf_deepseek_67b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_moe_16b_chat import \
-        models as hf_deepseek_moe_16b_chat_model  # noqa: F401, E501
-    from opencompass.configs.models.deepseek.hf_deepseek_v2_lite_chat import \
-        models as hf_deepseek_v2_lite_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_67b_chat import \
+        models as lmdeploy_deepseek_67b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_8b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_8b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_llama_70b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_llama_70b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_1_5b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_1_5b_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_r1_distill_qwen_32b import \
+        models as \
+        lmdeploy_deepseek_r1_distill_qwen_32b_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_5_1210 import \
         models as lmdeploy_deepseek_v2_5_1210_model  # noqa: F401, E501
+    from opencompass.configs.models.deepseek.lmdeploy_deepseek_v2_lite import \
+        models as lmdeploy_deepseek_v2_lite_model  # noqa: F401, E501
     from opencompass.configs.models.deepseek.vllm_deepseek_7b_chat import \
         models as vllm_deepseek_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.gemma.hf_gemma2_2b_it import \
@@ -45,6 +55,8 @@ with read_base():
         models as hf_internlm2_5_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.hf_internlm2_5_20b_chat import \
         models as hf_internlm2_5_20b_chat_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
+        models as hf_internlm3_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
         models as lmdeploy_internlm2_5_7b_chat_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_20b_chat import \
@@ -57,6 +69,8 @@ with read_base():
         models as lmdeploy_internlm2_chat_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_chat_7b_sft import \
         models as lmdeploy_internlm2_chat_7b_sft_model  # noqa: F401, E501
+    from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
+        models as lmdeploy_internlm3_8b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.hf_internlm.vllm_internlm2_chat_7b import \
         models as vllm_internlm2_chat_7b_model  # noqa: F401, E501
     from opencompass.configs.models.hf_llama.hf_llama3_1_8b_instruct import \
@@ -83,10 +97,6 @@ with read_base():
         models as hf_mistral_nemo_instruct_2407_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.hf_mistral_small_instruct_2409 import \
         models as hf_mistral_small_instruct_2409_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x7b_instruct_v0_1 import \
-        models as hf_mixtral_8x7b_instruct_v0_1_model  # noqa: F401, E501
-    from opencompass.configs.models.mistral.hf_mixtral_8x22b_instruct_v0_1 import \
-        models as hf_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.lmdeploy_mistral_large_instruct_2411 import \
         models as \
         lmdeploy_mistral_large_instruct_2411_model  # noqa: F401, E501
@@ -95,14 +105,19 @@ with read_base():
     from opencompass.configs.models.mistral.lmdeploy_mistral_small_instruct_2409 import \
         models as \
         lmdeploy_mistral_small_instruct_2409_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.lmdeploy_mixtral_8x22b_instruct_v0_1 import \
+        models as \
+        lmdeploy_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_1 import \
         models as vllm_mistral_7b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
         models as vllm_mistral_7b_instruct_v0_2_model  # noqa: F401, E501
+    from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
+        models as vllm_mixtral_8x22b_instruct_v0_1_model  # noqa: F401, E501
     from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
         models as lmdeploy_nemotron_70b_instruct_hf_model  # noqa: F401, E501
-    from opencompass.configs.models.phi.hf_phi_3_mini_4k_instruct import \
-        models as hf_phi_3_mini_4k_instruct_model  # noqa: F401, E501
+    from opencompass.configs.models.phi.hf_phi_4 import \
+        models as hf_phi_4_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen2_5_0_5b_instruct import \
         models as hf_qwen2_5_0_5b_instruct_model  # noqa: F401, E501
     from opencompass.configs.models.qwen2_5.hf_qwen2_5_3b_instruct import \
@@ -142,6 +157,8 @@ with read_base():
 
     from ...volc import infer as volc_infer  # noqa: F401, E501
 
+hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'
+
 race_datasets = [race_datasets[1]]
 datasets = sum([v for k, v in locals().items() if k.endswith('_datasets')], [])
 
diff --git a/.github/scripts/oc_score_assert.py b/.github/scripts/oc_score_assert.py
index 4ef414dc..1cbc5ad2 100644
--- a/.github/scripts/oc_score_assert.py
+++ b/.github/scripts/oc_score_assert.py
@@ -175,10 +175,11 @@ class TestApibench:
 class TestVolcFullbench:
     """Test cases for chat model."""
 
-    @pytest.mark.parametrize(
-        'model, dataset',
-        [(p1, p2) for p1 in ['internlm2_5-7b-chat-turbomind']
-         for p2 in dataset_list('internlm2_5-7b-chat-turbomind', 'objective')])
+    @pytest.mark.parametrize('model, dataset', [(p1, p2) for p1 in [
+        'internlm2_5-7b-chat-turbomind', 'qwen2.5-7b-instruct-turbomind',
+        'internlm2_5-7b-chat-pytorch', 'qwen2.5-7b-instruct-pytorch',
+        'internlm3-8b-instruct-turbomind', 'internlm3-8b-instruct-pytorch'
+    ] for p2 in dataset_list(p1, 'objective')])
     @pytest.mark.chat_objective
     def test_chat_objective(self, baseline_scores_fullbench, result_scores,
                             model, dataset):
@@ -245,10 +246,7 @@ class TestCmdCase:
     @pytest.mark.parametrize('model, dataset',
                              [('internlm2_5-7b-hf', 'race-middle_accuracy'),
                               ('internlm2_5-7b-hf', 'race-high_accuracy'),
-                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-middle_accuracy'),
-                              ('internlm2-1.8b-hf', 'race-high_accuracy'),
-                              ('internlm2-1.8b-hf', 'demo_gsm8k_accuracy')])
+                              ('internlm2_5-7b-hf', 'demo_gsm8k_accuracy')])
     def test_cmd_case1(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
@@ -260,9 +258,9 @@ class TestCmdCase:
         [('internlm2_5-7b-chat-lmdeploy', 'race-middle_accuracy'),
          ('internlm2_5-7b-chat-lmdeploy', 'race-high_accuracy'),
          ('internlm2_5-7b-chat-lmdeploy', 'demo_gsm8k_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-middle_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'race-high_accuracy'),
-         ('internlm2-chat-1.8b-lmdeploy', 'demo_gsm8k_accuracy')])
+         ('internlm3-8b-instruct-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct-lmdeploy', 'demo_gsm8k_accuracy')])
     def test_cmd_case2(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
@@ -280,13 +278,25 @@ class TestCmdCase:
 
     @pytest.mark.case4
     @pytest.mark.parametrize(
-        'model, dataset', [('internlm2_5-7b-chat_hf', 'race-middle_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'race-high_accuracy'),
-                           ('internlm2_5-7b-chat_hf', 'demo_gsm8k_accuracy')])
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-lmdeploy', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-lmdeploy', 'demo_gsm8k_accuracy')])
     def test_cmd_case4(self, baseline_scores, result_scores, model, dataset):
         base_score = baseline_scores.get(model).get(dataset)
         result_score = result_scores.get(model).get(dataset)
-        assert_score(model, result_score, base_score, dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
+
+    @pytest.mark.case5
+    @pytest.mark.parametrize(
+        'model, dataset',
+        [('internlm3-8b-instruct_hf-vllm', 'race-middle_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'race-high_accuracy'),
+         ('internlm3-8b-instruct_hf-vllm', 'demo_gsm8k_accuracy')])
+    def test_cmd_case5(self, baseline_scores, result_scores, model, dataset):
+        base_score = baseline_scores.get(model).get(dataset)
+        result_score = result_scores.get(model).get(dataset)
+        assert_score(model + '_batch', result_score, base_score, dataset)
 
 
 def assert_score(model_type, score, baseline, dataset: str = ''):
diff --git a/.github/scripts/oc_score_baseline.yaml b/.github/scripts/oc_score_baseline.yaml
index cd2e3328..e4567553 100644
--- a/.github/scripts/oc_score_baseline.yaml
+++ b/.github/scripts/oc_score_baseline.yaml
@@ -8,20 +8,25 @@ internlm2_5-7b_hf:
     race-middle_accuracy: 91.78
     race-high_accuracy: 90.02
 
-internlm2-1.8b-hf:
-    demo_gsm8k_accuracy: 15.62
-    race-middle_accuracy: 71.66
-    race-high_accuracy: 66.38
-
 internlm2_5-7b-chat-lmdeploy:
-    demo_gsm8k_accuracy: 89.06
+    demo_gsm8k_accuracy: 87.50
     race-middle_accuracy: 92.76
     race-high_accuracy: 90.54
 
-internlm2-chat-1.8b-lmdeploy:
-    demo_gsm8k_accuracy: 31
-    race-middle_accuracy: 81.34
-    race-high_accuracy: 73.96
+internlm3-8b-instruct-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-lmdeploy:
+    demo_gsm8k_accuracy: 73.44
+    race-middle_accuracy: 93.38
+    race-high_accuracy: 90.34
+
+internlm3-8b-instruct_hf-vllm:
+    demo_gsm8k_accuracy: 81.25
+    race-middle_accuracy: 92.20
+    race-high_accuracy: 89.88
 
 internlm2_5-7b-chat_hf:
     demo_gsm8k_accuracy: 87.50
@@ -29,6 +34,6 @@ internlm2_5-7b-chat_hf:
     race-high_accuracy: 90.48
 
 lmdeploy-api-test:
-    gsm8k_accuracy: 68.75
-    race-middle_accuracy: 87.50
+    gsm8k_accuracy: 56.25
+    race-middle_accuracy: 93.75
     race-high_accuracy: 93.75
diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index c0e735fb..3f5753d3 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -39,15 +39,15 @@ internlm2_5-7b-chat-hf_fullbench:
         college_knowledge_naive_average: 87.5
     subjective:
         alignment_bench_v1_1_总分: 0.66
-        alpaca_eval_total: 20
+        alpaca_eval_total: 0
         arenahard_score: 50
         Followbench_naive_average: 1
         CompassArena_naive_average: 43
         mtbench101_avg: 7.8
-        wildbench_average: -12.78
+        wildbench_average: -15.56
         simpleqa_accuracy_given_attempted: 0
         chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 7.90
+        alignment_bench_v1_1_专业能力: 8.00
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -55,7 +55,7 @@ internlm2_5-7b-chat-hf_fullbench:
         alignment_bench_v1_1_文本写作: 0
         alignment_bench_v1_1_角色扮演: 0
         alignment_bench_v1_1_综合问答: 0
-        alpaca_eval_helpful_base: 20
+        alpaca_eval_helpful_base: 0
         compassarena_language_naive_average: 35
         compassarena_knowledge_naive_average: 55
         compassarena_reason_v2_naive_average: 40
@@ -78,53 +78,53 @@ internlm2_5-7b-chat-hf_fullbench:
 internlm2_5-7b-chat-turbomind_fullbench:
     objective:
         race-high_accuracy:  93.75
-        ARC-c_accuracy: 93.75
+        ARC-c_accuracy: 87.50
         BoolQ_accuracy: 68.75
         triviaqa_wiki_1shot_score: 50
         nq_open_1shot_score: 25
         IFEval_Prompt-level-strict-accuracy: 56.25
-        drop_accuracy: 81.25
+        drop_accuracy: 75
         GPQA_diamond_accuracy: 31.25
-        hellaswag_accuracy: 81.25
-        TheoremQA_score: 6.25
+        hellaswag_accuracy: 87.5
+        TheoremQA_score: 12.5
         musr_average_naive_average: 39.58
-        korbench_single_naive_average: 37.50
-        gsm8k_accuracy: 68.75
-        math_accuracy: 68.75
+        korbench_single_naive_average: 40
+        gsm8k_accuracy: 62.5
+        math_accuracy: 75
         cmo_fib_accuracy: 6.25
         aime2024_accuracy: 6.25
-        wikibench-wiki-single_choice_cncircular_perf_4: 50.00
+        wikibench-wiki-single_choice_cncircular_perf_4: 25
         sanitized_mbpp_score: 68.75
-        ds1000_naive_average: 16.96
+        ds1000_naive_average: 17.86
         lcb_code_generation_pass@1: 12.5
         lcb_code_execution_pass@1: 43.75
-        lcb_test_output_pass@1: 25.00
-        bbh-logical_deduction_seven_objects_score: 50.00
-        bbh-multistep_arithmetic_two_score: 68.75
-        mmlu-other_accuracy: 69.71
-        cmmlu-china-specific_accuracy: 75.83
+        lcb_test_output_pass@1: 18.75
+        bbh-logical_deduction_seven_objects_score: 56.25
+        bbh-multistep_arithmetic_two_score: 75
+        mmlu-other_accuracy: 72.6
+        cmmlu-china-specific_accuracy: 78.33
         mmlu_pro_math_accuracy: 31.25
-        ds1000_Pandas_accuracy: 0
+        ds1000_Pandas_accuracy: 12.5
         ds1000_Numpy_accuracy: 0
         ds1000_Tensorflow_accuracy: 12.5
-        ds1000_Scipy_accuracy: 18.75
+        ds1000_Scipy_accuracy: 25
         ds1000_Sklearn_accuracy: 18.75
-        ds1000_Pytorch_accuracy: 18.75
+        ds1000_Pytorch_accuracy: 6.25
         ds1000_Matplotlib_accuracy: 50.00
         openai_mmmlu_lite_AR-XY_accuracy: 37.5
         college_naive_average: 12.50
         college_knowledge_naive_average: 87.5
     subjective:
-        alignment_bench_v1_1_总分: 0.70
+        alignment_bench_v1_1_总分: 0.66
         alpaca_eval_total: 0
         arenahard_score: 50
         Followbench_naive_average: 1
-        CompassArena_naive_average: 38
-        mtbench101_avg: 7.80
-        wildbench_average: -4.86
+        CompassArena_naive_average: 40
+        mtbench101_avg: 8
+        wildbench_average: -6.81
         simpleqa_accuracy_given_attempted: 0
         chinese_simpleqa_given_attempted_accuracy: 1
-        alignment_bench_v1_1_专业能力: 8.4
+        alignment_bench_v1_1_专业能力: 7.9
         alignment_bench_v1_1_数学计算: 0
         alignment_bench_v1_1_基本任务: 0
         alignment_bench_v1_1_逻辑推理: 0
@@ -134,10 +134,10 @@ internlm2_5-7b-chat-turbomind_fullbench:
         alignment_bench_v1_1_综合问答: 0
         alpaca_eval_helpful_base: 0
         compassarena_language_naive_average: 35
-        compassarena_knowledge_naive_average: 50
-        compassarena_reason_v2_naive_average: 30
-        compassarena_math_v2_naive_average: 50
-        compassarena_creationv2_zh_naive_average: 25
+        compassarena_knowledge_naive_average: 45
+        compassarena_reason_v2_naive_average: 25
+        compassarena_math_v2_naive_average: 60
+        compassarena_creationv2_zh_naive_average: 35
         followbench_llmeval_en_HSR_AVG: 1
         followbench_llmeval_en_SSR_AVG: 1
         followbench_llmeval_en_HSR_L1: 1
@@ -190,20 +190,20 @@ internlm2_5-7b-turbomind_fullbench:
         drop_accuracy: 62.5
         GPQA_diamond_accuracy: 62.5
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 25.00
+        TheoremQA_score: 31.25
         winogrande_accuracy: 87.5
-        gsm8k_accuracy: 62.50
-        GaokaoBench_2010-2022_Math_II_MCQs_score: 81.25
+        gsm8k_accuracy: 56.25
+        GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
         GaokaoBench_2010-2022_Math_II_Fill-in-the-Blank_score: 0
         math_accuracy: 18.75
         wikibench-wiki-single_choice_cncircular_perf_4: 25
         sanitized_mbpp_score: 62.50
-        dingo_en_192_score: 31.25
+        dingo_en_192_score: 50.00
         dingo_zh_170_score: 93.75
         mmlu-other_accuracy: 76.92
         cmmlu-china-specific_accuracy: 84.17
         mmlu_pro_math_accuracy: 18.75
-        bbh-logical_deduction_seven_objects_score: 50
+        bbh-logical_deduction_seven_objects_score: 43.75
         bbh-multistep_arithmetic_two_score: 56.25
         college_naive_average: 12.5
         college_knowledge_naive_average: 87.5
@@ -409,7 +409,7 @@ internlm2_5-7b-chat-turbomind:
         alpaca_eval_koala: 28.21
         alpaca_eval_oasst: 23.4
         alpaca_eval_selfinstruct: 30.95
-        alpaca_eval_vicuna: 25
+        alpaca_eval_vicuna: 33.75
         compassarena_language_naive_average: 52.5
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
@@ -454,3 +454,530 @@ internlm2_5-7b-chat-1m-turbomind:
         longbench_few-shot-learning_score: 51.67
         longbench_synthetic-tasks_score: 66.83
         longbench_code-completion_score: 45.99
+
+
+qwen2.5-7b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 84.99
+        ARC-c_accuracy: 92.2
+        BoolQ_accuracy: 86.7
+        triviaqa_wiki_1shot_score: 53.06
+        nq_open_1shot_score: 17.51
+        mmmlu_lite_naive_average: 54.96
+        IFEval_Prompt-level-strict-accuracy: 71.53
+        drop_accuracy: 80.07
+        bbh_naive_average: 68.81
+        GPQA_diamond_accuracy: 34.34
+        hellaswag_accuracy: 85.42
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.44
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 92.57
+        GaokaoBench_weighted_average: 80.14
+        math_accuracy: 73.58
+        cmo_fib_accuracy: 25
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 77.33
+        wikibench-wiki-single_choice_cncircular_perf_4: 34.9
+        cmmlu_naive_average: 75.97
+        mmlu_naive_average: 76.01
+        mmlu_pro_naive_average: 56.12
+        openai_humaneval_humaneval_pass@1: 83.54
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.29
+        ds1000_naive_average: 18.66
+        lcb_code_generation_pass@1: 39.5
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.68
+        bigcodebench_hard_instruct_pass@1: 16.22
+        bigcodebench_hard_complete_pass@1: 11.49
+        teval_naive_average: 79.72
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 99.01
+        mmlu_accuracy: 76.01
+        mmlu-stem_accuracy: 77.59
+        mmlu-social-science_accuracy: 79.02
+        mmlu-humanities_accuracy: 72.07
+        mmlu-other_accuracy: 74.86
+        cmmlu_accuracy: 75.97
+        cmmlu-stem_accuracy: 73.09
+        cmmlu-social-science_accuracy: 75.95
+        cmmlu-humanities_accuracy: 76.53
+        cmmlu-other_accuracy: 78.79
+        cmmlu-china-specific_accuracy: 73.17
+        mmlu_pro_accuracy: 56.12
+        mmlu_pro_biology_accuracy: 71.41
+        mmlu_pro_business_accuracy: 67.68
+        mmlu_pro_chemistry_accuracy: 54.59
+        mmlu_pro_computer_science_accuracy: 58.29
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 42.41
+        mmlu_pro_health_accuracy: 55.87
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 28.97
+        mmlu_pro_math_accuracy: 73.13
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 58.43
+        mmlu_pro_psychology_accuracy: 63.16
+        mmlu_pro_other_accuracy: 53.57
+        humanevalx-python_pass@1: 50
+        humanevalx-cpp_pass@1: 42.07
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 75
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.18
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 10.43
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 60.65
+        mmmlu_lite_accuracy: 54.96
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.25
+        openai_mmmlu_lite_DE-DE_accuracy: 59.93
+        openai_mmmlu_lite_ES-LA_accuracy: 66.53
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 49.26
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.47
+        openai_mmmlu_lite_JA-JP_accuracy: 61.54
+        openai_mmmlu_lite_KO-KR_accuracy: 60.28
+        openai_mmmlu_lite_PT-BR_accuracy: 55.51
+        openai_mmmlu_lite_SW-KE_accuracy: 36.42
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.61
+        college_naive_average: 48
+        high_naive_average: 59
+        middle_naive_average: 78
+        primary_naive_average: 85.67
+        arithmetic_naive_average: 75.67
+        mathbench-a (average)_naive_average: 69.27
+        college_knowledge_naive_average: 83.86
+        high_knowledge_naive_average: 80.29
+        middle_knowledge_naive_average: 84.26
+        primary_knowledge_naive_average: 93.16
+        mathbench-t (average)_naive_average: 85.39
+
+
+
+
+internlm2_5-7b-chat-pytorch:
+    objective:
+        race-high_accuracy: 86.39
+        ARC-c_accuracy: 90.51
+        BoolQ_accuracy: 88.01
+        triviaqa_wiki_1shot_score: 64.77
+        nq_open_1shot_score: 22.71
+        mmmlu_lite_naive_average: 45.02
+        IFEval_Prompt-level-strict-accuracy: 56.56
+        drop_accuracy: 75.46
+        bbh_naive_average: 73.34
+        GPQA_diamond_accuracy: 32.83
+        hellaswag_accuracy: 94.81
+        TheoremQA_score: 23.88
+        musr_average_naive_average: 51.31
+        korbench_single_naive_average: 32
+        ARC_Prize_Public_Evaluation_accuracy: 0.01
+        gsm8k_accuracy: 86.96
+        GaokaoBench_weighted_average: 78.05
+        math_accuracy: 60.34
+        cmo_fib_accuracy: 12.98
+        aime2024_accuracy: 3.33
+        Mathbench_naive_average: 64.82
+        wikibench-wiki-single_choice_cncircular_perf_4: 31.7
+        cmmlu_naive_average: 74.24
+        mmlu_naive_average: 70.2
+        mmlu_pro_naive_average: 45.39
+        openai_humaneval_humaneval_pass@1: 70.12
+        sanitized_mbpp_score: 64.59
+        humanevalx_naive_average: 38.78
+        ds1000_naive_average: 14.19
+        lcb_code_generation_pass@1: 16.5
+        lcb_code_execution_pass@1: 33.82
+        lcb_test_output_pass@1: 22.62
+        bigcodebench_hard_instruct_pass@1: 6.08
+        bigcodebench_hard_complete_pass@1: 6.76
+        teval_naive_average: 79.73
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 70.2
+        mmlu-stem_accuracy: 67.73
+        mmlu-social-science_accuracy: 75.49
+        mmlu-humanities_accuracy: 68.56
+        mmlu-other_accuracy: 70.58
+        cmmlu_accuracy: 74.24
+        cmmlu-stem_accuracy: 66.7
+        cmmlu-social-science_accuracy: 75.88
+        cmmlu-humanities_accuracy: 77.56
+        cmmlu-other_accuracy: 77.52
+        cmmlu-china-specific_accuracy: 73.46
+        mmlu_pro_accuracy: 45.39
+        mmlu_pro_biology_accuracy: 65.83
+        mmlu_pro_business_accuracy: 51.96
+        mmlu_pro_chemistry_accuracy: 36.84
+        mmlu_pro_computer_science_accuracy: 48.29
+        mmlu_pro_economics_accuracy: 56.16
+        mmlu_pro_engineering_accuracy: 29.1
+        mmlu_pro_health_accuracy: 44.5
+        mmlu_pro_history_accuracy: 42.26
+        mmlu_pro_law_accuracy: 24.98
+        mmlu_pro_math_accuracy: 54.85
+        mmlu_pro_philosophy_accuracy: 39.28
+        mmlu_pro_physics_accuracy: 37.41
+        mmlu_pro_psychology_accuracy: 58.27
+        mmlu_pro_other_accuracy: 45.78
+        humanevalx-python_pass@1: 56.1
+        humanevalx-cpp_pass@1: 20.73
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 59.15
+        humanevalx-js_pass@1: 57.93
+        ds1000_Pandas_accuracy: 8.93
+        ds1000_Numpy_accuracy: 4.09
+        ds1000_Tensorflow_accuracy: 11.11
+        ds1000_Scipy_accuracy: 7.55
+        ds1000_Sklearn_accuracy: 7.83
+        ds1000_Pytorch_accuracy: 8.82
+        ds1000_Matplotlib_accuracy: 50.97
+        mmmlu_lite_accuracy: 45.02
+        openai_mmmlu_lite_AR-XY_accuracy: 18.6
+        openai_mmmlu_lite_BN-BD_accuracy: 27.58
+        openai_mmmlu_lite_DE-DE_accuracy: 51.23
+        openai_mmmlu_lite_ES-LA_accuracy: 56.63
+        openai_mmmlu_lite_FR-FR_accuracy: 58.11
+        openai_mmmlu_lite_HI-IN_accuracy: 33.82
+        openai_mmmlu_lite_ID-ID_accuracy: 50.39
+        openai_mmmlu_lite_IT-IT_accuracy: 50.39
+        openai_mmmlu_lite_JA-JP_accuracy: 50.95
+        openai_mmmlu_lite_KO-KR_accuracy: 45.05
+        openai_mmmlu_lite_PT-BR_accuracy: 57.89
+        openai_mmmlu_lite_SW-KE_accuracy: 32.14
+        openai_mmmlu_lite_YO-NG_accuracy: 32.14
+        openai_mmmlu_lite_ZH-CN_accuracy: 65.33
+        college_naive_average: 21
+        high_naive_average: 47
+        middle_naive_average: 59.67
+        primary_naive_average: 76
+        arithmetic_naive_average: 62
+        mathbench-a (average)_naive_average: 53.13
+        college_knowledge_naive_average: 68.99
+        high_knowledge_naive_average: 70.06
+        middle_knowledge_naive_average: 78.53
+        primary_knowledge_naive_average: 88.49
+        mathbench-t (average)_naive_average: 76.51
+
+
+qwen2.5-7b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 85.16
+        ARC-c_accuracy: 90.85
+        BoolQ_accuracy: 86.61
+        triviaqa_wiki_1shot_score: 52.96
+        nq_open_1shot_score: 17.62
+        mmmlu_lite_naive_average: 54.7
+        IFEval_Prompt-level-strict-accuracy: 71.35
+        drop_accuracy: 80.23
+        bbh_naive_average: 68.88
+        GPQA_diamond_accuracy: 36.36
+        hellaswag_accuracy: 85.49
+        TheoremQA_score: 18.38
+        musr_average_naive_average: 43.3
+        korbench_single_naive_average: 39.44
+        ARC_Prize_Public_Evaluation_accuracy: 0
+        gsm8k_accuracy: 91.66
+        GaokaoBench_weighted_average: 80.02
+        math_accuracy: 73.74
+        cmo_fib_accuracy: 26.44
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 77.08
+        wikibench-wiki-single_choice_cncircular_perf_4: 34
+        cmmlu_naive_average: 75.9
+        mmlu_naive_average: 76.27
+        mmlu_pro_naive_average: 56.14
+        openai_humaneval_humaneval_pass@1: 84.76
+        sanitized_mbpp_score: 74.71
+        humanevalx_naive_average: 48.17
+        ds1000_naive_average: 18.57
+        lcb_code_generation_pass@1: 38.75
+        lcb_code_execution_pass@1: 42.38
+        lcb_test_output_pass@1: 50.45
+        bigcodebench_hard_instruct_pass@1: 16.89
+        bigcodebench_hard_complete_pass@1: 12.16
+        teval_naive_average: 79.46
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.27
+        mmlu-stem_accuracy: 77.75
+        mmlu-social-science_accuracy: 78.65
+        mmlu-humanities_accuracy: 73.12
+        mmlu-other_accuracy: 75.05
+        cmmlu_accuracy: 75.9
+        cmmlu-stem_accuracy: 73.41
+        cmmlu-social-science_accuracy: 75.97
+        cmmlu-humanities_accuracy: 76.42
+        cmmlu-other_accuracy: 78.15
+        cmmlu-china-specific_accuracy: 73.27
+        mmlu_pro_accuracy: 56.14
+        mmlu_pro_biology_accuracy: 72.25
+        mmlu_pro_business_accuracy: 66.16
+        mmlu_pro_chemistry_accuracy: 55.65
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 66.82
+        mmlu_pro_engineering_accuracy: 41.38
+        mmlu_pro_health_accuracy: 54.89
+        mmlu_pro_history_accuracy: 46.46
+        mmlu_pro_law_accuracy: 29.06
+        mmlu_pro_math_accuracy: 73.58
+        mmlu_pro_philosophy_accuracy: 44.89
+        mmlu_pro_physics_accuracy: 60.05
+        mmlu_pro_psychology_accuracy: 61.9
+        mmlu_pro_other_accuracy: 52.6
+        humanevalx-python_pass@1: 51.83
+        humanevalx-cpp_pass@1: 42.68
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 73.78
+        humanevalx-js_pass@1: 72.56
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 8.64
+        ds1000_Tensorflow_accuracy: 17.78
+        ds1000_Scipy_accuracy: 15.09
+        ds1000_Sklearn_accuracy: 8.7
+        ds1000_Pytorch_accuracy: 4.41
+        ds1000_Matplotlib_accuracy: 61.29
+        mmmlu_lite_accuracy: 54.7
+        openai_mmmlu_lite_AR-XY_accuracy: 42.32
+        openai_mmmlu_lite_BN-BD_accuracy: 42.18
+        openai_mmmlu_lite_DE-DE_accuracy: 60
+        openai_mmmlu_lite_ES-LA_accuracy: 66.18
+        openai_mmmlu_lite_FR-FR_accuracy: 66.88
+        openai_mmmlu_lite_HI-IN_accuracy: 48.63
+        openai_mmmlu_lite_ID-ID_accuracy: 61.26
+        openai_mmmlu_lite_IT-IT_accuracy: 65.26
+        openai_mmmlu_lite_JA-JP_accuracy: 60.7
+        openai_mmmlu_lite_KO-KR_accuracy: 60.63
+        openai_mmmlu_lite_PT-BR_accuracy: 54.46
+        openai_mmmlu_lite_SW-KE_accuracy: 36
+        openai_mmmlu_lite_YO-NG_accuracy: 31.86
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.4
+        college_naive_average: 48.33
+        high_naive_average: 59.33
+        middle_naive_average: 76.67
+        primary_naive_average: 86.67
+        arithmetic_naive_average: 74.33
+        mathbench-a (average)_naive_average: 69.07
+        college_knowledge_naive_average: 83.54
+        high_knowledge_naive_average: 80.82
+        middle_knowledge_naive_average: 83.79
+        primary_knowledge_naive_average: 92.22
+        mathbench-t (average)_naive_average: 85.1
+
+
+internlm3-8b-instruct-turbomind:
+    objective:
+        race-high_accuracy: 89.22
+        ARC-c_accuracy: 92.54
+        BoolQ_accuracy: 86.45
+        triviaqa_wiki_1shot_score: 60.72
+        nq_open_1shot_score: 20.25
+        mmmlu_lite_naive_average: 41.82
+        IFEval_Prompt-level-strict-accuracy: 77.45
+        drop_accuracy: 83.27
+        bbh_naive_average: 55.22
+        GPQA_diamond_accuracy: 37.88
+        hellaswag_accuracy: 91.28
+        TheoremQA_score: 20.12
+        musr_average_naive_average: 36.86
+        korbench_single_naive_average: 41.2
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 91.28
+        GaokaoBench_weighted_average: 86.59
+        math_accuracy: 76.96
+        cmo_fib_accuracy: 35.1
+        aime2024_accuracy: 16.67
+        Mathbench_naive_average: 78.96
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.45
+        cmmlu_naive_average: 83.33
+        mmlu_naive_average: 76.21
+        mmlu_pro_naive_average: 57.96
+        openai_humaneval_humaneval_pass@1: 81.71
+        sanitized_mbpp_score: 69.65
+        humanevalx_naive_average: 40.73
+        ds1000_naive_average: 27.23
+        lcb_code_generation_pass@1: 34.75
+        lcb_code_execution_pass@1: 49.9
+        lcb_test_output_pass@1: 48.19
+        bigcodebench_hard_instruct_pass@1: 13.51
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.21
+        mmlu-stem_accuracy: 77.7
+        mmlu-social-science_accuracy: 80.98
+        mmlu-humanities_accuracy: 70.83
+        mmlu-other_accuracy: 75.01
+        cmmlu_accuracy: 83.33
+        cmmlu-stem_accuracy: 79.66
+        cmmlu-social-science_accuracy: 83.39
+        cmmlu-humanities_accuracy: 84.73
+        cmmlu-other_accuracy: 86.2
+        cmmlu-china-specific_accuracy: 81.77
+        mmlu_pro_accuracy: 57.96
+        mmlu_pro_biology_accuracy: 75.45
+        mmlu_pro_business_accuracy: 64.64
+        mmlu_pro_chemistry_accuracy: 59.81
+        mmlu_pro_computer_science_accuracy: 60.24
+        mmlu_pro_economics_accuracy: 68.6
+        mmlu_pro_engineering_accuracy: 44.79
+        mmlu_pro_health_accuracy: 58.31
+        mmlu_pro_history_accuracy: 49.87
+        mmlu_pro_law_accuracy: 32.43
+        mmlu_pro_math_accuracy: 70.17
+        mmlu_pro_philosophy_accuracy: 46.89
+        mmlu_pro_physics_accuracy: 59.58
+        mmlu_pro_psychology_accuracy: 66.29
+        mmlu_pro_other_accuracy: 54.33
+        humanevalx-python_pass@1: 43.9
+        humanevalx-cpp_pass@1: 20.12
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 74.39
+        humanevalx-js_pass@1: 65.24
+        ds1000_Pandas_accuracy: 16.49
+        ds1000_Numpy_accuracy: 34.09
+        ds1000_Tensorflow_accuracy: 26.67
+        ds1000_Scipy_accuracy: 17.92
+        ds1000_Sklearn_accuracy: 20.87
+        ds1000_Pytorch_accuracy: 19.12
+        ds1000_Matplotlib_accuracy: 55.48
+        mmmlu_lite_accuracy: 41.82
+        openai_mmmlu_lite_AR-XY_accuracy: 32.56
+        openai_mmmlu_lite_BN-BD_accuracy: 4.56
+        openai_mmmlu_lite_DE-DE_accuracy: 24.91
+        openai_mmmlu_lite_ES-LA_accuracy: 51.09
+        openai_mmmlu_lite_FR-FR_accuracy: 61.68
+        openai_mmmlu_lite_HI-IN_accuracy: 24.98
+        openai_mmmlu_lite_ID-ID_accuracy: 44.56
+        openai_mmmlu_lite_IT-IT_accuracy: 52.35
+        openai_mmmlu_lite_JA-JP_accuracy: 51.02
+        openai_mmmlu_lite_KO-KR_accuracy: 47.93
+        openai_mmmlu_lite_PT-BR_accuracy: 53.89
+        openai_mmmlu_lite_SW-KE_accuracy: 33.47
+        openai_mmmlu_lite_YO-NG_accuracy: 33.47
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.05
+        college_naive_average: 45.67
+        high_naive_average: 64.67
+        middle_naive_average: 82.33
+        primary_naive_average: 90.33
+        arithmetic_naive_average: 74
+        mathbench-a (average)_naive_average: 71.4
+        college_knowledge_naive_average: 85.28
+        high_knowledge_naive_average: 79.43
+        middle_knowledge_naive_average: 87.9
+        primary_knowledge_naive_average: 93.42
+        mathbench-t (average)_naive_average: 86.51
+
+
+internlm3-8b-instruct-pytorch:
+    objective:
+        race-high_accuracy: 89.02
+        ARC-c_accuracy: 93.56
+        BoolQ_accuracy: 86.67
+        triviaqa_wiki_1shot_score: 60.54
+        nq_open_1shot_score: 20.3
+        mmmlu_lite_naive_average: 42.6
+        IFEval_Prompt-level-strict-accuracy: 79.11
+        drop_accuracy: 83.32
+        bbh_naive_average: 54.76
+        GPQA_diamond_accuracy: 42.42
+        hellaswag_accuracy: 91.31
+        TheoremQA_score: 18
+        musr_average_naive_average: 36.62
+        korbench_single_naive_average: 41.84
+        ARC_Prize_Public_Evaluation_accuracy: 0.06
+        gsm8k_accuracy: 90.67
+        GaokaoBench_weighted_average: 86.27
+        math_accuracy: 76.68
+        cmo_fib_accuracy: 33.65
+        aime2024_accuracy: 10
+        Mathbench_naive_average: 78.92
+        wikibench-wiki-single_choice_cncircular_perf_4: 37.35
+        cmmlu_naive_average: 83.11
+        mmlu_naive_average: 76.23
+        mmlu_pro_naive_average: 58.16
+        openai_humaneval_humaneval_pass@1: 82.32
+        sanitized_mbpp_score: 70.04
+        humanevalx_naive_average: 39.76
+        ds1000_naive_average: 27.84
+        lcb_code_generation_pass@1: 34.5
+        lcb_code_execution_pass@1: 48.02
+        lcb_test_output_pass@1: 47.74
+        bigcodebench_hard_instruct_pass@1: 12.84
+        bigcodebench_hard_complete_pass@1: 15.54
+        teval_naive_average: 82.86
+        SciCode_sub_accuracy: 100
+        qa_dingo_cn_score: 100
+        mmlu_accuracy: 76.23
+        mmlu-stem_accuracy: 78.08
+        mmlu-social-science_accuracy: 80.31
+        mmlu-humanities_accuracy: 71.38
+        mmlu-other_accuracy: 74.63
+        cmmlu_accuracy: 83.11
+        cmmlu-stem_accuracy: 79.42
+        cmmlu-social-science_accuracy: 83.34
+        cmmlu-humanities_accuracy: 83.95
+        cmmlu-other_accuracy: 86.22
+        cmmlu-china-specific_accuracy: 81.5
+        mmlu_pro_accuracy: 58.16
+        mmlu_pro_biology_accuracy: 74.62
+        mmlu_pro_business_accuracy: 65.02
+        mmlu_pro_chemistry_accuracy: 60.69
+        mmlu_pro_computer_science_accuracy: 61.46
+        mmlu_pro_economics_accuracy: 68.25
+        mmlu_pro_engineering_accuracy: 45.3
+        mmlu_pro_health_accuracy: 60.15
+        mmlu_pro_history_accuracy: 50.66
+        mmlu_pro_law_accuracy: 31.7
+        mmlu_pro_math_accuracy: 70.32
+        mmlu_pro_philosophy_accuracy: 47.7
+        mmlu_pro_physics_accuracy: 59.51
+        mmlu_pro_psychology_accuracy: 65.41
+        mmlu_pro_other_accuracy: 53.46
+        humanevalx-python_pass@1: 42.68
+        humanevalx-cpp_pass@1: 19.51
+        humanevalx-go_pass@1: 0
+        humanevalx-java_pass@1: 72.56
+        humanevalx-js_pass@1: 64.02
+        ds1000_Pandas_accuracy: 14.09
+        ds1000_Numpy_accuracy: 35
+        ds1000_Tensorflow_accuracy: 24.44
+        ds1000_Scipy_accuracy: 20.75
+        ds1000_Sklearn_accuracy: 21.74
+        ds1000_Pytorch_accuracy: 22.06
+        ds1000_Matplotlib_accuracy: 56.77
+        mmmlu_lite_accuracy: 42.6
+        openai_mmmlu_lite_AR-XY_accuracy: 32.84
+        openai_mmmlu_lite_BN-BD_accuracy: 10.46
+        openai_mmmlu_lite_DE-DE_accuracy: 24.56
+        openai_mmmlu_lite_ES-LA_accuracy: 50.95
+        openai_mmmlu_lite_FR-FR_accuracy: 61.05
+        openai_mmmlu_lite_HI-IN_accuracy: 30.6
+        openai_mmmlu_lite_ID-ID_accuracy: 45.89
+        openai_mmmlu_lite_IT-IT_accuracy: 51.79
+        openai_mmmlu_lite_JA-JP_accuracy: 51.65
+        openai_mmmlu_lite_KO-KR_accuracy: 48.77
+        openai_mmmlu_lite_PT-BR_accuracy: 52.7
+        openai_mmmlu_lite_SW-KE_accuracy: 32.91
+        openai_mmmlu_lite_YO-NG_accuracy: 32.84
+        openai_mmmlu_lite_ZH-CN_accuracy: 69.33
+        college_naive_average: 47
+        high_naive_average: 66.67
+        middle_naive_average: 81.67
+        primary_naive_average: 89.33
+        arithmetic_naive_average: 73.67
+        mathbench-a (average)_naive_average: 71.67
+        college_knowledge_naive_average: 82.91
+        high_knowledge_naive_average: 79.86
+        middle_knowledge_naive_average: 88.92
+        primary_knowledge_naive_average: 92.96
+        mathbench-t (average)_naive_average: 86.16
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 45f74131..16a13209 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -1,21 +1,24 @@
 chat:
     glm-4-9b-chat-hf:
-        gsm8k_accuracy: 68.75
-        race-high_accuracy: 90.62
+        gsm8k_accuracy: 56.25
+        race-high_accuracy: 84.38
     glm-4-9b-chat-turbomind:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 71.88
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 90.62
     deepseek-7b-chat-hf:
         gsm8k_accuracy: 46.88
         race-high_accuracy: 81.25
-    deepseek-moe-16b-chat-hf:
-        gsm8k_accuracy: 50
-        race-high_accuracy: 68.75
+    deepseek-r1-distill-llama-8b-turbomind:
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 81.25
+    deepseek-r1-distill-qwen-1_5b-turbomind:
+        gsm8k_accuracy: 37.5
+        race-high_accuracy: 53.12
     deepseek-7b-chat-vllm:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 43.75
         race-high_accuracy: 78.12
     gemma2-2b-it-hf:
         gsm8k_accuracy: 50
@@ -36,34 +39,40 @@ chat:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 93.75
     gemma-7b-it-vllm:
-        gsm8k_accuracy: 46.88
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 68.75
     internlm2_5-7b-chat-hf:
         gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
+    internlm3-8b-instruct-hf:
+        gsm8k_accuracy: 65.62
+        race-high_accuracy: 87.5
     internlm2_5-7b-chat-turbomind:
-        gsm8k_accuracy: 87.50
+        gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     internlm2-chat-1.8b-turbomind:
         gsm8k_accuracy: 28.12
         race-high_accuracy: 84.38
     internlm2-chat-1.8b-sft-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 31.25
         race-high_accuracy: 84.38
     internlm2-chat-7b-lmdeploy:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 59.38
         race-high_accuracy: 84.38
     internlm2-chat-7b-sft-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 90.62
+    internlm3-8b-instruct-turbomind:
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 87.5
     internlm2-chat-7b-vllm:
-        gsm8k_accuracy: 43.75
-        race-high_accuracy: 84.38
+        gsm8k_accuracy: 59.38
+        race-high_accuracy: 87.50
     llama-3_1-8b-instruct-hf:
         gsm8k_accuracy: 84.38
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-hf:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 81.25
     llama-3-8b-instruct-hf:
         gsm8k_accuracy: 68.75
@@ -72,14 +81,14 @@ chat:
         gsm8k_accuracy: 18.75
         race-high_accuracy: 46.88
     llama-3_1-8b-instruct-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 81.25
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 81.25
     llama-3-8b-instruct-turbomind:
-        gsm8k_accuracy: 71.88
-        race-high_accuracy: 87.5
+        gsm8k_accuracy: 68.75
+        race-high_accuracy: 84.38
     mistral-7b-instruct-v0.2-hf:
         gsm8k_accuracy: 40.62
         race-high_accuracy: 75
@@ -94,13 +103,10 @@ chat:
         race-high_accuracy: 78.12
     mistral-7b-instruct-v0.1-vllm:
         gsm8k_accuracy: 34.38
-        race-high_accuracy: 68.75
+        race-high_accuracy: 65.62
     mistral-7b-instruct-v0.2-vllm:
-        gsm8k_accuracy: 31.25
-        race-high_accuracy: 75
-    phi-3-mini-4k-instruct-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 87.50
+        gsm8k_accuracy: 21.88
+        race-high_accuracy: 78.12
     qwen2.5-0.5b-instruct-hf:
         gsm8k_accuracy: 34.38
         race-high_accuracy: 46.88
@@ -108,10 +114,10 @@ chat:
         gsm8k_accuracy: 53.12
         race-high_accuracy: 90.62
     qwen2.5-0.5b-instruct-turbomind:
-        gsm8k_accuracy: 28.12
-        race-high_accuracy: 50
+        gsm8k_accuracy: 31.25
+        race-high_accuracy: 43.75
     qwen2.5-3b-instruct-turbomind:
-        gsm8k_accuracy: 59.38
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 90.62
     qwen1.5-0.5b-chat-hf:
         gsm8k_accuracy: 0
@@ -123,11 +129,11 @@ chat:
         gsm8k_accuracy: 68.75
         race-high_accuracy: 90.62
     qwen2-1.5b-instruct-turbomind:
-        gsm8k_accuracy: 53.12
+        gsm8k_accuracy: 56.25
         race-high_accuracy: 84.38
     qwen2-7b-instruct-turbomind:
         gsm8k_accuracy: 81.25
-        race-high_accuracy: 90.62
+        race-high_accuracy: 87.50
     qwen1.5-0.5b-chat-vllm:
         gsm8k_accuracy: 3.12
         race-high_accuracy: 53.12
@@ -143,11 +149,11 @@ chat:
     yi-1.5-9b-chat-turbomind:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 93.75
-    deepseek-v2-lite-chat-hf:
-        gsm8k_accuracy: 46.88
+    deepseek-v2_lite-chat-turbomind:
+        gsm8k_accuracy: 37.5
         race-high_accuracy: 71.88
     gemma2-27b-it-hf:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 93.75
     internlm2_5-20b-chat-hf:
         gsm8k_accuracy: 84.38
@@ -161,6 +167,9 @@ chat:
     mistral-small-instruct-2409-turbomind:
         gsm8k_accuracy: 81.25
         race-high_accuracy: 87.50
+    phi-4:
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 87.50
     qwen2.5-14b-instruct-hf:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 96.88
@@ -168,40 +177,41 @@ chat:
         gsm8k_accuracy: 68.75
         race-high_accuracy: 93.75
     yi-1.5-34b-chat-turbomind:
-        gsm8k_accuracy: 78.12
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 93.75
-    deepseek-67b-chat-hf:
-        gsm8k_accuracy: 71.88
+    deepseek-67b-chat-turbomind:
+        gsm8k_accuracy: 75.00
         race-high_accuracy: 78.12
+    deepseek-r1-distill-qwen-32b-turbomind:
+        gsm8k_accuracy: 25
+        race-high_accuracy: 90.62
     llama-3_3-70b-instruct-turbomind:
         gsm8k_accuracy: 93.75
         race-high_accuracy: 87.5
-    mixtral-8x7b-instruct-v0.1-hf:
-        gsm8k_accuracy: 59.38
-        race-high_accuracy: 81.25
     mixtral-large-instruct-2411-turbomind:
-        gsm8k_accuracy: 90.62
+        gsm8k_accuracy: 87.50
         race-high_accuracy: 93.75
     nvidia-3_1-Nemotron-70b-instruct-HF-turbomind:
-        gsm8k_accuracy: 87.5
-        race-high_accuracy: 46.88
+        gsm8k_accuracy: 93.75
+        race-high_accuracy: 50.00
     qwen2.5-72b-instruct-turbomind:
-        gsm8k_accuracy: 75
-        race-high_accuracy: 93.75
+        gsm8k_accuracy: 81.25
+        race-high_accuracy: 90.62
+    deepseek-r1-distill-llama-70b-turbomind:
+        gsm8k_accuracy: 40.62
+        race-high_accuracy: 90.62
     deepseek-v2_5-1210-turbomind:
         gsm8k_accuracy: 90.62
         race-high_accuracy: 84.38
-    mixtral-8x22b-instruct-v0.1-hf:
-        gsm8k_accuracy: 81.25
-        race-high_accuracy: 81.25
+    mixtral-8x22b-instruct-v0.1-turbomind:
+        gsm8k_accuracy: 75
+        race-high_accuracy: 78.12
+    mixtral-8x22b-instruct-v0.1-vllm:
+        gsm8k_accuracy: 78.12
+        race-high_accuracy: 78.12
 base:
-    glm-4-9b-hf:
-        gsm8k_accuracy: 68.75
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
     glm-4-9b-turbomind:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 56.25
         GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
@@ -210,15 +220,10 @@ base:
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 46.88
         winogrande_accuracy: 71.88
-    deepseek-moe-16b-base-hf:
-        gsm8k_accuracy: 21.88
-        GPQA_diamond_accuracy: 0
-        race-high_accuracy: 21.88
-        winogrande_accuracy: 65.62
     deepseek-7b-base-turbomind:
-        gsm8k_accuracy: 21.88
+        gsm8k_accuracy: 18.75
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 46.88
+        race-high_accuracy: 43.75
         winogrande_accuracy: 84.38
     deepseek-moe-16b-base-vllm:
         gsm8k_accuracy: 21.88
@@ -245,16 +250,21 @@ base:
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 65.62
         winogrande_accuracy: 71.88
+    gemma-2-9b-turbomind:
+        gsm8k_accuracy: 68.75
+        GPQA_diamond_accuracy: 0
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 50
     gemma-2b-vllm:
         gsm8k_accuracy: 15.62
         GPQA_diamond_accuracy: 3.12
-        race-high_accuracy:
-        winogrande_accuracy:
+        race-high_accuracy: 28.12
+        winogrande_accuracy: 68.75
     gemma-7b-vllm:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy:
-        winogrande_accuracy:
+        gsm8k_accuracy: 43.75
+        GPQA_diamond_accuracy: 6.25
+        race-high_accuracy: 81.25
+        winogrande_accuracy: 81.25
     internlm2_5-7b-hf:
         gsm8k_accuracy: 37.5
         GPQA_diamond_accuracy: 25
@@ -265,30 +275,25 @@ base:
         GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 62.5
         winogrande_accuracy: 78.12
-    internlm2-base-7b-hf:
-        gsm8k_accuracy: 3.12
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 75
-        winogrande_accuracy: 65.62
     internlm2-1.8b-turbomind:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 6.25
+        GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 71.88
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 75
     internlm2_5-7b-turbomind:
-        gsm8k_accuracy: 62.50
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 34.38
         race-high_accuracy: 93.75
-        winogrande_accuracy: 87.50
+        winogrande_accuracy: 84.38
     internlm2-7b-turbomind:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 21.88
+        gsm8k_accuracy: 50
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 71.88
         winogrande_accuracy: 84.38
     internlm2-base-7b-turbomind:
         gsm8k_accuracy: 37.50
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
+        GPQA_diamond_accuracy: 21.88
+        race-high_accuracy: 84.38
         winogrande_accuracy: 75
     llama-2-7b-hf:
         gsm8k_accuracy: 21.88
@@ -311,7 +316,7 @@ base:
         race-high_accuracy: 78.12
         winogrande_accuracy: 78.12
     llama-3-8b-turbomind:
-        gsm8k_accuracy: 50
+        gsm8k_accuracy: 46.88
         GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 65.62
         winogrande_accuracy: 78.12
@@ -327,14 +332,14 @@ base:
         winogrande_accuracy: 71.88
     qwen2.5-1.5b-turbomind:
         gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 12.50
-        race-high_accuracy: 78.12
-        winogrande_accuracy: 68.75
-    qwen2.5-7b-turbomind:
-        gsm8k_accuracy: 75.00
-        GPQA_diamond_accuracy: 25
-        race-high_accuracy: 87.5
+        GPQA_diamond_accuracy: 15.62
+        race-high_accuracy: 75
         winogrande_accuracy: 71.88
+    qwen2.5-7b-turbomind:
+        gsm8k_accuracy: 71.88
+        GPQA_diamond_accuracy: 18.75
+        race-high_accuracy: 87.5
+        winogrande_accuracy: 75.00
     qwen1.5-moe-a2.7b-hf:
         gsm8k_accuracy: 62.5
         GPQA_diamond_accuracy: 18.75
@@ -356,17 +361,17 @@ base:
         race-high_accuracy: 87.5
         winogrande_accuracy: 68.75
     qwen2-1.5b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 81.25
         winogrande_accuracy: 75
     qwen2-7b-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 65.62
         GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 87.5
         winogrande_accuracy: 71.88
     qwen1.5-0.5b-vllm:
-        gsm8k_accuracy: 9.38
+        gsm8k_accuracy: 6.25
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 56.25
         winogrande_accuracy: 62.5
@@ -382,27 +387,12 @@ base:
         winogrande_accuracy: 59.38
     yi-1.5-9b-turbomind:
         gsm8k_accuracy: 78.12
-        GPQA_diamond_accuracy: 40.62
+        GPQA_diamond_accuracy: 43.75
         race-high_accuracy: 87.5
         winogrande_accuracy: 71.88
-    deepseek-v2-lite-hf:
-        gsm8k_accuracy: 31.25
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 59.38
-        winogrande_accuracy: 71.88
-    internlm2-20b-hf:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 68.75
-        winogrande_accuracy: 75
-    internlm2-base-20b-hf:
-        gsm8k_accuracy: 12.5
-        GPQA_diamond_accuracy: 9.38
-        race-high_accuracy: 84.38
-        winogrande_accuracy: 65.62
     internlm2-20b-turbomind:
-        gsm8k_accuracy: 71.88
-        GPQA_diamond_accuracy: 15.62
+        gsm8k_accuracy: 75
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 68.75
         winogrande_accuracy: 81.25
     qwen2.5-14b-hf:
@@ -416,37 +406,27 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 78.12
     qwen2.5-32b-turbomind:
-        gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 28.12
+        gsm8k_accuracy: 87.5
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
-    deepseek-67b-base-hf:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 31.25
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 90.62
     deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 53.12
         GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 81.25
         winogrande_accuracy: 84.38
     llama-3-70b-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 56.25
+        GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
     qwen2.5-72b-turbomind:
         gsm8k_accuracy: 84.38
-        GPQA_diamond_accuracy: 34.38
+        GPQA_diamond_accuracy: 31.25
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     deepseek-v2-turbomind:
-        gsm8k_accuracy: 65.62
-        GPQA_diamond_accuracy: 15.62
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
-    llama-3-70b-hf:
-        gsm8k_accuracy: 62.5
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 3.12
         race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
+        winogrande_accuracy: 81.25
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index a5a930fa..6a1c2ebc 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -61,6 +61,7 @@ env:
   HUGGINGFACE_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
   HF_HUB_CACHE: /fs-computility/llm/shared/llmeval/models/opencompass_hf_hub
   CONDA_ENV: regression_test
+  export VLLM_WORKER_MULTIPROC_METHOD: spawn
 
 jobs:
   build-pypi:
@@ -92,7 +93,6 @@ jobs:
       matrix:
         pyver: [py310]
     runs-on: ubuntu-latest
-    environment: 'prod'
     env:
       PYTHON_VERSION: ${{ matrix.pyver }}
       PLAT_NAME: manylinux2014_x86_64
@@ -126,7 +126,6 @@ jobs:
     if: ${{!cancelled()}}
     needs: ['build-pypi', 'build-pypi-lmdeploy']
     runs-on: volc_cu12
-    environment: 'prod'
     timeout-minutes: 120 #2hours
     steps:
       - name: Clone repository
@@ -190,7 +189,6 @@ jobs:
       matrix:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
     runs-on: volc_cu12_daily
-    environment: 'prod'
     timeout-minutes: 180 #3hours
     steps:
       - name: Clone repository
@@ -231,7 +229,6 @@ jobs:
       matrix:
         regression_func: ${{fromJSON(github.event.inputs.regression_func_local || '["cmd","api","chat_sub_fullbench"]')}}
     runs-on: volc_cu12_local
-    environment: 'prod'
     timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
@@ -258,27 +255,33 @@ jobs:
           conda info --envs
           export from_tf=TRUE
           python tools/list_configs.py internlm2_5 mmlu
-          opencompass --models hf_internlm2_5_7b hf_internlm2_1_8b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b --datasets race_ppl demo_gsm8k_chat_gen --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd1/*/summary regression_result_daily
           python -m pytest -m case1 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --models hf_internlm2_5_7b_chat hf_internlm2_chat_1_8b --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --models hf_internlm2_5_7b_chat hf_internlm3_8b_instruct --datasets race_gen demo_gsm8k_chat_gen -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd2/*/summary regression_result_daily
           python -m pytest -m case2 -s -v --color=yes .github/scripts/oc_score_assert.py
           opencompass --datasets race_ppl demo_gsm8k_chat_gen --hf-type base --hf-path internlm/internlm2_5-7b --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd3/*/summary regression_result_daily
           python -m pytest -m case3 -s -v --color=yes .github/scripts/oc_score_assert.py
-          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm2_5-7b-chat --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a lmdeploy --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4 --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd4/*/summary regression_result_daily
           python -m pytest -m case4 -s -v --color=yes .github/scripts/oc_score_assert.py
+          opencompass --datasets race_gen demo_gsm8k_chat_gen --hf-type chat --hf-path internlm/internlm3-8b-instruct -a vllm --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5 --reuse --max-num-workers 2 --dump-eval-details
+          rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/cmd5/*/summary regression_result_daily
+          python -m pytest -m case5 -s -v --color=yes .github/scripts/oc_score_assert.py
       - name:  Run model test - api
         if: matrix.regression_func == 'api'
         run: |
           . ${{env.CONDA_PATH}}/bin/activate
           conda activate ${{env.CONDA_ENV}}
           conda info --envs
-          lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
+          lmdeploy serve api_server internlm/internlm3-8b-instruct --max-batch-size 256 --model-name internlm3 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log  2>&1  &
           echo "restful_pid=$!" >> "$GITHUB_ENV"
           sleep 180s
+          env | grep PROXY
+          env | grep proxy
+          unset HTTP_PROXY;unset HTTPS_PROXY;unset http_proxy;unset https_proxy;
           opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
           rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
           python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@@ -307,7 +310,6 @@ jobs:
       matrix:
         function_type: ${{fromJSON(github.event.inputs.fullbench_eval || '["base_objective","chat_objective","chat_subjective","base_long_context","chat_long_context"]')}}
     runs-on: volc_cu12
-    environment: 'prod'
     timeout-minutes: 480 #6hours
     steps:
       - name: Clone repository
@@ -341,7 +343,6 @@ jobs:
     needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
     timeout-minutes: 5
     runs-on: self-hosted
-    environment: 'prod'
     steps:
       - name: notify
         run: |
diff --git a/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
new file mode 100644
index 00000000..1ffef256
--- /dev/null
+++ b/opencompass/configs/models/mistral/lmdeploy_mixtral_8x22b_instruct_v0_1.py
@@ -0,0 +1,22 @@
+from opencompass.models import TurboMindModelwithChatTemplate
+
+models = [
+    dict(
+        type=TurboMindModelwithChatTemplate,
+        abbr='mixtral-8x22b-instruct-v0.1-turbomind',
+        path='mistralai/Mixtral-8x22B-Instruct-v0.1',
+        engine_config=dict(
+            session_len=32768,
+            max_batch_size=16,
+            tp=8,
+            cache_max_entry_count=0.7,
+        ),
+        gen_config=dict(
+            top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
+        ),
+        max_seq_len=32768,
+        max_out_len=4096,
+        batch_size=8,
+        run_cfg=dict(num_gpus=8),
+    )
+]
diff --git a/opencompass/summarizers/subjective/common_summarizer.py b/opencompass/summarizers/subjective/common_summarizer.py
index ccb8d139..de917f44 100644
--- a/opencompass/summarizers/subjective/common_summarizer.py
+++ b/opencompass/summarizers/subjective/common_summarizer.py
@@ -147,6 +147,5 @@ class CommonSummarizer(CompassArenaSummarizer):
             f.write(','.join(new_header) + '\n')
             for line in new_table:
                 f.write(','.join(map(str, line)) + '\n')
-            print(t)
             print(output_file)
         return {'qa_bench_' + show_dataset_abbr:json_result}

From 828fb745c911675dfe5bd30865ccfcf316086d22 Mon Sep 17 00:00:00 2001
From: shijinpjlab <shijinpjlab@163.com>
Date: Mon, 7 Apr 2025 17:21:15 +0800
Subject: [PATCH 52/58] [Dataset] Update dingo 1.5.0 (#2008)

Co-authored-by: shiin <shijin@pjlab.org.cn>
---
 examples/eval_dingo.py        | 4 ++--
 opencompass/datasets/dingo.py | 2 +-
 requirements/extra.txt        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/eval_dingo.py b/examples/eval_dingo.py
index bc3ae82c..899eaa29 100644
--- a/examples/eval_dingo.py
+++ b/examples/eval_dingo.py
@@ -1,7 +1,7 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .datasets.dingo.dingo_gen import datasets
-    from .models.hf_internlm.hf_internlm_7b import models
+    from opencompass.configs.datasets.dingo.dingo_gen import datasets
+    from opencompass.configs.models.hf_internlm.hf_internlm_7b import models
 
 work_dir = './outputs/eval_dingo'
diff --git a/opencompass/datasets/dingo.py b/opencompass/datasets/dingo.py
index ea23b221..42483e65 100644
--- a/opencompass/datasets/dingo.py
+++ b/opencompass/datasets/dingo.py
@@ -68,7 +68,7 @@ class DingoEvaluator(BaseEvaluator):
                 json.dump(d, f, ensure_ascii=False)
                 f.write('\n')
         input_data = {
-            'eval_model': 'llm_base',
+            'eval_group': 'llm_base',
             'input_path': file_name,
             'output_path': './outputs/dingo/',
             'save_data': True,
diff --git a/requirements/extra.txt b/requirements/extra.txt
index fa90a34c..f81d410f 100644
--- a/requirements/extra.txt
+++ b/requirements/extra.txt
@@ -4,7 +4,7 @@ alpaca-eval==0.6
 antlr4-python3-runtime==4.11
 cn2an
 # Dingo
-dingo-python==1.1.2
+dingo-python==1.5.0
 # Icl topk retriever
 faiss_gpu==1.7.2
 # Humaneval, Humaneval X

From b564e608b11c6e57ee6829ef781afce8e315b556 Mon Sep 17 00:00:00 2001
From: Jin Ye <eugene.j.yonng@gmail.com>
Date: Tue, 8 Apr 2025 12:44:48 +1000
Subject: [PATCH 53/58] [Dataset] Add MedXpertQA (#2002)

* Add MedXpertQA

* Add MedXpertQA

* Add MedXpertQA

* Fix lint

---------

Co-authored-by: MaiziXiao <xxllcc1993@gmail.com>
---
 .pre-commit-config-zh-cn.yaml                 |   2 +-
 .pre-commit-config.yaml                       |   2 +-
 dataset-index.yml                             |   6 +
 .../datasets/MedXpertQA/MedXpertQA_gen.py     |  57 +++++
 .../MedXpertQA/MedXpertQA_llmjudge_gen.py     | 104 ++++++++
 opencompass/datasets/MedXpertQA.py            | 225 ++++++++++++++++++
 opencompass/datasets/__init__.py              |   1 +
 7 files changed, 395 insertions(+), 2 deletions(-)
 create mode 100644 opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
 create mode 100644 opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
 create mode 100644 opencompass/datasets/MedXpertQA.py

diff --git a/.pre-commit-config-zh-cn.yaml b/.pre-commit-config-zh-cn.yaml
index 2e21c85d..20a7d30c 100644
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
@@ -120,4 +120,4 @@ repos:
   #   hooks:
   #     - id: check-algo-readme
       # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
\ No newline at end of file
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9f72ae42..b464115e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -120,4 +120,4 @@ repos:
   #   hooks:
   #     - id: check-algo-readme
       # - id: check-copyright
-      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
+      #   args: ["mmocr", "tests", "tools"]  # these directories will be checked
\ No newline at end of file
diff --git a/dataset-index.yml b/dataset-index.yml
index de5e316e..5358a5de 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -116,6 +116,12 @@
     paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
     configpath: opencompass/configs/datasets/MedBench/medbench_gen.py
     configpath_llmjudge: ''
+- MedXpertQA:
+    name: MedXpertQA
+    category: Knowledge / Medicine
+    paper: https://arxiv.org/abs/2501.18362
+    configpath: opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
 - musr:
     name: MuSR
     category: Reasoning
diff --git a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
new file mode 100644
index 00000000..f7cf50d6
--- /dev/null
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_gen.py
@@ -0,0 +1,57 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQAEvaluator
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=MedXpertQAEvaluator),
+    pred_role='BOT',
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
diff --git a/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
new file mode 100644
index 00000000..8858f16b
--- /dev/null
+++ b/opencompass/configs/datasets/MedXpertQA/MedXpertQA_llmjudge_gen.py
@@ -0,0 +1,104 @@
+from opencompass.datasets import MedXpertQADataset, MedXpertQA_llmjudge_postprocess
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.evaluator import GenericLLMEvaluator
+
+SYSTEM_PROMPT = 'You are a helpful medical assistant.\n\n' # Where to put this?
+ZERO_SHOT_PROMPT = 'Q: {question}\nA: Among {start} through {end}, the answer is'
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+    <Original Question Begin>: Q: {question}\nA: Among {start} through {end}, the answer is\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    Judging the correctness of candidates' answers:
+""".strip()
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'medical_task',
+        'body_system',
+        'question_type',
+        'prompt_mode',
+    ],
+    output_column='label',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=SYSTEM_PROMPT),
+            ],
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=ZERO_SHOT_PROMPT, # prompt mode: zero-shot
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+            type=MedXpertQADataset,
+            path='TsinghuaC3I/MedXpertQA',
+            prompt_mode='zero-shot',
+            reader_cfg=reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=MedXpertQA_llmjudge_postprocess),
+    ),
+)
+medxpertqa_dataset = dict(
+    type=MedXpertQADataset,
+    abbr='medxpertqa',
+    path='TsinghuaC3I/MedXpertQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+medxpertqa_datasets = [medxpertqa_dataset]
diff --git a/opencompass/datasets/MedXpertQA.py b/opencompass/datasets/MedXpertQA.py
new file mode 100644
index 00000000..f016297a
--- /dev/null
+++ b/opencompass/datasets/MedXpertQA.py
@@ -0,0 +1,225 @@
+import re
+
+from datasets import Dataset, load_dataset
+
+from opencompass.openicl import BaseEvaluator
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
+from opencompass.utils import get_logger
+
+from .base import BaseDataset
+
+
+def _parse(item, prompt_mode):
+    item['start'] = chr(65)
+    item['end'] = chr(65 + len(item.get('options', [])) - 1)
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class MedXpertQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        dataset = load_dataset(path, 'Text', split='test')
+        # dataset = load_dataset(path, 'Text', split='dev')
+
+        if prompt_mode == 'zero-shot':
+            dataset = dataset.map(lambda item: _parse(item, prompt_mode))
+        elif prompt_mode == 'few-shot':
+            pass  # TODO: Implement few-shot prompt
+
+        return dataset
+
+
+class MedXpertQAEvaluator(BaseEvaluator):
+
+    def score(self, predictions, references, test_set):
+        method = test_set['prompt_mode'][0]
+
+        if len(predictions) != len(references):
+            return {'error': 'preds and refrs have different length'}
+        correct = 0
+        count = 0
+        details = []
+        for idx, (i, j) in enumerate(zip(predictions, references)):
+            i = answer_cleansing(method, i, test_set['options'][idx],
+                                 test_set['label'][idx])
+            detail = {'pred': i, 'answer': j, 'correct': False}
+            count += 1
+            if i == j:
+                correct += 1
+                detail['correct'] = True
+            details.append(detail)
+        result = {'accuracy': 100 * correct / count, 'details': details}
+        return result
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def answer_cleansing(
+    method: str,
+    prediction: str,
+    options: list,
+    label: str,
+) -> str:
+
+    # Clean up unwanted phrases in the prediction
+    for unwanted_phrase in [
+            'I understand',
+            'A through J',
+            'A through E',
+            'A through D',
+    ]:
+        prediction = prediction.replace(unwanted_phrase, '')
+
+    options_num = len(options)
+    options = [chr(65 + i) for i in range(options_num)]
+    options_str = r'\b(' + '|'.join(options) + r')\b'
+    prediction = re.findall(options_str, prediction)
+
+    if len(prediction) == 0:
+        prediction = []
+    else:
+        # If there is a "label" and its length is 1,
+        # process prediction accordingly
+        if len(label) == 1:
+            if method == 'few-shot':
+                answer_flag = True if len(prediction) > 1 else False
+                # choose the first or last element based on the answer_flag
+                if answer_flag:
+                    prediction = [prediction[0]]
+                else:
+                    prediction = [prediction[-1]]
+            elif method == 'zero-shot':
+                # choose the first element in list
+                prediction = [prediction[0]]
+            else:
+                raise ValueError('Method is not properly defined ...')
+
+            # Remove trailing period if it exists
+            if prediction[0] and prediction[0].endswith('.'):
+                prediction[0] = prediction[0][:-1]
+
+    return prediction[0]
+
+
+def _generic_llmjudge_postprocess(judgement: str):
+    match = re.search(r'(A|B)', judgement)
+    grade_letter = (match.group(0) if match else 'B'
+                    )  # Default to "INCORRECT" if no match
+    return grade_letter
+
+
+def MedXpertQA_llmjudge_postprocess(
+    output: dict,
+    output_path: str,
+    dataset: Dataset,
+) -> dict:
+    # Get the original dataset
+    original_dataset = dataset.reader.dataset['test']
+
+    judged_answers = []
+    original_responses = []
+    references = []
+    details = []
+
+    # Initialize statistics dictionaries
+    stats = {'medical_task': {}, 'body_system': {}, 'question_type': {}}
+
+    total_correct = 0
+    total_count = 0
+
+    # Process each sample
+    for k, v in output.items():
+        idx = int(k)  # Convert key to integer for indexing
+        original_responses.append(v['prediction'])
+        processed_judge = _generic_llmjudge_postprocess(v['prediction'])
+
+        # Get category information from the dataset
+        sample = original_dataset[idx]
+        medical_task = sample.get('medical_task', 'unknown')
+        body_system = sample.get('body_system', 'unknown')
+        question_type = sample.get('question_type', 'unknown')
+
+        # Initialize category stats if not exists
+        for level, key in [
+            ('medical_task', medical_task),
+            ('body_system', body_system),
+            ('question_type', question_type),
+        ]:
+            if key not in stats[level]:
+                stats[level][key] = {'correct': 0, 'total': 0}
+
+        # Record the judgment
+        if processed_judge is not None:
+            judged_answers.append(processed_judge)
+            try:
+                gold = v['gold']
+                references.append(gold)
+            except KeyError:
+                get_logger().warning(
+                    f'No gold answer for {k}, use empty string as reference!')
+                gold = ''
+                references.append('')
+
+            # Check if the answer is correct (A means correct)
+            is_correct = processed_judge == 'A'
+            total_count += 1
+
+            if is_correct:
+                total_correct += 1
+                # Update category stats
+                for level, key in [
+                    ('medical_task', medical_task),
+                    ('body_system', body_system),
+                    ('question_type', question_type),
+                ]:
+                    stats[level][key]['correct'] += 1
+
+            # Update category totals
+            for level, key in [
+                ('medical_task', medical_task),
+                ('body_system', body_system),
+                ('question_type', question_type),
+            ]:
+                stats[level][key]['total'] += 1
+            # Add to details
+            details.append({
+                'id': k,
+                'question': sample['question'],
+                'options': sample['options'],
+                'origin_prompt': v['origin_prompt'],
+                'llm_judge': processed_judge,
+                'gold': gold,
+                'is_correct': is_correct,
+                'medical_task': medical_task,
+                'body_system': body_system,
+                'question_type': question_type,
+            })
+
+    # Calculate overall accuracy with two decimal places
+    overall_accuracy = (round(
+        (total_correct / total_count * 100), 2) if total_count > 0 else 0.00)
+
+    # Initialize results dictionary
+    results = {
+        'accuracy': overall_accuracy,
+        'total_correct': total_correct,
+        'total_count': total_count,
+        'details': details,
+    }
+
+    # Calculate accuracy for each category and flatten into results
+    for level in stats:
+        for key, value in stats[level].items():
+            if value['total'] > 0:
+                # Calculate accuracy with two decimal places
+                accuracy = round((value['correct'] / value['total'] * 100), 2)
+
+                # Create a flattened key for the category
+                flat_key = f'MedXpertQA-{key}'
+
+                # Add to results
+                results[flat_key] = accuracy
+
+    return results
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index 45209054..3e2d0eef 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -92,6 +92,7 @@ from .math_intern import *  # noqa: F401, F403
 from .mathbench import *  # noqa: F401, F403
 from .mbpp import *  # noqa: F401, F403
 from .medbench import *  # noqa: F401, F403
+from .MedXpertQA import *  # noqa: F401, F403
 from .mgsm import *  # noqa: F401, F403
 from .mmlu import *  # noqa: F401, F403
 from .mmlu_cf import *  # noqa: F401, F403

From bb58cfc85dc481423d64e79ab52b7a85e8d2aa07 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 8 Apr 2025 11:58:14 +0800
Subject: [PATCH 54/58] [Feature] Add CascadeEvaluator (#1992)

* [Feature] Add CascadeEvaluator

* update

* updat
---
 README.md                                     |   1 +
 README_zh-CN.md                               |   3 +-
 docs/en/advanced_guides/llm_judge.md          | 105 +++++-
 docs/zh_cn/advanced_guides/llm_judge.md       | 102 +++++-
 examples/eval_cascade_evaluator.py            | 127 ++++++++
 opencompass/evaluator/__init__.py             |   1 +
 opencompass/evaluator/cascade_evaluator.py    | 302 ++++++++++++++++++
 .../evaluator/generic_llm_evaluator.py        |  57 +++-
 8 files changed, 681 insertions(+), 17 deletions(-)
 create mode 100644 examples/eval_cascade_evaluator.py
 create mode 100644 opencompass/evaluator/cascade_evaluator.py

diff --git a/README.md b/README.md
index a17a1998..28073c8f 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.04.01\]** OpenCompass now supports `CascadeEvaluator`, a flexible evaluation mechanism that allows multiple evaluators to work in sequence. This enables creating customized evaluation pipelines for complex assessment scenarios. Check out the [documentation](docs/en/advanced_guides/llm_judge.md) for more details! 🔥🔥🔥
 - **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 4406c7bc..f70eb41e 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -57,8 +57,9 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.04.01\]** OpenCompass 现已支持 `CascadeEvaluator`，允许多个评估器按顺序工作，可以为更复杂的评估场景创建自定义评估流程，查看[文档](docs/zh_cn/advanced_guides/llm_judge.md)了解具体用法！🔥🔥🔥
 - **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
-- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
+- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/zh_cn/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 - **\[2024.12.17\]** 我们提供了12月CompassAcademic学术榜单评估脚本 [CompassAcademic](configs/eval_academic_leaderboard_202412.py)，你可以通过简单地配置复现官方评测结果。
diff --git a/docs/en/advanced_guides/llm_judge.md b/docs/en/advanced_guides/llm_judge.md
index 1d9e9760..f7e09d78 100644
--- a/docs/en/advanced_guides/llm_judge.md
+++ b/docs/en/advanced_guides/llm_judge.md
@@ -49,7 +49,7 @@ export OC_JUDGE_API_BASE=http://172.30.56.1:4000/v1
 
 Note that by default, OpenCompass will use these three environment variables, but if you use configuration files to configure the evaluation service, these environment variables will not take effect.
 
-### ### Using LLM for Evaluation via Configuration Files
+### Using LLM for Evaluation via Configuration Files
 
 To set up an LLM judge evaluation, you'll need to configure three main components:
 
@@ -264,6 +264,107 @@ Example evaluation output:
 }
 ```
 
+## CascadeEvaluator
+
+OpenCompass also provides a CascadeEvaluator that combines the strengths of rule-based evaluation and LLM-based evaluation. The cascade evaluator has two modes:
+
+1. **Cascade Mode (parallel=False)**: First evaluates all samples with a rule-based evaluator, then only sends samples that were deemed incorrect by the rule-based evaluation to an LLM judge for re-evaluation. This approach reduces reliance on LLM judgments while maintaining accuracy, thus lowering evaluation costs and time.
+
+2. **Parallel Mode (parallel=True)**: Evaluates all samples with both the rule-based evaluator and LLM judge, then considers a sample correct if either method marks it as correct. This approach can increase the leniency of evaluation but may result in higher costs since all samples require LLM evaluation.
+
+### Configuring CascadeEvaluator
+
+Here's an example of how to configure the CascadeEvaluator:
+
+```python
+# Define a rule-based evaluator
+rule_evaluator = dict(type=MATHEvaluator)
+
+# Define an LLM judge evaluator
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # Can use environment variables to configure the judge model
+)
+
+# Configure cascade evaluator (cascade mode)
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # Cascade mode
+)
+
+# For parallel mode, set parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # Parallel mode
+)
+
+# Use the cascade evaluator in your dataset evaluation config
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### Evaluation Results
+
+The cascade evaluator outputs detailed evaluation statistics including:
+
+- Accuracy of the rule-based evaluation
+- Accuracy of the LLM evaluation (for samples that failed rule-based evaluation in cascade mode)
+- Final combined accuracy
+
+Example output:
+
+```python
+{
+    'accuracy': 85.0,  # Final accuracy
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # Number of samples correct by rule evaluation
+        'rule_accuracy': 70.0,  # Accuracy of rule evaluation
+        'llm_evaluated': 30,  # Number of samples evaluated by LLM (failed samples in cascade mode)
+        'llm_correct': 15,  # Number of samples correct by LLM evaluation
+        'llm_accuracy': 50.0,  # Accuracy of LLM evaluation
+        'final_correct': 85,  # Total correct samples
+        'final_accuracy': 85.0,  # Final accuracy
+        'parallel_mode': False,  # Whether parallel mode was used
+    },
+    'details': [
+        # Detailed evaluation results for each sample
+    ]
+}
+```
+
+The cascade evaluator is particularly useful for:
+
+1. Scenarios that require balancing evaluation cost and accuracy
+2. Cases where rule-based evaluators are available but might not be comprehensive
+3. Evaluation tasks that need more nuanced judgment for edge cases
+
 ## Complete Example
 
-For a complete working example, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving using an LLM judge.
+For a complete working example using GenericLLMEvaluator
+, refer to the `eval_llm_judge.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
+
+For a complete working example using CascadeEvaluator, refer to the `eval_cascade_evaluator.py` file in the examples directory, which demonstrates how to evaluate mathematical problem-solving .
diff --git a/docs/zh_cn/advanced_guides/llm_judge.md b/docs/zh_cn/advanced_guides/llm_judge.md
index bc49696e..3cf9619b 100644
--- a/docs/zh_cn/advanced_guides/llm_judge.md
+++ b/docs/zh_cn/advanced_guides/llm_judge.md
@@ -263,6 +263,106 @@ GenericLLMEvaluator专为使用LLM作为评判器评估模型输出而设计。
 }
 ```
 
+## 级联评估器 (CascadeEvaluator)
+
+OpenCompass还提供了级联评估器`CascadeEvaluator`，它结合了规则式评估和LLM评估的优势。级联评估器有两种模式：
+
+1. **级联模式（Cascade Mode, parallel=False）**：首先使用规则式评估器评估所有样本，然后只将规则式评估认为不正确的样本发送给LLM评判器进行重新评估。这种方式可以在保持准确性的同时减少对LLM评判的依赖，从而降低评估成本和时间。
+
+2. **并行模式（Parallel Mode, parallel=True）**：使用规则式评估器和LLM评判器同时评估所有样本，如果任何一个评估器认为样本是正确的，则将该样本视为正确。这种方式可以提高评估的宽容度，但可能会导致更高的成本，因为所有样本都需要LLM评估。
+
+### 配置CascadeEvaluator
+
+以下是配置`CascadeEvaluator`的示例：
+
+```python
+# 定义规则式评估器
+rule_evaluator = dict(type=MATHEvaluator)
+
+# 定义LLM评判器
+llm_judge_evaluator = dict(
+    type=GenericLLMEvaluator,
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="你是一个负责评估模型输出正确性和质量的助手。",
+                )
+            ],
+            round=[
+                dict(role='HUMAN', prompt=YOUR_JUDGE_TEMPLATE),
+            ],
+        ),
+    ),
+    dataset_cfg=dict(
+        type=YourDataset,
+        path='path/to/your/dataset',
+        reader_cfg=reader_cfg,
+    ),
+    judge_cfg=dict(),  # 可以使用环境变量配置评判模型
+)
+
+# 配置级联评估器（级联模式）
+cascade_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=False  # 级联模式
+)
+
+# 如果需要并行模式，可以设置parallel=True
+parallel_evaluator = dict(
+    type=CascadeEvaluator,
+    llm_evaluator=llm_judge_evaluator,
+    rule_evaluator=rule_evaluator,
+    parallel=True  # 并行模式
+)
+
+# 在数据集评估配置中使用级联评估器
+eval_cfg = dict(evaluator=cascade_evaluator)
+```
+
+### 评估结果
+
+级联评估器会输出详细的评估统计信息，包括：
+
+- 规则评估的准确率
+- LLM评估的准确率（针对规则评估失败的样本）
+- 最终的综合准确率
+
+输出示例：
+
+```python
+{
+    'accuracy': 85.0,  # 最终准确率
+    'cascade_stats': {
+        'total_samples': 100,
+        'rule_correct': 70,  # 规则评估认为正确的样本数
+        'rule_accuracy': 70.0,  # 规则评估的准确率
+        'llm_evaluated': 30,  # LLM评估的样本数（级联模式下为规则评估失败的样本数）
+        'llm_correct': 15,  # LLM评估认为正确的样本数
+        'llm_accuracy': 50.0,  # LLM评估的准确率
+        'final_correct': 85,  # 最终正确的样本数
+        'final_accuracy': 85.0,  # 最终准确率
+        'parallel_mode': False,  # 是否是并行模式
+    },
+    'details': [
+        # 每个样本的详细评估结果
+    ]
+}
+```
+
+级联评估器特别适用于：
+
+1. 需要平衡评估成本和准确性的场景
+2. 有可用的规则式评估器但可能不够完善的情况
+3. 需要对边界情况进行更精确判断的评估任务
+
 ## 完整示例
 
-有关完整的工作示例，请参考examples目录中的`eval_llm_judge.py`文件，该文件演示了如何使用LLM评判器评估数学问题解决能力。
+如果希望了解通用LLM评判器，请参考examples目录中的`eval_llm_judge.py`文件，该示例展示了如何使用LLM评判器评估数学问题。
+
+如果希望了解级联评估器请参考examples目录中的`eval_cascade_evaluator.py`文件，该示例展示了如何使用级联评估器评估数学问题。
diff --git a/examples/eval_cascade_evaluator.py b/examples/eval_cascade_evaluator.py
new file mode 100644
index 00000000..1c1b0980
--- /dev/null
+++ b/examples/eval_cascade_evaluator.py
@@ -0,0 +1,127 @@
+
+from mmengine.config import read_base
+
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator, CascadeEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+from opencompass.datasets import (
+    MATHDataset,
+    math_postprocess_v2,
+    normalize_final_answer,
+)
+#######################################################################
+#                          PART 0  Essential Configs                  #
+#######################################################################
+
+with read_base():
+    # Datasets, Summarizer
+    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
+        models as lmdeploy_qwen2_5_7b_instruct_model,
+    )
+
+reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+########################## Evaluator  #################################
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+    
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+    
+    Judging the correctness of candidates' answers:
+""".strip()
+
+llm_judge_evaluator =   dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+                begin=[
+                    dict(
+                        role='SYSTEM',
+                        fallback_role='HUMAN',
+                        prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
+                    )
+                ],
+                round=[
+                    dict(role='HUMAN', prompt=GRADER_TEMPLATE),
+                ],
+            ),
+        ),
+        dataset_cfg=dict(
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        ),
+        judge_cfg=dict(),
+    )
+
+rule_evaluator =dict(type=MATHEvaluator)
+cascade_evaluator = dict(type=CascadeEvaluator,
+                   llm_evaluator=llm_judge_evaluator,
+                   rule_evaluator=rule_evaluator,
+                   parallel=False
+                   )
+########################## #################################
+eval_cfg = dict()
+
+# eval_cfg['evaluator'] = rule_evaluator
+# eval_cfg['evaluator'] = llm_judge_evaluator
+eval_cfg['evaluator'] = cascade_evaluator 
+
+math_datasets = [
+    dict(
+        abbr='math_prm800k_500',
+        type=MATHDataset,
+        path='opencompass/math',
+        file_name='test_prm800k_500.json',
+        reader_cfg=reader_cfg,
+        infer_cfg=infer_cfg,
+        eval_cfg=eval_cfg,
+    )
+]
+
+
+datasets = math_datasets
+models = lmdeploy_qwen2_5_7b_instruct_model
+
+
+work_dir = 'math_prm800k_500_cascade_evaluator'
\ No newline at end of file
diff --git a/opencompass/evaluator/__init__.py b/opencompass/evaluator/__init__.py
index a2653010..77b89f29 100644
--- a/opencompass/evaluator/__init__.py
+++ b/opencompass/evaluator/__init__.py
@@ -1 +1,2 @@
+from .cascade_evaluator import CascadeEvaluator  # noqa
 from .generic_llm_evaluator import GenericLLMEvaluator  # noqa
diff --git a/opencompass/evaluator/cascade_evaluator.py b/opencompass/evaluator/cascade_evaluator.py
new file mode 100644
index 00000000..e26b3d86
--- /dev/null
+++ b/opencompass/evaluator/cascade_evaluator.py
@@ -0,0 +1,302 @@
+import os
+from typing import Any, Callable, Dict, List, Optional
+
+import mmengine
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+from opencompass.utils.logging import get_logger
+
+
+@ICL_EVALUATORS.register_module()
+class CascadeEvaluator(BaseEvaluator):
+    """Cascade Evaluator.
+
+    First uses a rule-based method to judge predictions.
+    If a sample is marked as incorrect by the rule-based method,
+    then it uses an LLM judge to re-evaluate it.
+
+    Arguments:
+        llm_evaluator (dict): Configuration for the LLM evaluator.
+        rule_evaluator (Optional[dict]): Configuration for the
+            rule-based evaluator.
+        sample_score_fn (Optional[Callable]): A function to
+            score individual samples. If provided without rule_evaluator,
+            this function will be used directly.
+        parallel (bool): Whether to run in parallel mode.
+    """
+
+    def __init__(
+        self,
+        llm_evaluator: Dict,
+        rule_evaluator: Optional[Dict] = None,
+        sample_score_fn: Optional[Callable] = None,
+        parallel: bool = True,
+    ) -> None:
+        self.logger = get_logger()
+
+        # Initialize the LLM evaluator
+        llm_evaluator_type = llm_evaluator.pop('type')
+        if isinstance(llm_evaluator_type, str):
+            llm_evaluator_type = ICL_EVALUATORS.get(llm_evaluator_type)
+        self.llm_evaluator = llm_evaluator_type(**llm_evaluator)
+
+        # Initialize the rule evaluator if provided
+        self.rule_evaluator = None
+        if rule_evaluator:
+            rule_evaluator_type = rule_evaluator.pop('type')
+            if isinstance(rule_evaluator_type, str):
+                rule_evaluator_type = ICL_EVALUATORS.get(rule_evaluator_type)
+            self.rule_evaluator = rule_evaluator_type(**rule_evaluator)
+
+        self.sample_score_fn = sample_score_fn
+        self.parallel = parallel
+
+        # At least one of rule_evaluator or sample_score_fn must be provided
+        if not self.rule_evaluator and not self.sample_score_fn:
+            raise ValueError(
+                'Either rule_evaluator or sample_score_fn must be provided')
+
+    def sample_score(self, prediction: str, reference: str) -> Dict[str, Any]:
+        """Score a single sample using sample_score_fn or rule_evaluator.
+
+        Args:
+            prediction: The model's prediction.
+            reference: The ground truth.
+
+        Returns:
+            Dict: A dictionary containing the score and other details.
+        """
+        if self.sample_score_fn:
+            # Use user-provided function to evaluate a single sample
+            result = self.sample_score_fn(prediction, reference)
+            if not isinstance(result, dict):
+                # Ensure result is a dictionary with at least 'correct' field
+                result = {
+                    'correct': bool(result),
+                    'pred': prediction,
+                    'answer': reference,
+                }
+            return result
+        else:
+            # Use rule_evaluator to evaluate a single sample by calling
+            # the score method with single-element lists
+            result = self.rule_evaluator.score([prediction], [reference])
+            if 'details' in result and len(result['details']) > 0:
+                return result['details'][0]
+            else:
+                # Fallback if rule_evaluator doesn't provide detailed results
+                return {
+                    'correct': result.get('accuracy', 0) > 0,
+                    'pred': prediction,
+                    'answer': reference,
+                }
+
+    def _get_llm_correctness(self, llm_detail):
+        """Determine if the LLM judge considers the answer correct.
+
+        Args:
+            llm_detail: The evaluation details from the LLM judge.
+
+        Returns:
+            bool: Whether the answer is correct according to the LLM judge.
+        """
+        if 'prediction' in llm_detail:
+            response = llm_detail['prediction'].strip().upper()
+            return response == 'A' or response.startswith('CORRECT')
+        elif 'correct' in llm_detail:
+            return llm_detail['correct']
+        elif 'score' in llm_detail:
+            return llm_detail['score'] > 0.5
+        return False
+
+    def score(
+        self,
+        predictions: List[str],
+        references: List[str],
+        test_set: Optional[Dataset] = None,
+    ) -> Dict[str, Any]:
+        """Score predictions using cascade or parallel evaluation.
+
+        Args:
+            predictions: List of model predictions.
+            references: List of ground truths.
+            test_set: Huggingface Dataset containing original test samples.
+
+        Returns:
+            Dict: A dictionary containing the scores and details.
+        """
+        self.logger.info(
+            f"Running {'parallel' if self.parallel else 'cascade'} evaluation")
+
+        # Step 1: Evaluate each sample individually using rule-based evaluation
+        details = []
+        failed_predictions = []
+        failed_references = []
+        failed_indices = []
+
+        for i, (pred, ref) in enumerate(zip(predictions, references)):
+            result = self.sample_score(pred, ref)
+            result['evaluation_method'] = 'rule'
+            details.append({'rule_evaluation': result})
+
+            # If the sample failed rule-based evaluation or in parallel
+            # mode, mark it for LLM evaluation
+            if not result.get('correct', False) or self.parallel:
+                failed_predictions.append(pred)
+                failed_references.append(ref)
+                failed_indices.append(i)
+
+        # Calculate initial accuracy based on rule evaluation
+        initial_correct = sum(
+            1 for detail in details
+            if detail['rule_evaluation'].get('correct', False))
+        initial_accuracy = (100 * initial_correct /
+                            len(predictions) if predictions else 0)
+
+        self.logger.info(
+            f'Rule-based evaluation: {initial_correct}/{len(predictions)} '
+            f'correct ({initial_accuracy:.2f}%)')
+
+        eval_mode = ('parallel (all samples)'
+                     if self.parallel else 'cascade (only failed samples)')
+        self.logger.info(f'Samples requiring LLM evaluation ({eval_mode}): '
+                         f'{len(failed_indices)}')
+
+        # Step 2: If there are samples for LLM evaluation
+        if failed_predictions and test_set is not None:
+            self.logger.info(f'Running LLM evaluation in {eval_mode} mode...')
+
+            # Create a subset of the test_set for LLM evaluation
+            failed_subset = test_set.select(failed_indices)
+
+            # Add prediction and reference columns to the dataset
+            failed_subset = failed_subset.add_column('prediction',
+                                                     failed_predictions)
+            failed_subset = failed_subset.add_column('reference',
+                                                     failed_references)
+
+            # Set a custom output path for LLM evaluation
+            original_out_dir = getattr(self.llm_evaluator, '_out_dir', None)
+            self.llm_evaluator._out_dir = f'{self._out_dir}_llm_judge'
+
+            # Check if results already exist to avoid re-evaluation
+            llm_results_path = f'{self.llm_evaluator._out_dir}.json'
+            if os.path.exists(llm_results_path):
+                self.logger.info(
+                    f'Loading existing LLM evaluation results from '
+                    f'{llm_results_path}')
+                llm_results = mmengine.load(llm_results_path)
+
+                # Extract details from loaded results
+                if llm_results.get('details', []):
+                    loaded_details = llm_results['details']
+                else:
+                    loaded_details = llm_results
+
+                # Strictly verify that the loaded results match
+                # the current evaluation needs
+                if len(loaded_details) != len(failed_indices):
+                    error_msg = (
+                        f'Error: Loaded LLM results contain '
+                        f'{len(loaded_details)} samples, but current '
+                        f'evaluation requires {len(failed_indices)} samples. '
+                        f"The cached results at {llm_results_path} don't match"
+                        f'the current evaluation needs. '
+                        f'Please remove the cache file or fix the mismatch.')
+                    self.logger.error(error_msg)
+                    raise ValueError(error_msg)
+
+            else:
+                # Use GenericLLMEvaluator to evaluate samples
+                # unset dataset_cfg for GenericLLMEvaluator to
+                # directly use test_set
+                self.llm_evaluator.dataset_cfg = None
+                llm_results = self.llm_evaluator.score(
+                    predictions=failed_predictions,
+                    references=failed_references,
+                    test_set=failed_subset,
+                )
+
+            # Restore original output directory
+            if original_out_dir:
+                self.llm_evaluator._out_dir = original_out_dir
+
+            if llm_results.get('details', []):
+                llm_details = llm_results['details']
+            else:
+                llm_details = llm_results
+
+            # Initialize counters for accuracy calculation
+            final_correct = initial_correct if not self.parallel else 0
+            llm_correct = 0
+            llm_evaluated = 0
+
+            # Update the details for samples that were evaluated by LLM
+            for i, llm_detail in enumerate(llm_details.values()):
+                original_index = failed_indices[i]
+                # Store original rule-based evaluation result
+                rule_result = details[original_index].copy()
+                rule_correct = rule_result['rule_evaluation'].get(
+                    'correct', False)
+
+                # Add LLM evaluation details
+                details[original_index]['llm_evaluation'] = llm_detail
+
+                # Determine LLM correctness judgment and store it
+                is_correct = self._get_llm_correctness(llm_detail)
+                details[original_index]['llm_evaluation'][
+                    'llm_correct'] = is_correct
+
+                # Count LLM evaluation statistics
+                llm_evaluated += 1
+                if is_correct:
+                    llm_correct += 1
+
+                # Update final_correct counter based on evaluation mode
+                if self.parallel:
+                    # In parallel mode, either rule-based or LLM evaluations
+                    # should be correct
+                    if rule_correct or is_correct:
+                        final_correct += 1
+                else:
+                    # In cascade mode, if rule was incorrect but LLM
+                    # correct, increment
+                    # (rule correct samples are already counted
+                    # in initial_correct)
+                    if not rule_correct and is_correct:
+                        final_correct += 1
+
+            # Calculate final accuracy
+            final_accuracy = (100 * final_correct /
+                              len(predictions) if predictions else 0)
+            llm_accuracy = (100 * llm_correct /
+                            llm_evaluated if llm_evaluated else 0)
+
+            self.logger.info(
+                f'Final evaluation: {final_correct}/{len(predictions)}'
+                f'correct ({final_accuracy:.2f}%)')
+
+            if llm_evaluated > 0:
+                self.logger.info(
+                    f'LLM evaluation: {llm_correct}/{llm_evaluated} '
+                    f'correct ({llm_accuracy:.2f}%)')
+
+            result = {
+                'accuracy': final_accuracy,
+                'cascade_stats': {
+                    'total_samples': len(predictions),
+                    'rule_correct': initial_correct,
+                    'rule_accuracy': initial_accuracy,
+                    'llm_evaluated': llm_evaluated,
+                    'llm_correct': llm_correct,
+                    'llm_accuracy': llm_accuracy,
+                    'final_correct': final_correct,
+                    'final_accuracy': final_accuracy,
+                    'parallel_mode': self.parallel,
+                },
+                'details': details,
+            }
+
+            return result
diff --git a/opencompass/evaluator/generic_llm_evaluator.py b/opencompass/evaluator/generic_llm_evaluator.py
index 2b829ba1..c205ec4b 100644
--- a/opencompass/evaluator/generic_llm_evaluator.py
+++ b/opencompass/evaluator/generic_llm_evaluator.py
@@ -3,6 +3,7 @@ import os.path as osp
 from typing import Dict, List, Optional
 
 import mmengine
+from datasets import Dataset
 from mmengine.config import ConfigDict
 
 from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -82,10 +83,19 @@ class GenericLLMEvaluator(BaseEvaluator):
         self,
         predictions,
         references: Optional[List] = None,
+        test_set: Optional[Dataset] = None,
     ) -> Dict:
-        """Apply to single-model scoring."""
+        """Apply to single-model scoring.
+
+        Args:
+            predictions: List of model predictions
+            references: List of reference answers
+            test_set: Optional Dataset containing additional
+            context for evaluation
+        """
         assert len(predictions) == len(
             references), 'predictions and references must have the same length'
+
         # -------------- Build Inferencer ----------------
         self.build_inferencer()
 
@@ -93,9 +103,7 @@ class GenericLLMEvaluator(BaseEvaluator):
         predictions = self.pred_postprocess(predictions)
 
         # For Single Round Dialogue
-        prediction_dict = {}
-        prediction_dict['prediction'] = predictions
-        prediction_dict['obj_gold'] = references
+        prediction_dict = {'prediction': predictions, 'obj_gold': references}
 
         # ---------------- Build Dataset for LLM Judge -----------------
         if self.dataset_cfg:
@@ -109,19 +117,42 @@ class GenericLLMEvaluator(BaseEvaluator):
                 dataset.reader.dataset['test'] = dataset.test.add_column(
                     'reference', references)
         else:
-            # build a default dataset just for comparison
+            # Handle test_set in the else branch
             from opencompass.datasets.lmeval import LMEvalDataset
 
-            input_columns = list(prediction_dict.keys())
-            if references:
-                input_columns.append('reference')
+            if test_set is not None:
+                # If test_set is provided, use it as the base
+                # Ensure necessary columns exist
+                if 'prediction' not in test_set.column_names:
+                    test_set = test_set.add_column('prediction', predictions)
+                if 'reference' not in test_set.column_names:
+                    test_set = test_set.add_column('reference', references)
+
+                # Prepare input_columns and data dictionary
+                input_columns = test_set.column_names
+                data_dict = {
+                    column: test_set[column]
+                    for column in test_set.column_names
+                }
+            else:
+                # Original default dataset building logic
+                input_columns = list(prediction_dict.keys())
+                if references:
+                    input_columns.append('reference')
+                data_dict = prediction_dict.copy()
+                if references:
+                    data_dict['reference'] = references
+
+            # Create LMEvalDataset
             dataset = LMEvalDataset(
-                reader_cfg=dict(input_columns=input_columns,
-                                output_column=None,
-                                train_split='test'),
-                reference=references,
-                **prediction_dict,
+                reader_cfg=dict(
+                    input_columns=input_columns,
+                    output_column=None,
+                    train_split='test',
+                ),
+                **data_dict,
             )
+
         dataset.reader.output_column = 'reference'
         retriever = ZeroRetriever(dataset)
         # ----------------- LLM Judge ----------------

From fd82bea7471e9b6ec921dd6cda8965c43090bc55 Mon Sep 17 00:00:00 2001
From: Myhs_phz <demarcia2014@126.com>
Date: Tue, 8 Apr 2025 14:38:35 +0800
Subject: [PATCH 55/58] [Fix] OpenICL Math Evaluator Config (#2007)

* fix

* fix recommended

* fix

* fix

* fix

* fix
---
 dataset-index.yml                             |  4 +-
 docs/en/statis.py                             |  2 +-
 docs/zh_cn/statis.py                          |  2 +-
 opencompass/configs/datasets/math/math_gen.py |  2 +-
 .../configs/datasets/math/math_gen_a58d9d.py  | 38 +++++++++
 .../configs/datasets/math/math_llm_judge.py   | 35 --------
 .../datasets/math/math_llm_judge_gen.py       |  4 +
 .../math/math_llm_judge_gen_56606f.py         | 85 +++++++++++++++++++
 .../openicl/icl_evaluator/math_evaluator.py   | 16 +---
 9 files changed, 135 insertions(+), 53 deletions(-)
 create mode 100644 opencompass/configs/datasets/math/math_gen_a58d9d.py
 delete mode 100644 opencompass/configs/datasets/math/math_llm_judge.py
 create mode 100644 opencompass/configs/datasets/math/math_llm_judge_gen.py
 create mode 100644 opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py

diff --git a/dataset-index.yml b/dataset-index.yml
index 5358a5de..f1581c21 100644
--- a/dataset-index.yml
+++ b/dataset-index.yml
@@ -615,8 +615,8 @@
     name: MATH
     category: Math
     paper: https://arxiv.org/pdf/2103.03874
-    configpath: opencompass/configs/datasets/math
-    configpath_llmjudge: ''
+    configpath: opencompass/configs/datasets/math/math_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/math/math_llm_judge_gen.py
 - math500:
     name: MATH500
     category: Math
diff --git a/docs/en/statis.py b/docs/en/statis.py
index daabe818..d751fdae 100755
--- a/docs/en/statis.py
+++ b/docs/en/statis.py
@@ -35,7 +35,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
     'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
     'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
-    'mmlu_pro', 'musr'
+    'mmlu_pro', 'musr', 'math500'
 ]
 
 
diff --git a/docs/zh_cn/statis.py b/docs/zh_cn/statis.py
index 04134cf6..25dc24f1 100755
--- a/docs/zh_cn/statis.py
+++ b/docs/zh_cn/statis.py
@@ -33,7 +33,7 @@ HEADER = ['name', 'category', 'paper', 'configpath', 'configpath_llmjudge']
 recommanded_dataset_list = [
     'ifeval', 'aime2024', 'bbh', 'bigcodebench', 'cmmlu', 'drop', 'gpqa',
     'hellaswag', 'humaneval', 'korbench', 'livecodebench', 'math', 'mmlu',
-    'mmlu_pro', 'musr'
+    'mmlu_pro', 'musr', 'math500'
 ]
 
 
diff --git a/opencompass/configs/datasets/math/math_gen.py b/opencompass/configs/datasets/math/math_gen.py
index f3e23068..b97669c2 100644
--- a/opencompass/configs/datasets/math/math_gen.py
+++ b/opencompass/configs/datasets/math/math_gen.py
@@ -1,4 +1,4 @@
 from mmengine.config import read_base
 
 with read_base():
-    from .math_gen_265cce import math_datasets  # noqa: F401, F403
+    from .math_gen_a58d9d import math_datasets  # noqa: F401, F403
diff --git a/opencompass/configs/datasets/math/math_gen_a58d9d.py b/opencompass/configs/datasets/math/math_gen_a58d9d.py
new file mode 100644
index 00000000..bf01e9bc
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_gen_a58d9d.py
@@ -0,0 +1,38 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MATHDataset
+from opencompass.openicl.icl_evaluator import MATHEvaluator
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt='Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:'),
+            dict(role='BOT', prompt='The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:'),
+            dict(role='BOT', prompt='We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:'),
+            dict(role='BOT', prompt='If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:'),
+            dict(role='BOT', prompt='If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n'),
+            dict(role='HUMAN', prompt='Problem:\n{problem}\nSolution:\n'),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer))
+
+math_eval_cfg = dict(
+    evaluator=dict(type=MATHEvaluator)
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
diff --git a/opencompass/configs/datasets/math/math_llm_judge.py b/opencompass/configs/datasets/math/math_llm_judge.py
deleted file mode 100644
index 6a81bea2..00000000
--- a/opencompass/configs/datasets/math/math_llm_judge.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess
-
-QUERY_TEMPLATE = """
-Solve the following math problem step by step. The last line of your response should be of the form ANSWER: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
-{problem}
-Remember to put your answer on its own line after "ANSWER:", and you do not need to use a \\boxed command.
-""".strip()
-
-math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
-
-math_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-
-        template=dict(round=[
-            dict(role='HUMAN', prompt=QUERY_TEMPLATE),
-        ])),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=1024))
-
-math_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator), pred_postprocessor=dict(type=math_postprocess))
-
-math_datasets = [
-    dict(
-        type=MATHDataset,
-        abbr='math',
-        path='opencompass/math',
-        reader_cfg=math_reader_cfg,
-        infer_cfg=math_infer_cfg,
-        eval_cfg=math_eval_cfg)
-]
diff --git a/opencompass/configs/datasets/math/math_llm_judge_gen.py b/opencompass/configs/datasets/math/math_llm_judge_gen.py
new file mode 100644
index 00000000..fff23ef6
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_llm_judge_gen.py
@@ -0,0 +1,4 @@
+from mmengine.config import read_base
+
+with read_base():
+    from .math_llm_judge_gen_56606f import math_datasets  # noqa: F401, F403
\ No newline at end of file
diff --git a/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py b/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py
new file mode 100644
index 00000000..f655fa49
--- /dev/null
+++ b/opencompass/configs/datasets/math/math_llm_judge_gen_56606f.py
@@ -0,0 +1,85 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets import generic_llmjudge_postprocess
+from opencompass.datasets import MATHDataset
+
+
+math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
+
+math_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(role='HUMAN', prompt="Question: {problem}\nLet's think step by step\nAnswer:")
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer)
+)
+
+GRADER_TEMPLATE = """
+    Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly. 
+
+    Here are some evaluation criteria:
+    1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
+    2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
+    3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
+    4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
+    5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
+
+    Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
+    A: CORRECT 
+    B: INCORRECT
+    Just return the letters "A" or "B", with no text around it.
+
+    Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
+
+
+    <Original Question Begin>: \n{problem}\n<Original Question End>\n\n
+    <Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
+    <Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
+
+    Judging the correctness of candidates' answers:
+""".strip()
+
+# Evaluation configuration
+math_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=MATHDataset,
+            path='opencompass/math',
+            reader_cfg=math_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=generic_llmjudge_postprocess),
+    ),
+    pred_role='BOT',
+)
+
+math_datasets = [
+    dict(
+        type=MATHDataset,
+        abbr='math',
+        path='opencompass/math',
+        reader_cfg=math_reader_cfg,
+        infer_cfg=math_infer_cfg,
+        eval_cfg=math_eval_cfg)
+]
diff --git a/opencompass/openicl/icl_evaluator/math_evaluator.py b/opencompass/openicl/icl_evaluator/math_evaluator.py
index 48764252..16db89f8 100644
--- a/opencompass/openicl/icl_evaluator/math_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/math_evaluator.py
@@ -22,26 +22,16 @@ class MATHEvaluator(BaseEvaluator):
         details = []
         for i, j in zip(predictions, references):
             count += 1
+            j_with_env = f'${j}$'
             gold_parsed = parse(
-                j,
+                j_with_env,
                 extraction_mode='first_match',
                 extraction_config=[
                     LatexExtractionConfig(),
                     ExprExtractionConfig(),
                 ],
             )
-            # If parsing result is empty, try adding LaTeX
-            # environment and parse again
-            if len(gold_parsed) == 0:
-                j_with_env = f'${j}$'
-                gold_parsed = parse(
-                    j_with_env,
-                    extraction_mode='first_match',
-                    extraction_config=[
-                        LatexExtractionConfig(),
-                        ExprExtractionConfig(),
-                    ],
-                )
+
             if len(gold_parsed) != 0:
                 # We require the answer to be provided in correct
                 # latex (no malformed operators)

From a05f9da134e9c0e3a10255d7c884767c5405f520 Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Tue, 8 Apr 2025 14:42:26 +0800
Subject: [PATCH 56/58] [Feature] Make dump-eval-details default behavior
 (#1999)

* Update

* update

* update
---
 docs/en/user_guides/experimentation.md    |  2 +-
 docs/zh_cn/user_guides/experimentation.md |  2 +-
 opencompass/cli/main.py                   | 11 ++++++++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/docs/en/user_guides/experimentation.md b/docs/en/user_guides/experimentation.md
index 0f5575a0..a5e2df43 100644
--- a/docs/en/user_guides/experimentation.md
+++ b/docs/en/user_guides/experimentation.md
@@ -57,7 +57,7 @@ The parameter explanation is as follows:
 - `-w`: Specify the working path, default is `./outputs/default`.
 - `-l`: Enable status reporting via Lark bot.
 - `--dry-run`: When enabled, inference and evaluation tasks will be dispatched but won't actually run for debugging.
-- `--dump-eval-details`: When enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample.
+- `--dump-eval-details`: Default enabled，evaluation under the `results` folder will include more details, such as the correctness of each sample. Set `--dump-eval-details False` to disable it。
 
 Using run mode `-m all` as an example, the overall execution flow is as follows:
 
diff --git a/docs/zh_cn/user_guides/experimentation.md b/docs/zh_cn/user_guides/experimentation.md
index c960e0f8..53e5d6de 100644
--- a/docs/zh_cn/user_guides/experimentation.md
+++ b/docs/zh_cn/user_guides/experimentation.md
@@ -57,7 +57,7 @@ python run.py $EXP {--slurm | --dlc | None} [-p PARTITION] [-q QUOTATYPE] [--deb
 - `-w`: 指定工作路径，默认为 `./outputs/default`
 - `-l`: 打开飞书机器人状态上报。
 - `--dry-run`: 开启时，推理和评测任务仅会分发但不会真正运行，便于调试；
-- `--dump-eval-details`: 开启时，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。
+- `--dump-eval-details`: 默认开启，`results` 下的评测结果中将会包含更加详细的评测结果信息，例如每条样本是否正确等。如不需要开启，需设置`--dump-eval-details False`。
 
 以运行模式 `-m all` 为例，整体运行流如下：
 
diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py
index a5937033..d1f4b1dd 100644
--- a/opencompass/cli/main.py
+++ b/opencompass/cli/main.py
@@ -119,8 +119,11 @@ def parse_args():
     parser.add_argument(
         '--dump-eval-details',
         help='Whether to dump the evaluation details, including the '
-        'correctness of each sample, bpb, etc.',
-        action='store_true',
+        'correctness of each sample, bpb, etc. Defaults to True.',
+        nargs='?',
+        const=True,
+        default=True,
+        type=lambda x: False if x and x.lower() == 'false' else True
     )
     parser.add_argument(
         '--dump-extract-rate',
@@ -233,7 +236,6 @@ def parse_custom_dataset_args(custom_dataset_parser):
 
 def main():
     args = parse_args()
-
     if args.num_gpus is not None:
         raise ValueError('The `--num-gpus` argument is deprecated, please use '
                          '`--hf-num-gpus` to describe number of gpus used for '
@@ -350,6 +352,9 @@ def main():
         if args.dlc or args.slurm or cfg.get('eval', None) is None:
             fill_eval_cfg(cfg, args)
         if args.dump_eval_details:
+            logger.warning('Default to dump eval details, it might take extra'
+                        'space to save all the evaluation details. '
+                        'Set --dump-eval-details False to skip the details dump')
             cfg.eval.runner.task.dump_details = True
         if args.dump_extract_rate:
             cfg.eval.runner.task.cal_extract_rate = True

From 6ac9b06bc2209d82333987671d20175d7cd0d2f7 Mon Sep 17 00:00:00 2001
From: zhulinJulia24 <145004780+zhulinJulia24@users.noreply.github.com>
Date: Wed, 9 Apr 2025 14:09:35 +0800
Subject: [PATCH 57/58] [ci] update baseline for kernal change of vllm and
 lmdeploy (#2011)

* update

* update

* update

* update

* update

* update

* update
---
 .../scripts/oc_score_baseline_fullbench.yaml  | 26 +++----
 .../scripts/oc_score_baseline_testrange.yaml  | 78 +++++++++----------
 .github/workflows/daily-run-test.yml          |  2 +-
 opencompass/datasets/subjective/__init__.py   |  1 +
 .../datasets/subjective/commonbench.py        | 56 +++++++++++++
 5 files changed, 110 insertions(+), 53 deletions(-)
 create mode 100644 opencompass/datasets/subjective/commonbench.py

diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml
index 3f5753d3..fd355c0e 100644
--- a/.github/scripts/oc_score_baseline_fullbench.yaml
+++ b/.github/scripts/oc_score_baseline_fullbench.yaml
@@ -9,7 +9,7 @@ internlm2_5-7b-chat-hf_fullbench:
         drop_accuracy: 81.25
         GPQA_diamond_accuracy: 25
         hellaswag_accuracy: 87.5
-        TheoremQA_score: 18.75
+        TheoremQA_score: 12.50
         musr_average_naive_average: 39.58
         korbench_single_naive_average: 40
         gsm8k_accuracy: 62.50
@@ -162,7 +162,7 @@ internlm2_5-7b-hf_fullbench:
         drop_accuracy: 62.5
         GPQA_diamond_accuracy: 62.5
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 25
+        TheoremQA_score: 12.50
         winogrande_accuracy: 75
         gsm8k_accuracy: 37.5
         GaokaoBench_2010-2022_Math_II_MCQs_score: 62.5
@@ -190,7 +190,7 @@ internlm2_5-7b-turbomind_fullbench:
         drop_accuracy: 62.5
         GPQA_diamond_accuracy: 62.5
         hellaswag_accuracy: 93.75
-        TheoremQA_score: 31.25
+        TheoremQA_score: 12.50
         winogrande_accuracy: 87.5
         gsm8k_accuracy: 56.25
         GaokaoBench_2010-2022_Math_II_MCQs_score: 68.75
@@ -391,7 +391,7 @@ internlm2_5-7b-chat-turbomind:
         alpaca_eval_total: 25.96
         arenahard_score: 17.15
         Followbench_naive_average: 0.81
-        CompassArena_naive_average: 34.61
+        CompassArena_naive_average: 39.49
         FoFo_naive_average: 0.38
         mtbench101_avg: 8.01
         wildbench_average: -10.49
@@ -410,10 +410,10 @@ internlm2_5-7b-chat-turbomind:
         alpaca_eval_oasst: 23.4
         alpaca_eval_selfinstruct: 30.95
         alpaca_eval_vicuna: 33.75
-        compassarena_language_naive_average: 52.5
+        compassarena_language_naive_average: 58.50
         compassarena_knowledge_naive_average: 36
         compassarena_reason_v2_naive_average: 35
-        compassarena_math_v2_naive_average: 19.91
+        compassarena_math_v2_naive_average: 25.95
         compassarena_creationv2_zh_naive_average: 43.64
         fofo_test_prompts_overall: 0.35
         fofo_test_prompts_cn_overall: 0.41
@@ -493,7 +493,7 @@ qwen2.5-7b-instruct-turbomind:
         bigcodebench_hard_instruct_pass@1: 16.22
         bigcodebench_hard_complete_pass@1: 11.49
         teval_naive_average: 79.72
-        SciCode_sub_accuracy: 100
+        SciCode_sub_accuracy: 10.76
         qa_dingo_cn_score: 99.01
         mmlu_accuracy: 76.01
         mmlu-stem_accuracy: 77.59
@@ -600,7 +600,7 @@ internlm2_5-7b-chat-pytorch:
         bigcodebench_hard_instruct_pass@1: 6.08
         bigcodebench_hard_complete_pass@1: 6.76
         teval_naive_average: 79.73
-        SciCode_sub_accuracy: 100
+        SciCode_sub_accuracy: 3.47
         qa_dingo_cn_score: 100
         mmlu_accuracy: 70.2
         mmlu-stem_accuracy: 67.73
@@ -689,7 +689,7 @@ qwen2.5-7b-instruct-pytorch:
         GaokaoBench_weighted_average: 80.02
         math_accuracy: 73.74
         cmo_fib_accuracy: 26.44
-        aime2024_accuracy: 10
+        aime2024_accuracy: 13.33
         Mathbench_naive_average: 77.08
         wikibench-wiki-single_choice_cncircular_perf_4: 34
         cmmlu_naive_average: 75.9
@@ -705,7 +705,7 @@ qwen2.5-7b-instruct-pytorch:
         bigcodebench_hard_instruct_pass@1: 16.89
         bigcodebench_hard_complete_pass@1: 12.16
         teval_naive_average: 79.46
-        SciCode_sub_accuracy: 100
+        SciCode_sub_accuracy: 10.42
         qa_dingo_cn_score: 100
         mmlu_accuracy: 76.27
         mmlu-stem_accuracy: 77.75
@@ -810,7 +810,7 @@ internlm3-8b-instruct-turbomind:
         bigcodebench_hard_instruct_pass@1: 13.51
         bigcodebench_hard_complete_pass@1: 15.54
         teval_naive_average: 82.86
-        SciCode_sub_accuracy: 100
+        SciCode_sub_accuracy: 11.11
         qa_dingo_cn_score: 100
         mmlu_accuracy: 76.21
         mmlu-stem_accuracy: 77.7
@@ -889,7 +889,7 @@ internlm3-8b-instruct-pytorch:
         IFEval_Prompt-level-strict-accuracy: 79.11
         drop_accuracy: 83.32
         bbh_naive_average: 54.76
-        GPQA_diamond_accuracy: 42.42
+        GPQA_diamond_accuracy: 33.84
         hellaswag_accuracy: 91.31
         TheoremQA_score: 18
         musr_average_naive_average: 36.62
@@ -915,7 +915,7 @@ internlm3-8b-instruct-pytorch:
         bigcodebench_hard_instruct_pass@1: 12.84
         bigcodebench_hard_complete_pass@1: 15.54
         teval_naive_average: 82.86
-        SciCode_sub_accuracy: 100
+        SciCode_sub_accuracy: 9.38
         qa_dingo_cn_score: 100
         mmlu_accuracy: 76.23
         mmlu-stem_accuracy: 78.08
diff --git a/.github/scripts/oc_score_baseline_testrange.yaml b/.github/scripts/oc_score_baseline_testrange.yaml
index 16a13209..94a28d36 100644
--- a/.github/scripts/oc_score_baseline_testrange.yaml
+++ b/.github/scripts/oc_score_baseline_testrange.yaml
@@ -6,7 +6,7 @@ chat:
         gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     glm-4-9b-chat-vllm:
-        gsm8k_accuracy: 68.75
+        gsm8k_accuracy: 71.88
         race-high_accuracy: 90.62
     deepseek-7b-chat-hf:
         gsm8k_accuracy: 46.88
@@ -84,7 +84,7 @@ chat:
         gsm8k_accuracy: 81.25
         race-high_accuracy: 90.62
     llama-3_2-3b-instruct-turbomind:
-        gsm8k_accuracy: 75.00
+        gsm8k_accuracy: 68.75
         race-high_accuracy: 81.25
     llama-3-8b-instruct-turbomind:
         gsm8k_accuracy: 68.75
@@ -204,14 +204,14 @@ chat:
         gsm8k_accuracy: 90.62
         race-high_accuracy: 84.38
     mixtral-8x22b-instruct-v0.1-turbomind:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 78.12
         race-high_accuracy: 78.12
     mixtral-8x22b-instruct-v0.1-vllm:
         gsm8k_accuracy: 78.12
         race-high_accuracy: 78.12
 base:
     glm-4-9b-turbomind:
-        gsm8k_accuracy: 56.25
+        gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
@@ -253,8 +253,8 @@ base:
     gemma-2-9b-turbomind:
         gsm8k_accuracy: 68.75
         GPQA_diamond_accuracy: 0
-        race-high_accuracy: 78.12
-        winogrande_accuracy: 50
+        race-high_accuracy: 18.75
+        winogrande_accuracy: 46.88
     gemma-2b-vllm:
         gsm8k_accuracy: 15.62
         GPQA_diamond_accuracy: 3.12
@@ -281,20 +281,20 @@ base:
         race-high_accuracy: 71.88
         winogrande_accuracy: 75
     internlm2_5-7b-turbomind:
+        gsm8k_accuracy: 62.5
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 93.75
+        winogrande_accuracy: 87.5
+    internlm2-7b-turbomind:
         gsm8k_accuracy: 59.38
         GPQA_diamond_accuracy: 34.38
-        race-high_accuracy: 93.75
-        winogrande_accuracy: 84.38
-    internlm2-7b-turbomind:
-        gsm8k_accuracy: 50
-        GPQA_diamond_accuracy: 18.75
-        race-high_accuracy: 71.88
-        winogrande_accuracy: 84.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 71.88
     internlm2-base-7b-turbomind:
-        gsm8k_accuracy: 37.50
-        GPQA_diamond_accuracy: 21.88
-        race-high_accuracy: 84.38
-        winogrande_accuracy: 75
+        gsm8k_accuracy: 28.12
+        GPQA_diamond_accuracy: 31.25
+        race-high_accuracy: 71.88
+        winogrande_accuracy: 62.50
     llama-2-7b-hf:
         gsm8k_accuracy: 21.88
         GPQA_diamond_accuracy: 21.88
@@ -311,15 +311,15 @@ base:
         race-high_accuracy: 65.62
         winogrande_accuracy: 65.62
     llama-3.1-8b-turbomind:
-        gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 9.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 15.62
         race-high_accuracy: 78.12
         winogrande_accuracy: 78.12
     llama-3-8b-turbomind:
         gsm8k_accuracy: 46.88
         GPQA_diamond_accuracy: 12.50
         race-high_accuracy: 65.62
-        winogrande_accuracy: 78.12
+        winogrande_accuracy: 81.25
     mistral-7b-v0.3-hf:
         gsm8k_accuracy: 31.25
         GPQA_diamond_accuracy: 6.25
@@ -331,8 +331,8 @@ base:
         race-high_accuracy: 87.5
         winogrande_accuracy: 71.88
     qwen2.5-1.5b-turbomind:
-        gsm8k_accuracy: 62.50
-        GPQA_diamond_accuracy: 15.62
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 75
         winogrande_accuracy: 71.88
     qwen2.5-7b-turbomind:
@@ -362,19 +362,19 @@ base:
         winogrande_accuracy: 68.75
     qwen2-1.5b-turbomind:
         gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 12.50
+        GPQA_diamond_accuracy: 6.25
         race-high_accuracy: 81.25
         winogrande_accuracy: 75
     qwen2-7b-turbomind:
-        gsm8k_accuracy: 65.62
+        gsm8k_accuracy: 62.5
         GPQA_diamond_accuracy: 12.5
         race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 75
     qwen1.5-0.5b-vllm:
-        gsm8k_accuracy: 6.25
+        gsm8k_accuracy: 9.38
         GPQA_diamond_accuracy: 0
         race-high_accuracy: 56.25
-        winogrande_accuracy: 62.5
+        winogrande_accuracy: 59.38
     yi-1.5-6b-hf:
         gsm8k_accuracy: 62.5
         GPQA_diamond_accuracy: 3.12
@@ -387,11 +387,11 @@ base:
         winogrande_accuracy: 59.38
     yi-1.5-9b-turbomind:
         gsm8k_accuracy: 78.12
-        GPQA_diamond_accuracy: 43.75
+        GPQA_diamond_accuracy: 40.62
         race-high_accuracy: 87.5
-        winogrande_accuracy: 71.88
+        winogrande_accuracy: 65.62
     internlm2-20b-turbomind:
-        gsm8k_accuracy: 75
+        gsm8k_accuracy: 71.88
         GPQA_diamond_accuracy: 18.75
         race-high_accuracy: 68.75
         winogrande_accuracy: 81.25
@@ -406,18 +406,18 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 78.12
     qwen2.5-32b-turbomind:
-        gsm8k_accuracy: 87.5
-        GPQA_diamond_accuracy: 18.75
+        gsm8k_accuracy: 84.38
+        GPQA_diamond_accuracy: 28.12
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
     deepseek-67b-base-turbomind:
-        gsm8k_accuracy: 53.12
-        GPQA_diamond_accuracy: 28.12
-        race-high_accuracy: 81.25
-        winogrande_accuracy: 84.38
+        gsm8k_accuracy: 59.38
+        GPQA_diamond_accuracy: 34.38
+        race-high_accuracy: 78.12
+        winogrande_accuracy: 81.25
     llama-3-70b-turbomind:
         gsm8k_accuracy: 56.25
-        GPQA_diamond_accuracy: 12.50
+        GPQA_diamond_accuracy: 15.62
         race-high_accuracy: 93.75
         winogrande_accuracy: 84.38
     qwen2.5-72b-turbomind:
@@ -426,7 +426,7 @@ base:
         race-high_accuracy: 93.75
         winogrande_accuracy: 87.5
     deepseek-v2-turbomind:
-        gsm8k_accuracy: 59.38
-        GPQA_diamond_accuracy: 3.12
+        gsm8k_accuracy: 65.62
+        GPQA_diamond_accuracy: 9.38
         race-high_accuracy: 93.75
         winogrande_accuracy: 81.25
diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml
index 6a1c2ebc..e6000c09 100644
--- a/.github/workflows/daily-run-test.yml
+++ b/.github/workflows/daily-run-test.yml
@@ -44,7 +44,7 @@ on:
         type: string
         default: "['base_objective','chat_objective','chat_subjective','base_long_context','chat_long_context']"
   schedule:
-    - cron:  '15 14 * * 0,2'
+    - cron:  '15 14 * * 0,3'
 
 env:
   HF_DATASETS_OFFLINE: 1
diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py
index 09a0a8ed..8d663528 100644
--- a/opencompass/datasets/subjective/__init__.py
+++ b/opencompass/datasets/subjective/__init__.py
@@ -7,6 +7,7 @@ from .alpacaeval import alpacaeval_postprocess  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .arena_hard import arenahard_bradleyterry_postprocess  # noqa: F401, F403
 from .arena_hard import arenahard_postprocess  # noqa: F401, F403
+from .commonbench import commonbench_postprocess
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
 from .compass_arena import \
     compassarena_bradleyterry_postprocess  # noqa: F401, F403
diff --git a/opencompass/datasets/subjective/commonbench.py b/opencompass/datasets/subjective/commonbench.py
new file mode 100644
index 00000000..1b634111
--- /dev/null
+++ b/opencompass/datasets/subjective/commonbench.py
@@ -0,0 +1,56 @@
+# flake8: noqa: E501
+import re
+from collections import defaultdict
+from typing import Optional
+
+from opencompass.registry import DICT_POSTPROCESSORS
+
+from .utils import get_judgeanswer_and_reference
+
+
+def post_process(judgement: str):
+    """Input a string like below:
+
+    xxx[[5]]xxx, and extract the score
+    """
+    judgement = judgement['prediction']
+    pattern = r'\[\[([\d.]+)\]\]'
+    matched_result = re.findall(pattern, judgement)
+    if matched_result:
+        score = float(matched_result[0])
+    else:
+        return None
+    return {'score': score}
+
+
+def get_capability_results(judged_answers, references):
+    capability_ratings = defaultdict(int)
+    capability_counts = defaultdict(int)
+    for ans, ref in zip(judged_answers, references):
+        capability_ratings['total'] += ans['score']
+        capability_counts['total'] += 1
+        capability_ratings[ref['capability']] += ans['score']
+        capability_counts[ref['capability']] += 1
+
+    capability_avg_ratings = defaultdict(float)
+
+    for capability, total_score in capability_ratings.items():
+        s = total_score / capability_counts[capability]
+        s = round(s, 2)
+        capability_avg_ratings[capability] = s
+
+    return capability_avg_ratings
+
+
+@DICT_POSTPROCESSORS.register_module('commenbench')
+def commonbench_postprocess(
+    output: dict,
+    output_path: str,
+    post_process: Optional[callable] = post_process,
+) -> dict:
+    judged_answers, references = get_judgeanswer_and_reference(
+        output, output_path, post_process)
+
+    results = get_capability_results(judged_answers, references)
+    results['details'] = output
+    return results

From 12213207b6f954f00a2826e042795a90c253746c Mon Sep 17 00:00:00 2001
From: Linchen Xiao <xxllcc1993@gmail.com>
Date: Wed, 9 Apr 2025 15:52:23 +0800
Subject: [PATCH 58/58] [Refactor] Refactorize openicl eval task (#1990)

* [Refactor] Refactorize openicl eval task

* update
---
 .../icl_evaluator/icl_base_evaluator.py       |   3 +-
 opencompass/tasks/openicl_eval.py             | 415 +++++++++---------
 opencompass/utils/__init__.py                 |   2 -
 opencompass/utils/model_postprocessors.py     | 135 ------
 opencompass/utils/postprocessors/__init__.py  |   0
 .../postprocessors/naive/PROMPT_TEMPLATE.py   |  11 -
 .../utils/postprocessors/naive/README.md      |  71 ---
 .../utils/postprocessors/naive/__init__.py    |   2 -
 .../utils/postprocessors/naive/extractor.py   | 121 -----
 .../utils/postprocessors/xfinder/README.md    | 194 --------
 .../utils/postprocessors/xfinder/__init__.py  |   0
 .../utils/postprocessors/xfinder/extractor.py | 175 --------
 .../xfinder/xfinder_utils/PROMPT_TEMPLATE.py  |  14 -
 .../xfinder/xfinder_utils/__init__.py         |   3 -
 .../xfinder/xfinder_utils/convert_data.py     | 123 ------
 .../xfinder/xfinder_utils/data_process.py     |  24 -
 16 files changed, 198 insertions(+), 1095 deletions(-)
 delete mode 100644 opencompass/utils/model_postprocessors.py
 delete mode 100644 opencompass/utils/postprocessors/__init__.py
 delete mode 100644 opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
 delete mode 100644 opencompass/utils/postprocessors/naive/README.md
 delete mode 100644 opencompass/utils/postprocessors/naive/__init__.py
 delete mode 100644 opencompass/utils/postprocessors/naive/extractor.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/README.md
 delete mode 100644 opencompass/utils/postprocessors/xfinder/__init__.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/extractor.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py
 delete mode 100644 opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py

diff --git a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
index 794c0ed6..f7ff0277 100644
--- a/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
+++ b/opencompass/openicl/icl_evaluator/icl_base_evaluator.py
@@ -91,7 +91,8 @@ class BaseEvaluator:
     ):
         # Check if predictions and references have the
         # same length if both are provided
-        if 'predictions' in score_kwargs and 'references' in score_kwargs:
+        if ('predictions' in score_kwargs and 'references' in score_kwargs
+                and score_kwargs['references'] is not None):
             if len(score_kwargs['predictions']) != len(
                     score_kwargs['references']):
                 raise ValueError(
diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py
index 252a120a..fd0a773c 100644
--- a/opencompass/tasks/openicl_eval.py
+++ b/opencompass/tasks/openicl_eval.py
@@ -7,7 +7,6 @@ import random
 import statistics
 import sys
 import time
-from collections import Counter
 from inspect import signature
 from typing import List
 
@@ -19,7 +18,7 @@ from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
                                   TEXT_POSTPROCESSORS)
 from opencompass.tasks.base import BaseTask, extract_role_pred
 from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
-                               get_logger, task_abbr_from_cfg)
+                               get_logger)
 
 
 @TASKS.register_module()
@@ -86,6 +85,26 @@ class OpenICLEvalTask(BaseTask):
                 self._score()
 
     def _score(self):
+        # Load and preprocess test data
+        test_set = self._load_and_preprocess_test_data()
+        # Load predictions
+        pred_dicts, pred_strs = self._load_predictions()
+
+        # Process predictions
+        pred_strs = self._process_predictions(pred_strs)
+
+        # Evaluate predictions
+        result = self._evaluate_predictions(
+            pred_strs,
+            test_set,
+            pred_dicts,
+        )
+
+        # Save results
+        self._save_results(result)
+
+    def _load_and_preprocess_test_data(self):
+        """Load test dataset and apply postprocessing if needed."""
         test_set = build_dataset_from_cfg(self.dataset_cfg).test
         # Postprocess dataset if necessary
         if 'dataset_postprocessor' in self.eval_cfg:
@@ -100,7 +119,10 @@ class OpenICLEvalTask(BaseTask):
 
             test_set = test_set.map(postprocess)
 
-        # Load predictions
+        return test_set
+
+    def _load_predictions(self):
+        """Load model predictions from files."""
         filename = get_infer_output_path(
             self.model_cfg,
             self.dataset_cfg,
@@ -110,217 +132,188 @@ class OpenICLEvalTask(BaseTask):
         root, ext = osp.splitext(filename)
         partial_filename = root + '_0' + ext
 
-        # Get sc_size if use Self-Consistency
-        sc_size = self.eval_cfg.get('sc_size')
-
         if not osp.exists(osp.realpath(filename)) and not osp.exists(
                 osp.realpath(partial_filename)):
-            result = {'error': 'No predictions found.'}
+            raise FileNotFoundError(
+                f'Prediction files not found: neither {filename} '
+                f'nor {partial_filename} exists')
+
+        if osp.exists(osp.realpath(filename)):
+            preds = mmengine.load(filename)
+            preds = [preds[str(i)] for i in range(len(preds))]
         else:
-            if osp.exists(osp.realpath(filename)):
-                preds = mmengine.load(filename)
-                preds = [preds[str(i)] for i in range(len(preds))]
+            filename = partial_filename
+            preds = []
+            i = 1
+            while osp.exists(osp.realpath(filename)):
+                sub_preds = mmengine.load(filename)
+                preds.extend(
+                    [sub_preds[str(i)] for i in range(len(sub_preds))])
+                filename = root + f'_{i}' + ext
+                i += 1
+
+        pred_dicts = copy.deepcopy(preds)
+        preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
+
+        pred_strs = preds.pop('prediction', None)
+
+        return pred_dicts, pred_strs
+
+    def _process_predictions(self, pred_strs):
+        """Apply various processing steps to predictions."""
+        # Check if we're dealing with a list of lists (pred_list_flag)
+        pred_list_flag = pred_strs is not None and isinstance(
+            pred_strs[0], list)
+
+        # Extract role predictions if needed
+        if ('pred_role' in self.eval_cfg and 'meta_template' in self.model_cfg
+                and not MODELS.get(self.model_cfg['type']).is_api):
+            # Create a prompt template for role config parsing
+            from opencompass.models.base import LMTemplateParser
+
+            parser = LMTemplateParser(self.model_cfg['meta_template'])
+            role = parser.roles[self.eval_cfg['pred_role']]
+            if pred_list_flag:
+                pred_strs = [[
+                    extract_role_pred(
+                        _pred,
+                        role.get('begin', None),
+                        role.get('end', None),
+                    ) for _pred in pred
+                ] for pred in pred_strs]
             else:
-                filename = partial_filename
-                preds = []
-                i = 1
-                while osp.exists(osp.realpath(filename)):
-                    sub_preds = mmengine.load(filename)
-                    preds.extend(
-                        [sub_preds[str(i)] for i in range(len(sub_preds))])
-                    filename = root + f'_{i}' + ext
-                    i += 1
-            pred_dicts = copy.deepcopy(preds)
-            preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
-
-            pred_strs = preds.pop('prediction', None)
-            pred_list_flag = pred_strs is not None and isinstance(
-                pred_strs[0], list)
-            if ('pred_role' in self.eval_cfg
-                    and 'meta_template' in self.model_cfg
-                    and not MODELS.get(self.model_cfg['type']).is_api):
-                # Create a prompt template for role config parsing
-                from opencompass.models.base import LMTemplateParser
-
-                parser = LMTemplateParser(self.model_cfg['meta_template'])
-                role = parser.roles[self.eval_cfg['pred_role']]
-                if sc_size is not None:
-                    assert pred_list_flag, (
-                        'The prediction for Self-Consistency'
-                        'must be list.')
-                if pred_list_flag:
-                    pred_strs = [[
-                        extract_role_pred(
-                            _pred,
-                            role.get('begin', None),
-                            role.get('end', None),
-                        ) for _pred in pred
-                    ] for pred in pred_strs]
-                else:
-                    pred_strs = [
-                        extract_role_pred(
-                            pred,
-                            role.get('begin', None),
-                            role.get('end', None),
-                        ) for pred in pred_strs
-                    ]
-
-            # Postprocess predictions if necessary
-            # Model Specified Postprocessor
-            if 'pred_postprocessor' in self.model_cfg:
-                kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    pred_strs = [[proc(s, **kwargs) for s in preds]
-                                 for preds in pred_strs]
-                else:
-                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
-            # Dataset Specified Postprocessor
-            if 'pred_postprocessor' in self.eval_cfg:
-                kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    pred_strs = [[proc(s, **kwargs) for s in preds]
-                                 for preds in pred_strs]
-                else:
-                    pred_strs = [proc(s, **kwargs) for s in pred_strs]
-
-            model_pred_strs = []
-            if 'model_postprocessor' in self.eval_cfg:
-                references = (test_set[self.output_column]
-                              if self.output_column else None)
-                model_pred_dicts = copy.deepcopy(pred_dicts)
-                for i, pred_dict in enumerate(model_pred_dicts):
-                    pred_dict['reference'] = [references[i]]
-                self.logger.info('Postprocessing model predictions...')
-                kwargs = self.eval_cfg['model_postprocessor']
-                proc = kwargs.pop('type')
-                if isinstance(proc, str):
-                    proc = TEXT_POSTPROCESSORS.get(proc)
-                if pred_list_flag:
-                    model_pred_strs = [[
-                        proc(model_pred_dict, **kwargs)
-                        for model_pred_dict in model_pred_dicts
-                    ]]
-                else:
-                    model_pred_strs = proc(model_pred_dicts, **kwargs)
-
-            # Get majority voting predictions if use self-consistency
-            if sc_size is not None:
                 pred_strs = [
-                    Counter(s).most_common(1)[0][0] for s in pred_strs
+                    extract_role_pred(
+                        pred,
+                        role.get('begin', None),
+                        role.get('end', None),
+                    ) for pred in pred_strs
                 ]
 
-            icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
-            # need results dir to save other files
-            out_path = get_infer_output_path(
-                self.model_cfg,
-                self.dataset_cfg,
-                osp.join(self.work_dir, 'results'),
-            )
-            icl_evaluator._out_dir = osp.splitext(out_path)[
-                0]  # strip extension
-
-            preds['predictions'] = pred_strs
-            preds['references'] = (test_set[self.output_column]
-                                   if self.output_column else None)
-            preds['test_set'] = test_set
-            if 'origin_prompt' not in preds:
-                try:
-                    preds['origin_prompt'] = [
-                        None for _ in range(len(pred_strs))
-                    ]
-                except TypeError:
-                    preds['origin_prompt'] = None
-            preds = {
-                k: preds[k]
-                for k in signature(icl_evaluator.score).parameters
-            }
-            k = self.dataset_cfg.get('k', 1)
-            n = self.dataset_cfg.get('n', 1)
-            result = icl_evaluator.evaluate(k, n, copy.deepcopy(test_set),
-                                            **preds)
-
-            # Get model postprocess result
-            model_details = None
-            model_result = None
-            if 'model_postprocessor' in self.eval_cfg:
-                model_preds = copy.deepcopy(preds)
-                model_preds['predictions'] = model_pred_strs
-                model_result = icl_evaluator.evaluate(k, n,
-                                                      copy.deepcopy(test_set),
-                                                      **model_preds)
-                for key in model_result:
-                    if key == 'details':
-                        model_details = model_result[key]
-                        continue
-                    new_key = 'model_postprocess_' + key
-                    result[new_key] = model_result[key]
-
-            if self.dump_details:
-                details = result.get('details', None)
-                # Try to format details is details is not provided by evaluator
-                if details is None:
-                    self.logger.info(
-                        'Details is not give by evaluator, try to format it')
-                    try:
-                        result['details'] = self.format_details(
-                            pred_strs,
-                            model_pred_strs,
-                            test_set[self.output_column],
-                            details,
-                            model_details,
-                            pred_dicts,
-                        )
-                        self.logger.warning(
-                            f"result['details'] : {result['details']}"),
-                        result['type'] = result['details'].pop('type', None)
-                        if self.cal_extract_rate:
-                            # Calculate the extraction success
-                            # rate for prediction
-                            result['extract_rate'] = self.extract_rate(result)
-
-                        if 'PPL' in str(
-                                self.dataset_cfg.infer_cfg.inferencer.type):
-                            result['correct_bpb'], result['incorrect_bpb'] = (
-                                self.calculate_bpb(pred_dicts))
-                    except Exception as e:
-                        self.logger.warning(
-                            f'Skip dumping details due to: {e}.')
+        # Apply postprocessors if configured
+        # Postprocess predictions if necessary
+        # Model Specified Postprocessor
+        if 'pred_postprocessor' in self.model_cfg:
+            kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
+            proc = kwargs.pop('type')
+            if isinstance(proc, str):
+                proc = TEXT_POSTPROCESSORS.get(proc)
+            if pred_list_flag:
+                pred_strs = [[proc(s, **kwargs) for s in preds]
+                             for preds in pred_strs]
             else:
-                result.pop('details', None)
+                pred_strs = [proc(s, **kwargs) for s in pred_strs]
 
-        if 'error' in result:
-            self.logger.error(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result["error"]}')
-            return
-        elif model_result is None:
-            result_wo_details = {
-                i: result[i]
-                for i in result if i != 'details'
-            }
-            self.logger.info(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
+        # Dataset Specified Postprocessor
+        if 'pred_postprocessor' in self.eval_cfg:
+            kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
+            proc = kwargs.pop('type')
+            if isinstance(proc, str):
+                proc = TEXT_POSTPROCESSORS.get(proc)
+            if pred_list_flag:
+                pred_strs = [[proc(s, **kwargs) for s in preds]
+                             for preds in pred_strs]
+            else:
+                pred_strs = [proc(s, **kwargs) for s in pred_strs]
+
+        return pred_strs
+
+    def _evaluate_predictions(
+        self,
+        pred_strs,
+        test_set,
+        pred_dicts,
+    ):
+        """Evaluate predictions using the configured evaluator."""
+        # Get references from test set
+        references = (None if self.output_column is None else
+                      [sample[self.output_column] for sample in test_set])
+        # Build evaluator from config
+        evaluator_cfg = self.eval_cfg.get('evaluator', {})
+        evaluator_type = evaluator_cfg.get('type')
+        if isinstance(evaluator_type, str):
+            evaluator_type = ICL_EVALUATORS.get(evaluator_type)
+
+        # Prepare evaluator inputs
+        evaluator_cfg_copy = copy.deepcopy(evaluator_cfg)
+        evaluator_cfg_copy.pop('type', None)
+        # Initialize evaluator with appropriate parameters
+        sig = signature(evaluator_type)
+        if 'predictions' in sig.parameters and 'references' in sig.parameters:
+            evaluator = evaluator_type(
+                predictions=pred_strs,
+                references=references,
+                **evaluator_cfg_copy,
+            )
         else:
-            result_wo_details = {
-                i: result[i]
-                for i in result if i != 'details'
-            }
-            model_result_wo_details = {
-                i: model_result[i]
-                for i in model_result if i != 'details'
-            }
-            self.logger.info(
-                f'Task {task_abbr_from_cfg(self.cfg)}: {result_wo_details}')
-            self.logger.info(
-                'Model Postprocess Task: ' +
-                f'{task_abbr_from_cfg(self.cfg)}:{model_result_wo_details}')
+            evaluator = evaluator_type(**evaluator_cfg_copy)
 
-        # Save result
+        # Set output directory for the evaluator
+        out_path = get_infer_output_path(
+            self.model_cfg,
+            self.dataset_cfg,
+            osp.join(self.work_dir, 'results'),
+        )
+        evaluator._out_dir = osp.splitext(out_path)[0]  # strip extension
+
+        # If preds contains keys that match the score method
+        # parameters, include them
+        if pred_dicts:
+            preds = {
+                k: [pred.get(k) for pred in pred_dicts]
+                for k in pred_dicts[0]
+            }
+        # Add predictions and references if they're expected
+        # by the score method
+        preds['predictions'] = pred_strs
+        preds['references'] = (test_set[self.output_column]
+                               if self.output_column else None)
+        preds['test_set'] = test_set
+        if 'origin_prompt' not in preds:
+            try:
+                preds['origin_prompt'] = [None for _ in range(len(pred_strs))]
+            except TypeError:
+                preds['origin_prompt'] = None
+        preds = {k: preds[k] for k in signature(evaluator.score).parameters}
+        # Call evaluate with the appropriate parameters
+        k = self.dataset_cfg.get('k', 1)
+        n = self.dataset_cfg.get('n', 1)
+        result = evaluator.evaluate(k, n, copy.deepcopy(test_set), **preds)
+
+        # Format details if needed
+        if self.dump_details:
+            # Get detailed results if available
+            details = result.get('details', None)
+            if details is None:
+                self.logger.info(
+                    'Details is not give by evaluator, try to format it')
+                try:
+                    result['details'] = self.format_details(
+                        pred_strs,
+                        references,
+                        details,
+                        pred_dicts,
+                    )
+
+                    # Calculate extraction rate if needed
+                    if self.cal_extract_rate and details is not None:
+                        result['extract_rate'] = self.extract_rate(result)
+
+                    # Calculate BPB if applicable
+                    if pred_dicts and 'BPB' in pred_dicts[0].get(
+                            list(pred_dicts[0].keys())[0], {}):
+                        correct_bpb, incorrect_bpb = self.calculate_bpb(
+                            pred_dicts)
+                        result['correct_bpb'] = correct_bpb
+                        result['incorrect_bpb'] = incorrect_bpb
+                except Exception as e:
+                    self.logger.warning(f'Skip dumping details due to: {e}.')
+        else:
+            result.pop('details', None)
+        return result
+
+    def _save_results(self, result):
+        """Save evaluation results to file."""
         out_path = get_infer_output_path(
             self.model_cfg,
             self.dataset_cfg,
@@ -351,10 +344,8 @@ class OpenICLEvalTask(BaseTask):
     def format_details(
         self,
         predictions,
-        model_pred_strs,
         references,
         details,
-        model_details,
         pred_dicts,
     ):
         """This function is responsible for formatting prediction details.
@@ -393,20 +384,6 @@ class OpenICLEvalTask(BaseTask):
                 result['predictions'] = str(predictions[i])
                 result['references'] = str(references[i])
                 result['correct'] = str(predictions[i]) == str(references[i])
-            elif details is not None and model_details is not None:
-                assert (
-                    model_pred_strs != []
-                ), 'Model details is not None, but model_pred_strs is empty'
-                self.logger.info(
-                    f"model_details[i]['pred']: {model_details[i]['pred']}")
-                results['type'] = 'GEN'
-                result['prompt'] = origin_prediction['origin_prompt']
-                result['origin_prediction'] = pred_dicts[i]['prediction']
-                result['predictions'] = details[i]['pred']
-                result['model_extract_predictions'] = model_details[i]['pred']
-                result['references'] = details[i]['answer']
-                result['correct'] = details[i]['correct']
-                result['model_extract_correct'] = model_details[i]['correct']
             elif details is not None:
                 results['type'] = 'GEN'
                 result['prompt'] = origin_prediction['origin_prompt']
diff --git a/opencompass/utils/__init__.py b/opencompass/utils/__init__.py
index ba4c80c3..21f003f9 100644
--- a/opencompass/utils/__init__.py
+++ b/opencompass/utils/__init__.py
@@ -10,9 +10,7 @@ from .fileio import *  # noqa
 from .lark import *  # noqa
 from .logging import *  # noqa
 from .menu import *  # noqa
-from .model_postprocessors import *  # noqa
 from .network import *  # noqa
-from .postprocessors import *  # noqa
 from .prompt import *  # noqa
 from .result_station import *  # noqa
 from .text_postprocessors import *  # noqa
diff --git a/opencompass/utils/model_postprocessors.py b/opencompass/utils/model_postprocessors.py
deleted file mode 100644
index fa0336ee..00000000
--- a/opencompass/utils/model_postprocessors.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from functools import partial
-from multiprocessing import Pool
-from typing import Union
-
-from tqdm import tqdm
-
-from opencompass.registry import TEXT_POSTPROCESSORS
-
-from .postprocessors.naive import NaiveExtractor, format_input_naive
-from .postprocessors.xfinder.extractor import Extractor
-from .postprocessors.xfinder.xfinder_utils import (DataProcessor,
-                                                   convert_to_xfinder_format)
-
-
-def gen_output_naive(ori_data, extractor):
-    extracted_answers = []
-    for item in tqdm(ori_data):
-        user_input = extractor.prepare_input(item)
-        extracted_answer = extractor.gen_output(user_input)
-        item['extracted_answer'] = extracted_answer
-        extracted_answers.append(extracted_answer)
-
-    return extracted_answers
-
-
-@TEXT_POSTPROCESSORS.register_module('naive')
-def naive_model_postprocess(preds: list,
-                            model_name: str,
-                            custom_instruction: str,
-                            api_url: Union[str, list],
-                            num_processes: int = 8,
-                            **kwargs) -> list:
-    """Postprocess the text extracted by custom model.
-    Args:
-        preds (list): The question, reference answer and model prediction.
-        model_name (str): The name of the model.
-        custom_instruction (str): Custom instruction for the dataset.
-        url (Union[str, list]): The api url of the model.
-
-    Returns:
-        list: The postprocessed answers.
-    """
-
-    def _eval_pred(texts, extractor, num_processes):
-        ori_data = texts
-        extracted_answers = []
-        batched_ori_data = []
-        # Split data into batches
-        num_processes = min(num_processes, len(ori_data))
-        batch_size = len(ori_data) // num_processes
-        for i in range(0, len(ori_data), batch_size):
-            batched_ori_data.append(ori_data[i:i + batch_size])
-        with Pool(num_processes) as p:
-            results = p.map(partial(gen_output_naive, extractor=extractor),
-                            batched_ori_data)
-            for result in results:
-                extracted_answers.extend(result)
-        return extracted_answers
-
-    format_data = format_input_naive(preds)
-    assert api_url is not None, 'Please provide the api url.'
-    extractor = NaiveExtractor(
-        model_name=model_name,
-        custom_instruction=custom_instruction,
-        url=api_url.split(',') if ',' in api_url else api_url)
-    calc_acc_func = partial(_eval_pred,
-                            extractor=extractor,
-                            num_processes=num_processes)
-    extracted_answers = calc_acc_func(format_data)
-    return extracted_answers
-
-
-def gen_output_xfinder(ori_data, extractor):
-    ext_cor_pairs = []
-    extracted_data = []
-    extracted_answers = []
-    for item in tqdm(ori_data):
-        user_input = extractor.prepare_input(item)
-        extracted_answer = extractor.gen_output(user_input)
-        ext_cor_pairs.append([
-            item['key_answer_type'], item['standard_answer_range'],
-            extracted_answer, item['correct_answer']
-        ])
-        item['xfinder_extracted_answer'] = extracted_answer
-        extracted_answers.append(extracted_answer)
-        extracted_data.append(item)
-
-    return extracted_answers, ext_cor_pairs, extracted_data
-
-
-@TEXT_POSTPROCESSORS.register_module('xfinder')
-def xfinder_postprocess(preds: list, question_type: str, model_name: str,
-                        api_url: Union[str, list], **kwargs) -> list:
-    """Postprocess the text extracted by xFinder model.
-    Args:
-        preds (list): The question, reference answer and model prediction.
-        question_type (str): The type of the question.
-        url (Union[str, list]): The api url of the xFinder model.
-
-
-    Returns:
-        list: The postprocessed texts.
-    """
-
-    def _eval_pred(texts, data_processor, extractor, num_processes=8):
-        ori_data = data_processor.read_data(texts)
-        extracted_correct_pairs = []
-        extracted_data = []
-        extracted_answers = []
-        batched_ori_data = []
-        # Split data into batches
-        num_processes = min(num_processes, len(ori_data))
-        batch_size = len(ori_data) // num_processes
-        for i in range(0, len(ori_data), batch_size):
-            batched_ori_data.append(ori_data[i:i + batch_size])
-        with Pool(num_processes) as p:
-            results = p.map(partial(gen_output_xfinder, extractor=extractor),
-                            batched_ori_data)
-        for result in results:
-            extracted_answers += result[0]
-            extracted_correct_pairs += result[1]
-            extracted_data += result[2]
-        return extracted_answers
-
-    format_data = convert_to_xfinder_format(question_type, preds)
-    assert api_url is not None, 'Please provide the api url.'
-    data_processor = DataProcessor()
-    extractor = Extractor(
-        model_name=model_name,
-        url=api_url.split(',') if ',' in api_url else api_url)
-    calc_acc_func = partial(_eval_pred,
-                            data_processor=data_processor,
-                            extractor=extractor)
-    extracted_answers = calc_acc_func(format_data)
-    return extracted_answers
diff --git a/opencompass/utils/postprocessors/__init__.py b/opencompass/utils/postprocessors/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py b/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
deleted file mode 100644
index b8b9abbb..00000000
--- a/opencompass/utils/postprocessors/naive/PROMPT_TEMPLATE.py
+++ /dev/null
@@ -1,11 +0,0 @@
-OPTION_NAVIE_PROMPT_TEMPLATE = """
-There is a detailed explanation of the final answer you should extract:
-1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
-2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
-""" # noqa
-
-MATH_NAVIE_PROMPT_TEMPLATE = """
-This is a detailed explanation of the final answer you should extract:
-1. The question type is a math question, so the final answer should be a number, set, vector, matrix, interval, expression, function, equation, or inequality and any combination of them.
-2. If the final answer includes additional symbols, such as units, you should exclude them and only extract the pure final answer.
-""" # noqa
diff --git a/opencompass/utils/postprocessors/naive/README.md b/opencompass/utils/postprocessors/naive/README.md
deleted file mode 100644
index dcc14a4b..00000000
--- a/opencompass/utils/postprocessors/naive/README.md
+++ /dev/null
@@ -1,71 +0,0 @@
-## Short Usage Introduction for Naive Model Postprocessor with Custom Model
-
-<!-- Now OC can use  -->
-
-### Step 1: Deploy an API server using vLLM or LMDeploy
-
-```bash
-lmdeploy serve api_server meta-llama/Meta-Llama-3-8B-Instruct --model-name llama3-8b-instruct  --server-port 23333 --backend turbomind --tp 1
-```
-
-### Step 2: Add Naive Model Postprocessor to the configuration file
-
-Take GSM8K as an example, you can add the following lines to the configuration file and replace the `api_url` with the correct address of the API server.
-
-```python
-...
-from opencompass.utils.model_postprocessors import navie_model_postprocess
-from opencompass.utils.postprocessors.naive import MATH_NAVIE_PROMPT_TEMPLATE
-
-...
-
-gsm8k_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'),
-    pred_postprocessor=dict(type=math_postprocess_v2),
-    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
-    # Add the following line to use the naive model postprocessor
-    model_postprocessor=dict(
-        type=navie_model_postprocess,
-        custom_instruction=MATH_NAVIE_PROMPT_TEMPLATE,
-        model_name='llama3-8b-instruct',
-        api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-    )
-...
-
-```
-
-The prompt for extraction can also be customized by changing the `custom_instruction` parameter. Now support two default templates: `MATH_NAVIE_PROMPT_TEMPLATE` for math problems extraction like GSM8K and MATH, and `OPTION_NAVIE_PROMPT_TEMPLATE` for option problems extraction like MMLU. You can also write your own prompt template, like:
-
-```python
-OPTION_NAVIE_PROMPT_TEMPLATE = """
-There is a detailed explanation of the final answer you should extract:
-1. You should extract the final answer option like 'A', 'B', 'C', 'D' ... from the given output sentences.
-2. The question is a single choice question, so the final answer option should be one of the options, not a combination of options.
-"""
-```
-
-Your prompt should start with `There is a detailed explanation of the final answer you should extract:` and following with your customized instructions.
-
-### Step 3: Run the Evaluation as Usual
-
-Now you can run the evaluation as usual with the configuration file you modified. The evaluation will use the custom model as the post-process model to get the final result. The final result will be the `model_postprocess_accuracy` in the evaluation result, like:
-
-```Markdown
-dataset                                            version    metric                      mode      llama-3-8b-instruct-turbomind
--------------------------------------------------  ---------  --------------------------  ------  -------------------------------
-gsm8k                                              a58960     accuracy                    gen                               73.46
-gsm8k                                              a58960     model_postprocess_accuracy  gen                               78.77
-```
-
-## Experiment Results
-
-We have tested the model postprocess method with different models (Qwen2-72B-Chat, Llama3-8b-Chat) as post-process model on the GSM8K, MMLU datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
-
-```Markdown
-| Dataset | Type            | Config ID              | Regex Postprocess Score | Model Postprocess Score (Llama3-8b-Instruct) | Model Postprocess Score (Qwen2-72B-Chat) |
-| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |----------------------- |
-| gsm8k   | math            | a58960                   | 73.46               | 79.08                  | 78.77                   |
-| mmlu    | option          | 4d595a                   | 67.89               | 65.26                  | 67.94                  |
-```
-
-The `metric` column with `model_postprocess_accuracy` is the final result after the `Naive Model Postprocessor` is applied.
diff --git a/opencompass/utils/postprocessors/naive/__init__.py b/opencompass/utils/postprocessors/naive/__init__.py
deleted file mode 100644
index 70a914d5..00000000
--- a/opencompass/utils/postprocessors/naive/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .extractor import *  # noqa
-from .PROMPT_TEMPLATE import *  # noqa
diff --git a/opencompass/utils/postprocessors/naive/extractor.py b/opencompass/utils/postprocessors/naive/extractor.py
deleted file mode 100644
index c759094c..00000000
--- a/opencompass/utils/postprocessors/naive/extractor.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Naive model extractor for OpenCompass, modified from xFinder: https://github.com/IAAR-Shanghai/xFinder # noqa
-import json
-import time
-from logging import getLogger
-
-from openai import OpenAI
-
-Meta_Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
-First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
-Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
-Below are some special cases you need to be aware of:
-    (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
-    (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
-    (3) You should only return the precise answer you extract, without processing the answer. Please return only the answer and do not add any additional content.
-
-""" # noqa
-
-
-def format_input_naive(data):
-    format_data = []
-    for item in data:
-        template = {}
-        question = item['origin_prompt'][-1]['prompt']
-        llm_output = item['prediction']
-        correct_answer = item['reference'] if item['reference'] else item[
-            'gold']
-        template['correct_answer'] = correct_answer
-        template['question'] = question
-        template['llm_output'] = llm_output
-
-        format_data.append(template)
-    return format_data
-
-
-class NaiveExtractor:
-
-    def __init__(
-            self,
-            model_name,
-            model_path=None,
-            url=None,
-            temperature=0,
-            max_tokens=3000,
-            api_key='EMPTY',
-            SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.',  # noqa
-            custom_instruction=''):
-        self.model_name = model_name
-        self.SYSTEM = SYSTEM
-        self.model_path = model_path
-        self.url = url
-        self.api_key = api_key
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.custom_instruction = custom_instruction
-        self.logger = getLogger(__name__)
-
-    def prepare_input(self, item):
-        user_input = Meta_Instruction + self.custom_instruction + \
-            "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
-            "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
-            'Key extracted answer: '
-
-        return user_input
-
-    def gen_output(self, query):
-        return self.openai_infer(query)
-
-    def openai_infer(self, query: str, retry=9) -> str:
-        """Perform inference on the OpenAI model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        if isinstance(self.url, list):
-            # Randomly api for better load balancing
-            import random
-            self.url = random.choice(self.url)
-        self.client = OpenAI(
-            api_key=self.api_key,
-            base_url=self.url,
-        )
-        self.retry = retry
-
-        t = time.time()
-        retry = self.retry
-        response = ''
-        while retry > 0:
-            try:
-                chat_response = self.client.chat.completions.create(
-                    model=self.client.models.list().data[0].id
-                    if self.model_name == '' else self.model_name,
-                    messages=[
-                        {
-                            'role': 'system',
-                            'content': self.SYSTEM
-                        },
-                        {
-                            'role': 'user',
-                            'content': query
-                        },
-                    ],
-                    temperature=self.temperature,
-                    max_tokens=self.max_tokens,
-                )
-                js_response = json.loads(chat_response.model_dump_json())
-                response = js_response['choices'][0]['message']['content']
-                break
-            except Exception as e:
-                self.logger.info(f'Error: {e}')
-                self.logger.info(f'{self.url} is down. Retrying...')
-                self.logger.info(f'Time elapsed: {time.time() - t} seconds')
-                time.sleep(6)
-                retry -= 1
-        if retry == 0:
-            response = 'Error: Failed to get response.'
-            self.logger.info(f'{response} after {self.retry} tries.')
-            raise ValueError('The api is down')
-        return response.strip()
diff --git a/opencompass/utils/postprocessors/xfinder/README.md b/opencompass/utils/postprocessors/xfinder/README.md
deleted file mode 100644
index 4f7b4ace..00000000
--- a/opencompass/utils/postprocessors/xfinder/README.md
+++ /dev/null
@@ -1,194 +0,0 @@
-## Extract Final Answers with Postprocess Models
-
-OpenCompass now support postprocess (extract) prediction answers with postprocess models, to get the true ability level of models. Now, we use [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first postprocess model to extract the final answers from the model outputs.
-
-We support four types of task types now:
-
-1. **math**: for math questions with numerical pr formula answers, like GSM8k, Math, etc.
-2. **alphabet_option**: for alphabet option questions with alphabet answers, like CommonsenseQA, MMLU, etc.
-3. **short_text**: for questions answer type is a short text with selected short text answers.
-
-Here are some examples of the question type supported from the official xFinder repo:
-
-```json
-[
-    {
-        "model_name": "Llama3_8B_Instruct",
-        "dataset": "MetaMathQA",
-        "key_answer_type": "math",
-        "question": "Find the smallest, positive four-digit multiple of $15.$",
-        "llm_output": "The smallest four-digit multiple of $15$ is $\\boxed{1005}.$",
-        "correct_answer": "1005",
-        "standard_answer_range": "a(n) number / set / vector / matrix / interval / expression / function / equation / inequality"
-    },
-    {
-        "model_name": "LLaMA2_7B_Chat",
-        "dataset": "SIQA_text",
-        "key_answer_type": "short_text",
-        "question": "Jordan kept Carson up all night talking because Jordan's girlfriend left them and they were quite upset. How would Carson feel as a result? Answer Choices: drowsy / upset their girlfriend left / energetic",
-        "llm_output": "Based on the information provided in the question, Carson would likely feel upset because Jordan's girlfriend left them and they were quite upset. Therefore, the correct answer is (B) upset their girlfriend left.",
-        "correct_answer": "drowsy",
-        "standard_answer_range": [
-            "drowsy",
-            "upset their girlfriend left",
-            "energetic"
-        ]
-    },
-    {
-        "model_name": "PHI2",
-        "dataset": "CommonsenseQA_alpha",
-        "key_answer_type": "alphabet_option",
-        "question": "Why would you go to an office? Answer Choices: (A) work (B) school building (C) paper (D) city (E) habit",
-        "llm_output": "A) work (B) school building (C) paper (D) city (E) habit\n\nExplanation:\n We are looking for a reason why someone would go to an office. The only answer choice that matches this is option A) work. Therefore, the correct answer is A) work.",
-        "correct_answer": "A",
-        "standard_answer_range": [
-            [
-                "A",
-                "work"
-            ],
-            [
-                "B",
-                "school building"
-            ],
-            [
-                "C",
-                "paper"
-            ],
-            [
-                "D",
-                "city"
-            ],
-            [
-                "E",
-                "habit"
-            ]
-        ]
-    }
-]
-```
-
-## How to Use Model Postprocess in OpenCompass
-
-### Step 1: Deploy the Postprocess Model Server
-
-For now, there are two xFinder models can use, you can download them from Huggingface model hub:
-
-1. **IAAR-Shanghai/xFinder-qwen1505**
-2. **IAAR-Shanghai/xFinder-llama38it**
-
-You can use LMDeploy or vLLM to deploy the xFinder model server, for example, you can use the following command to deploy the xFinder model server with LMDeploy:
-
-```bash
-lmdeploy serve api_server IAAR-Shanghai/xFinder-qwen1505  --model-name xFinder-qwen1505  --server-port 23333 --backend turbomind --tp 1
-```
-
-### Step 2: Set the Postprocess Model Config in the Dataset Configuration
-
-We make the postprocess as a common postprocess function in OpenCompass, so you can use it by setting the `postprocess` parameter in the `predict` function of OpenCompass. It can be used with the default postprocess regularization extract function at the same time. The only thing you need to do is to deploy the postprocess model server and set the `model_postprocessor` to the original `eval_cfg` in the dataset configuration, like the following example:
-
-```python
-from opencompass.utils.model_postprocessors import xfinder_postprocess
-
-...
-
-    model_postprocessor=dict(
-        type=xfinder_postprocess,
-        question_type='math',
-        xfinder_model_name='xFinder-qwen1505',
-        xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-```
-
-Explanation of the parameters:
-
-- `question_type`: the type of the question, which can be one of the three types mentioned above.
-- `xfinder_model_name`: the name of the model you deploying the model server.
-- `xfiner_api_url`: the URL of the model server, you can set multiple URLs with `,` to use multiple model servers, which can accelerate the postprocess speed.
-
-📢：**Please attention following points**:
-
-1. Now only support extract questions with Zero-shot setting.
-2. For alphabet_option problems, the option should be like '\\nA. xxx\\nB. xxx\\nC. xxx\\nD. xxx\\nE. xxx\\n ...' or '\\n(A) xxx\\n(B) xxx\\n(C) xxx\\n(D) xxx\\n(E) xxx\\n ...' format, and the correct answer should be the alphabet of the correct answer, like 'A', 'B', 'C', 'D', 'E'.
-
-For more details about the xFinder model, you can refer to the [xFinder](https://github.com/IAAR-Shanghai/xFinder), and for a complete example, you can refer to the following example, which is the configuration of the GSM8K dataset with the xFinder postprocess model:
-
-```python
-from opencompass.openicl.icl_prompt_template import PromptTemplate
-from opencompass.openicl.icl_retriever import ZeroRetriever
-from opencompass.openicl.icl_inferencer import GenInferencer
-from opencompass.datasets import GSM8KDataset, gsm8k_dataset_postprocess, Gsm8kEvaluator
-from opencompass.datasets import MATHEvaluator, math_postprocess_v2
-from opencompass.utils.model_postprocessors import xfinder_postprocess
-
-gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
-
-gsm8k_infer_cfg = dict(
-    prompt_template=dict(
-        type=PromptTemplate,
-        template=dict(
-            round=[
-                dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
-            ],
-        ),
-    ),
-    retriever=dict(type=ZeroRetriever),
-    inferencer=dict(type=GenInferencer, max_out_len=512),
-)
-
-gsm8k_eval_cfg = dict(
-    evaluator=dict(type=MATHEvaluator, version='v2'),
-    pred_postprocessor=dict(type=math_postprocess_v2),
-    dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
-    model_postprocessor=dict(
-        type=xfinder_postprocess,
-        question_type='math',
-        xfinder_model_name='xFinder-qwen1505',
-        xfiner_api_url='http://0.0.0.0:23333/v1,http://0.0.0.0:23334/v1')
-    )
-
-gsm8k_datasets = [
-    dict(
-        abbr='gsm8k',
-        type=GSM8KDataset,
-        path='opencompass/gsm8k',
-        reader_cfg=gsm8k_reader_cfg,
-        infer_cfg=gsm8k_infer_cfg,
-        eval_cfg=gsm8k_eval_cfg,
-    )
-]
-```
-
-For evaluation results, `accuracy` is the result using default postprocess, and `model_postprocess_accuracy` is the result using xFinder postprocess, the gap can be wider when the model is not good answering the questions properly.
-
-You can also use the `--dump-eval-details` command to dump the detailed evaluation details to see the model postprocess results from the `results` folder.
-
-## Results Comparison with Different Question Types
-
-We have tested the model postprocess method with XFinder model on the GSM8K, MMLU, Natural Questions (NQ) datasets for `Meta-Llama-3-8B-Instruct` with above settings, and the results are as follows:
-
-| Dataset | Type            | Config Name              | Regex Postprocess Score | Model Postprocess Score |
-| ------- | --------------- | ------------------------ | ----------------------- | ----------------------- |
-| gsm8k   | math            | gsm8k_xfinder_gen_a58960 | 73.46                   | 78.09                   |
-| nq      | short_text      | nq_xfinder_gen_3dcea1    | 22.33                   | 37.53                   |
-| mmlu    | alphabet_option | mmlu_xfinder_gen_4d595a  | 67.89                   | 67.93                   |
-
-## Citation
-
-```bibtex
-@misc{2023opencompass,
-    title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
-    author={OpenCompass Contributors},
-    howpublished = {\url{https://github.com/open-compass/opencompass}},
-    year={2023}
-}
-
-@misc{yu2024xfinderrobustpinpointanswer,
-      title={xFinder: Robust and Pinpoint Answer Extraction for Large Language Models},
-      author={Qingchen Yu and Zifan Zheng and Shichao Song and Zhiyu Li and Feiyu Xiong and Bo Tang and Ding Chen},
-      year={2024},
-      eprint={2405.11874},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2405.11874},
-}
-
-```
diff --git a/opencompass/utils/postprocessors/xfinder/__init__.py b/opencompass/utils/postprocessors/xfinder/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/opencompass/utils/postprocessors/xfinder/extractor.py b/opencompass/utils/postprocessors/xfinder/extractor.py
deleted file mode 100644
index de2abb08..00000000
--- a/opencompass/utils/postprocessors/xfinder/extractor.py
+++ /dev/null
@@ -1,175 +0,0 @@
-import json
-import time
-from logging import getLogger
-
-import requests
-from openai import OpenAI
-
-from .xfinder_utils import PROMPT_TEMPLATE
-
-Instruction = """I will provide you with a question, output sentences along with an answer range. The output sentences are the response of the question provided. The answer range could either describe the type of answer expected or list all possible valid answers. Using the information provided, you must accurately and precisely determine and extract the intended key answer from the output sentences. Please don't have your subjective thoughts about the question.
-First, you need to determine whether the content of the output sentences is relevant to the given question. If the entire output sentences are unrelated to the question (meaning the output sentences are not addressing the question), then output [No valid answer].
-Otherwise, ignore the parts of the output sentences that have no relevance to the question and then extract the key answer that matches the answer range.
-Below are some special cases you need to be aware of:
-    (1) If the output sentences present multiple different answers, carefully determine if the later provided answer is a correction or modification of a previous one. If so, extract this corrected or modified answer as the final response. Conversely, if the output sentences fluctuate between multiple answers without a clear final answer, you should output [No valid answer].
-    (2) If the answer range is a list and the key answer in the output sentences is not explicitly listed among the candidate options in the answer range, also output [No valid answer].
-
-""" # noqa
-
-
-class Extractor:
-
-    def __init__(
-        self,
-        model_name,
-        model_path=None,
-        url=None,
-        temperature=0,
-        max_tokens=3000,
-        api_key='EMPTY',
-        SYSTEM='You are a help assistant tasked with extracting the precise key answer from given output sentences. You must only provide the extracted key answer without including any additional text.'  # noqa
-    ):
-        self.model_name = model_name
-        self.PROMPT_TEMPLATE = PROMPT_TEMPLATE[model_name]
-        self.SYSTEM = SYSTEM
-        self.model_path = model_path
-        self.url = url
-        self.api_key = api_key
-        self.temperature = temperature
-        self.max_tokens = max_tokens
-        self.mode = 'API' if self.url is not None else 'Local'
-        self.logger = getLogger(__name__)
-
-        if self.mode == 'Local':
-            from vllm import LLM, SamplingParams
-            self.sampling_params = SamplingParams(temperature=self.temperature,
-                                                  max_tokens=self.max_tokens,
-                                                  stop=[
-                                                      '<|endoftext|>',
-                                                      '<|im_end|>', '<eoa>',
-                                                      '<||>', '<end_of_turn>',
-                                                      '<|eot_id|>'
-                                                  ])
-            self.llm = LLM(model=self.model_path, gpu_memory_utilization=0.5)
-
-    @staticmethod
-    def prepare_input(item):
-        user_input = Instruction + \
-            "Question: \"\"\"" + item['question'] + "\"\"\"\n\n" + \
-            "Output sentences: \"\"\"" + item['llm_output'] + "\"\"\"\n\n" + \
-            'Answer range: ' + item['standard_answer_range'] + '\n\n' + \
-            'Key extracted answer: '
-
-        return user_input
-
-    def gen_output(self, query):
-        if self.mode == 'API':
-            # return self.send_request(query)
-            return self.openai_infer(query)
-        else:
-            return self.offline_infer(query)
-
-    def send_request(self, query: str) -> str:
-        """Send a request to the model's API and return the response.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
-        payload = json.dumps({
-            'prompt':
-            prompt,
-            'temperature':
-            self.temperature,
-            'max_tokens':
-            self.max_tokens,
-            'stop': [
-                '<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
-                '<end_of_turn>', '<|eot_id|>'
-            ],
-        })
-        headers = {'Content-Type': 'application/json'}
-        res = requests.request('POST', self.url, headers=headers, data=payload)
-        res = res.json()['text'][0]
-        res = res.replace(prompt, '')
-        # res = requests.post(self.url, json=payload)
-        # res = res.json()['text']
-        res = res.strip()
-        return res
-
-    def openai_infer(self, query: str, retry=9) -> str:
-        """Perform inference on the OpenAI model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        if isinstance(self.url, list):
-            # Randomly api for better load balancing
-            import random
-            self.url = random.choice(self.url)
-        self.client = OpenAI(
-            api_key=self.api_key,
-            base_url=self.url,
-        )
-        self.retry = retry
-
-        t = time.time()
-        retry = self.retry
-        response = ''
-        while retry > 0:
-            try:
-                chat_response = self.client.chat.completions.create(
-                    model=self.client.models.list().data[0].id
-                    if self.model_name == '' else self.model_name,
-                    messages=[
-                        {
-                            'role': 'system',
-                            'content': self.SYSTEM
-                        },
-                        {
-                            'role': 'user',
-                            'content': query
-                        },
-                    ],
-                    stop=[
-                        '<|endoftext|>', '<|im_end|>', '<eoa>', '<||>',
-                        '<end_of_turn>', '<|eot_id|>'
-                    ],
-                    temperature=self.temperature,
-                    max_tokens=self.max_tokens,
-                )
-                js_response = json.loads(chat_response.model_dump_json())
-                response = js_response['choices'][0]['message']['content']
-                break
-            except Exception as e:
-                self.logger.info(f'Error: {e}')
-                self.logger.info(f'{self.url} is down. Retrying...')
-                self.logger.info(f'Time elapsed: {time.time() - t} seconds')
-                time.sleep(6)
-                retry -= 1
-        if retry == 0:
-            response = 'Error: Failed to get response.'
-            self.logger.info(f'{response} after {self.retry} tries.')
-            raise ValueError('The api is down')
-        return response.strip()
-
-    def offline_infer(self, query: str) -> str:
-        """Perform inference on the local xFinder model.
-
-        Args:
-            query (str): The input query.
-
-        Returns:
-            str: The extracted answer (xFinder's output).
-        """
-        prompt = self.PROMPT_TEMPLATE.format(system=self.SYSTEM, input=query)
-        res = self.llm.generate(prompt, self.sampling_params)
-        res = res[0]
-        res = res.outputs[0].text.strip()
-        return res
diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py
deleted file mode 100644
index 3fb19fd2..00000000
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/PROMPT_TEMPLATE.py
+++ /dev/null
@@ -1,14 +0,0 @@
-PROMPT_TEMPLATE = {
-    'xFinder-qwen1505':
-    """<|System|>:{system}
-<|User|>:{input}
-<|Bot|>:""",
-    'xFinder-llama38it':
-    """<|start_header_id|>system<|end_header_id|>
-
-{system}<|eot_id|><|start_header_id|>user<|end_header_id|>
-
-{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
-
-""",
-}
diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py
deleted file mode 100644
index b875d98d..00000000
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .convert_data import *  # noqa
-from .data_process import *  # noqa
-from .PROMPT_TEMPLATE import *  # noqa
diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py
deleted file mode 100644
index ace88222..00000000
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/convert_data.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Convert OpenCompass prediction data to XFinder format
-import copy
-import json
-import re
-
-xfinder_template = {
-    'math': {
-        'model_name':
-        '',
-        'dataset':
-        '',
-        'key_answer_type':
-        'math',
-        'question':
-        '',
-        'llm_output':
-        '',
-        'correct_answer':
-        '',
-        'standard_answer_range':
-        'a(n) number / set / vector / matrix / interval / expression / function / equation / inequality'  # noqa
-    },
-    'alphabet_option': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': 'alphabet_option',
-        'question': '',
-        'llm_output': '.',
-        'correct_answer': '',
-        'standard_answer_range': []
-    },
-    'categorical_label': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': '',
-        'question': '',
-        'llm_output': '',
-        'correct_answer': '',
-        'standard_answer_range': []
-    },
-    'short_text': {
-        'model_name': '',
-        'dataset': '',
-        'key_answer_type': 'short_text',
-        'question': '',
-        'llm_output': '',
-        'correct_answer': '',
-        'standard_answer_range': []
-    }
-}
-
-
-def parse_options(text: str):
-    lines = text.split('\n')
-    parsed_options = []
-    option_pattern = r'^[A-Z]\)|[A-Z]\.|[A-Z]\)|[A-Z]:|\([A-Z]\)'
-    for line in lines:
-        line = line.strip()
-        match = re.match(option_pattern, line)
-        if match:
-            option = ''
-            # 等于第一个属于选项的字符
-            for c in line:
-                if c.isalpha():
-                    option = c
-                    break
-            content_start = match.end() + 1
-            content = line[content_start:].strip()
-            parsed_options.append([option, content])
-
-    return parsed_options
-
-
-def convert_to_xfinder_format(typ, data, model_name='', dataset_name=''):
-    assert typ in xfinder_template.keys(), f'Invalid type {typ}'
-    format_data = []
-    for item in data:
-        template = copy.deepcopy(xfinder_template[typ])
-        question = item['origin_prompt'][-1]['prompt']
-        llm_output = item['prediction']
-        correct_answer = item['reference'] if item['reference'] else item[
-            'gold']
-        template['correct_answer'] = correct_answer
-        template['model_name'] = model_name
-        template['dataset'] = dataset_name
-        template['question'] = question
-        template['llm_output'] = llm_output
-        try:
-            assert typ in list(xfinder_template.keys())
-            if typ == 'alphabet_option':
-                options = parse_options(question)
-                template['standard_answer_range'] = options
-            elif typ == 'short_text':
-                template['standard_answer_range'] = item['gold']
-            elif typ == 'categorical_label':
-                pass
-        except Exception as e:
-            print(f'Error when parsing question options: {e}, skipping...')
-            continue
-
-        format_data.append(template)
-    return format_data
-
-
-if __name__ == '__main__':
-    # Test
-    example_data = {
-        'origin_prompt': [{
-            'role':
-            'HUMAN',
-            'prompt':
-            'Alice, Bob, Claire, Dave, and Eve are dancers at a square dance. At the start of a song, they each have a partner: Alice is dancing with Ophelia, Bob is dancing with Jamie, Claire is dancing with Melissa, Dave is dancing with Rodrigo, and Eve is dancing with Patrick.\nThroughout the song, the dancers often trade partners. First, Claire and Bob switch partners. Then, Claire and Eve switch partners. Then, Claire and Bob switch partners. Then, Eve and Dave switch partners. Finally, Claire and Alice switch partners. At the end of the dance, Alice is dancing with\nOptions:\n(A) Ophelia\n(B) Jamie\n(C) Melissa\n(D) Rodrigo\n(E) Patrick'  # noqa
-        }],
-        'origin_prediction':
-        '\n 答案: B) 前者小于后者',
-        'prediction':
-        'B',
-        'reference':
-        'A'
-    }
-    example_data = convert_to_xfinder_format('alphabet_option', [example_data],
-                                             'GPT-3', 'OpenAI')
-    print(json.dumps(example_data, indent=4, ensure_ascii=False))
diff --git a/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py b/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py
deleted file mode 100644
index 0cacd08b..00000000
--- a/opencompass/utils/postprocessors/xfinder/xfinder_utils/data_process.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import ast
-
-
-class DataProcessor:
-
-    def __init__(self):
-        pass
-
-    def read_data(self, data):
-        for item in data:
-            if isinstance(item['standard_answer_range'],
-                          str) and item['key_answer_type'] != 'math':
-                try:
-                    item['standard_answer_range'] = ast.literal_eval(
-                        item['standard_answer_range'])
-                except Exception as e:
-                    print(f'Error: {e}')
-                    print('Please check the form of standard_answer_range')
-                    exit(0)
-
-            item['standard_answer_range'] = str(item['standard_answer_range'])
-            item['key_answer_type'] = str(item['key_answer_type'])
-
-        return data

- Language -	- Knowledge -	- Reasoning -	- Examination -
- - Word Definition +We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website. -- WiC -- SummEdits +You can quickly find the dataset you need from the list through sorting, filtering, and searching functions. - +Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details. - - Idiom Learning - -- CHID - - - - - Semantic Similarity - -- AFQMC -- BUSTM - - - - - Coreference Resolution - -- CLUEWSC -- WSC -- WinoGrande - - - - - Translation - -- Flores -- IWSLT2017 - - - - - Multi-language Question Answering - -- TyDi-QA -- XCOPA - - - - - Multi-language Summary - -- XLSum - - -	- - Knowledge Question Answering - -- BoolQ -- CommonSenseQA -- NaturalQuestions -- TriviaQA - - -	- - Textual Entailment - -- CMNLI -- OCNLI -- OCNLI_FC -- AX-b -- AX-g -- CB -- RTE -- ANLI - - - - - Commonsense Reasoning - -- StoryCloze -- COPA -- ReCoRD -- HellaSwag -- PIQA -- SIQA - - - - - Mathematical Reasoning - -- MATH -- GSM8K - - - - - Theorem Application - -- TheoremQA -- StrategyQA -- SciBench - - - - - Comprehensive Reasoning - -- BBH - - -	- - Junior High, High School, University, Professional Examinations - -- C-Eval -- AGIEval -- MMLU -- GAOKAO-Bench -- CMMLU -- ARC -- Xiezhi - - - - - Medical Examinations - -- CMB - - -
- Understanding -	- Long Context -	- Safety -	- Code -
- - Reading Comprehension - -- C3 -- CMRC -- DRCD -- MultiRC -- RACE -- DROP -- OpenBookQA -- SQuAD2.0 - - - - - Content Summary - -- CSL -- LCSTS -- XSum -- SummScreen - - - - - Content Analysis - -- EPRSTMT -- LAMBADA -- TNEWS - - -	- - Long Context Understanding - -- LEval -- LongBench -- GovReports -- NarrativeQA -- Qasper - - -	- - Safety - -- CivilComments -- CrowsPairs -- CValues -- JigsawMultilingual -- TruthfulQA - - - - Robustness - -- AdvGLUE - - -	- - Code - -- HumanEval -- HumanEvalX -- MBPP -- APPs -- DS1000 - - -
- 语言 -	- 知识 -	- 推理 -	- 考试 -
- - 字词释义 +我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。 -- WiC -- SummEdits +您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。 - - - - 成语习语 - -- CHID - - - - - 语义相似度 - -- AFQMC -- BUSTM - - - - - 指代消解 - -- CLUEWSC -- WSC -- WinoGrande - - - - - 翻译 - -- Flores -- IWSLT2017 - - - - - 多语种问答 - -- TyDi-QA -- XCOPA - - - - - 多语种总结 - -- XLSum - - -	- - 知识问答 - -- BoolQ -- CommonSenseQA -- NaturalQuestions -- TriviaQA - - -	- - 文本蕴含 - -- CMNLI -- OCNLI -- OCNLI_FC -- AX-b -- AX-g -- CB -- RTE -- ANLI - - - - - 常识推理 - -- StoryCloze -- COPA -- ReCoRD -- HellaSwag -- PIQA -- SIQA - - - - - 数学推理 - -- MATH -- GSM8K - - - - - 定理应用 - -- TheoremQA -- StrategyQA -- SciBench - - - - - 综合推理 - -- BBH - - -	- - 初中/高中/大学/职业考试 - -- C-Eval -- AGIEval -- MMLU -- GAOKAO-Bench -- CMMLU -- ARC -- Xiezhi - - - - - 医学考试 - -- CMB - - -
- 理解 -	- 长文本 -	- 安全 -	- 代码 -
- - 阅读理解 - -- C3 -- CMRC -- DRCD -- MultiRC -- RACE -- DROP -- OpenBookQA -- SQuAD2.0 - - - - - 内容总结 - -- CSL -- LCSTS -- XSum -- SummScreen - - - - - 内容分析 - -- EPRSTMT -- LAMBADA -- TNEWS - - -	- - 长文本理解 - -- LEval -- LongBench -- GovReports -- NarrativeQA -- Qasper - - -	- - 安全 - -- CivilComments -- CrowsPairs -- CValues -- JigsawMultilingual -- TruthfulQA - - - - 健壮性 - -- AdvGLUE - - -	- - 代码 - -- HumanEval -- HumanEvalX -- MBPP -- APPs -- DS1000 - - -