Merge branch 'main' of github.com:open-compass/opencompass into tmp_olmpbench

This commit is contained in:
liushz 2025-02-25 09:16:11 +00:00
commit 4c9366d6fc
71 changed files with 2303 additions and 709 deletions

View File

@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
openai_mmmlu_lite_DE-DE_accuracy: 51.27
openai_mmmlu_lite_ES-LA_accuracy: 56.94
openai_mmmlu_lite_FR-FR_accuracy: 58.22
openai_mmmlu_lite_HI-IN_accuracy: 33.75
openai_mmmlu_lite_HI-IN_accuracy: 30.75
openai_mmmlu_lite_ID-ID_accuracy: 50.6
openai_mmmlu_lite_IT-IT_accuracy: 50.6
openai_mmmlu_lite_JA-JP_accuracy: 51.13
@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
CompassArena_naive_average: 34.61
FoFo_naive_average: 0.38
mtbench101_avg: 8.01
wildbench_average: -15.69
wildbench_average: -10.49
simpleqa_accuracy_given_attempted: 0.04
chinese_simpleqa_given_attempted_accuracy: 0.34
alignment_bench_v1_1_专业能力: 6.05
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
compassarena_knowledge_naive_average: 36
compassarena_reason_v2_naive_average: 35
compassarena_math_v2_naive_average: 19.91
compassarena_creationv2_zh_naive_average: 35.81
compassarena_creationv2_zh_naive_average: 43.64
fofo_test_prompts_overall: 0.35
fofo_test_prompts_cn_overall: 0.41
followbench_llmeval_en_HSR_AVG: 0.73

View File

@ -15,13 +15,13 @@ chat:
gsm8k_accuracy: 50
race-high_accuracy: 68.75
deepseek-7b-chat-vllm:
gsm8k_accuracy: 43.75
race-high_accuracy: 75
gsm8k_accuracy: 50
race-high_accuracy: 78.12
gemma2-2b-it-hf:
gsm8k_accuracy: 50
race-high_accuracy: 71.88
race-high_accuracy: 75
gemma2-9b-it-hf:
gsm8k_accuracy: 71.88
gsm8k_accuracy: 68.75
race-high_accuracy: 84.38
gemma-2b-it-hf:
gsm8k_accuracy: 3.12
@ -36,7 +36,7 @@ chat:
gsm8k_accuracy: 78.12
race-high_accuracy: 93.75
gemma-7b-it-vllm:
gsm8k_accuracy: 34.38
gsm8k_accuracy: 46.88
race-high_accuracy: 68.75
internlm2_5-7b-chat-hf:
gsm8k_accuracy: 84.38
@ -57,7 +57,7 @@ chat:
gsm8k_accuracy: 53.12
race-high_accuracy: 90.62
internlm2-chat-7b-vllm:
gsm8k_accuracy: 56.25
gsm8k_accuracy: 43.75
race-high_accuracy: 84.38
llama-3_1-8b-instruct-hf:
gsm8k_accuracy: 84.38
@ -90,13 +90,13 @@ chat:
gsm8k_accuracy: 75
race-high_accuracy: 81.25
mistral-nemo-instruct-2407-turbomind:
gsm8k_accuracy: 65.62
race-high_accuracy: 87.50
gsm8k_accuracy: 71.88
race-high_accuracy: 78.12
mistral-7b-instruct-v0.1-vllm:
gsm8k_accuracy: 34.38
race-high_accuracy: 68.75
mistral-7b-instruct-v0.2-vllm:
gsm8k_accuracy: 43.75
gsm8k_accuracy: 31.25
race-high_accuracy: 75
phi-3-mini-4k-instruct-hf:
gsm8k_accuracy: 81.25
@ -177,7 +177,7 @@ chat:
gsm8k_accuracy: 93.75
race-high_accuracy: 87.5
mixtral-8x7b-instruct-v0.1-hf:
gsm8k_accuracy: 56.25
gsm8k_accuracy: 59.38
race-high_accuracy: 81.25
mixtral-large-instruct-2411-turbomind:
gsm8k_accuracy: 90.62

View File

@ -17,7 +17,7 @@ on:
required: false
description: 'whether to build lmdeploy'
type: boolean
default: false
default: true
repo_org_lmdeploy:
required: false
description: 'Tested repository organization name. Default is internlm/lmdeploy'
@ -162,15 +162,16 @@ jobs:
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
uses: actions/download-artifact@v4
with:
name: my-artifact-${{ github.run_id }}-py310
- name: Prepare - reinstall lmdeploy - cu12
if: ${{inputs.build_lmdeploy}}
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall -y lmdeploy
pip install lmdeploy-*.whl --no-deps
- name: conda env
run: |
@ -188,7 +189,7 @@ jobs:
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
runs-on: volc_cu12_daily
environment: 'prod'
timeout-minutes: 120 #2hours
timeout-minutes: 180 #3hours
steps:
- name: Clone repository
uses: actions/checkout@v2
@ -275,7 +276,7 @@ jobs:
conda info --envs
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
echo "restful_pid=$!" >> "$GITHUB_ENV"
sleep 120s
sleep 180s
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
@ -334,7 +335,7 @@ jobs:
notify_to_feishu:
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
timeout-minutes: 5
runs-on: self-hosted

264
README.md
View File

@ -58,9 +58,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
@ -279,263 +279,13 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
## 📖 Dataset Support
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>Language</b>
</td>
<td>
<b>Knowledge</b>
</td>
<td>
<b>Reasoning</b>
</td>
<td>
<b>Examination</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>Word Definition</b></summary>
We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
- WiC
- SummEdits
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
</details>
Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
<details open>
<summary><b>Idiom Learning</b></summary>
- CHID
</details>
<details open>
<summary><b>Semantic Similarity</b></summary>
- AFQMC
- BUSTM
</details>
<details open>
<summary><b>Coreference Resolution</b></summary>
- CLUEWSC
- WSC
- WinoGrande
</details>
<details open>
<summary><b>Translation</b></summary>
- Flores
- IWSLT2017
</details>
<details open>
<summary><b>Multi-language Question Answering</b></summary>
- TyDi-QA
- XCOPA
</details>
<details open>
<summary><b>Multi-language Summary</b></summary>
- XLSum
</details>
</td>
<td>
<details open>
<summary><b>Knowledge Question Answering</b></summary>
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
</details>
</td>
<td>
<details open>
<summary><b>Textual Entailment</b></summary>
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
</details>
<details open>
<summary><b>Commonsense Reasoning</b></summary>
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
</details>
<details open>
<summary><b>Mathematical Reasoning</b></summary>
- MATH
- GSM8K
</details>
<details open>
<summary><b>Theorem Application</b></summary>
- TheoremQA
- StrategyQA
- SciBench
</details>
<details open>
<summary><b>Comprehensive Reasoning</b></summary>
- BBH
</details>
</td>
<td>
<details open>
<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
</details>
<details open>
<summary><b>Medical Examinations</b></summary>
- CMB
</details>
</td>
</tr>
</td>
</tr>
</tbody>
<tbody>
<tr align="center" valign="bottom">
<td>
<b>Understanding</b>
</td>
<td>
<b>Long Context</b>
</td>
<td>
<b>Safety</b>
</td>
<td>
<b>Code</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>Reading Comprehension</b></summary>
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
</details>
<details open>
<summary><b>Content Summary</b></summary>
- CSL
- LCSTS
- XSum
- SummScreen
</details>
<details open>
<summary><b>Content Analysis</b></summary>
- EPRSTMT
- LAMBADA
- TNEWS
</details>
</td>
<td>
<details open>
<summary><b>Long Context Understanding</b></summary>
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
</details>
</td>
<td>
<details open>
<summary><b>Safety</b></summary>
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
</details>
<details open>
<summary><b>Robustness</b></summary>
- AdvGLUE
</details>
</td>
<td>
<details open>
<summary><b>Code</b></summary>
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
</details>
</td>
</tr>
</td>
</tr>
</tbody>
</table>
<p align="right"><a href="#top">🔝Back to top</a></p>
## 📖 Model Support

View File

@ -274,263 +274,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
## 📖 数据集支持
<table align="center">
<tbody>
<tr align="center" valign="bottom">
<td>
<b>语言</b>
</td>
<td>
<b>知识</b>
</td>
<td>
<b>推理</b>
</td>
<td>
<b>考试</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>字词释义</b></summary>
我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
- WiC
- SummEdits
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
</details>
<details open>
<summary><b>成语习语</b></summary>
- CHID
</details>
<details open>
<summary><b>语义相似度</b></summary>
- AFQMC
- BUSTM
</details>
<details open>
<summary><b>指代消解</b></summary>
- CLUEWSC
- WSC
- WinoGrande
</details>
<details open>
<summary><b>翻译</b></summary>
- Flores
- IWSLT2017
</details>
<details open>
<summary><b>多语种问答</b></summary>
- TyDi-QA
- XCOPA
</details>
<details open>
<summary><b>多语种总结</b></summary>
- XLSum
</details>
</td>
<td>
<details open>
<summary><b>知识问答</b></summary>
- BoolQ
- CommonSenseQA
- NaturalQuestions
- TriviaQA
</details>
</td>
<td>
<details open>
<summary><b>文本蕴含</b></summary>
- CMNLI
- OCNLI
- OCNLI_FC
- AX-b
- AX-g
- CB
- RTE
- ANLI
</details>
<details open>
<summary><b>常识推理</b></summary>
- StoryCloze
- COPA
- ReCoRD
- HellaSwag
- PIQA
- SIQA
</details>
<details open>
<summary><b>数学推理</b></summary>
- MATH
- GSM8K
</details>
<details open>
<summary><b>定理应用</b></summary>
- TheoremQA
- StrategyQA
- SciBench
</details>
<details open>
<summary><b>综合推理</b></summary>
- BBH
</details>
</td>
<td>
<details open>
<summary><b>初中/高中/大学/职业考试</b></summary>
- C-Eval
- AGIEval
- MMLU
- GAOKAO-Bench
- CMMLU
- ARC
- Xiezhi
</details>
<details open>
<summary><b>医学考试</b></summary>
- CMB
</details>
</td>
</tr>
</td>
</tr>
</tbody>
<tbody>
<tr align="center" valign="bottom">
<td>
<b>理解</b>
</td>
<td>
<b>长文本</b>
</td>
<td>
<b>安全</b>
</td>
<td>
<b>代码</b>
</td>
</tr>
<tr valign="top">
<td>
<details open>
<summary><b>阅读理解</b></summary>
- C3
- CMRC
- DRCD
- MultiRC
- RACE
- DROP
- OpenBookQA
- SQuAD2.0
</details>
<details open>
<summary><b>内容总结</b></summary>
- CSL
- LCSTS
- XSum
- SummScreen
</details>
<details open>
<summary><b>内容分析</b></summary>
- EPRSTMT
- LAMBADA
- TNEWS
</details>
</td>
<td>
<details open>
<summary><b>长文本理解</b></summary>
- LEval
- LongBench
- GovReports
- NarrativeQA
- Qasper
</details>
</td>
<td>
<details open>
<summary><b>安全</b></summary>
- CivilComments
- CrowsPairs
- CValues
- JigsawMultilingual
- TruthfulQA
</details>
<details open>
<summary><b>健壮性</b></summary>
- AdvGLUE
</details>
</td>
<td>
<details open>
<summary><b>代码</b></summary>
- HumanEval
- HumanEvalX
- MBPP
- APPs
- DS1000
</details>
</td>
</tr>
</td>
</tr>
</tbody>
</table>
详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
<p align="right"><a href="#top">🔝返回顶部</a></p>

734
dataset-index.yml Normal file
View File

@ -0,0 +1,734 @@
- ifeval:
name: IFEval
category: Instruction Following
paper: https://arxiv.org/pdf/2311.07911
configpath: opencompass/configs/datasets/IFEval
- nphard:
name: NPHardEval
category: Reasoning
paper: https://arxiv.org/pdf/2312.14890v2
configpath: opencompass/configs/datasets/NPHardEval
- pmmeval:
name: PMMEval
category: Language
paper: https://arxiv.org/pdf/2411.09116v1
configpath: opencompass/configs/datasets/PMMEval
- theoremqa:
name: TheroremQA
category: Reasoning
paper: https://arxiv.org/pdf/2305.12524
configpath: opencompass/configs/datasets/TheroremQA
- agieval:
name: AGIEval
category: Examination
paper: https://arxiv.org/pdf/2304.06364
configpath: opencompass/configs/datasets/agieval
- babilong:
name: BABILong
category: Long Context
paper: https://arxiv.org/pdf/2406.10149
configpath: opencompass/configs/datasets/babilong
- bigcodebench:
name: BigCodeBench
category: Code
paper: https://arxiv.org/pdf/2406.15877
configpath: opencompass/configs/datasets/bigcodebench
- calm:
name: CaLM
category: Reasoning
paper: https://arxiv.org/pdf/2405.00622
configpath: opencompass/configs/datasets/calm
- infinitebench:
name: InfiniteBench (∞Bench)
category: Long Context
paper: https://aclanthology.org/2024.acl-long.814.pdf
configpath: opencompass/configs/datasets/infinitebench
- korbench:
name: KOR-Bench
category: Reasoning
paper: https://arxiv.org/pdf/2410.06526v1
configpath: opencompass/configs/datasets/korbench
- lawbench:
name: LawBench
category: Knowledge / Law
paper: https://arxiv.org/pdf/2309.16289
configpath: opencompass/configs/datasets/lawbench
- leval:
name: L-Eval
category: Long Context
paper: https://arxiv.org/pdf/2307.11088v1
configpath: opencompass/configs/datasets/leval
- livecodebench:
name: LiveCodeBench
category: Code
paper: https://arxiv.org/pdf/2403.07974
configpath: opencompass/configs/datasets/livecodebench
- livemathbench:
name: LiveMathBench
category: Math
paper: https://arxiv.org/pdf/2412.13147
configpath: opencompass/configs/datasets/livemathbench
- longbench:
name: LongBench
category: Long Context
paper: https://github.com/THUDM/LongBench
configpath: opencompass/configs/datasets/livemathbench
- lveval:
name: LV-Eval
category: Long Context
paper: https://arxiv.org/pdf/2402.05136
configpath: opencompass/configs/datasets/lveval
- medbench:
name: MedBench
category: Knowledge / Medicine
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
configpath: opencompass/configs/datasets/MedBench
- musr:
name: MuSR
category: Reasoning
paper: https://arxiv.org/pdf/2310.16049
configpath: opencompass/configs/datasets/musr
- needlebench:
name: NeedleBench
category: Long Context
paper: https://arxiv.org/pdf/2407.11963
configpath: opencompass/configs/datasets/needlebench
- ruler:
name: RULER
category: Long Context
paper: https://arxiv.org/pdf/2404.06654
configpath: opencompass/configs/datasets/ruler
- alignment:
name: AlignBench
category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.18743
configpath: opencompass/configs/datasets/subjective/alignbench
- alpaca:
name: AlpacaEval
category: Subjective / Instruction Following
paper: https://github.com/tatsu-lab/alpaca_eval
configpath: opencompass/configs/datasets/subjective/aplaca_eval
- arenahard:
name: Arena-Hard
category: Subjective / Chatbot
paper: https://lmsys.org/blog/2024-04-19-arena-hard/
configpath: opencompass/configs/datasets/subjective/arena_hard
- flames:
name: FLAMES
category: Subjective / Alignment
paper: https://arxiv.org/pdf/2311.06899
configpath: opencompass/configs/datasets/subjective/flames
- fofo:
name: FOFO
category: Subjective / Format Following
paper: https://arxiv.org/pdf/2402.18667
configpath: opencompass/configs/datasets/subjective/fofo
- followbench:
name: FollowBench
category: Subjective / Instruction Following
paper: https://arxiv.org/pdf/2310.20410
configpath: opencompass/configs/datasets/subjective/followbench
- hellobench:
name: HelloBench
category: Subjective / Long Context
paper: https://arxiv.org/pdf/2409.16191
configpath: opencompass/configs/datasets/subjective/hellobench
- judgerbench:
name: JudgerBench
category: Subjective / Long Context
paper: https://arxiv.org/pdf/2410.16256
configpath: opencompass/configs/datasets/subjective/judgerbench
- multiround:
name: MT-Bench-101
category: Subjective / Multi-Round
paper: https://arxiv.org/pdf/2402.14762
configpath: opencompass/configs/datasets/subjective/multiround
- wildbench:
name: WildBench
category: Subjective / Real Task
paper: https://arxiv.org/pdf/2406.04770
configpath: opencompass/configs/datasets/subjective/wildbench
- teval:
name: T-Eval
category: Tool Utilization
paper: https://arxiv.org/pdf/2312.14033
configpath: opencompass/configs/datasets/teval
- finalceiq:
name: FinanceIQ
category: Knowledge / Finance
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
configpath: opencompass/configs/datasets/FinanceIQ
- gaokaobench:
name: GAOKAOBench
category: Examination
paper: https://arxiv.org/pdf/2305.12474
configpath: opencompass/configs/datasets/GaokaoBench
- lcbench:
name: LCBench
category: Code
paper: https://github.com/open-compass/CodeBench/
configpath: opencompass/configs/datasets/LCBench
- MMLUArabic:
name: ArabicMMLU
category: Language
paper: https://arxiv.org/pdf/2402.12840
configpath: opencompass/configs/datasets/MMLUArabic
- OpenFinData:
name: OpenFinData
category: Knowledge / Finance
paper: https://github.com/open-compass/OpenFinData
configpath: opencompass/configs/datasets/OpenFinData
- QuALITY:
name: QuALITY
category: Long Context
paper: https://arxiv.org/pdf/2112.08608
configpath: opencompass/configs/datasets/QuALITY
- advglue:
name: Adversarial GLUE
category: Safety
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
configpath: opencompass/configs/datasets/adv_glue
- afqmcd:
name: CLUE / AFQMC
category: Language
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_afqmc
- aime2024:
name: AIME2024
category: Examination
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
configpath: opencompass/configs/datasets/aime2024
- anli:
name: Adversarial NLI
category: Reasoning
paper: https://arxiv.org/pdf/1910.14599v2
configpath: opencompass/configs/datasets/anli
- anthropics_evals:
name: Anthropics Evals
category: Safety
paper: https://arxiv.org/pdf/2212.09251
configpath: opencompass/configs/datasets/anthropics_evals
- apps:
name: APPS
category: Code
paper: https://arxiv.org/pdf/2105.09938
configpath: opencompass/configs/datasets/apps
- arc:
name: ARC
category: Reasoning
paper: https://arxiv.org/pdf/1803.05457
configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e]
- arc_prize_public_eval:
name: ARC Prize
category: ARC-AGI
paper: https://arcprize.org/guide#private
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
- ax:
name: SuperGLUE / AX
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g]
- bbh:
name: BIG-Bench Hard
category: Reasoning
paper: https://arxiv.org/pdf/2210.09261
configpath: opencompass/configs/datasets/bbh
- BoolQ:
name: SuperGLUE / BoolQ
category: Knowledge
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
- c3:
name: CLUE / C3 (C³)
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_C3
- cb:
name: SuperGLUE / CB
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_CB
- ceval:
name: C-EVAL
category: Examination
paper: https://arxiv.org/pdf/2305.08322v1
configpath: opencompass/configs/datasets/ceval
- charm:
name: CHARM
category: Reasoning
paper: https://arxiv.org/pdf/2403.14112
configpath: opencompass/configs/datasets/CHARM
- chembench:
name: ChemBench
category: Knowledge / Chemistry
paper: https://arxiv.org/pdf/2404.01475
configpath: opencompass/configs/datasets/ChemBench
- chid:
name: FewCLUE / CHID
category: Language
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_chid
- chinese_simpleqa:
name: Chinese SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.07140
configpath: opencompass/configs/datasets/chinese_simpleqa
- cibench:
name: CIBench
category: Code
paper: https://www.arxiv.org/pdf/2407.10499
configpath: opencompass/configs/datasets/CIBench
- civilcomments:
name: CivilComments
category: Safety
paper: https://arxiv.org/pdf/1903.04561
configpath: opencompass/configs/datasets/civilcomments
- clozeTest_maxmin:
name: Cloze Test-max/min
category: Code
paper: https://arxiv.org/pdf/2102.04664
configpath: opencompass/configs/datasets/clozeTest_maxmin
- cluewsc:
name: FewCLUE / CLUEWSC
category: Language / WSC
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_cluewsc
- cmb:
name: CMB
category: Knowledge / Medicine
paper: https://arxiv.org/pdf/2308.08833
configpath: opencompass/configs/datasets/cmb
- cmmlu:
name: CMMLU
category: Understanding
paper: https://arxiv.org/pdf/2306.09212
configpath: opencompass/configs/datasets/cmmlu
- cmnli:
name: CLUE / CMNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_cmnli
- cmo_fib:
name: cmo_fib
category: Examination
paper: ""
configpath: opencompass/configs/datasets/cmo_fib
- cmrc:
name: CLUE / CMRC
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_CMRC
- commonsenseqa:
name: CommonSenseQA
category: Knowledge
paper: https://arxiv.org/pdf/1811.00937v2
configpath: opencompass/configs/datasets/commonsenseqa
- commonsenseqa_cn:
name: CommonSenseQA-CN
category: Knowledge
paper: ""
configpath: opencompass/configs/datasets/commonsenseqa_cn
- copa:
name: SuperGLUE / COPA
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_COPA
- crowspairs:
name: CrowsPairs
category: Safety
paper: https://arxiv.org/pdf/2010.00133
configpath: opencompass/configs/datasets/crowspairs
- crowspairs_cn:
name: CrowsPairs-CN
category: Safety
paper: ""
configpath: opencompass/configs/datasets/crowspairs_cn
- cvalues:
name: CVALUES
category: Safety
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
configpath: opencompass/configs/datasets/cvalues
- drcd:
name: CLUE / DRCD
category: Understanding
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_DRCD
- drop:
name: DROP (DROP Simple Eval)
category: Understanding
paper: https://arxiv.org/pdf/1903.00161
configpath: opencompass/configs/datasets/drop
- ds1000:
name: DS-1000
category: Code
paper: https://arxiv.org/pdf/2211.11501
configpath: opencompass/configs/datasets/ds1000
- eprstmt:
name: FewCLUE / EPRSTMT
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_eprstmt
- flores:
name: Flores
category: Language
paper: https://aclanthology.org/D19-1632.pdf
configpath: opencompass/configs/datasets/flores
- game24:
name: Game24
category: Math
paper: https://huggingface.co/datasets/nlile/24-game
configpath: opencompass/configs/datasets/game24
- govrepcrs:
name: Government Report Dataset
category: Long Context
paper: https://aclanthology.org/2021.naacl-main.112.pdf
configpath: opencompass/configs/datasets/govrepcrs
- gpqa:
name: GPQA
category: Knowledge
paper: https://arxiv.org/pdf/2311.12022v1
configpath: opencompass/configs/datasets/gpqa
- gsm8k:
name: GSM8K
category: Math
paper: https://arxiv.org/pdf/2110.14168v2
configpath: opencompass/configs/datasets/gsm8k
- gsm_hard:
name: GSM-Hard
category: Math
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
configpath: opencompass/configs/datasets/gsm_hard
- hellaswag:
name: HellaSwag
category: Reasoning
paper: https://arxiv.org/pdf/1905.07830
configpath: opencompass/configs/datasets/hellaswag
- humaneval:
name: HumanEval
category: Code
paper: https://arxiv.org/pdf/2107.03374v2
configpath: opencompass/configs/datasets/humaneval
- humaneval_cn:
name: HumanEval-CN
category: Code
paper: ""
configpath: opencompass/configs/datasets/humaneval_cn
- humaneval_multi:
name: Multi-HumanEval
category: Code
paper: https://arxiv.org/pdf/2210.14868
configpath: opencompass/configs/datasets/humaneval_multi
- humanevalx:
name: HumanEval-X
category: Code
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
configpath: opencompass/configs/datasets/humanevalx
- hungarian_math:
name: Hungarian_Math
category: Math
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
configpath: opencompass/configs/datasets/hungarian_exam
- iwslt2017:
name: IWSLT2017
category: Language
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
configpath: opencompass/configs/datasets/iwslt2017
- jigsawmultilingual:
name: JigsawMultilingual
category: Safety
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
configpath: opencompass/configs/datasets/jigsawmultilingual
- lambada:
name: LAMBADA
category: Understanding
paper: https://arxiv.org/pdf/1606.06031
configpath: opencompass/configs/datasets/lambada
- lcsts:
name: LCSTS
category: Understanding
paper: https://aclanthology.org/D15-1229.pdf
configpath: opencompass/configs/datasets/lcsts
- livestembench:
name: LiveStemBench
category: ""
paper: ""
configpath: opencompass/configs/datasets/livestembench
- llm_compression:
name: LLM Compression
category: Bits Per Character (BPC)
paper: https://arxiv.org/pdf/2404.09937
configpath: opencompass/configs/datasets/llm_compression
- math:
name: MATH
category: Math
paper: https://arxiv.org/pdf/2103.03874
configpath: opencompass/configs/datasets/math
- math401:
name: MATH 401
category: Math
paper: https://arxiv.org/pdf/2304.02015
configpath: opencompass/configs/datasets/math401
- mathbench:
name: MathBench
category: Math
paper: https://arxiv.org/pdf/2405.12209
configpath: opencompass/configs/datasets/mathbench
- mbpp:
name: MBPP
category: Code
paper: https://arxiv.org/pdf/2108.07732
configpath: opencompass/configs/datasets/mbpp
- mbpp_cn:
name: MBPP-CN
category: Code
paper: ""
configpath: opencompass/configs/datasets/mbpp_cn
- mbpp_plus:
name: MBPP-PLUS
category: Code
paper: ""
configpath: opencompass/configs/datasets/mbpp_plus
- mgsm:
name: MGSM
category: Language / Math
paper: https://arxiv.org/pdf/2210.03057
configpath: opencompass/configs/datasets/mgsm
- mmlu:
name: MMLU
category: Understanding
paper: https://arxiv.org/pdf/2009.03300
configpath: opencompass/configs/datasets/mmlu
- mmlu_cf:
name: MMLU-CF
category: Understanding
paper: https://arxiv.org/pdf/2412.15194
configpath: opencompass/configs/datasets/mmlu_cf
- mmlu_pro:
name: MMLU-Pro
category: Understanding
paper: https://arxiv.org/pdf/2406.01574
configpath: opencompass/configs/datasets/mmlu_pro
- mmmlu:
name: MMMLU
category: Language / Understanding
paper: https://huggingface.co/datasets/openai/MMMLU
configpath: opencompass/configs/datasets/mmmlu
- multirc:
name: SuperGLUE / MultiRC
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
- narrativeqa:
name: NarrativeQA
category: Understanding
paper: https://github.com/google-deepmind/narrativeqa
configpath: opencompass/configs/datasets/narrativeqa
- natural_question:
name: NaturalQuestions
category: Knowledge
paper: https://github.com/google-research-datasets/natural-questions
configpath: opencompass/configs/datasets/nq
- natural_question_cn:
name: NaturalQuestions-CN
category: Knowledge
paper: ""
configpath: opencompass/configs/datasets/nq_cn
- obqa:
name: OpenBookQA
category: Knowledge
paper: https://arxiv.org/pdf/1809.02789v1
configpath: opencompass/configs/datasets/obqa
- piqa:
name: OpenBookQA
category: Knowledge / Physics
paper: https://arxiv.org/pdf/1911.11641v1
configpath: opencompass/configs/datasets/piqa
- py150:
name: py150
category: Code
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
configpath: opencompass/configs/datasets/py150
- qasper:
name: Qasper
category: Long Context
paper: https://arxiv.org/pdf/2105.03011
configpath: opencompass/configs/datasets/qasper
- qaspercut:
name: Qasper-Cut
category: Long Context
paper: ""
configpath: opencompass/configs/datasets/qaspercut
- race:
name: RACE
category: Examination
paper: https://arxiv.org/pdf/1704.04683
configpath: opencompass/configs/datasets/race
- realtoxicprompts:
name: RealToxicPrompts
category: Safety
paper: https://arxiv.org/pdf/2009.11462
configpath: opencompass/configs/datasets/realtoxicprompts
- record:
name: SuperGLUE / ReCoRD
category: Understanding
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
- rte:
name: SuperGLUE / RTE
category: Reasoning
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_RTE
- ocnli:
name: CLUE / OCNLI
category: Reasoning
paper: https://arxiv.org/pdf/2004.05986
configpath: opencompass/configs/datasets/CLUE_ocnli
- rolebench:
name: RoleBench
category: Role Play
paper: https://arxiv.org/pdf/2310.00746
configpath: opencompass/configs/datasets/rolebench
- s3eval:
name: S3Eval
category: Long Context
paper: https://aclanthology.org/2024.naacl-long.69.pdf
configpath: opencompass/configs/datasets/s3eval
- scibench:
name: SciBench
category: Reasoning
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
configpath: opencompass/configs/datasets/scibench
- scicode:
name: SciCode
category: Code
paper: https://arxiv.org/pdf/2407.13168
configpath: opencompass/configs/datasets/scicode
- simpleqa:
name: SimpleQA
category: Knowledge
paper: https://arxiv.org/pdf/2411.04368
configpath: opencompass/configs/datasets/SimpleQA
- siqa:
name: SocialIQA
category: Reasoning
paper: https://arxiv.org/pdf/1904.09728
configpath: opencompass/configs/datasets/siqa
- squad20:
name: SQuAD2.0
category: Understanding
paper: https://arxiv.org/pdf/1806.03822
configpath: opencompass/configs/datasets/squad20
- storycloze:
name: StoryCloze
category: Reasoning
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
configpath: opencompass/configs/datasets/storycloze
- strategyqa:
name: StrategyQA
category: Reasoning
paper: https://arxiv.org/pdf/2101.02235
configpath: opencompass/configs/datasets/strategyqa
- summedits:
name: SummEdits
category: Language
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
configpath: opencompass/configs/datasets/summedits
- summscreen:
name: SummScreen
category: Understanding
paper: https://arxiv.org/pdf/2104.07091v1
configpath: opencompass/configs/datasets/summscreen
- svamp:
name: SVAMP
category: Math
paper: https://aclanthology.org/2021.naacl-main.168.pdf
configpath: opencompass/configs/datasets/SVAMP
- tabmwp:
name: TabMWP
category: Math / Table
paper: https://arxiv.org/pdf/2209.14610
configpath: opencompass/configs/datasets/TabMWP
- taco:
name: TACO
category: Code
paper: https://arxiv.org/pdf/2312.14852
configpath: opencompass/configs/datasets/taco
- tnews:
name: FewCLUE / TNEWS
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_tnews
- bustm:
name: FewCLUE / BUSTM
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_bustm
- csl:
name: FewCLUE / CSL
category: Understanding
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_csl
- ocnli_fc:
name: FewCLUE / OCNLI-FC
category: Reasoning
paper: https://arxiv.org/pdf/2107.07498
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
- triviaqa:
name: TriviaQA
category: Knowledge
paper: https://arxiv.org/pdf/1705.03551v2
configpath: opencompass/configs/datasets/triviaqa
- triviaqarc:
name: TriviaQA-RC
category: Knowledge / Understanding
paper: ""
configpath: opencompass/configs/datasets/triviaqarc
- truthfulqa:
name: TruthfulQA
category: Safety
paper: https://arxiv.org/pdf/2109.07958v2
configpath: opencompass/configs/datasets/truthfulqa
- tydiqa:
name: TyDi-QA
category: Language
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
configpath: opencompass/configs/datasets/tydiqa
- wic:
name: SuperGLUE / WiC
category: Language
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WiC
- wsc:
name: SuperGLUE / WSC
category: Language / WSC
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
configpath: opencompass/configs/datasets/SuperGLUE_WSC
- winogrande:
name: WinoGrande
category: Language / WSC
paper: https://arxiv.org/pdf/1907.10641v2
configpath: opencompass/configs/datasets/winogrande
- xcopa:
name: XCOPA
category: Language
paper: https://arxiv.org/pdf/2005.00333
configpath: opencompass/configs/datasets/XCOPA
- xiezhi:
name: Xiezhi
category: Knowledge
paper: https://arxiv.org/pdf/2306.05783
configpath: opencompass/configs/datasets/xiezhi
- xlsum:
name: XLSum
category: Understanding
paper: https://arxiv.org/pdf/2106.13822v1
configpath: opencompass/configs/datasets/XLSum
- xsum:
name: Xsum
category: Understanding
paper: https://arxiv.org/pdf/1808.08745
configpath: opencompass/configs/datasets/Xsum

View File

@ -1,10 +1,20 @@
var collapsedSections = [];
var collapsedSections = ['Dataset Statistics'];
$(document).ready(function () {
$('.model-summary').DataTable({
$('.dataset').DataTable({
"stateSave": false,
"lengthChange": false,
"pageLength": 20,
"order": []
"order": [],
"language": {
"info": "Show _START_ to _END_ ItemsTotally _TOTAL_ ",
"infoFiltered": "Filtered from _MAX_ Items",
"search": "Search",
"zeroRecords": "Item Not Found",
"paginate": {
"next": "Next",
"previous": "Previous"
},
}
});
});

View File

@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
return dataset
```
3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
- The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
```
- mydataset:
name: MyDataset
category: Understanding
paper: https://arxiv.org/pdf/xxxxxxx
configpath: opencompass/configs/datasets/MyDataset
```
Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.

View File

@ -220,3 +220,11 @@ autodoc_typehints = 'none'
# The not found page
notfound_template = '404.html'
def builder_inited_handler(app):
subprocess.run(['./statis.py'])
def setup(app):
app.connect('builder-inited', builder_inited_handler)

View File

@ -80,6 +80,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
tools.md
.. _Dataset List:
.. toctree::
:maxdepth: 1
:caption: Dataset List
dataset_statistics.md
.. _Notes:
.. toctree::
:maxdepth: 1

76
docs/en/statis.py Executable file
View File

@ -0,0 +1,76 @@
#! /usr/bin/env python
from pathlib import Path
import yaml
from tabulate import tabulate
OC_ROOT = Path(__file__).absolute().parents[2]
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
DATASETZOO_TEMPLATE = """\
# Dataset Statistics
On this page, we have listed all the datasets supported by OpenCompass.
You can use sorting and search functions to find the dataset you need.
"""
with open('dataset_statistics.md', 'w') as f:
f.write(DATASETZOO_TEMPLATE)
load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath']
def table_format(data_list):
table_format_list = []
for i in data_list:
table_format_list_sub = []
for j in i:
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[link](' + i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
for k in i[j][index]:
sub_list_text += ('[link](' + GITHUB_PREFIX + k +
') / ')
table_format_list_sub.append(sub_list_text[:-2])
else:
table_format_list_sub.append('[link](' +
GITHUB_PREFIX +
i[j][index] + ')')
else:
table_format_list_sub.append(i[j][index])
table_format_list.append(table_format_list_sub)
return table_format_list
data_format_list = table_format(data_list)
def generate_table(data_list, title=None):
with open('dataset_statistics.md', 'a') as f:
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = ['Name', 'Category', 'Paper or Repository', 'Config File']
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',
stralign='center')
f.write(tabulate(data_list, header, **table_cfg))
f.write('\n```\n')
generate_table(
data_list=data_format_list,
title='## Supported Dataset List',
)

View File

@ -1,10 +1,20 @@
var collapsedSections = [];
var collapsedSections = ['数据集统计'];
$(document).ready(function () {
$('.model-summary').DataTable({
$('.dataset').DataTable({
"stateSave": false,
"lengthChange": false,
"pageLength": 20,
"order": []
"order": [],
"language": {
"info": "显示 _START_ 至 _END_ 条目(总计 _TOTAL_ ",
"infoFiltered": "(筛选自 _MAX_ 条目)",
"search": "搜索:",
"zeroRecords": "没有找到任何条目",
"paginate": {
"next": "下一页",
"previous": "上一页"
},
}
});
});

View File

@ -91,4 +91,16 @@
return dataset
```
3. 在完成数据集脚本和配置文件的构建后需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息以使其加入OpenCompass官网Doc的数据集统计列表中。
- 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下:
```
- mydataset:
name: MyDataset
category: Understanding
paper: https://arxiv.org/pdf/xxxxxxx
configpath: opencompass/configs/datasets/MyDataset
```
详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程,启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。

View File

@ -224,6 +224,7 @@ notfound_template = '404.html'
def builder_inited_handler(app):
subprocess.run(['./cp_origin_docs.sh'])
subprocess.run(['./statis.py'])
def setup(app):

View File

@ -81,6 +81,13 @@ OpenCompass 上手路线
tools.md
.. _数据集列表:
.. toctree::
:maxdepth: 1
:caption: 数据集列表
dataset_statistics.md
.. _其他说明:
.. toctree::
:maxdepth: 1

75
docs/zh_cn/statis.py Executable file
View File

@ -0,0 +1,75 @@
#! /usr/bin/env python
from pathlib import Path
import yaml
from tabulate import tabulate
OC_ROOT = Path(__file__).absolute().parents[2]
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
DATASETZOO_TEMPLATE = """\
# 数据集统计
在本页面中我们列举了OpenCompass所支持的所有数据集
你可以使用排序和搜索功能找到需要的数据集
"""
with open('dataset_statistics.md', 'w') as f:
f.write(DATASETZOO_TEMPLATE)
load_path = str(OC_ROOT / 'dataset-index.yml')
with open(load_path, 'r') as f2:
data_list = yaml.load(f2, Loader=yaml.FullLoader)
HEADER = ['name', 'category', 'paper', 'configpath']
def table_format(data_list):
table_format_list = []
for i in data_list:
table_format_list_sub = []
for j in i:
for index in HEADER:
if index == 'paper':
table_format_list_sub.append('[链接](' + i[j][index] + ')')
elif index == 'configpath':
if isinstance(i[j][index], list):
sub_list_text = ''
for k in i[j][index]:
sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
') / ')
table_format_list_sub.append(sub_list_text[:-2])
else:
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
i[j][index] + ')')
else:
table_format_list_sub.append(i[j][index])
table_format_list.append(table_format_list_sub)
return table_format_list
data_format_list = table_format(data_list)
def generate_table(data_list, title=None):
with open('dataset_statistics.md', 'a') as f:
if title is not None:
f.write(f'\n{title}')
f.write("""\n```{table}\n:class: dataset\n""")
header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
table_cfg = dict(tablefmt='pipe',
floatfmt='.2f',
numalign='right',
stralign='center')
f.write(tabulate(data_list, header, **table_cfg))
f.write('\n```\n')
generate_table(
data_list=data_format_list,
title='## 支持数据集列表',
)

View File

@ -0,0 +1,137 @@
# flake8: noqa
from mmengine.config import read_base
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
#######################################################################
# PART 0 Essential Configs #
#######################################################################
with read_base():
# Datasets Part
# Knowledge
# Math
from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
aime2024_datasets
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
bbh_datasets
# General Reasoning
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
gpqa_datasets
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
humaneval_datasets
# Instruction Following
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
ifeval_datasets
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
LCBCodeGeneration_dataset
from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
math_datasets
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
mmlu_pro_datasets
# Model List
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
models as hf_internlm2_5_7b_chat_model
# Summary Groups
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups
#######################################################################
# PART 1 Datasets List #
#######################################################################
# datasets list for evaluation
# Only take LCB generation for evaluation
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
[]) + [LCBCodeGeneration_dataset]
# LLM judge config: using LLM to evaluate predictions
judge_cfg = dict()
for dataset in datasets:
dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
#######################################################################
# PART 2 Datset Summarizer #
#######################################################################
core_summary_groups = [
{
'name':
'core_average',
'subsets': [
['IFEval', 'Prompt-level-strict-accuracy'],
['bbh', 'naive_average'],
['math_prm800k_500', 'accuracy'],
['aime2024', 'accuracy'],
['GPQA_diamond', 'accuracy'],
['mmlu_pro', 'naive_average'],
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
},
]
summarizer = dict(
dataset_abbrs=[
['core_average', 'naive_average'],
'',
'Instruction Following',
['IFEval', 'Prompt-level-strict-accuracy'],
'',
'General Reasoning',
['bbh', 'naive_average'],
['GPQA_diamond', 'accuracy'],
'',
'Math Calculation',
['math_prm800k_500', 'accuracy'],
['aime2024', 'accuracy'],
'',
'Knowledge',
['mmlu_pro', 'naive_average'],
'',
'Code',
['openai_humaneval', 'humaneval_pass@1'],
['lcb_code_generation', 'pass@1'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)
#######################################################################
# PART 3 Models List #
#######################################################################
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
#######################################################################
# PART 4 Inference/Evaluation Configuaration #
#######################################################################
# Local Runner
infer = dict(
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
runner=dict(
type=LocalRunner,
max_num_workers=16,
retry=0, # Modify if needed
task=dict(type=OpenICLInferTask),
),
)
# eval with local runner
eval = dict(
partitioner=dict(type=NaivePartitioner, n=10),
runner=dict(type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLEvalTask)),
)
#######################################################################
# PART 5 Utils Configuaration #
#######################################################################
work_dir = './outputs/oc_academic_202502'

View File

@ -0,0 +1,77 @@
from mmengine.config import read_base
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
with read_base():
from opencompass.configs.datasets.math.math_500_gen import math_datasets
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
gen_config=dict(
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
),
max_seq_len=32768,
max_out_len=32768,
batch_size=32,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
gen_config=dict(
top_k=1,
temperature=0.6,
top_p=0.95,
max_new_tokens=32768,
do_sample=True,
),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content),
),
]
datasets = [*math_datasets]
work_dir = './outputs/math_500'

View File

@ -0,0 +1,98 @@
# flake8: noqa
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import Aime2024Dataset
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
aime2024_reader_cfg = dict(
input_columns=['question'],
output_column='answer'
)
aime2024_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN',
prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
],
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048)
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
aime2024_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE),
]),
),
dataset_cfg=dict(
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
metric_name='accuracy'),
),
pred_role='BOT',
)
aime2024_datasets = [
dict(
abbr='aime2024',
type=Aime2024Dataset,
path='opencompass/aime2024',
reader_cfg=aime2024_reader_cfg,
infer_cfg=aime2024_infer_cfg,
eval_cfg=aime2024_eval_cfg,
mode='singlescore',
)
]

View File

@ -0,0 +1,189 @@
# flake8: noqa
import os
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import BBHDataset
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
bbh_multiple_choice_sets = [
'temporal_sequences',
'disambiguation_qa',
'date_understanding',
'tracking_shuffled_objects_three_objects',
'penguins_in_a_table',
'geometric_shapes',
'snarks',
'ruin_names',
'tracking_shuffled_objects_seven_objects',
'tracking_shuffled_objects_five_objects',
'logical_deduction_three_objects',
'hyperbaton',
'logical_deduction_five_objects',
'logical_deduction_seven_objects',
'movie_recommendation',
'salient_translation_error_detection',
'reasoning_about_colored_objects',
]
bbh_free_form_sets = [
'multistep_arithmetic_two',
'navigate',
'dyck_languages',
'word_sorting',
'sports_understanding',
'boolean_expressions',
'object_counting',
'formal_fallacies',
'causal_judgement',
'web_of_lies',
]
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
# For zero shot inference in bbh
bbh_datasets = []
for _name in bbh_sets:
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy())
)
# For original 3 shot inference in bbh
bbh_3_shot_datasets = []
for _name in bbh_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
bbh_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
)
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
bbh_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt=GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=BBHDataset,
name=_name,
path='opencompass/bbh',
reader_cfg=bbh_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
),
pred_role='BOT',
)
bbh_3_shot_datasets.append(
dict(
type=BBHDataset,
path='opencompass/bbh',
name=_name,
abbr='bbh-' + _name,
reader_cfg=bbh_reader_cfg,
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))

View File

@ -1,36 +1,30 @@
# LiveMathBench
## Details of Datsets
## v202412
### Details of Datsets
| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
| -- | -- | -- | -- | -- | -- |
| AIMC | cn | 0 | 0 | 0 | 46 |
| AIMC | en | 0 | 0 | 0 | 46 |
| CEE | cn | 0 | 0 | 13 | 40 |
| CEE | en | 0 | 0 | 13 | 40 |
| CMO | cn | 0 | 0 | 0 | 18 |
| CMO | en | 0 | 0 | 0 | 18 |
| MATH500 | en | 0 | 0 | 0 | 500 |
| AIME2024 | en | 0 | 0 | 0 | 44 |
| AMC | cn | 0 | 0 | 0 | 46 |
| AMC | en | 0 | 0 | 0 | 46 |
| CCEE | cn | 0 | 0 | 13 | 31 |
| CCEE | en | 0 | 0 | 13 | 31 |
| CNMO | cn | 0 | 0 | 0 | 18 |
| CNMO | en | 0 | 0 | 0 | 18 |
| WLPMC | cn | 0 | 0 | 0 | 11 |
| WLPMC | en | 0 | 0 | 0 | 11 |
## How to use
### How to use
#### G-Pass@k
```python
from mmengine.config import read_base
with read_base():
from opencompass.datasets.livemathbench import livemathbench_datasets
from opencompass.datasets.livemathbench_gen import livemathbench_datasets
livemathbench_datasets[0].update(
{
'abbr': 'livemathbench_${k}x${n}'
'path': '/path/to/data/dir',
'k': 'k@pass', # the max value of k in k@pass
'n': 'number of runs', # number of runs
}
)
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
{
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
] # set url of evaluation models
}
)
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
max_out_len=32768 # for o1-like models you need to update max_out_len
))
```
> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested.
#### Greedy
```python
from mmengine.config import read_base
with read_base():
from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
{
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
'url': [
'http://0.0.0.0:23333/v1',
'...'
], # set url of evaluation models
# for LLM-based extraction
'use_extract_model': True,
'post_model_name': 'oc-extractor',
'post_url': [
'http://0.0.0.0:21006/v1,
'...'
]
] # set url of evaluation models
}
)
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
max_out_len=32768 # for o1-like models you need to update max_out_len
))
```
## Output Samples
### Output Samples
| dataset | version | metric | mode | Qwen2.5-72B-Instruct |
|----- | ----- | ----- | ----- | -----|
| LiveMathBench | caed8f | 1@pass | gen | 26.07 |
| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx |
| LiveMathBench | caed8f | 2@pass | gen | xx.xx |
| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx |
| LiveMathBench | caed8f | pass-rate | gen | xx.xx |
| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403

View File

@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen
livemathbench_dataset = dict(
abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name.
type=LiveMathBenchDataset,
path='opencompass/LiveMathBench',
path='',
k=1,
replication=1,
dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
dataset_languages=['cn', 'en'],
cot=False,
cot=True,
version='202412',
abbr='LiveMathBench-v202412',
reader_cfg=dict(
input_columns=['prompt'],
output_column='answer'
@ -31,7 +31,7 @@ livemathbench_dataset = dict(
retriever=dict(type=ZeroRetriever),
inferencer=dict(
type=GenInferencer,
max_out_len=16384,
max_out_len=8192
),
),
eval_cfg=dict(
@ -44,7 +44,7 @@ livemathbench_dataset = dict(
extract_model_name='',
k=[1],
replication=1,
thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
thresholds=[0.0]
)
)
)

View File

@ -0,0 +1,40 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import CustomDataset
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator),
)
math_datasets = [
dict(
type=CustomDataset,
abbr='math-500',
path='opencompass/math',
file_name='test_prm800k_500.jsonl',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,100 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (
MATHDataset,
MATHEvaluator,
math_postprocess_v2,
normalize_final_answer,
)
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets import generic_llmjudge_academic_postprocess
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role='HUMAN',
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
GRADER_TEMPLATE = """
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
Here are some evaluation criteria:
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
A: CORRECT
B: INCORRECT
Just return the letters "A" or "B", with no text around it.
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
Judging the correctness of candidates' answers:
""".strip()
# Evaluation configuration
math_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=MATHDataset,
path='opencompass/math',
file_name = 'test_prm800k_500.json',
reader_cfg=math_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess),
),
pred_role='BOT',
)
math_datasets = [
dict(
type=MATHDataset,
abbr='math_prm800k_500',
path='opencompass/math',
file_name='test_prm800k_500.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
mode='singlescore',
)
]

View File

@ -32,7 +32,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -31,7 +31,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -32,7 +32,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -31,7 +31,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -73,12 +73,13 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(

View File

@ -74,7 +74,7 @@ for _name in subjective_all_sets:
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -72,7 +72,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -38,7 +38,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -39,7 +39,7 @@ for _name in subjective_all_sets:
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -37,7 +37,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -1,6 +1,6 @@
from opencompass.datasets import (
CompassArenaDataset,
compassarena_bradleyterry_postprocess,
compassarena_bradleyterry_postprocess
)
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer
@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -91,7 +91,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -90,7 +90,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -59,7 +59,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -58,7 +58,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
inferencer=dict(type=GenInferencer,),
)
subjective_eval_cfg = dict(

View File

@ -29,7 +29,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -28,7 +28,7 @@ for _name in subjective_all_sets:
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
inferencer=dict(type=GenInferencer),
)
subjective_eval_cfg = dict(

View File

@ -24,7 +24,7 @@ for _name in subjective_all_sets:
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
inferencer=dict(type=ChatInferencer, infer_mode='last'),
)
subjective_eval_cfg = dict(

View File

@ -23,7 +23,7 @@ for _name in subjective_all_sets:
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
inferencer=dict(type=ChatInferencer, infer_mode='last'),
)
subjective_eval_cfg = dict(

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-llama-70b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=8),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
max_out_len=16384,
batch_size=16,
run_cfg=dict(num_gpus=4),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-qwen-1.5b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-qwen-32b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=4),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-hf',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
max_out_len=16384,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-70b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=8),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=8,
run_cfg=dict(num_gpus=8),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-llama-8b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-14b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=2),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-32b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=16,
run_cfg=dict(num_gpus=4),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -0,0 +1,20 @@
from opencompass.models import TurboMindModelwithChatTemplate
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
models = [
dict(
type=TurboMindModelwithChatTemplate,
abbr='deepseek-r1-distill-qwen-7b-turbomind',
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
gen_config=dict(top_k=1,
temperature=1e-6,
top_p=0.9,
max_new_tokens=32768),
max_seq_len=32768,
max_out_len=32768,
batch_size=8,
run_cfg=dict(num_gpus=1),
pred_postprocessor=dict(type=extract_non_reasoning_content)
)
]

View File

@ -22,7 +22,10 @@ from .base import BaseDataset
class LCDataset(BaseDataset):
@staticmethod
def load(path: str, num_repeats: int = 1, difficulty='ALL'):
def load(path: str,
num_repeats: int = 1,
difficulty='ALL',
local_mode=False):
"""Load LC dataset for pass k mode.
Note that you can use num_repeats > 1 when your model does not support
@ -38,7 +41,7 @@ class LCDataset(BaseDataset):
num_repeats(int): Number of repetition for this dataset to get
multiple responses in special cases.
"""
path = get_data_path(path, local_mode=True)
path = get_data_path(path, local_mode=local_mode)
def processing_test(example):
example['test_case'] = example['test_list']

View File

@ -73,6 +73,8 @@ class BigCodeBenchEvaluator(BaseEvaluator):
eval_type='instruct',
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # noqa
dataset_version: str = 'full',
local_mode: bool = False,
path: str = 'opencompass/bigcodebench',
pass_k: str = '1,5,10',
parallel: int = -1,
min_time_limit: float = 1,
@ -84,7 +86,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
super().__init__()
self.dataset = BigCodeBenchDataset.load(
release_version=release_version,
dataset_version=dataset_version)['test']
dataset_version=dataset_version,
local_mode=local_mode,
path=path)['test']
self.eval_type = eval_type
self.remote_execute_api = remote_execute_api
@ -117,8 +121,40 @@ class BigCodeBenchEvaluator(BaseEvaluator):
logger.info('Start to extract code from predictions')
sanitized_predictions = []
for prediction, entrypoint in zip(predictions, entrypoints):
sanitized_prediction = extract_code_generation(
prediction, entrypoint=entrypoint)
try:
import signal
from contextlib import contextmanager
@contextmanager
def timeout_handler(seconds):
def _handle_timeout(signum, frame):
raise TimeoutError(f'Code extraction timed out'
f'after {seconds} seconds')
original_handler = signal.signal(signal.SIGALRM,
_handle_timeout)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, original_handler)
with timeout_handler(10):
sanitized_prediction = extract_code_generation(
prediction, entrypoint=entrypoint)
except TimeoutError as e:
logger.warning(
f'Code extraction timeout for entrypoint {entrypoint}: '
f'{str(e)}')
sanitized_prediction = ''
except Exception as e:
logger.warning(
f'Code extraction failed for entrypoint {entrypoint}: '
f'{str(e)}')
sanitized_prediction = ''
sanitized_predictions.append(sanitized_prediction)
# Prepare for submission

View File

@ -13,6 +13,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@ -114,7 +115,7 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
tmp_metrics[f'correct_{k}'] += (1 if parsed == refr else 0)
tmp_metrics[f'correct_{k}'] += 1 if parsed == refr else 0
tmp_metrics[f'count_{k}'] += 1
for k in circular_patterns:
@ -164,7 +165,10 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
class CustomDataset(BaseDataset):
@staticmethod
def load(path):
def load(path, file_name=None, local_mode=False):
path = get_data_path(path, local_mode=local_mode)
if file_name is not None:
path = os.path.join(path, file_name)
if path.endswith('.jsonl'):
with open(path, 'r', encoding='utf-8-sig') as f:
data = [json.loads(line) for line in f]
@ -222,9 +226,10 @@ def make_mcq_gen_config(meta):
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator', OptionSimAccEvaluator),
**meta.get('evaluator_kwargs',
{'options': meta['options']})),
evaluator=dict(
type=meta.get('evaluator', OptionSimAccEvaluator),
**meta.get('evaluator_kwargs', {'options': meta['options']}),
),
pred_role='BOT',
)
@ -269,10 +274,10 @@ def make_circular_mcq_gen_config(meta):
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator',
CircularOptionSimAccEvaluator),
**meta.get('evaluator_kwargs',
{'options': meta['options']})),
evaluator=dict(
type=meta.get('evaluator', CircularOptionSimAccEvaluator),
**meta.get('evaluator_kwargs', {'options': meta['options']}),
),
pred_role='BOT',
)
@ -320,8 +325,10 @@ def make_qa_gen_config(meta):
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {})),
evaluator=dict(
type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {}),
),
pred_role='BOT',
)
@ -346,9 +353,11 @@ def make_mcq_ppl_config(meta):
template = {
answer: dict(round=[
dict(role='HUMAN', prompt=human_prompt),
dict(role='BOT',
prompt=bot_prompt.format(
**{meta['output_column']: answer})),
dict(
role='BOT',
prompt=bot_prompt.format(
**{meta['output_column']: answer}),
),
], )
for answer in meta['options']
}
@ -370,8 +379,10 @@ def make_mcq_ppl_config(meta):
inferencer=dict(type=PPLInferencer),
)
eval_cfg = dict(evaluator=dict(type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {})))
eval_cfg = dict(evaluator=dict(
type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {}),
))
dataset = dict(
abbr=meta['abbr'],
@ -394,9 +405,11 @@ def make_circular_mcq_ppl_config(meta):
template = {
answer: dict(round=[
dict(role='HUMAN', prompt=human_prompt),
dict(role='BOT',
prompt=bot_prompt.format(
**{meta['output_column']: answer})),
dict(
role='BOT',
prompt=bot_prompt.format(
**{meta['output_column']: answer}),
),
], )
for answer in meta['options']
}
@ -418,9 +431,10 @@ def make_circular_mcq_ppl_config(meta):
inferencer=dict(type=PPLInferencer),
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator', CircularEvaluator),
**meta.get('evaluator_kwargs', {})))
eval_cfg = dict(evaluator=dict(
type=meta.get('evaluator', CircularEvaluator),
**meta.get('evaluator_kwargs', {}),
))
dataset = dict(
abbr=meta['abbr'],

View File

@ -1,7 +1,10 @@
import re
def get_final_results(judged_answers, references, origial_responses):
def get_final_results(judged_answers,
references,
origial_responses,
metric_name='accuracy'):
count = 0
is_correct_count = 0
is_incorrect_count = 0
@ -39,7 +42,7 @@ def get_final_results(judged_answers, references, origial_responses):
is_correct) > 0 else 0
result = {
# 'accuracy_given_attempted': accuracy_given_attempted,
'accuracy': accuracy_given_attempted * 100,
metric_name: accuracy_given_attempted * 100,
'f1': f1,
'details': details
}
@ -69,3 +72,25 @@ def generic_llmjudge_postprocess(
results = get_final_results(judged_answers, references, origial_responses)
results['details'] = output
return results
def generic_llmjudge_academic_postprocess(
output: dict,
output_path: str,
metric_name: str = 'accuracy',
) -> dict:
judged_answers = []
origial_responses = []
references = []
for k, v in output.items():
origial_responses.append(v['prediction'])
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
results = get_final_results(judged_answers, references, origial_responses,
metric_name)
results['details'] = output
# For academic summarizer
results.pop('f1', None)
return results

View File

@ -25,12 +25,7 @@ OPENAI_API_BASE = os.path.join(
OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
'https://api.openai.com/v1/')
O1_MODEL_LIST = [
'o1-preview-2024-09-12',
'o1-mini-2024-09-12',
'o1-preview',
'o1-mini',
]
O1_MODEL_LIST = ['o1', 'o3']
@MODELS.register_module()
@ -96,7 +91,6 @@ class OpenAI(BaseAPIModel):
temperature: Optional[float] = None,
tokenizer_path: Optional[str] = None,
extra_body: Optional[Dict] = None,
max_completion_tokens: int = 16384,
verbose: bool = False,
):
@ -151,9 +145,6 @@ class OpenAI(BaseAPIModel):
self.proxy_url = openai_proxy_url
self.path = path
self.max_completion_tokens = max_completion_tokens
self.logger.warning(
f'Max Completion tokens for {path} is {max_completion_tokens}')
def generate(
self,
@ -250,16 +241,15 @@ class OpenAI(BaseAPIModel):
header['OpenAI-Organization'] = self.orgs[self.org_ctr]
try:
if self.path in O1_MODEL_LIST:
if any(model in self.path for model in O1_MODEL_LIST):
self.logger.warning(
f"'max_token' is unsupported for model {self.path}")
self.logger.warning(
f'We use max_completion_tokens: '
f'{self.max_completion_tokens}for this query')
f'We use max_out_len: {max_out_len} for this query')
data = dict(
model=self.path,
messages=messages,
max_completion_tokens=self.max_completion_tokens,
max_completion_tokens=max_out_len,
n=1,
logprobs=self.logprobs,
top_logprobs=self.top_logprobs,
@ -440,7 +430,7 @@ class OpenAI(BaseAPIModel):
if mode == 'front':
cur_prompt = sep.join(words[-mid:])
elif mode == 'mid':
cur_prompt = (sep.join(words[:mid]) + sep.join(words[-mid:]))
cur_prompt = sep.join(words[:mid]) + sep.join(words[-mid:])
elif mode == 'rear':
cur_prompt = sep.join(words[:mid])
@ -480,7 +470,9 @@ class OpenAI(BaseAPIModel):
"""
# Check input length when mode is 'none'
if mode == 'none':
input_len = get_token_len_func(str(input))
input_len = (get_token_len_func(input) if isinstance(
input, str) else sum(
get_token_len_func(item['prompt']) for item in input))
if input_len > max_seq_len:
raise ValueError(
f'Input length ({input_len}) exceeds max_seq_len '
@ -499,12 +491,15 @@ class OpenAI(BaseAPIModel):
# Convert input to messages format
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
input_len = get_token_len_func(input)
else:
messages = []
processed_prompts = []
for item in input:
input_content = item['prompt']
if mode != 'none':
input_content = bin_trim_wrapper(input_content)
processed_prompts.append(input_content)
msg = {'content': input_content}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
@ -513,19 +508,18 @@ class OpenAI(BaseAPIModel):
elif item['role'] == 'SYSTEM':
msg['role'] = 'system'
messages.append(msg)
input_len = sum(
get_token_len_func(prompt) for prompt in processed_prompts)
# Adjust max_out_len
if max_out_len is not None:
original_max_out_len = max_out_len
max_out_len = min(
max_out_len,
max_seq_len - get_token_len_func(str(input)) - 100)
max_out_len = min(max_out_len, max_seq_len - input_len - 100)
if max_out_len <= 0:
raise ValueError(
f'max_out_len ({max_out_len}) is less than or equal to 0. '
f'This may be due to input length '
f'({get_token_len_func(str(input))}) being too close to '
f'max_seq_len ({max_seq_len}). Please either increase '
f'This may be due to input length ({input_len}) being too '
f'close to max_seq_len ({max_seq_len}). Please increase '
f'max_seq_len or use a truncation mode other than "none".')
if max_out_len < original_max_out_len:
self.logger.warning(
@ -555,7 +549,6 @@ class OpenAISDK(OpenAI):
temperature: float | None = None,
tokenizer_path: str | None = None,
extra_body: Dict | None = None,
max_completion_tokens: int = 16384,
verbose: bool = False,
status_code_mappings: dict = {},
):
@ -577,7 +570,6 @@ class OpenAISDK(OpenAI):
tokenizer_path,
extra_body,
verbose=verbose,
max_completion_tokens=max_completion_tokens,
)
from openai import OpenAI
@ -605,8 +597,23 @@ class OpenAISDK(OpenAI):
self.logger.info(f'Used openai_client: {self.openai_client}')
self.status_code_mappings = status_code_mappings
def _generate(self, input: PromptList | str, max_out_len: int,
temperature: float) -> str:
def _generate(self,
input: PromptList | str,
max_out_len: int,
temperature: float,
timeout: int = 3600) -> str:
"""Generate results given a list of inputs.
Args:
input (PromptType): A string or PromptDict.
max_out_len (int): The maximum length of the output.
temperature (float): What sampling temperature to use.
timeout (int, optional): Timeout in seconds for the API call.
Defaults to 3600 (60 minutes).
Returns:
str: The generated string.
"""
from openai import APIStatusError, BadRequestError
assert isinstance(input, (str, PromptList))
@ -618,16 +625,14 @@ class OpenAISDK(OpenAI):
num_retries = 0
while num_retries < self.retry:
self.wait()
if self.path in O1_MODEL_LIST:
if any(model in self.path for model in O1_MODEL_LIST):
self.logger.warning(
f"'max_token' is unsupported for model {self.path}")
self.logger.warning(
f'We use max_completion_tokens: '
f'{self.max_completion_tokens}for this query')
f'We use max_out_len: {max_out_len} for this query')
query_data = dict(
model=self.path,
max_completion_tokens=self.max_completion_tokens,
max_completion_tokens=max_out_len,
n=1,
messages=messages,
extra_body=self.extra_body,
@ -646,7 +651,8 @@ class OpenAISDK(OpenAI):
if self.verbose:
self.logger.info('Start calling OpenAI API')
responses = self.openai_client.chat.completions.create(
**query_data)
**query_data, timeout=timeout) # timeout in seconds
if self.verbose:
self.logger.info(
'Successfully get response from OpenAI API')

View File

@ -329,4 +329,4 @@ class LMEvaluator:
else:
kwargs = self.dict_postprocessor
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
return proc(output, self.output_path, **kwargs)
return proc(output, self.output_path, **kwargs)

View File

@ -0,0 +1,154 @@
from latex2sympy2_extended import NormalizationConfig
from math_verify import (ExprExtractionConfig, LatexExtractionConfig, parse,
verify)
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS
@ICL_EVALUATORS.register_module()
class MATHEvaluator(BaseEvaluator):
def score(self, predictions, references):
self.is_num_equal(predictions, references)
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
count += 1
gold_parsed = parse(
j,
extraction_mode='first_match',
extraction_config=[
LatexExtractionConfig(),
ExprExtractionConfig(),
],
)
# If parsing result is empty, try adding LaTeX
# environment and parse again
if len(gold_parsed) == 0:
j_with_env = f'${j}$'
gold_parsed = parse(
j_with_env,
extraction_mode='first_match',
extraction_config=[
LatexExtractionConfig(),
ExprExtractionConfig(),
],
)
if len(gold_parsed) != 0:
# We require the answer to be provided in correct
# latex (no malformed operators)
answer_parsed = parse(
i,
extraction_config=[
LatexExtractionConfig(
normalization_config=NormalizationConfig(
nits=False,
malformed_operators=False,
basic_latex=True,
equations=True,
boxed='all',
units=True,
),
# Ensures that boxed is tried first
boxed_match_priority=0,
try_extract_without_anchor=False,
)
],
extraction_mode='first_match',
)
answer_correct = float(verify(answer_parsed, gold_parsed))
correct += answer_correct
detail = {
'pred': str(answer_parsed),
'answer': str(gold_parsed),
'correct': True if answer_correct else False,
}
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result
if __name__ == '__main__':
import sympy
test_cases = [
# 1. Basic arithmetic operations
r'Simple fraction: \boxed{\frac{1}{2}}',
r'Addition: \boxed{2 + 3}',
r'Multiplication: \boxed{2 \times 3}',
# 2. Algebraic expressions
r'Quadratic: \boxed{x^2 + 2x + 1}',
r'Polynomial: \boxed{3x^3 - 2x^2 + 4x - 5}',
# 3. Trigonometric functions
r'Trigonometry: \boxed{\sin(x) + \cos(x)}',
r'Complex trig: \boxed{\tan^2(x) + \sec^2(x)}',
# 4. Roots and exponents
r'Square root: \boxed{\sqrt{16}}',
r'Complex root: \boxed{\sqrt[3]{x^2 + 1}}',
# 5. Logarithms
r'Natural log: \boxed{\ln(e^2)}',
r'Log base: \boxed{\log_2(8)}',
# 6. Limits and summations
r'Limit: \boxed{\lim_{x \to 0} \frac{\sin(x)}{x}}',
r'Sum: \boxed{\sum_{i=1}^{n} i}',
# 7. Integrals
r'Integral: \boxed{\int_{0}^{1} x^2 dx}',
r'Double integral: \boxed{\int_{0}^{1}\int_{0}^{1} xy \,dx\,dy}',
# 8. Matrices
r'Matrix: \boxed{\begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}}',
# 9. Complex combinations
r'Complex expr: \boxed{\frac{\sqrt{x^2 + 1}}{\ln(x)} + '
r'\int_{0}^{x} t^2 dt}',
# 10. Error cases
r'Empty: \boxed{}',
r'Invalid: \boxed{\frac{1}}', # Missing denominator
r'Nested: \boxed{\boxed{1}}', # Nested boxed
]
def print_result(expr: str, result: list):
print('\n' + '=' * 50)
print(f'Input: {expr}')
print(f'Output type: {type(result)}')
print(f'Output: {result}')
# If result is sympy expression, show more information
if result:
for item in result:
if isinstance(item, sympy.Basic):
print(f'Sympy repr: {repr(item)}')
try:
print(f'Evaluated: {item.evalf()}')
except Exception as e:
print(f'Cannot evaluate: {e}')
# Test all cases
for test_expr in test_cases:
try:
result = parse(test_expr)
print_result(test_expr, result)
except Exception as e:
print(f'\nError processing {test_expr}: {e}')
# Special test: verify numerical calculations
numerical_tests = [
r'\boxed{2 + 2}', # Should equal 4
r'\boxed{\frac{1}{2} + \frac{1}{3}}', # Should equal 5/6
r'\boxed{\sqrt{16} + \sqrt{9}}', # Should equal 7
]
print('\n' + '=' * 50 + '\nNumerical Verification Tests:')
for test_expr in numerical_tests:
try:
result = parse(test_expr)
if result and isinstance(result[0], sympy.Basic):
expr = result[0]
print(f'\nExpression: {test_expr}')
print(f'Symbolic: {expr}')
print(f'Numerical value: {float(expr.evalf())}')
except Exception as e:
print(f'\nError in numerical test {test_expr}: {e}')

View File

@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
f'Failed to get job info for {job_id}')
status = job_info['Status']
if status == 'Failed':
if status == 'Failed' or status == 'Stopped':
return -1
elif status == 'Succeeded':
return 0

View File

@ -34,39 +34,29 @@ MAP = {
'总分',
'中文总分',
'英文总分',
'instruct/compassbenchv1_4_IF_en_fofo_sub',
'instruct/compassbenchv1_4_IF_zh_fofo_sub',
'instruct/compassbench_2501_IF_en_chatIF_sub',
'instruct/compassbench_2501_IF_en_functionalIF_sub',
'instruct/compassbench_2501_IF_cn_chatIF_sub',
'instruct/compassbench_2501_IF_cn_functionalIF_sub',
],
'language': [
'总分',
'中文总分',
'英文总分',
'language/compassbenchv1_4_language_zh_chat_sub',
'language/compassbenchv1_4_language_zh_creation_sub',
'language/compassbenchv1_4_language_zh_NLP_sub',
'language/compassbenchv1_4_language_en_chat_sub',
'language/compassbenchv1_4_language_en_creation_sub',
'language/compassbenchv1_4_language_en_NLP_sub',
'language/compassbench_v2501_language_zh_chat_sub',
'language/compassbench_v2501_language_zh_nlp_sub',
'language/compassbench_v2501_language_zh_creation_sub',
'language/compassbench_v2501_language_en_chat_sub',
'language/compassbench_v2501_language_en_nlp_sub',
'language/compassbench_v2501_language_en_creation_sub',
],
'reasoning': [
'code': [
'总分',
'中文总分',
'英文总分',
'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub',
'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub',
'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub',
'reasoning/compassbenchv1_4_reasoning_en_Social_sub',
'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub',
'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub',
'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub',
'reasoning/compassbenchv1_4_reasoning_zh_Social_sub',
],
'coding': [
'总分',
'中文总分',
'英文总分',
'coding/compassbenchv1_4_coding_en_sub',
'coding/compassbenchv1_4_coding_zh_sub',
'code/compassbench_2501_code_arena_en_sub',
'code/compassbench_2501_code_arena_zh_sub',
],
}

View File

@ -1,6 +1,5 @@
import argparse
import copy
import fnmatch
import math
import os
import os.path as osp
@ -18,9 +17,8 @@ from mmengine.utils import mkdir_or_exist
from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
TEXT_POSTPROCESSORS)
from opencompass.tasks.base import BaseTask, extract_role_pred
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path, get_logger,
task_abbr_from_cfg)
from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
get_logger, task_abbr_from_cfg)
@TASKS.register_module()
@ -60,19 +58,9 @@ class OpenICLEvalTask(BaseTask):
self.dataset_cfg = dataset_cfg
# Load Dataset
self.eval_cfg = self.dataset_cfg.get('eval_cfg')
self.output_column = dataset_cfg['reader_cfg']['output_column']
# overwrite postprocessor if the model has specified one
ds_abbr = dataset_abbr_from_cfg(self.dataset_cfg)
model_postprocessors = self.model_cfg.get(
'pred_postprocessor', {})
for pattern in model_postprocessors.keys():
if fnmatch.fnmatch(ds_abbr, pattern):
self.eval_cfg[
'pred_postprocessor'] = model_postprocessors[
pattern] # noqa
break
self.eval_cfg = copy.deepcopy(dataset_cfg.get('eval_cfg'))
self.output_column = copy.deepcopy(
dataset_cfg['reader_cfg']['output_column'])
out_path = get_infer_output_path(
self.model_cfg, self.dataset_cfg,
@ -155,8 +143,20 @@ class OpenICLEvalTask(BaseTask):
]
# Postprocess predictions if necessary
# Model Specified Postprocessor
if 'pred_postprocessor' in self.model_cfg:
kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)
if pred_list_flag:
pred_strs = [[proc(s, **kwargs) for s in preds]
for preds in pred_strs]
else:
pred_strs = [proc(s, **kwargs) for s in pred_strs]
# Dataset Specified Postprocessor
if 'pred_postprocessor' in self.eval_cfg:
kwargs = self.eval_cfg['pred_postprocessor']
kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
proc = kwargs.pop('type')
if isinstance(proc, str):
proc = TEXT_POSTPROCESSORS.get(proc)

View File

@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
if fnmatch.fnmatch(ds_abbr, pattern):
pred_postprocessor = model_postprocessors[pattern]
break
if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
kwargs = pred_postprocessor or eval_cfg['evaluator'][
'pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
temp_kwargs = copy.deepcopy(kwargs)
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
self.logger.info('Get postprocessor {postprocessor}.')
pred_strs = [proc(s, **kwargs) for s in pred_strs]
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
else:
self.logger.info('No postprocessor found.')
self.logger.info('No dataset postprocessor found.')
if 'pred_postprocessor' in model_cfg or pred_postprocessor:
kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
temp_kwargs = copy.deepcopy(kwargs)
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
else:
self.logger.info('No model postprocessor found.')
return {
'model_name': model_abbr_from_cfg(model_cfg),
@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
if fnmatch.fnmatch(ds_abbr, pattern):
pred_postprocessor = model_postprocessors[pattern]
break
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
pred_strs = [proc(s, **kwargs) for s in pred_strs]

View File

@ -37,6 +37,7 @@ def general_cn_postprocess(text: str) -> str:
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
import jieba
cleaned_text = ' '.join(jieba.cut(text))
return cleaned_text
@ -57,6 +58,18 @@ def last_capital_postprocess(text: str) -> str:
return ''
@TEXT_POSTPROCESSORS.register_module('think_pred')
def think_pred_postprocess(
prediction: str,
re_pattern: str,
) -> str:
match = re.search(re_pattern, prediction)
if match:
return match.group(1).strip()
else:
return prediction
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
"""Find first valid option for text."""
@ -229,3 +242,44 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
match = re.search(answer_pattern, response_text)
extracted_answer = match.group(1) if match else ''
return extracted_answer
@TEXT_POSTPROCESSORS.register_module('extract-non-reasoning-content')
def extract_non_reasoning_content(
text: str,
think_start_token: str = '<think>',
think_end_token: str = '</think>',
) -> str:
"""Extract content after the last reasoning tag from text.
When only end token is present, returns content after the end token.
When both tokens are present, removes all content between start and end tokens.
Args:
text (str): Input text containing reasoning tags.
think_start_token (str, optional): Start token for reasoning section. Defaults to '<think>'.
think_end_token (str, optional): End token for reasoning section. Defaults to '</think>'.
Returns:
str: Processed text after removing reasoning sections.
Examples:
>>> # When only end token exists
>>> text = "This is a test.</think> How are you?"
>>> extract_non_reasoning_content(text)
'How are you?'
>>> # When both tokens exist
>>> text = "Start<think>reasoning here</think> End"
>>> extract_non_reasoning_content(text)
'Start End'
"""
# If text contains only end token, split by end token and take the last part
if think_start_token not in text and think_end_token in text:
return text.split(think_end_token)[-1].strip()
# Original behavior for complete tag pairs
reasoning_regex = re.compile(rf'{think_start_token}(.*?){think_end_token}',
re.DOTALL)
non_reasoning_content = reasoning_regex.sub('', text).strip()
return non_reasoning_content

View File

@ -15,6 +15,8 @@ langdetect
latex2sympy2
# Lawbench, leval
ltp
# Math
math-verify
# Taco, apps Dataset
pyext
# Law Bench