mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
Merge branch 'main' of github.com:open-compass/opencompass into tmp_olmpbench
This commit is contained in:
commit
4c9366d6fc
@ -366,7 +366,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
openai_mmmlu_lite_DE-DE_accuracy: 51.27
|
||||
openai_mmmlu_lite_ES-LA_accuracy: 56.94
|
||||
openai_mmmlu_lite_FR-FR_accuracy: 58.22
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 33.75
|
||||
openai_mmmlu_lite_HI-IN_accuracy: 30.75
|
||||
openai_mmmlu_lite_ID-ID_accuracy: 50.6
|
||||
openai_mmmlu_lite_IT-IT_accuracy: 50.6
|
||||
openai_mmmlu_lite_JA-JP_accuracy: 51.13
|
||||
@ -394,7 +394,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
CompassArena_naive_average: 34.61
|
||||
FoFo_naive_average: 0.38
|
||||
mtbench101_avg: 8.01
|
||||
wildbench_average: -15.69
|
||||
wildbench_average: -10.49
|
||||
simpleqa_accuracy_given_attempted: 0.04
|
||||
chinese_simpleqa_given_attempted_accuracy: 0.34
|
||||
alignment_bench_v1_1_专业能力: 6.05
|
||||
@ -414,7 +414,7 @@ internlm2_5-7b-chat-turbomind:
|
||||
compassarena_knowledge_naive_average: 36
|
||||
compassarena_reason_v2_naive_average: 35
|
||||
compassarena_math_v2_naive_average: 19.91
|
||||
compassarena_creationv2_zh_naive_average: 35.81
|
||||
compassarena_creationv2_zh_naive_average: 43.64
|
||||
fofo_test_prompts_overall: 0.35
|
||||
fofo_test_prompts_cn_overall: 0.41
|
||||
followbench_llmeval_en_HSR_AVG: 0.73
|
||||
|
20
.github/scripts/oc_score_baseline_testrange.yaml
vendored
20
.github/scripts/oc_score_baseline_testrange.yaml
vendored
@ -15,13 +15,13 @@ chat:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 68.75
|
||||
deepseek-7b-chat-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 75
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 78.12
|
||||
gemma2-2b-it-hf:
|
||||
gsm8k_accuracy: 50
|
||||
race-high_accuracy: 71.88
|
||||
race-high_accuracy: 75
|
||||
gemma2-9b-it-hf:
|
||||
gsm8k_accuracy: 71.88
|
||||
gsm8k_accuracy: 68.75
|
||||
race-high_accuracy: 84.38
|
||||
gemma-2b-it-hf:
|
||||
gsm8k_accuracy: 3.12
|
||||
@ -36,7 +36,7 @@ chat:
|
||||
gsm8k_accuracy: 78.12
|
||||
race-high_accuracy: 93.75
|
||||
gemma-7b-it-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
gsm8k_accuracy: 46.88
|
||||
race-high_accuracy: 68.75
|
||||
internlm2_5-7b-chat-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -57,7 +57,7 @@ chat:
|
||||
gsm8k_accuracy: 53.12
|
||||
race-high_accuracy: 90.62
|
||||
internlm2-chat-7b-vllm:
|
||||
gsm8k_accuracy: 56.25
|
||||
gsm8k_accuracy: 43.75
|
||||
race-high_accuracy: 84.38
|
||||
llama-3_1-8b-instruct-hf:
|
||||
gsm8k_accuracy: 84.38
|
||||
@ -90,13 +90,13 @@ chat:
|
||||
gsm8k_accuracy: 75
|
||||
race-high_accuracy: 81.25
|
||||
mistral-nemo-instruct-2407-turbomind:
|
||||
gsm8k_accuracy: 65.62
|
||||
race-high_accuracy: 87.50
|
||||
gsm8k_accuracy: 71.88
|
||||
race-high_accuracy: 78.12
|
||||
mistral-7b-instruct-v0.1-vllm:
|
||||
gsm8k_accuracy: 34.38
|
||||
race-high_accuracy: 68.75
|
||||
mistral-7b-instruct-v0.2-vllm:
|
||||
gsm8k_accuracy: 43.75
|
||||
gsm8k_accuracy: 31.25
|
||||
race-high_accuracy: 75
|
||||
phi-3-mini-4k-instruct-hf:
|
||||
gsm8k_accuracy: 81.25
|
||||
@ -177,7 +177,7 @@ chat:
|
||||
gsm8k_accuracy: 93.75
|
||||
race-high_accuracy: 87.5
|
||||
mixtral-8x7b-instruct-v0.1-hf:
|
||||
gsm8k_accuracy: 56.25
|
||||
gsm8k_accuracy: 59.38
|
||||
race-high_accuracy: 81.25
|
||||
mixtral-large-instruct-2411-turbomind:
|
||||
gsm8k_accuracy: 90.62
|
||||
|
13
.github/workflows/daily-run-test.yml
vendored
13
.github/workflows/daily-run-test.yml
vendored
@ -17,7 +17,7 @@ on:
|
||||
required: false
|
||||
description: 'whether to build lmdeploy'
|
||||
type: boolean
|
||||
default: false
|
||||
default: true
|
||||
repo_org_lmdeploy:
|
||||
required: false
|
||||
description: 'Tested repository organization name. Default is internlm/lmdeploy'
|
||||
@ -162,15 +162,16 @@ jobs:
|
||||
pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}}
|
||||
cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{inputs.build_lmdeploy}}
|
||||
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: my-artifact-${{ github.run_id }}-py310
|
||||
- name: Prepare - reinstall lmdeploy - cu12
|
||||
if: ${{inputs.build_lmdeploy}}
|
||||
if: ${{github.event_name == 'schedule' || inputs.build_lmdeploy}}
|
||||
run: |
|
||||
. ${{env.CONDA_PATH}}/bin/activate
|
||||
conda activate ${{env.CONDA_ENV}}
|
||||
pip uninstall -y lmdeploy
|
||||
pip install lmdeploy-*.whl --no-deps
|
||||
- name: conda env
|
||||
run: |
|
||||
@ -188,7 +189,7 @@ jobs:
|
||||
regression_func: ${{fromJSON(github.event.inputs.regression_func_volc || '["chat_models","base_models","chat_obj_fullbench","base_fullbench"]')}}
|
||||
runs-on: volc_cu12_daily
|
||||
environment: 'prod'
|
||||
timeout-minutes: 120 #2hours
|
||||
timeout-minutes: 180 #3hours
|
||||
steps:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v2
|
||||
@ -275,7 +276,7 @@ jobs:
|
||||
conda info --envs
|
||||
lmdeploy serve api_server internlm/internlm2_5-7b-chat --max-batch-size 256 --model-name internlm2 > ${{env.REPORT_ROOT}}/${{ github.run_id }}/restful.log 2>&1 &
|
||||
echo "restful_pid=$!" >> "$GITHUB_ENV"
|
||||
sleep 120s
|
||||
sleep 180s
|
||||
opencompass .github/scripts/eval_regression_api.py --work-dir ${{env.REPORT_ROOT}}/${{ github.run_id }}/api --reuse --max-num-workers 2 --dump-eval-details
|
||||
rm regression_result_daily -f && ln -s ${{env.REPORT_ROOT}}/${{ github.run_id }}/api/*/summary regression_result_daily
|
||||
python -m pytest -m api -s -v --color=yes .github/scripts/oc_score_assert.py
|
||||
@ -334,7 +335,7 @@ jobs:
|
||||
|
||||
|
||||
notify_to_feishu:
|
||||
if: ${{ always() && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
if: ${{ always() && github.event_name == 'schedule' && !cancelled() && contains(needs.*.result, 'failure') && (github.ref_name == 'develop' || github.ref_name == 'main') }}
|
||||
needs: [daily_run_test_volc, daily_run_test_local, fullbench_run_test]
|
||||
timeout-minutes: 5
|
||||
runs-on: self-hosted
|
||||
|
264
README.md
264
README.md
@ -58,9 +58,9 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
|
||||
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
|
||||
|
||||
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
|
||||
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](configs/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
|
||||
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](configs/eval_musr.py) and give it a spin! 🔥🔥🔥
|
||||
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](configs/eval_babilong.py) and give it a try! 🔥🔥🔥
|
||||
- **\[2024.12.17\]** We have provided the evaluation script for the December [CompassAcademic](examples/eval_academic_leaderboard_202412.py), which allows users to easily reproduce the official evaluation results by configuring it.
|
||||
- **\[2024.11.14\]** OpenCompass now offers support for a sophisticated benchmark designed to evaluate complex reasoning skills — [MuSR](https://arxiv.org/pdf/2310.16049). Check out the [demo](examples/eval_musr.py) and give it a spin! 🔥🔥🔥
|
||||
- **\[2024.11.14\]** OpenCompass now supports the brand new long-context language model evaluation benchmark — [BABILong](https://arxiv.org/pdf/2406.10149). Have a look at the [demo](examples/eval_babilong.py) and give it a try! 🔥🔥🔥
|
||||
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
|
||||
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
|
||||
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
|
||||
@ -279,263 +279,13 @@ OpenCompass is a one-stop platform for large model evaluation, aiming to provide
|
||||
|
||||
## 📖 Dataset Support
|
||||
|
||||
<table align="center">
|
||||
<tbody>
|
||||
<tr align="center" valign="bottom">
|
||||
<td>
|
||||
<b>Language</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Knowledge</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Reasoning</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Examination</b>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top">
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Word Definition</b></summary>
|
||||
We have supported a statistical list of all datasets that can be used on this platform in the documentation on the OpenCompass website.
|
||||
|
||||
- WiC
|
||||
- SummEdits
|
||||
You can quickly find the dataset you need from the list through sorting, filtering, and searching functions.
|
||||
|
||||
</details>
|
||||
Please refer to the dataset statistics chapter of [official document](https://opencompass.org.cn/doc) for details.
|
||||
|
||||
<details open>
|
||||
<summary><b>Idiom Learning</b></summary>
|
||||
|
||||
- CHID
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Semantic Similarity</b></summary>
|
||||
|
||||
- AFQMC
|
||||
- BUSTM
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Coreference Resolution</b></summary>
|
||||
|
||||
- CLUEWSC
|
||||
- WSC
|
||||
- WinoGrande
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Translation</b></summary>
|
||||
|
||||
- Flores
|
||||
- IWSLT2017
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Multi-language Question Answering</b></summary>
|
||||
|
||||
- TyDi-QA
|
||||
- XCOPA
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Multi-language Summary</b></summary>
|
||||
|
||||
- XLSum
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Knowledge Question Answering</b></summary>
|
||||
|
||||
- BoolQ
|
||||
- CommonSenseQA
|
||||
- NaturalQuestions
|
||||
- TriviaQA
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Textual Entailment</b></summary>
|
||||
|
||||
- CMNLI
|
||||
- OCNLI
|
||||
- OCNLI_FC
|
||||
- AX-b
|
||||
- AX-g
|
||||
- CB
|
||||
- RTE
|
||||
- ANLI
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Commonsense Reasoning</b></summary>
|
||||
|
||||
- StoryCloze
|
||||
- COPA
|
||||
- ReCoRD
|
||||
- HellaSwag
|
||||
- PIQA
|
||||
- SIQA
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Mathematical Reasoning</b></summary>
|
||||
|
||||
- MATH
|
||||
- GSM8K
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Theorem Application</b></summary>
|
||||
|
||||
- TheoremQA
|
||||
- StrategyQA
|
||||
- SciBench
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Comprehensive Reasoning</b></summary>
|
||||
|
||||
- BBH
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Junior High, High School, University, Professional Examinations</b></summary>
|
||||
|
||||
- C-Eval
|
||||
- AGIEval
|
||||
- MMLU
|
||||
- GAOKAO-Bench
|
||||
- CMMLU
|
||||
- ARC
|
||||
- Xiezhi
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Medical Examinations</b></summary>
|
||||
|
||||
- CMB
|
||||
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<tbody>
|
||||
<tr align="center" valign="bottom">
|
||||
<td>
|
||||
<b>Understanding</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Long Context</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Safety</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>Code</b>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top">
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Reading Comprehension</b></summary>
|
||||
|
||||
- C3
|
||||
- CMRC
|
||||
- DRCD
|
||||
- MultiRC
|
||||
- RACE
|
||||
- DROP
|
||||
- OpenBookQA
|
||||
- SQuAD2.0
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Content Summary</b></summary>
|
||||
|
||||
- CSL
|
||||
- LCSTS
|
||||
- XSum
|
||||
- SummScreen
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>Content Analysis</b></summary>
|
||||
|
||||
- EPRSTMT
|
||||
- LAMBADA
|
||||
- TNEWS
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Long Context Understanding</b></summary>
|
||||
|
||||
- LEval
|
||||
- LongBench
|
||||
- GovReports
|
||||
- NarrativeQA
|
||||
- Qasper
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Safety</b></summary>
|
||||
|
||||
- CivilComments
|
||||
- CrowsPairs
|
||||
- CValues
|
||||
- JigsawMultilingual
|
||||
- TruthfulQA
|
||||
|
||||
</details>
|
||||
<details open>
|
||||
<summary><b>Robustness</b></summary>
|
||||
|
||||
- AdvGLUE
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>Code</b></summary>
|
||||
|
||||
- HumanEval
|
||||
- HumanEvalX
|
||||
- MBPP
|
||||
- APPs
|
||||
- DS1000
|
||||
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<p align="right"><a href="#top">🔝Back to top</a></p>
|
||||
|
||||
## 📖 Model Support
|
||||
|
||||
|
258
README_zh-CN.md
258
README_zh-CN.md
@ -274,263 +274,11 @@ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下
|
||||
|
||||
## 📖 数据集支持
|
||||
|
||||
<table align="center">
|
||||
<tbody>
|
||||
<tr align="center" valign="bottom">
|
||||
<td>
|
||||
<b>语言</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>知识</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>推理</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>考试</b>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top">
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>字词释义</b></summary>
|
||||
我们已经在OpenCompass官网的文档中支持了所有可在本平台上使用的数据集的统计列表。
|
||||
|
||||
- WiC
|
||||
- SummEdits
|
||||
您可以通过排序、筛选和搜索等功能从列表中快速找到您需要的数据集。
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>成语习语</b></summary>
|
||||
|
||||
- CHID
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>语义相似度</b></summary>
|
||||
|
||||
- AFQMC
|
||||
- BUSTM
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>指代消解</b></summary>
|
||||
|
||||
- CLUEWSC
|
||||
- WSC
|
||||
- WinoGrande
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>翻译</b></summary>
|
||||
|
||||
- Flores
|
||||
- IWSLT2017
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>多语种问答</b></summary>
|
||||
|
||||
- TyDi-QA
|
||||
- XCOPA
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>多语种总结</b></summary>
|
||||
|
||||
- XLSum
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>知识问答</b></summary>
|
||||
|
||||
- BoolQ
|
||||
- CommonSenseQA
|
||||
- NaturalQuestions
|
||||
- TriviaQA
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>文本蕴含</b></summary>
|
||||
|
||||
- CMNLI
|
||||
- OCNLI
|
||||
- OCNLI_FC
|
||||
- AX-b
|
||||
- AX-g
|
||||
- CB
|
||||
- RTE
|
||||
- ANLI
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>常识推理</b></summary>
|
||||
|
||||
- StoryCloze
|
||||
- COPA
|
||||
- ReCoRD
|
||||
- HellaSwag
|
||||
- PIQA
|
||||
- SIQA
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>数学推理</b></summary>
|
||||
|
||||
- MATH
|
||||
- GSM8K
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>定理应用</b></summary>
|
||||
|
||||
- TheoremQA
|
||||
- StrategyQA
|
||||
- SciBench
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>综合推理</b></summary>
|
||||
|
||||
- BBH
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>初中/高中/大学/职业考试</b></summary>
|
||||
|
||||
- C-Eval
|
||||
- AGIEval
|
||||
- MMLU
|
||||
- GAOKAO-Bench
|
||||
- CMMLU
|
||||
- ARC
|
||||
- Xiezhi
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>医学考试</b></summary>
|
||||
|
||||
- CMB
|
||||
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
<tbody>
|
||||
<tr align="center" valign="bottom">
|
||||
<td>
|
||||
<b>理解</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>长文本</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>安全</b>
|
||||
</td>
|
||||
<td>
|
||||
<b>代码</b>
|
||||
</td>
|
||||
</tr>
|
||||
<tr valign="top">
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>阅读理解</b></summary>
|
||||
|
||||
- C3
|
||||
- CMRC
|
||||
- DRCD
|
||||
- MultiRC
|
||||
- RACE
|
||||
- DROP
|
||||
- OpenBookQA
|
||||
- SQuAD2.0
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>内容总结</b></summary>
|
||||
|
||||
- CSL
|
||||
- LCSTS
|
||||
- XSum
|
||||
- SummScreen
|
||||
|
||||
</details>
|
||||
|
||||
<details open>
|
||||
<summary><b>内容分析</b></summary>
|
||||
|
||||
- EPRSTMT
|
||||
- LAMBADA
|
||||
- TNEWS
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>长文本理解</b></summary>
|
||||
|
||||
- LEval
|
||||
- LongBench
|
||||
- GovReports
|
||||
- NarrativeQA
|
||||
- Qasper
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>安全</b></summary>
|
||||
|
||||
- CivilComments
|
||||
- CrowsPairs
|
||||
- CValues
|
||||
- JigsawMultilingual
|
||||
- TruthfulQA
|
||||
|
||||
</details>
|
||||
<details open>
|
||||
<summary><b>健壮性</b></summary>
|
||||
|
||||
- AdvGLUE
|
||||
|
||||
</details>
|
||||
</td>
|
||||
<td>
|
||||
<details open>
|
||||
<summary><b>代码</b></summary>
|
||||
|
||||
- HumanEval
|
||||
- HumanEvalX
|
||||
- MBPP
|
||||
- APPs
|
||||
- DS1000
|
||||
|
||||
</details>
|
||||
</td>
|
||||
</tr>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
详情请参阅 [官方文档](https://opencompass.org.cn/doc) 的数据集统计章节。
|
||||
|
||||
<p align="right"><a href="#top">🔝返回顶部</a></p>
|
||||
|
||||
|
734
dataset-index.yml
Normal file
734
dataset-index.yml
Normal file
@ -0,0 +1,734 @@
|
||||
|
||||
- ifeval:
|
||||
name: IFEval
|
||||
category: Instruction Following
|
||||
paper: https://arxiv.org/pdf/2311.07911
|
||||
configpath: opencompass/configs/datasets/IFEval
|
||||
- nphard:
|
||||
name: NPHardEval
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2312.14890v2
|
||||
configpath: opencompass/configs/datasets/NPHardEval
|
||||
- pmmeval:
|
||||
name: PMMEval
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2411.09116v1
|
||||
configpath: opencompass/configs/datasets/PMMEval
|
||||
- theoremqa:
|
||||
name: TheroremQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2305.12524
|
||||
configpath: opencompass/configs/datasets/TheroremQA
|
||||
- agieval:
|
||||
name: AGIEval
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2304.06364
|
||||
configpath: opencompass/configs/datasets/agieval
|
||||
- babilong:
|
||||
name: BABILong
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2406.10149
|
||||
configpath: opencompass/configs/datasets/babilong
|
||||
- bigcodebench:
|
||||
name: BigCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2406.15877
|
||||
configpath: opencompass/configs/datasets/bigcodebench
|
||||
- calm:
|
||||
name: CaLM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2405.00622
|
||||
configpath: opencompass/configs/datasets/calm
|
||||
- infinitebench:
|
||||
name: InfiniteBench (∞Bench)
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2024.acl-long.814.pdf
|
||||
configpath: opencompass/configs/datasets/infinitebench
|
||||
- korbench:
|
||||
name: KOR-Bench
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2410.06526v1
|
||||
configpath: opencompass/configs/datasets/korbench
|
||||
- lawbench:
|
||||
name: LawBench
|
||||
category: Knowledge / Law
|
||||
paper: https://arxiv.org/pdf/2309.16289
|
||||
configpath: opencompass/configs/datasets/lawbench
|
||||
- leval:
|
||||
name: L-Eval
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2307.11088v1
|
||||
configpath: opencompass/configs/datasets/leval
|
||||
- livecodebench:
|
||||
name: LiveCodeBench
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2403.07974
|
||||
configpath: opencompass/configs/datasets/livecodebench
|
||||
- livemathbench:
|
||||
name: LiveMathBench
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2412.13147
|
||||
configpath: opencompass/configs/datasets/livemathbench
|
||||
- longbench:
|
||||
name: LongBench
|
||||
category: Long Context
|
||||
paper: https://github.com/THUDM/LongBench
|
||||
configpath: opencompass/configs/datasets/livemathbench
|
||||
- lveval:
|
||||
name: LV-Eval
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2402.05136
|
||||
configpath: opencompass/configs/datasets/lveval
|
||||
- medbench:
|
||||
name: MedBench
|
||||
category: Knowledge / Medicine
|
||||
paper: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=10778138
|
||||
configpath: opencompass/configs/datasets/MedBench
|
||||
- musr:
|
||||
name: MuSR
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2310.16049
|
||||
configpath: opencompass/configs/datasets/musr
|
||||
- needlebench:
|
||||
name: NeedleBench
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2407.11963
|
||||
configpath: opencompass/configs/datasets/needlebench
|
||||
- ruler:
|
||||
name: RULER
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2404.06654
|
||||
configpath: opencompass/configs/datasets/ruler
|
||||
- alignment:
|
||||
name: AlignBench
|
||||
category: Subjective / Alignment
|
||||
paper: https://arxiv.org/pdf/2311.18743
|
||||
configpath: opencompass/configs/datasets/subjective/alignbench
|
||||
- alpaca:
|
||||
name: AlpacaEval
|
||||
category: Subjective / Instruction Following
|
||||
paper: https://github.com/tatsu-lab/alpaca_eval
|
||||
configpath: opencompass/configs/datasets/subjective/aplaca_eval
|
||||
- arenahard:
|
||||
name: Arena-Hard
|
||||
category: Subjective / Chatbot
|
||||
paper: https://lmsys.org/blog/2024-04-19-arena-hard/
|
||||
configpath: opencompass/configs/datasets/subjective/arena_hard
|
||||
- flames:
|
||||
name: FLAMES
|
||||
category: Subjective / Alignment
|
||||
paper: https://arxiv.org/pdf/2311.06899
|
||||
configpath: opencompass/configs/datasets/subjective/flames
|
||||
- fofo:
|
||||
name: FOFO
|
||||
category: Subjective / Format Following
|
||||
paper: https://arxiv.org/pdf/2402.18667
|
||||
configpath: opencompass/configs/datasets/subjective/fofo
|
||||
- followbench:
|
||||
name: FollowBench
|
||||
category: Subjective / Instruction Following
|
||||
paper: https://arxiv.org/pdf/2310.20410
|
||||
configpath: opencompass/configs/datasets/subjective/followbench
|
||||
- hellobench:
|
||||
name: HelloBench
|
||||
category: Subjective / Long Context
|
||||
paper: https://arxiv.org/pdf/2409.16191
|
||||
configpath: opencompass/configs/datasets/subjective/hellobench
|
||||
- judgerbench:
|
||||
name: JudgerBench
|
||||
category: Subjective / Long Context
|
||||
paper: https://arxiv.org/pdf/2410.16256
|
||||
configpath: opencompass/configs/datasets/subjective/judgerbench
|
||||
- multiround:
|
||||
name: MT-Bench-101
|
||||
category: Subjective / Multi-Round
|
||||
paper: https://arxiv.org/pdf/2402.14762
|
||||
configpath: opencompass/configs/datasets/subjective/multiround
|
||||
- wildbench:
|
||||
name: WildBench
|
||||
category: Subjective / Real Task
|
||||
paper: https://arxiv.org/pdf/2406.04770
|
||||
configpath: opencompass/configs/datasets/subjective/wildbench
|
||||
- teval:
|
||||
name: T-Eval
|
||||
category: Tool Utilization
|
||||
paper: https://arxiv.org/pdf/2312.14033
|
||||
configpath: opencompass/configs/datasets/teval
|
||||
- finalceiq:
|
||||
name: FinanceIQ
|
||||
category: Knowledge / Finance
|
||||
paper: https://github.com/Duxiaoman-DI/XuanYuan/tree/main/FinanceIQ
|
||||
configpath: opencompass/configs/datasets/FinanceIQ
|
||||
- gaokaobench:
|
||||
name: GAOKAOBench
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2305.12474
|
||||
configpath: opencompass/configs/datasets/GaokaoBench
|
||||
- lcbench:
|
||||
name: LCBench
|
||||
category: Code
|
||||
paper: https://github.com/open-compass/CodeBench/
|
||||
configpath: opencompass/configs/datasets/LCBench
|
||||
- MMLUArabic:
|
||||
name: ArabicMMLU
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2402.12840
|
||||
configpath: opencompass/configs/datasets/MMLUArabic
|
||||
- OpenFinData:
|
||||
name: OpenFinData
|
||||
category: Knowledge / Finance
|
||||
paper: https://github.com/open-compass/OpenFinData
|
||||
configpath: opencompass/configs/datasets/OpenFinData
|
||||
- QuALITY:
|
||||
name: QuALITY
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2112.08608
|
||||
configpath: opencompass/configs/datasets/QuALITY
|
||||
- advglue:
|
||||
name: Adversarial GLUE
|
||||
category: Safety
|
||||
paper: https://openreview.net/pdf?id=GF9cSKI3A_q
|
||||
configpath: opencompass/configs/datasets/adv_glue
|
||||
- afqmcd:
|
||||
name: CLUE / AFQMC
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_afqmc
|
||||
- aime2024:
|
||||
name: AIME2024
|
||||
category: Examination
|
||||
paper: https://huggingface.co/datasets/Maxwell-Jia/AIME_2024
|
||||
configpath: opencompass/configs/datasets/aime2024
|
||||
- anli:
|
||||
name: Adversarial NLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1910.14599v2
|
||||
configpath: opencompass/configs/datasets/anli
|
||||
- anthropics_evals:
|
||||
name: Anthropics Evals
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2212.09251
|
||||
configpath: opencompass/configs/datasets/anthropics_evals
|
||||
- apps:
|
||||
name: APPS
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2105.09938
|
||||
configpath: opencompass/configs/datasets/apps
|
||||
- arc:
|
||||
name: ARC
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1803.05457
|
||||
configpath: [opencompass/configs/datasets/ARC_c, opencompass/configs/datasets/ARC_e]
|
||||
- arc_prize_public_eval:
|
||||
name: ARC Prize
|
||||
category: ARC-AGI
|
||||
paper: https://arcprize.org/guide#private
|
||||
configpath: opencompass/configs/datasets/ARC_Prize_Public_Evaluation
|
||||
- ax:
|
||||
name: SuperGLUE / AX
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: [opencompass/configs/datasets/SuperGLUE_AX_b, opencompass/configs/datasets/SuperGLUE_AX_g]
|
||||
- bbh:
|
||||
name: BIG-Bench Hard
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2210.09261
|
||||
configpath: opencompass/configs/datasets/bbh
|
||||
- BoolQ:
|
||||
name: SuperGLUE / BoolQ
|
||||
category: Knowledge
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_BoolQ
|
||||
- c3:
|
||||
name: CLUE / C3 (C³)
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_C3
|
||||
- cb:
|
||||
name: SuperGLUE / CB
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_CB
|
||||
- ceval:
|
||||
name: C-EVAL
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/2305.08322v1
|
||||
configpath: opencompass/configs/datasets/ceval
|
||||
- charm:
|
||||
name: CHARM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2403.14112
|
||||
configpath: opencompass/configs/datasets/CHARM
|
||||
- chembench:
|
||||
name: ChemBench
|
||||
category: Knowledge / Chemistry
|
||||
paper: https://arxiv.org/pdf/2404.01475
|
||||
configpath: opencompass/configs/datasets/ChemBench
|
||||
- chid:
|
||||
name: FewCLUE / CHID
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_chid
|
||||
- chinese_simpleqa:
|
||||
name: Chinese SimpleQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2411.07140
|
||||
configpath: opencompass/configs/datasets/chinese_simpleqa
|
||||
- cibench:
|
||||
name: CIBench
|
||||
category: Code
|
||||
paper: https://www.arxiv.org/pdf/2407.10499
|
||||
configpath: opencompass/configs/datasets/CIBench
|
||||
- civilcomments:
|
||||
name: CivilComments
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/1903.04561
|
||||
configpath: opencompass/configs/datasets/civilcomments
|
||||
- clozeTest_maxmin:
|
||||
name: Cloze Test-max/min
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2102.04664
|
||||
configpath: opencompass/configs/datasets/clozeTest_maxmin
|
||||
- cluewsc:
|
||||
name: FewCLUE / CLUEWSC
|
||||
category: Language / WSC
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_cluewsc
|
||||
- cmb:
|
||||
name: CMB
|
||||
category: Knowledge / Medicine
|
||||
paper: https://arxiv.org/pdf/2308.08833
|
||||
configpath: opencompass/configs/datasets/cmb
|
||||
- cmmlu:
|
||||
name: CMMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2306.09212
|
||||
configpath: opencompass/configs/datasets/cmmlu
|
||||
- cmnli:
|
||||
name: CLUE / CMNLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_cmnli
|
||||
- cmo_fib:
|
||||
name: cmo_fib
|
||||
category: Examination
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/cmo_fib
|
||||
- cmrc:
|
||||
name: CLUE / CMRC
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_CMRC
|
||||
- commonsenseqa:
|
||||
name: CommonSenseQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1811.00937v2
|
||||
configpath: opencompass/configs/datasets/commonsenseqa
|
||||
- commonsenseqa_cn:
|
||||
name: CommonSenseQA-CN
|
||||
category: Knowledge
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/commonsenseqa_cn
|
||||
- copa:
|
||||
name: SuperGLUE / COPA
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_COPA
|
||||
- crowspairs:
|
||||
name: CrowsPairs
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2010.00133
|
||||
configpath: opencompass/configs/datasets/crowspairs
|
||||
- crowspairs_cn:
|
||||
name: CrowsPairs-CN
|
||||
category: Safety
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/crowspairs_cn
|
||||
- cvalues:
|
||||
name: CVALUES
|
||||
category: Safety
|
||||
paper: http://xdp-expriment.oss-cn-zhangjiakou.aliyuncs.com/shanqi.xgh/release_github/CValues.pdf
|
||||
configpath: opencompass/configs/datasets/cvalues
|
||||
- drcd:
|
||||
name: CLUE / DRCD
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_DRCD
|
||||
- drop:
|
||||
name: DROP (DROP Simple Eval)
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1903.00161
|
||||
configpath: opencompass/configs/datasets/drop
|
||||
- ds1000:
|
||||
name: DS-1000
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2211.11501
|
||||
configpath: opencompass/configs/datasets/ds1000
|
||||
- eprstmt:
|
||||
name: FewCLUE / EPRSTMT
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_eprstmt
|
||||
- flores:
|
||||
name: Flores
|
||||
category: Language
|
||||
paper: https://aclanthology.org/D19-1632.pdf
|
||||
configpath: opencompass/configs/datasets/flores
|
||||
- game24:
|
||||
name: Game24
|
||||
category: Math
|
||||
paper: https://huggingface.co/datasets/nlile/24-game
|
||||
configpath: opencompass/configs/datasets/game24
|
||||
- govrepcrs:
|
||||
name: Government Report Dataset
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2021.naacl-main.112.pdf
|
||||
configpath: opencompass/configs/datasets/govrepcrs
|
||||
- gpqa:
|
||||
name: GPQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2311.12022v1
|
||||
configpath: opencompass/configs/datasets/gpqa
|
||||
- gsm8k:
|
||||
name: GSM8K
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2110.14168v2
|
||||
configpath: opencompass/configs/datasets/gsm8k
|
||||
- gsm_hard:
|
||||
name: GSM-Hard
|
||||
category: Math
|
||||
paper: https://proceedings.mlr.press/v202/gao23f/gao23f.pdf
|
||||
configpath: opencompass/configs/datasets/gsm_hard
|
||||
- hellaswag:
|
||||
name: HellaSwag
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1905.07830
|
||||
configpath: opencompass/configs/datasets/hellaswag
|
||||
- humaneval:
|
||||
name: HumanEval
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2107.03374v2
|
||||
configpath: opencompass/configs/datasets/humaneval
|
||||
- humaneval_cn:
|
||||
name: HumanEval-CN
|
||||
category: Code
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/humaneval_cn
|
||||
- humaneval_multi:
|
||||
name: Multi-HumanEval
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2210.14868
|
||||
configpath: opencompass/configs/datasets/humaneval_multi
|
||||
- humanevalx:
|
||||
name: HumanEval-X
|
||||
category: Code
|
||||
paper: https://dl.acm.org/doi/pdf/10.1145/3580305.3599790
|
||||
configpath: opencompass/configs/datasets/humanevalx
|
||||
- hungarian_math:
|
||||
name: Hungarian_Math
|
||||
category: Math
|
||||
paper: https://huggingface.co/datasets/keirp/hungarian_national_hs_finals_exam
|
||||
configpath: opencompass/configs/datasets/hungarian_exam
|
||||
- iwslt2017:
|
||||
name: IWSLT2017
|
||||
category: Language
|
||||
paper: https://cris.fbk.eu/bitstream/11582/312796/1/iwslt17-overview.pdf
|
||||
configpath: opencompass/configs/datasets/iwslt2017
|
||||
- jigsawmultilingual:
|
||||
name: JigsawMultilingual
|
||||
category: Safety
|
||||
paper: https://www.kaggle.com/competitions/jigsaw-multilingual-toxic-comment-classification/data
|
||||
configpath: opencompass/configs/datasets/jigsawmultilingual
|
||||
- lambada:
|
||||
name: LAMBADA
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1606.06031
|
||||
configpath: opencompass/configs/datasets/lambada
|
||||
- lcsts:
|
||||
name: LCSTS
|
||||
category: Understanding
|
||||
paper: https://aclanthology.org/D15-1229.pdf
|
||||
configpath: opencompass/configs/datasets/lcsts
|
||||
- livestembench:
|
||||
name: LiveStemBench
|
||||
category: ""
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/livestembench
|
||||
- llm_compression:
|
||||
name: LLM Compression
|
||||
category: Bits Per Character (BPC)
|
||||
paper: https://arxiv.org/pdf/2404.09937
|
||||
configpath: opencompass/configs/datasets/llm_compression
|
||||
- math:
|
||||
name: MATH
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2103.03874
|
||||
configpath: opencompass/configs/datasets/math
|
||||
- math401:
|
||||
name: MATH 401
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2304.02015
|
||||
configpath: opencompass/configs/datasets/math401
|
||||
- mathbench:
|
||||
name: MathBench
|
||||
category: Math
|
||||
paper: https://arxiv.org/pdf/2405.12209
|
||||
configpath: opencompass/configs/datasets/mathbench
|
||||
- mbpp:
|
||||
name: MBPP
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2108.07732
|
||||
configpath: opencompass/configs/datasets/mbpp
|
||||
- mbpp_cn:
|
||||
name: MBPP-CN
|
||||
category: Code
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/mbpp_cn
|
||||
- mbpp_plus:
|
||||
name: MBPP-PLUS
|
||||
category: Code
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/mbpp_plus
|
||||
- mgsm:
|
||||
name: MGSM
|
||||
category: Language / Math
|
||||
paper: https://arxiv.org/pdf/2210.03057
|
||||
configpath: opencompass/configs/datasets/mgsm
|
||||
- mmlu:
|
||||
name: MMLU
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2009.03300
|
||||
configpath: opencompass/configs/datasets/mmlu
|
||||
- mmlu_cf:
|
||||
name: MMLU-CF
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2412.15194
|
||||
configpath: opencompass/configs/datasets/mmlu_cf
|
||||
- mmlu_pro:
|
||||
name: MMLU-Pro
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2406.01574
|
||||
configpath: opencompass/configs/datasets/mmlu_pro
|
||||
- mmmlu:
|
||||
name: MMMLU
|
||||
category: Language / Understanding
|
||||
paper: https://huggingface.co/datasets/openai/MMMLU
|
||||
configpath: opencompass/configs/datasets/mmmlu
|
||||
- multirc:
|
||||
name: SuperGLUE / MultiRC
|
||||
category: Understanding
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_MultiRC
|
||||
- narrativeqa:
|
||||
name: NarrativeQA
|
||||
category: Understanding
|
||||
paper: https://github.com/google-deepmind/narrativeqa
|
||||
configpath: opencompass/configs/datasets/narrativeqa
|
||||
- natural_question:
|
||||
name: NaturalQuestions
|
||||
category: Knowledge
|
||||
paper: https://github.com/google-research-datasets/natural-questions
|
||||
configpath: opencompass/configs/datasets/nq
|
||||
- natural_question_cn:
|
||||
name: NaturalQuestions-CN
|
||||
category: Knowledge
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/nq_cn
|
||||
- obqa:
|
||||
name: OpenBookQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1809.02789v1
|
||||
configpath: opencompass/configs/datasets/obqa
|
||||
- piqa:
|
||||
name: OpenBookQA
|
||||
category: Knowledge / Physics
|
||||
paper: https://arxiv.org/pdf/1911.11641v1
|
||||
configpath: opencompass/configs/datasets/piqa
|
||||
- py150:
|
||||
name: py150
|
||||
category: Code
|
||||
paper: https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/CodeCompletion-line
|
||||
configpath: opencompass/configs/datasets/py150
|
||||
- qasper:
|
||||
name: Qasper
|
||||
category: Long Context
|
||||
paper: https://arxiv.org/pdf/2105.03011
|
||||
configpath: opencompass/configs/datasets/qasper
|
||||
- qaspercut:
|
||||
name: Qasper-Cut
|
||||
category: Long Context
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/qaspercut
|
||||
- race:
|
||||
name: RACE
|
||||
category: Examination
|
||||
paper: https://arxiv.org/pdf/1704.04683
|
||||
configpath: opencompass/configs/datasets/race
|
||||
- realtoxicprompts:
|
||||
name: RealToxicPrompts
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2009.11462
|
||||
configpath: opencompass/configs/datasets/realtoxicprompts
|
||||
- record:
|
||||
name: SuperGLUE / ReCoRD
|
||||
category: Understanding
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_ReCoRD
|
||||
- rte:
|
||||
name: SuperGLUE / RTE
|
||||
category: Reasoning
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_RTE
|
||||
- ocnli:
|
||||
name: CLUE / OCNLI
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2004.05986
|
||||
configpath: opencompass/configs/datasets/CLUE_ocnli
|
||||
- rolebench:
|
||||
name: RoleBench
|
||||
category: Role Play
|
||||
paper: https://arxiv.org/pdf/2310.00746
|
||||
configpath: opencompass/configs/datasets/rolebench
|
||||
- s3eval:
|
||||
name: S3Eval
|
||||
category: Long Context
|
||||
paper: https://aclanthology.org/2024.naacl-long.69.pdf
|
||||
configpath: opencompass/configs/datasets/s3eval
|
||||
- scibench:
|
||||
name: SciBench
|
||||
category: Reasoning
|
||||
paper: https://sxkdz.github.io/files/publications/ICML/SciBench/SciBench.pdf
|
||||
configpath: opencompass/configs/datasets/scibench
|
||||
- scicode:
|
||||
name: SciCode
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2407.13168
|
||||
configpath: opencompass/configs/datasets/scicode
|
||||
- simpleqa:
|
||||
name: SimpleQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2411.04368
|
||||
configpath: opencompass/configs/datasets/SimpleQA
|
||||
- siqa:
|
||||
name: SocialIQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/1904.09728
|
||||
configpath: opencompass/configs/datasets/siqa
|
||||
- squad20:
|
||||
name: SQuAD2.0
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1806.03822
|
||||
configpath: opencompass/configs/datasets/squad20
|
||||
- storycloze:
|
||||
name: StoryCloze
|
||||
category: Reasoning
|
||||
paper: https://aclanthology.org/2022.emnlp-main.616.pdf
|
||||
configpath: opencompass/configs/datasets/storycloze
|
||||
- strategyqa:
|
||||
name: StrategyQA
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2101.02235
|
||||
configpath: opencompass/configs/datasets/strategyqa
|
||||
- summedits:
|
||||
name: SummEdits
|
||||
category: Language
|
||||
paper: https://aclanthology.org/2023.emnlp-main.600.pdf
|
||||
configpath: opencompass/configs/datasets/summedits
|
||||
- summscreen:
|
||||
name: SummScreen
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2104.07091v1
|
||||
configpath: opencompass/configs/datasets/summscreen
|
||||
- svamp:
|
||||
name: SVAMP
|
||||
category: Math
|
||||
paper: https://aclanthology.org/2021.naacl-main.168.pdf
|
||||
configpath: opencompass/configs/datasets/SVAMP
|
||||
- tabmwp:
|
||||
name: TabMWP
|
||||
category: Math / Table
|
||||
paper: https://arxiv.org/pdf/2209.14610
|
||||
configpath: opencompass/configs/datasets/TabMWP
|
||||
- taco:
|
||||
name: TACO
|
||||
category: Code
|
||||
paper: https://arxiv.org/pdf/2312.14852
|
||||
configpath: opencompass/configs/datasets/taco
|
||||
- tnews:
|
||||
name: FewCLUE / TNEWS
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_tnews
|
||||
- bustm:
|
||||
name: FewCLUE / BUSTM
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_bustm
|
||||
- csl:
|
||||
name: FewCLUE / CSL
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_csl
|
||||
- ocnli_fc:
|
||||
name: FewCLUE / OCNLI-FC
|
||||
category: Reasoning
|
||||
paper: https://arxiv.org/pdf/2107.07498
|
||||
configpath: opencompass/configs/datasets/FewCLUE_ocnli_fc
|
||||
- triviaqa:
|
||||
name: TriviaQA
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/1705.03551v2
|
||||
configpath: opencompass/configs/datasets/triviaqa
|
||||
- triviaqarc:
|
||||
name: TriviaQA-RC
|
||||
category: Knowledge / Understanding
|
||||
paper: ""
|
||||
configpath: opencompass/configs/datasets/triviaqarc
|
||||
- truthfulqa:
|
||||
name: TruthfulQA
|
||||
category: Safety
|
||||
paper: https://arxiv.org/pdf/2109.07958v2
|
||||
configpath: opencompass/configs/datasets/truthfulqa
|
||||
- tydiqa:
|
||||
name: TyDi-QA
|
||||
category: Language
|
||||
paper: https://storage.googleapis.com/tydiqa/tydiqa.pdf
|
||||
configpath: opencompass/configs/datasets/tydiqa
|
||||
- wic:
|
||||
name: SuperGLUE / WiC
|
||||
category: Language
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_WiC
|
||||
- wsc:
|
||||
name: SuperGLUE / WSC
|
||||
category: Language / WSC
|
||||
paper: https://proceedings.neurips.cc/paper_files/paper/2019/file/4496bf24afe7fab6f046bf4923da8de6-Paper.pdf
|
||||
configpath: opencompass/configs/datasets/SuperGLUE_WSC
|
||||
- winogrande:
|
||||
name: WinoGrande
|
||||
category: Language / WSC
|
||||
paper: https://arxiv.org/pdf/1907.10641v2
|
||||
configpath: opencompass/configs/datasets/winogrande
|
||||
- xcopa:
|
||||
name: XCOPA
|
||||
category: Language
|
||||
paper: https://arxiv.org/pdf/2005.00333
|
||||
configpath: opencompass/configs/datasets/XCOPA
|
||||
- xiezhi:
|
||||
name: Xiezhi
|
||||
category: Knowledge
|
||||
paper: https://arxiv.org/pdf/2306.05783
|
||||
configpath: opencompass/configs/datasets/xiezhi
|
||||
- xlsum:
|
||||
name: XLSum
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/2106.13822v1
|
||||
configpath: opencompass/configs/datasets/XLSum
|
||||
- xsum:
|
||||
name: Xsum
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/1808.08745
|
||||
configpath: opencompass/configs/datasets/Xsum
|
||||
|
||||
|
||||
|
@ -1,10 +1,20 @@
|
||||
var collapsedSections = [];
|
||||
var collapsedSections = ['Dataset Statistics'];
|
||||
|
||||
$(document).ready(function () {
|
||||
$('.model-summary').DataTable({
|
||||
$('.dataset').DataTable({
|
||||
"stateSave": false,
|
||||
"lengthChange": false,
|
||||
"pageLength": 20,
|
||||
"order": []
|
||||
"order": [],
|
||||
"language": {
|
||||
"info": "Show _START_ to _END_ Items(Totally _TOTAL_ )",
|
||||
"infoFiltered": "(Filtered from _MAX_ Items)",
|
||||
"search": "Search:",
|
||||
"zeroRecords": "Item Not Found",
|
||||
"paginate": {
|
||||
"next": "Next",
|
||||
"previous": "Previous"
|
||||
},
|
||||
}
|
||||
});
|
||||
});
|
||||
|
@ -90,4 +90,16 @@ Although OpenCompass has already included most commonly used datasets, users nee
|
||||
return dataset
|
||||
```
|
||||
|
||||
3. After completing the dataset script and config file, you need to register the information of your new dataset in the file `dataset-index.yml` at the main directory, so that it can be added to the dataset statistics list on the OpenCompass website.
|
||||
|
||||
- The keys that need to be filled in include `name`: the name of your dataset, `category`: the category of your dataset, `paper`: the URL of the paper or project, and `configpath`: the path to the dataset config file. Here's an example:
|
||||
|
||||
```
|
||||
- mydataset:
|
||||
name: MyDataset
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/xxxxxxx
|
||||
configpath: opencompass/configs/datasets/MyDataset
|
||||
```
|
||||
|
||||
Detailed dataset configuration files and other required configuration files can be referred to in the [Configuration Files](../user_guides/config.md) tutorial. For guides on launching tasks, please refer to the [Quick Start](../get_started/quick_start.md) tutorial.
|
||||
|
@ -220,3 +220,11 @@ autodoc_typehints = 'none'
|
||||
|
||||
# The not found page
|
||||
notfound_template = '404.html'
|
||||
|
||||
|
||||
def builder_inited_handler(app):
|
||||
subprocess.run(['./statis.py'])
|
||||
|
||||
|
||||
def setup(app):
|
||||
app.connect('builder-inited', builder_inited_handler)
|
@ -80,6 +80,13 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
|
||||
|
||||
tools.md
|
||||
|
||||
.. _Dataset List:
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Dataset List
|
||||
|
||||
dataset_statistics.md
|
||||
|
||||
.. _Notes:
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
76
docs/en/statis.py
Executable file
76
docs/en/statis.py
Executable file
@ -0,0 +1,76 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from tabulate import tabulate
|
||||
|
||||
OC_ROOT = Path(__file__).absolute().parents[2]
|
||||
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
|
||||
DATASETZOO_TEMPLATE = """\
|
||||
# Dataset Statistics
|
||||
|
||||
On this page, we have listed all the datasets supported by OpenCompass.
|
||||
|
||||
You can use sorting and search functions to find the dataset you need.
|
||||
|
||||
"""
|
||||
|
||||
with open('dataset_statistics.md', 'w') as f:
|
||||
f.write(DATASETZOO_TEMPLATE)
|
||||
|
||||
load_path = str(OC_ROOT / 'dataset-index.yml')
|
||||
|
||||
with open(load_path, 'r') as f2:
|
||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||
|
||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
||||
|
||||
|
||||
def table_format(data_list):
|
||||
table_format_list = []
|
||||
for i in data_list:
|
||||
table_format_list_sub = []
|
||||
for j in i:
|
||||
for index in HEADER:
|
||||
if index == 'paper':
|
||||
table_format_list_sub.append('[link](' + i[j][index] + ')')
|
||||
elif index == 'configpath':
|
||||
if isinstance(i[j][index], list):
|
||||
sub_list_text = ''
|
||||
for k in i[j][index]:
|
||||
sub_list_text += ('[link](' + GITHUB_PREFIX + k +
|
||||
') / ')
|
||||
table_format_list_sub.append(sub_list_text[:-2])
|
||||
else:
|
||||
table_format_list_sub.append('[link](' +
|
||||
GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
else:
|
||||
table_format_list_sub.append(i[j][index])
|
||||
table_format_list.append(table_format_list_sub)
|
||||
return table_format_list
|
||||
|
||||
|
||||
data_format_list = table_format(data_list)
|
||||
|
||||
|
||||
def generate_table(data_list, title=None):
|
||||
|
||||
with open('dataset_statistics.md', 'a') as f:
|
||||
if title is not None:
|
||||
f.write(f'\n{title}')
|
||||
f.write("""\n```{table}\n:class: dataset\n""")
|
||||
header = ['Name', 'Category', 'Paper or Repository', 'Config File']
|
||||
table_cfg = dict(tablefmt='pipe',
|
||||
floatfmt='.2f',
|
||||
numalign='right',
|
||||
stralign='center')
|
||||
f.write(tabulate(data_list, header, **table_cfg))
|
||||
f.write('\n```\n')
|
||||
|
||||
|
||||
generate_table(
|
||||
data_list=data_format_list,
|
||||
title='## Supported Dataset List',
|
||||
)
|
@ -1,10 +1,20 @@
|
||||
var collapsedSections = [];
|
||||
var collapsedSections = ['数据集统计'];
|
||||
|
||||
$(document).ready(function () {
|
||||
$('.model-summary').DataTable({
|
||||
$('.dataset').DataTable({
|
||||
"stateSave": false,
|
||||
"lengthChange": false,
|
||||
"pageLength": 20,
|
||||
"order": []
|
||||
"order": [],
|
||||
"language": {
|
||||
"info": "显示 _START_ 至 _END_ 条目(总计 _TOTAL_ )",
|
||||
"infoFiltered": "(筛选自 _MAX_ 条目)",
|
||||
"search": "搜索:",
|
||||
"zeroRecords": "没有找到任何条目",
|
||||
"paginate": {
|
||||
"next": "下一页",
|
||||
"previous": "上一页"
|
||||
},
|
||||
}
|
||||
});
|
||||
});
|
||||
|
@ -91,4 +91,16 @@
|
||||
return dataset
|
||||
```
|
||||
|
||||
3. 在完成数据集脚本和配置文件的构建后,需要在OpenCompass主目录下的`dataset-index.yml`配置文件中登记新数据集的相关信息,以使其加入OpenCompass官网Doc的数据集统计列表中。
|
||||
|
||||
- 需要填写的字段包括数据集名称`name`、数据集类型`category`、原文或项目地址`paper`、以及数据集配置文件的路径`configpath`。具体示例如下:
|
||||
|
||||
```
|
||||
- mydataset:
|
||||
name: MyDataset
|
||||
category: Understanding
|
||||
paper: https://arxiv.org/pdf/xxxxxxx
|
||||
configpath: opencompass/configs/datasets/MyDataset
|
||||
```
|
||||
|
||||
详细的数据集配置文件以及其他需要的配置文件可以参考[配置文件](../user_guides/config.md)教程,启动任务相关的教程可以参考[快速开始](../get_started/quick_start.md)教程。
|
||||
|
@ -224,6 +224,7 @@ notfound_template = '404.html'
|
||||
|
||||
def builder_inited_handler(app):
|
||||
subprocess.run(['./cp_origin_docs.sh'])
|
||||
subprocess.run(['./statis.py'])
|
||||
|
||||
|
||||
def setup(app):
|
||||
|
@ -81,6 +81,13 @@ OpenCompass 上手路线
|
||||
|
||||
tools.md
|
||||
|
||||
.. _数据集列表:
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: 数据集列表
|
||||
|
||||
dataset_statistics.md
|
||||
|
||||
.. _其他说明:
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
|
75
docs/zh_cn/statis.py
Executable file
75
docs/zh_cn/statis.py
Executable file
@ -0,0 +1,75 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from tabulate import tabulate
|
||||
|
||||
OC_ROOT = Path(__file__).absolute().parents[2]
|
||||
GITHUB_PREFIX = 'https://github.com/open-compass/opencompass/tree/main/'
|
||||
DATASETZOO_TEMPLATE = """\
|
||||
# 数据集统计
|
||||
|
||||
在本页面中,我们列举了OpenCompass所支持的所有数据集。
|
||||
|
||||
你可以使用排序和搜索功能找到需要的数据集。
|
||||
|
||||
"""
|
||||
|
||||
with open('dataset_statistics.md', 'w') as f:
|
||||
f.write(DATASETZOO_TEMPLATE)
|
||||
|
||||
load_path = str(OC_ROOT / 'dataset-index.yml')
|
||||
|
||||
with open(load_path, 'r') as f2:
|
||||
data_list = yaml.load(f2, Loader=yaml.FullLoader)
|
||||
|
||||
HEADER = ['name', 'category', 'paper', 'configpath']
|
||||
|
||||
|
||||
def table_format(data_list):
|
||||
table_format_list = []
|
||||
for i in data_list:
|
||||
table_format_list_sub = []
|
||||
for j in i:
|
||||
for index in HEADER:
|
||||
if index == 'paper':
|
||||
table_format_list_sub.append('[链接](' + i[j][index] + ')')
|
||||
elif index == 'configpath':
|
||||
if isinstance(i[j][index], list):
|
||||
sub_list_text = ''
|
||||
for k in i[j][index]:
|
||||
sub_list_text += ('[链接](' + GITHUB_PREFIX + k +
|
||||
') / ')
|
||||
table_format_list_sub.append(sub_list_text[:-2])
|
||||
else:
|
||||
table_format_list_sub.append('[链接](' + GITHUB_PREFIX +
|
||||
i[j][index] + ')')
|
||||
else:
|
||||
table_format_list_sub.append(i[j][index])
|
||||
table_format_list.append(table_format_list_sub)
|
||||
return table_format_list
|
||||
|
||||
|
||||
data_format_list = table_format(data_list)
|
||||
|
||||
|
||||
def generate_table(data_list, title=None):
|
||||
|
||||
with open('dataset_statistics.md', 'a') as f:
|
||||
if title is not None:
|
||||
f.write(f'\n{title}')
|
||||
f.write("""\n```{table}\n:class: dataset\n""")
|
||||
header = ['数据集名称', '数据集类型', '原文或资源地址', '配置文件链接']
|
||||
table_cfg = dict(tablefmt='pipe',
|
||||
floatfmt='.2f',
|
||||
numalign='right',
|
||||
stralign='center')
|
||||
f.write(tabulate(data_list, header, **table_cfg))
|
||||
f.write('\n```\n')
|
||||
|
||||
|
||||
generate_table(
|
||||
data_list=data_format_list,
|
||||
title='## 支持数据集列表',
|
||||
)
|
137
examples/eval_academic_leaderboard_202502.py
Normal file
137
examples/eval_academic_leaderboard_202502.py
Normal file
@ -0,0 +1,137 @@
|
||||
# flake8: noqa
|
||||
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner, VOLCRunner
|
||||
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
||||
|
||||
#######################################################################
|
||||
# PART 0 Essential Configs #
|
||||
#######################################################################
|
||||
with read_base():
|
||||
# Datasets Part
|
||||
# Knowledge
|
||||
# Math
|
||||
from opencompass.configs.datasets.aime2024.aime2024_0shot_nocot_genericllmeval_academic_gen import \
|
||||
aime2024_datasets
|
||||
from opencompass.configs.datasets.bbh.bbh_0shot_nocot_academic_gen import \
|
||||
bbh_datasets
|
||||
# General Reasoning
|
||||
from opencompass.configs.datasets.gpqa.gpqa_openai_simple_evals_gen_5aeece import \
|
||||
gpqa_datasets
|
||||
from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import \
|
||||
humaneval_datasets
|
||||
# Instruction Following
|
||||
from opencompass.configs.datasets.IFEval.IFEval_gen_353ae7 import \
|
||||
ifeval_datasets
|
||||
from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import \
|
||||
LCBCodeGeneration_dataset
|
||||
from opencompass.configs.datasets.math.math_prm800k_500_0shot_cot_gen import \
|
||||
math_datasets
|
||||
from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_cot_gen_08c1de import \
|
||||
mmlu_pro_datasets
|
||||
# Model List
|
||||
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat import \
|
||||
models as hf_internlm2_5_7b_chat_model
|
||||
# Summary Groups
|
||||
from opencompass.configs.summarizers.groups.bbh import bbh_summary_groups
|
||||
from opencompass.configs.summarizers.groups.mmlu_pro import \
|
||||
mmlu_pro_summary_groups
|
||||
|
||||
#######################################################################
|
||||
# PART 1 Datasets List #
|
||||
#######################################################################
|
||||
# datasets list for evaluation
|
||||
# Only take LCB generation for evaluation
|
||||
datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')),
|
||||
[]) + [LCBCodeGeneration_dataset]
|
||||
|
||||
# LLM judge config: using LLM to evaluate predictions
|
||||
judge_cfg = dict()
|
||||
for dataset in datasets:
|
||||
dataset['infer_cfg']['inferencer']['max_out_len'] = 32768
|
||||
if 'judge_cfg' in dataset['eval_cfg']['evaluator']:
|
||||
dataset['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
|
||||
|
||||
|
||||
#######################################################################
|
||||
# PART 2 Datset Summarizer #
|
||||
#######################################################################
|
||||
|
||||
core_summary_groups = [
|
||||
{
|
||||
'name':
|
||||
'core_average',
|
||||
'subsets': [
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
['bbh', 'naive_average'],
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
['mmlu_pro', 'naive_average'],
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
],
|
||||
},
|
||||
]
|
||||
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
['core_average', 'naive_average'],
|
||||
'',
|
||||
'Instruction Following',
|
||||
['IFEval', 'Prompt-level-strict-accuracy'],
|
||||
'',
|
||||
'General Reasoning',
|
||||
['bbh', 'naive_average'],
|
||||
['GPQA_diamond', 'accuracy'],
|
||||
'',
|
||||
'Math Calculation',
|
||||
['math_prm800k_500', 'accuracy'],
|
||||
['aime2024', 'accuracy'],
|
||||
'',
|
||||
'Knowledge',
|
||||
['mmlu_pro', 'naive_average'],
|
||||
'',
|
||||
'Code',
|
||||
['openai_humaneval', 'humaneval_pass@1'],
|
||||
['lcb_code_generation', 'pass@1'],
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith('_summary_groups')], []),
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 3 Models List #
|
||||
#######################################################################
|
||||
|
||||
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
|
||||
|
||||
#######################################################################
|
||||
# PART 4 Inference/Evaluation Configuaration #
|
||||
#######################################################################
|
||||
|
||||
# Local Runner
|
||||
infer = dict(
|
||||
partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
retry=0, # Modify if needed
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
# eval with local runner
|
||||
eval = dict(
|
||||
partitioner=dict(type=NaivePartitioner, n=10),
|
||||
runner=dict(type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLEvalTask)),
|
||||
)
|
||||
|
||||
#######################################################################
|
||||
# PART 5 Utils Configuaration #
|
||||
#######################################################################
|
||||
work_dir = './outputs/oc_academic_202502'
|
77
examples/eval_math_verify.py
Normal file
77
examples/eval_math_verify.py
Normal file
@ -0,0 +1,77 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
with read_base():
|
||||
from opencompass.configs.datasets.math.math_500_gen import math_datasets
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-llama-8b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=32,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
||||
),
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-7b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
||||
gen_config=dict(
|
||||
temperature=0.6,
|
||||
top_p=0.95,
|
||||
max_new_tokens=32768,
|
||||
do_sample=True,
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=32,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
||||
),
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
|
||||
gen_config=dict(
|
||||
top_k=1, temperature=1e-6, top_p=0.9, max_new_tokens=4096
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=32,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
||||
),
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-14b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
|
||||
gen_config=dict(
|
||||
top_k=1,
|
||||
temperature=0.6,
|
||||
top_p=0.95,
|
||||
max_new_tokens=32768,
|
||||
do_sample=True,
|
||||
),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content),
|
||||
),
|
||||
]
|
||||
|
||||
datasets = [*math_datasets]
|
||||
|
||||
|
||||
work_dir = './outputs/math_500'
|
@ -0,0 +1,98 @@
|
||||
# flake8: noqa
|
||||
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import Aime2024Dataset
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
|
||||
|
||||
|
||||
aime2024_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
)
|
||||
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN',
|
||||
prompt='{question}\nRemember to put your final answer within \\boxed{}.'),
|
||||
],
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048)
|
||||
)
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess,
|
||||
metric_name='accuracy'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
mode='singlescore',
|
||||
)
|
||||
]
|
189
opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
Normal file
189
opencompass/configs/datasets/bbh/bbh_0shot_nocot_academic_gen.py
Normal file
@ -0,0 +1,189 @@
|
||||
# flake8: noqa
|
||||
|
||||
import os
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import BBHDataset
|
||||
from opencompass.datasets.generic import generic_llmjudge_academic_postprocess
|
||||
|
||||
|
||||
bbh_reader_cfg = dict(input_columns=['input'], output_column='target')
|
||||
|
||||
bbh_multiple_choice_sets = [
|
||||
'temporal_sequences',
|
||||
'disambiguation_qa',
|
||||
'date_understanding',
|
||||
'tracking_shuffled_objects_three_objects',
|
||||
'penguins_in_a_table',
|
||||
'geometric_shapes',
|
||||
'snarks',
|
||||
'ruin_names',
|
||||
'tracking_shuffled_objects_seven_objects',
|
||||
'tracking_shuffled_objects_five_objects',
|
||||
'logical_deduction_three_objects',
|
||||
'hyperbaton',
|
||||
'logical_deduction_five_objects',
|
||||
'logical_deduction_seven_objects',
|
||||
'movie_recommendation',
|
||||
'salient_translation_error_detection',
|
||||
'reasoning_about_colored_objects',
|
||||
]
|
||||
bbh_free_form_sets = [
|
||||
'multistep_arithmetic_two',
|
||||
'navigate',
|
||||
'dyck_languages',
|
||||
'word_sorting',
|
||||
'sports_understanding',
|
||||
'boolean_expressions',
|
||||
'object_counting',
|
||||
'formal_fallacies',
|
||||
'causal_judgement',
|
||||
'web_of_lies',
|
||||
]
|
||||
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{input}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
bbh_sets = bbh_multiple_choice_sets + bbh_free_form_sets
|
||||
|
||||
# For zero shot inference in bbh
|
||||
bbh_datasets = []
|
||||
for _name in bbh_sets:
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f"Question: {{input}}\n You must give your final answer by starting with 'So the answer is' "
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=BBHDataset,
|
||||
name=_name,
|
||||
path='opencompass/bbh',
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
bbh_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy())
|
||||
)
|
||||
|
||||
|
||||
# For original 3 shot inference in bbh
|
||||
bbh_3_shot_datasets = []
|
||||
for _name in bbh_sets:
|
||||
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
|
||||
_hint = f.read()
|
||||
bbh_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
|
||||
)
|
||||
])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||
|
||||
bbh_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=BBHDataset,
|
||||
name=_name,
|
||||
path='opencompass/bbh',
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess, metric_name='score'),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
bbh_3_shot_datasets.append(
|
||||
dict(
|
||||
type=BBHDataset,
|
||||
path='opencompass/bbh',
|
||||
name=_name,
|
||||
abbr='bbh-' + _name,
|
||||
reader_cfg=bbh_reader_cfg,
|
||||
infer_cfg=bbh_infer_cfg.copy(),
|
||||
eval_cfg=bbh_eval_cfg.copy()))
|
@ -1,36 +1,30 @@
|
||||
# LiveMathBench
|
||||
|
||||
## Details of Datsets
|
||||
## v202412
|
||||
|
||||
### Details of Datsets
|
||||
|
||||
| dataset | language | #single-choice | #multiple-choice | #fill-in-the-blank | #problem-solving |
|
||||
| -- | -- | -- | -- | -- | -- |
|
||||
| AIMC | cn | 0 | 0 | 0 | 46 |
|
||||
| AIMC | en | 0 | 0 | 0 | 46 |
|
||||
| CEE | cn | 0 | 0 | 13 | 40 |
|
||||
| CEE | en | 0 | 0 | 13 | 40 |
|
||||
| CMO | cn | 0 | 0 | 0 | 18 |
|
||||
| CMO | en | 0 | 0 | 0 | 18 |
|
||||
| MATH500 | en | 0 | 0 | 0 | 500 |
|
||||
| AIME2024 | en | 0 | 0 | 0 | 44 |
|
||||
| AMC | cn | 0 | 0 | 0 | 46 |
|
||||
| AMC | en | 0 | 0 | 0 | 46 |
|
||||
| CCEE | cn | 0 | 0 | 13 | 31 |
|
||||
| CCEE | en | 0 | 0 | 13 | 31 |
|
||||
| CNMO | cn | 0 | 0 | 0 | 18 |
|
||||
| CNMO | en | 0 | 0 | 0 | 18 |
|
||||
| WLPMC | cn | 0 | 0 | 0 | 11 |
|
||||
| WLPMC | en | 0 | 0 | 0 | 11 |
|
||||
|
||||
|
||||
## How to use
|
||||
|
||||
### How to use
|
||||
|
||||
#### G-Pass@k
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.datasets.livemathbench import livemathbench_datasets
|
||||
from opencompass.datasets.livemathbench_gen import livemathbench_datasets
|
||||
|
||||
livemathbench_datasets[0].update(
|
||||
{
|
||||
'abbr': 'livemathbench_${k}x${n}'
|
||||
'path': '/path/to/data/dir',
|
||||
'k': 'k@pass', # the max value of k in k@pass
|
||||
'n': 'number of runs', # number of runs
|
||||
}
|
||||
)
|
||||
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
{
|
||||
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
||||
@ -40,38 +34,41 @@ livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
] # set url of evaluation models
|
||||
}
|
||||
)
|
||||
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
||||
max_out_len=32768 # for o1-like models you need to update max_out_len
|
||||
))
|
||||
|
||||
```
|
||||
|
||||
> ❗️ At present, `extract_from_boxed` is used to extract answers from model responses, and one can also leverage LLM for extracting through the following parameters, but this part of the code has not been tested.
|
||||
|
||||
#### Greedy
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from opencompass.datasets.livemathbench_greedy_gen import livemathbench_datasets
|
||||
|
||||
livemathbench_datasets[0]['eval_cfg']['evaluator'].update(
|
||||
{
|
||||
'model_name': 'Qwen/Qwen2.5-72B-Instruct',
|
||||
'url': [
|
||||
'http://0.0.0.0:23333/v1',
|
||||
'...'
|
||||
], # set url of evaluation models
|
||||
|
||||
# for LLM-based extraction
|
||||
'use_extract_model': True,
|
||||
'post_model_name': 'oc-extractor',
|
||||
'post_url': [
|
||||
'http://0.0.0.0:21006/v1,
|
||||
'...'
|
||||
]
|
||||
] # set url of evaluation models
|
||||
}
|
||||
)
|
||||
livemathbench_dataset['infer_cfg']['inferencer'].update(dict(
|
||||
max_out_len=32768 # for o1-like models you need to update max_out_len
|
||||
))
|
||||
|
||||
```
|
||||
|
||||
## Output Samples
|
||||
### Output Samples
|
||||
|
||||
| dataset | version | metric | mode | Qwen2.5-72B-Instruct |
|
||||
|----- | ----- | ----- | ----- | -----|
|
||||
| LiveMathBench | caed8f | 1@pass | gen | 26.07 |
|
||||
| LiveMathBench | caed8f | 1@pass/std | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | 2@pass | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | 2@pass/std | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | pass-rate | gen | xx.xx |
|
||||
| LiveMathBench | 9befbf | G-Pass@16_0.0 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.25 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.5 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_0.75 | gen | xx.xx |
|
||||
| LiveMathBench | caed8f | G-Pass@16_1.0 | gen | xx.xx |
|
||||
|
||||
|
@ -0,0 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .livemathbench_greedy_gen_efb20d import livemathbench_datasets # noqa: F401, F403
|
@ -6,15 +6,15 @@ from opencompass.datasets.livemathbench import LiveMathBenchDataset, LiveMathBen
|
||||
|
||||
|
||||
livemathbench_dataset = dict(
|
||||
abbr='LiveMathBench-v202412-greedy', # If you change the K and replication, you need to change the dataset name.
|
||||
type=LiveMathBenchDataset,
|
||||
path='opencompass/LiveMathBench',
|
||||
path='',
|
||||
k=1,
|
||||
replication=1,
|
||||
dataset_splits=['CNMO', 'CCEE', 'AMC', 'WLPMC'],
|
||||
dataset_languages=['cn', 'en'],
|
||||
cot=False,
|
||||
cot=True,
|
||||
version='202412',
|
||||
abbr='LiveMathBench-v202412',
|
||||
reader_cfg=dict(
|
||||
input_columns=['prompt'],
|
||||
output_column='answer'
|
||||
@ -31,7 +31,7 @@ livemathbench_dataset = dict(
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(
|
||||
type=GenInferencer,
|
||||
max_out_len=16384,
|
||||
max_out_len=8192
|
||||
),
|
||||
),
|
||||
eval_cfg=dict(
|
||||
@ -44,7 +44,7 @@ livemathbench_dataset = dict(
|
||||
extract_model_name='',
|
||||
k=[1],
|
||||
replication=1,
|
||||
thresholds=[0.0, 0.25, 0.5, 0.75, 1.0]
|
||||
thresholds=[0.0]
|
||||
)
|
||||
)
|
||||
)
|
40
opencompass/configs/datasets/math/math_500_gen.py
Normal file
40
opencompass/configs/datasets/math/math_500_gen.py
Normal file
@ -0,0 +1,40 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.openicl.icl_evaluator.math_evaluator import MATHEvaluator
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(type=MATHEvaluator),
|
||||
)
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
type=CustomDataset,
|
||||
abbr='math-500',
|
||||
path='opencompass/math',
|
||||
file_name='test_prm800k_500.jsonl',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
)
|
||||
]
|
@ -0,0 +1,100 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (
|
||||
MATHDataset,
|
||||
MATHEvaluator,
|
||||
math_postprocess_v2,
|
||||
normalize_final_answer,
|
||||
)
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_academic_postprocess
|
||||
|
||||
|
||||
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
|
||||
|
||||
math_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{problem}\nPlease reason step by step, and put your final answer within \\boxed{}.',
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
Here are some evaluation criteria:
|
||||
1. Please refer to the given standard answer. You don't need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate's answer is consistent with the standard answer according to the form of the question. Don't try to answer the original question. You can assume that the standard answer is definitely correct.
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: \n{problem}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{solution}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# Evaluation configuration
|
||||
math_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MATHDataset,
|
||||
path='opencompass/math',
|
||||
file_name = 'test_prm800k_500.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_academic_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
type=MATHDataset,
|
||||
abbr='math_prm800k_500',
|
||||
path='opencompass/math',
|
||||
file_name='test_prm800k_500.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
infer_cfg=math_infer_cfg,
|
||||
eval_cfg=math_eval_cfg,
|
||||
mode='singlescore',
|
||||
)
|
||||
]
|
@ -32,7 +32,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -31,7 +31,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -32,7 +32,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -31,7 +31,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -73,12 +73,13 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=LMEvaluator,
|
||||
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
|
@ -74,7 +74,7 @@ for _name in subjective_all_sets:
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -72,7 +72,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -38,7 +38,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -39,7 +39,7 @@ for _name in subjective_all_sets:
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -37,7 +37,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -118,7 +118,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -1,6 +1,6 @@
|
||||
from opencompass.datasets import (
|
||||
CompassArenaDataset,
|
||||
compassarena_bradleyterry_postprocess,
|
||||
compassarena_bradleyterry_postprocess
|
||||
)
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
@ -127,7 +127,7 @@ for _name, _prompt in sub_map.items():
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -105,7 +105,7 @@ for _name, _prompt in sub_map.items():
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -91,7 +91,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -90,7 +90,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -59,7 +59,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -58,7 +58,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=4096),
|
||||
inferencer=dict(type=GenInferencer,),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -29,7 +29,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -28,7 +28,7 @@ for _name in subjective_all_sets:
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -24,7 +24,7 @@ for _name in subjective_all_sets:
|
||||
template="""{dialogue}""",
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
|
||||
inferencer=dict(type=ChatInferencer, infer_mode='last'),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -23,7 +23,7 @@ for _name in subjective_all_sets:
|
||||
template="""{dialogue}""",
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
|
||||
inferencer=dict(type=ChatInferencer, infer_mode='last'),
|
||||
)
|
||||
|
||||
subjective_eval_cfg = dict(
|
||||
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-llama-70b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
|
||||
max_out_len=16384,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=8),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-llama-8b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
|
||||
max_out_len=16384,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-14b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
||||
max_out_len=16384,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-1.5b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
|
||||
max_out_len=16384,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-32b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
||||
max_out_len=16384,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,14 @@
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-7b-hf',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
||||
max_out_len=16384,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-llama-70b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Llama-70B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=8, tp=8),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=8),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-llama-8b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Llama-8B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-14b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-14B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=2),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=2),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-1_5b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=1),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-32b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-32B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=16, tp=4),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=16,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -0,0 +1,20 @@
|
||||
from opencompass.models import TurboMindModelwithChatTemplate
|
||||
from opencompass.utils.text_postprocessors import extract_non_reasoning_content
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=TurboMindModelwithChatTemplate,
|
||||
abbr='deepseek-r1-distill-qwen-7b-turbomind',
|
||||
path='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B',
|
||||
engine_config=dict(session_len=32768, max_batch_size=8, tp=1),
|
||||
gen_config=dict(top_k=1,
|
||||
temperature=1e-6,
|
||||
top_p=0.9,
|
||||
max_new_tokens=32768),
|
||||
max_seq_len=32768,
|
||||
max_out_len=32768,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
pred_postprocessor=dict(type=extract_non_reasoning_content)
|
||||
)
|
||||
]
|
@ -22,7 +22,10 @@ from .base import BaseDataset
|
||||
class LCDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path: str, num_repeats: int = 1, difficulty='ALL'):
|
||||
def load(path: str,
|
||||
num_repeats: int = 1,
|
||||
difficulty='ALL',
|
||||
local_mode=False):
|
||||
"""Load LC dataset for pass k mode.
|
||||
|
||||
Note that you can use num_repeats > 1 when your model does not support
|
||||
@ -38,7 +41,7 @@ class LCDataset(BaseDataset):
|
||||
num_repeats(int): Number of repetition for this dataset to get
|
||||
multiple responses in special cases.
|
||||
"""
|
||||
path = get_data_path(path, local_mode=True)
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
|
||||
def processing_test(example):
|
||||
example['test_case'] = example['test_list']
|
||||
|
@ -73,6 +73,8 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
eval_type='instruct',
|
||||
remote_execute_api='https://bigcode-bigcodebench-evaluator.hf.space/', # noqa
|
||||
dataset_version: str = 'full',
|
||||
local_mode: bool = False,
|
||||
path: str = 'opencompass/bigcodebench',
|
||||
pass_k: str = '1,5,10',
|
||||
parallel: int = -1,
|
||||
min_time_limit: float = 1,
|
||||
@ -84,7 +86,9 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
super().__init__()
|
||||
self.dataset = BigCodeBenchDataset.load(
|
||||
release_version=release_version,
|
||||
dataset_version=dataset_version)['test']
|
||||
dataset_version=dataset_version,
|
||||
local_mode=local_mode,
|
||||
path=path)['test']
|
||||
self.eval_type = eval_type
|
||||
self.remote_execute_api = remote_execute_api
|
||||
|
||||
@ -117,8 +121,40 @@ class BigCodeBenchEvaluator(BaseEvaluator):
|
||||
logger.info('Start to extract code from predictions')
|
||||
sanitized_predictions = []
|
||||
for prediction, entrypoint in zip(predictions, entrypoints):
|
||||
sanitized_prediction = extract_code_generation(
|
||||
prediction, entrypoint=entrypoint)
|
||||
try:
|
||||
import signal
|
||||
from contextlib import contextmanager
|
||||
|
||||
@contextmanager
|
||||
def timeout_handler(seconds):
|
||||
|
||||
def _handle_timeout(signum, frame):
|
||||
raise TimeoutError(f'Code extraction timed out'
|
||||
f'after {seconds} seconds')
|
||||
|
||||
original_handler = signal.signal(signal.SIGALRM,
|
||||
_handle_timeout)
|
||||
signal.alarm(seconds)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.alarm(0)
|
||||
signal.signal(signal.SIGALRM, original_handler)
|
||||
|
||||
with timeout_handler(10):
|
||||
sanitized_prediction = extract_code_generation(
|
||||
prediction, entrypoint=entrypoint)
|
||||
|
||||
except TimeoutError as e:
|
||||
logger.warning(
|
||||
f'Code extraction timeout for entrypoint {entrypoint}: '
|
||||
f'{str(e)}')
|
||||
sanitized_prediction = ''
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f'Code extraction failed for entrypoint {entrypoint}: '
|
||||
f'{str(e)}')
|
||||
sanitized_prediction = ''
|
||||
sanitized_predictions.append(sanitized_prediction)
|
||||
|
||||
# Prepare for submission
|
||||
|
@ -13,6 +13,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.utils import get_data_path
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
@ -114,7 +115,7 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
|
||||
circular_pattern = origin_item['circular_pattern']
|
||||
for k in circular_patterns:
|
||||
if tuple(circular_pattern) in circular_patterns[k]:
|
||||
tmp_metrics[f'correct_{k}'] += (1 if parsed == refr else 0)
|
||||
tmp_metrics[f'correct_{k}'] += 1 if parsed == refr else 0
|
||||
tmp_metrics[f'count_{k}'] += 1
|
||||
|
||||
for k in circular_patterns:
|
||||
@ -164,7 +165,10 @@ class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
|
||||
class CustomDataset(BaseDataset):
|
||||
|
||||
@staticmethod
|
||||
def load(path):
|
||||
def load(path, file_name=None, local_mode=False):
|
||||
path = get_data_path(path, local_mode=local_mode)
|
||||
if file_name is not None:
|
||||
path = os.path.join(path, file_name)
|
||||
if path.endswith('.jsonl'):
|
||||
with open(path, 'r', encoding='utf-8-sig') as f:
|
||||
data = [json.loads(line) for line in f]
|
||||
@ -222,9 +226,10 @@ def make_mcq_gen_config(meta):
|
||||
)
|
||||
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=meta.get('evaluator', OptionSimAccEvaluator),
|
||||
**meta.get('evaluator_kwargs',
|
||||
{'options': meta['options']})),
|
||||
evaluator=dict(
|
||||
type=meta.get('evaluator', OptionSimAccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {'options': meta['options']}),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
@ -269,10 +274,10 @@ def make_circular_mcq_gen_config(meta):
|
||||
)
|
||||
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=meta.get('evaluator',
|
||||
CircularOptionSimAccEvaluator),
|
||||
**meta.get('evaluator_kwargs',
|
||||
{'options': meta['options']})),
|
||||
evaluator=dict(
|
||||
type=meta.get('evaluator', CircularOptionSimAccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {'options': meta['options']}),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
@ -320,8 +325,10 @@ def make_qa_gen_config(meta):
|
||||
)
|
||||
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=meta.get('evaluator', AccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {})),
|
||||
evaluator=dict(
|
||||
type=meta.get('evaluator', AccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {}),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
@ -346,9 +353,11 @@ def make_mcq_ppl_config(meta):
|
||||
template = {
|
||||
answer: dict(round=[
|
||||
dict(role='HUMAN', prompt=human_prompt),
|
||||
dict(role='BOT',
|
||||
prompt=bot_prompt.format(
|
||||
**{meta['output_column']: answer})),
|
||||
dict(
|
||||
role='BOT',
|
||||
prompt=bot_prompt.format(
|
||||
**{meta['output_column']: answer}),
|
||||
),
|
||||
], )
|
||||
for answer in meta['options']
|
||||
}
|
||||
@ -370,8 +379,10 @@ def make_mcq_ppl_config(meta):
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
|
||||
eval_cfg = dict(evaluator=dict(type=meta.get('evaluator', AccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {})))
|
||||
eval_cfg = dict(evaluator=dict(
|
||||
type=meta.get('evaluator', AccEvaluator),
|
||||
**meta.get('evaluator_kwargs', {}),
|
||||
))
|
||||
|
||||
dataset = dict(
|
||||
abbr=meta['abbr'],
|
||||
@ -394,9 +405,11 @@ def make_circular_mcq_ppl_config(meta):
|
||||
template = {
|
||||
answer: dict(round=[
|
||||
dict(role='HUMAN', prompt=human_prompt),
|
||||
dict(role='BOT',
|
||||
prompt=bot_prompt.format(
|
||||
**{meta['output_column']: answer})),
|
||||
dict(
|
||||
role='BOT',
|
||||
prompt=bot_prompt.format(
|
||||
**{meta['output_column']: answer}),
|
||||
),
|
||||
], )
|
||||
for answer in meta['options']
|
||||
}
|
||||
@ -418,9 +431,10 @@ def make_circular_mcq_ppl_config(meta):
|
||||
inferencer=dict(type=PPLInferencer),
|
||||
)
|
||||
|
||||
eval_cfg = dict(
|
||||
evaluator=dict(type=meta.get('evaluator', CircularEvaluator),
|
||||
**meta.get('evaluator_kwargs', {})))
|
||||
eval_cfg = dict(evaluator=dict(
|
||||
type=meta.get('evaluator', CircularEvaluator),
|
||||
**meta.get('evaluator_kwargs', {}),
|
||||
))
|
||||
|
||||
dataset = dict(
|
||||
abbr=meta['abbr'],
|
||||
|
@ -1,7 +1,10 @@
|
||||
import re
|
||||
|
||||
|
||||
def get_final_results(judged_answers, references, origial_responses):
|
||||
def get_final_results(judged_answers,
|
||||
references,
|
||||
origial_responses,
|
||||
metric_name='accuracy'):
|
||||
count = 0
|
||||
is_correct_count = 0
|
||||
is_incorrect_count = 0
|
||||
@ -39,7 +42,7 @@ def get_final_results(judged_answers, references, origial_responses):
|
||||
is_correct) > 0 else 0
|
||||
result = {
|
||||
# 'accuracy_given_attempted': accuracy_given_attempted,
|
||||
'accuracy': accuracy_given_attempted * 100,
|
||||
metric_name: accuracy_given_attempted * 100,
|
||||
'f1': f1,
|
||||
'details': details
|
||||
}
|
||||
@ -69,3 +72,25 @@ def generic_llmjudge_postprocess(
|
||||
results = get_final_results(judged_answers, references, origial_responses)
|
||||
results['details'] = output
|
||||
return results
|
||||
|
||||
|
||||
def generic_llmjudge_academic_postprocess(
|
||||
output: dict,
|
||||
output_path: str,
|
||||
metric_name: str = 'accuracy',
|
||||
) -> dict:
|
||||
judged_answers = []
|
||||
origial_responses = []
|
||||
references = []
|
||||
for k, v in output.items():
|
||||
origial_responses.append(v['prediction'])
|
||||
processed_judge = _generic_llmjudge_postprocess(v['prediction'])
|
||||
if processed_judge is not None:
|
||||
judged_answers.append(processed_judge)
|
||||
references.append(v['gold'])
|
||||
results = get_final_results(judged_answers, references, origial_responses,
|
||||
metric_name)
|
||||
results['details'] = output
|
||||
# For academic summarizer
|
||||
results.pop('f1', None)
|
||||
return results
|
||||
|
@ -25,12 +25,7 @@ OPENAI_API_BASE = os.path.join(
|
||||
OPENAISDK_API_BASE = os.environ.get('OPENAI_BASE_URL',
|
||||
'https://api.openai.com/v1/')
|
||||
|
||||
O1_MODEL_LIST = [
|
||||
'o1-preview-2024-09-12',
|
||||
'o1-mini-2024-09-12',
|
||||
'o1-preview',
|
||||
'o1-mini',
|
||||
]
|
||||
O1_MODEL_LIST = ['o1', 'o3']
|
||||
|
||||
|
||||
@MODELS.register_module()
|
||||
@ -96,7 +91,6 @@ class OpenAI(BaseAPIModel):
|
||||
temperature: Optional[float] = None,
|
||||
tokenizer_path: Optional[str] = None,
|
||||
extra_body: Optional[Dict] = None,
|
||||
max_completion_tokens: int = 16384,
|
||||
verbose: bool = False,
|
||||
):
|
||||
|
||||
@ -151,9 +145,6 @@ class OpenAI(BaseAPIModel):
|
||||
self.proxy_url = openai_proxy_url
|
||||
|
||||
self.path = path
|
||||
self.max_completion_tokens = max_completion_tokens
|
||||
self.logger.warning(
|
||||
f'Max Completion tokens for {path} is {max_completion_tokens}')
|
||||
|
||||
def generate(
|
||||
self,
|
||||
@ -250,16 +241,15 @@ class OpenAI(BaseAPIModel):
|
||||
header['OpenAI-Organization'] = self.orgs[self.org_ctr]
|
||||
|
||||
try:
|
||||
if self.path in O1_MODEL_LIST:
|
||||
if any(model in self.path for model in O1_MODEL_LIST):
|
||||
self.logger.warning(
|
||||
f"'max_token' is unsupported for model {self.path}")
|
||||
self.logger.warning(
|
||||
f'We use max_completion_tokens: '
|
||||
f'{self.max_completion_tokens}for this query')
|
||||
f'We use max_out_len: {max_out_len} for this query')
|
||||
data = dict(
|
||||
model=self.path,
|
||||
messages=messages,
|
||||
max_completion_tokens=self.max_completion_tokens,
|
||||
max_completion_tokens=max_out_len,
|
||||
n=1,
|
||||
logprobs=self.logprobs,
|
||||
top_logprobs=self.top_logprobs,
|
||||
@ -440,7 +430,7 @@ class OpenAI(BaseAPIModel):
|
||||
if mode == 'front':
|
||||
cur_prompt = sep.join(words[-mid:])
|
||||
elif mode == 'mid':
|
||||
cur_prompt = (sep.join(words[:mid]) + sep.join(words[-mid:]))
|
||||
cur_prompt = sep.join(words[:mid]) + sep.join(words[-mid:])
|
||||
elif mode == 'rear':
|
||||
cur_prompt = sep.join(words[:mid])
|
||||
|
||||
@ -480,7 +470,9 @@ class OpenAI(BaseAPIModel):
|
||||
"""
|
||||
# Check input length when mode is 'none'
|
||||
if mode == 'none':
|
||||
input_len = get_token_len_func(str(input))
|
||||
input_len = (get_token_len_func(input) if isinstance(
|
||||
input, str) else sum(
|
||||
get_token_len_func(item['prompt']) for item in input))
|
||||
if input_len > max_seq_len:
|
||||
raise ValueError(
|
||||
f'Input length ({input_len}) exceeds max_seq_len '
|
||||
@ -499,12 +491,15 @@ class OpenAI(BaseAPIModel):
|
||||
# Convert input to messages format
|
||||
if isinstance(input, str):
|
||||
messages = [{'role': 'user', 'content': input}]
|
||||
input_len = get_token_len_func(input)
|
||||
else:
|
||||
messages = []
|
||||
processed_prompts = []
|
||||
for item in input:
|
||||
input_content = item['prompt']
|
||||
if mode != 'none':
|
||||
input_content = bin_trim_wrapper(input_content)
|
||||
processed_prompts.append(input_content)
|
||||
msg = {'content': input_content}
|
||||
if item['role'] == 'HUMAN':
|
||||
msg['role'] = 'user'
|
||||
@ -513,19 +508,18 @@ class OpenAI(BaseAPIModel):
|
||||
elif item['role'] == 'SYSTEM':
|
||||
msg['role'] = 'system'
|
||||
messages.append(msg)
|
||||
input_len = sum(
|
||||
get_token_len_func(prompt) for prompt in processed_prompts)
|
||||
|
||||
# Adjust max_out_len
|
||||
if max_out_len is not None:
|
||||
original_max_out_len = max_out_len
|
||||
max_out_len = min(
|
||||
max_out_len,
|
||||
max_seq_len - get_token_len_func(str(input)) - 100)
|
||||
max_out_len = min(max_out_len, max_seq_len - input_len - 100)
|
||||
if max_out_len <= 0:
|
||||
raise ValueError(
|
||||
f'max_out_len ({max_out_len}) is less than or equal to 0. '
|
||||
f'This may be due to input length '
|
||||
f'({get_token_len_func(str(input))}) being too close to '
|
||||
f'max_seq_len ({max_seq_len}). Please either increase '
|
||||
f'This may be due to input length ({input_len}) being too '
|
||||
f'close to max_seq_len ({max_seq_len}). Please increase '
|
||||
f'max_seq_len or use a truncation mode other than "none".')
|
||||
if max_out_len < original_max_out_len:
|
||||
self.logger.warning(
|
||||
@ -555,7 +549,6 @@ class OpenAISDK(OpenAI):
|
||||
temperature: float | None = None,
|
||||
tokenizer_path: str | None = None,
|
||||
extra_body: Dict | None = None,
|
||||
max_completion_tokens: int = 16384,
|
||||
verbose: bool = False,
|
||||
status_code_mappings: dict = {},
|
||||
):
|
||||
@ -577,7 +570,6 @@ class OpenAISDK(OpenAI):
|
||||
tokenizer_path,
|
||||
extra_body,
|
||||
verbose=verbose,
|
||||
max_completion_tokens=max_completion_tokens,
|
||||
)
|
||||
from openai import OpenAI
|
||||
|
||||
@ -605,8 +597,23 @@ class OpenAISDK(OpenAI):
|
||||
self.logger.info(f'Used openai_client: {self.openai_client}')
|
||||
self.status_code_mappings = status_code_mappings
|
||||
|
||||
def _generate(self, input: PromptList | str, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
def _generate(self,
|
||||
input: PromptList | str,
|
||||
max_out_len: int,
|
||||
temperature: float,
|
||||
timeout: int = 3600) -> str:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
Args:
|
||||
input (PromptType): A string or PromptDict.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use.
|
||||
timeout (int, optional): Timeout in seconds for the API call.
|
||||
Defaults to 3600 (60 minutes).
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
from openai import APIStatusError, BadRequestError
|
||||
|
||||
assert isinstance(input, (str, PromptList))
|
||||
@ -618,16 +625,14 @@ class OpenAISDK(OpenAI):
|
||||
num_retries = 0
|
||||
while num_retries < self.retry:
|
||||
self.wait()
|
||||
|
||||
if self.path in O1_MODEL_LIST:
|
||||
if any(model in self.path for model in O1_MODEL_LIST):
|
||||
self.logger.warning(
|
||||
f"'max_token' is unsupported for model {self.path}")
|
||||
self.logger.warning(
|
||||
f'We use max_completion_tokens: '
|
||||
f'{self.max_completion_tokens}for this query')
|
||||
f'We use max_out_len: {max_out_len} for this query')
|
||||
query_data = dict(
|
||||
model=self.path,
|
||||
max_completion_tokens=self.max_completion_tokens,
|
||||
max_completion_tokens=max_out_len,
|
||||
n=1,
|
||||
messages=messages,
|
||||
extra_body=self.extra_body,
|
||||
@ -646,7 +651,8 @@ class OpenAISDK(OpenAI):
|
||||
if self.verbose:
|
||||
self.logger.info('Start calling OpenAI API')
|
||||
responses = self.openai_client.chat.completions.create(
|
||||
**query_data)
|
||||
**query_data, timeout=timeout) # timeout in seconds
|
||||
|
||||
if self.verbose:
|
||||
self.logger.info(
|
||||
'Successfully get response from OpenAI API')
|
||||
|
@ -329,4 +329,4 @@ class LMEvaluator:
|
||||
else:
|
||||
kwargs = self.dict_postprocessor
|
||||
proc = DICT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||
return proc(output, self.output_path, **kwargs)
|
||||
return proc(output, self.output_path, **kwargs)
|
154
opencompass/openicl/icl_evaluator/math_evaluator.py
Normal file
154
opencompass/openicl/icl_evaluator/math_evaluator.py
Normal file
@ -0,0 +1,154 @@
|
||||
from latex2sympy2_extended import NormalizationConfig
|
||||
from math_verify import (ExprExtractionConfig, LatexExtractionConfig, parse,
|
||||
verify)
|
||||
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import ICL_EVALUATORS
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class MATHEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, predictions, references):
|
||||
|
||||
self.is_num_equal(predictions, references)
|
||||
|
||||
correct = 0
|
||||
count = 0
|
||||
details = []
|
||||
for i, j in zip(predictions, references):
|
||||
count += 1
|
||||
gold_parsed = parse(
|
||||
j,
|
||||
extraction_mode='first_match',
|
||||
extraction_config=[
|
||||
LatexExtractionConfig(),
|
||||
ExprExtractionConfig(),
|
||||
],
|
||||
)
|
||||
# If parsing result is empty, try adding LaTeX
|
||||
# environment and parse again
|
||||
if len(gold_parsed) == 0:
|
||||
j_with_env = f'${j}$'
|
||||
gold_parsed = parse(
|
||||
j_with_env,
|
||||
extraction_mode='first_match',
|
||||
extraction_config=[
|
||||
LatexExtractionConfig(),
|
||||
ExprExtractionConfig(),
|
||||
],
|
||||
)
|
||||
if len(gold_parsed) != 0:
|
||||
# We require the answer to be provided in correct
|
||||
# latex (no malformed operators)
|
||||
answer_parsed = parse(
|
||||
i,
|
||||
extraction_config=[
|
||||
LatexExtractionConfig(
|
||||
normalization_config=NormalizationConfig(
|
||||
nits=False,
|
||||
malformed_operators=False,
|
||||
basic_latex=True,
|
||||
equations=True,
|
||||
boxed='all',
|
||||
units=True,
|
||||
),
|
||||
# Ensures that boxed is tried first
|
||||
boxed_match_priority=0,
|
||||
try_extract_without_anchor=False,
|
||||
)
|
||||
],
|
||||
extraction_mode='first_match',
|
||||
)
|
||||
|
||||
answer_correct = float(verify(answer_parsed, gold_parsed))
|
||||
correct += answer_correct
|
||||
detail = {
|
||||
'pred': str(answer_parsed),
|
||||
'answer': str(gold_parsed),
|
||||
'correct': True if answer_correct else False,
|
||||
}
|
||||
details.append(detail)
|
||||
result = {'accuracy': 100 * correct / count, 'details': details}
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sympy
|
||||
|
||||
test_cases = [
|
||||
# 1. Basic arithmetic operations
|
||||
r'Simple fraction: \boxed{\frac{1}{2}}',
|
||||
r'Addition: \boxed{2 + 3}',
|
||||
r'Multiplication: \boxed{2 \times 3}',
|
||||
# 2. Algebraic expressions
|
||||
r'Quadratic: \boxed{x^2 + 2x + 1}',
|
||||
r'Polynomial: \boxed{3x^3 - 2x^2 + 4x - 5}',
|
||||
# 3. Trigonometric functions
|
||||
r'Trigonometry: \boxed{\sin(x) + \cos(x)}',
|
||||
r'Complex trig: \boxed{\tan^2(x) + \sec^2(x)}',
|
||||
# 4. Roots and exponents
|
||||
r'Square root: \boxed{\sqrt{16}}',
|
||||
r'Complex root: \boxed{\sqrt[3]{x^2 + 1}}',
|
||||
# 5. Logarithms
|
||||
r'Natural log: \boxed{\ln(e^2)}',
|
||||
r'Log base: \boxed{\log_2(8)}',
|
||||
# 6. Limits and summations
|
||||
r'Limit: \boxed{\lim_{x \to 0} \frac{\sin(x)}{x}}',
|
||||
r'Sum: \boxed{\sum_{i=1}^{n} i}',
|
||||
# 7. Integrals
|
||||
r'Integral: \boxed{\int_{0}^{1} x^2 dx}',
|
||||
r'Double integral: \boxed{\int_{0}^{1}\int_{0}^{1} xy \,dx\,dy}',
|
||||
# 8. Matrices
|
||||
r'Matrix: \boxed{\begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}}',
|
||||
# 9. Complex combinations
|
||||
r'Complex expr: \boxed{\frac{\sqrt{x^2 + 1}}{\ln(x)} + '
|
||||
r'\int_{0}^{x} t^2 dt}',
|
||||
# 10. Error cases
|
||||
r'Empty: \boxed{}',
|
||||
r'Invalid: \boxed{\frac{1}}', # Missing denominator
|
||||
r'Nested: \boxed{\boxed{1}}', # Nested boxed
|
||||
]
|
||||
|
||||
def print_result(expr: str, result: list):
|
||||
print('\n' + '=' * 50)
|
||||
print(f'Input: {expr}')
|
||||
print(f'Output type: {type(result)}')
|
||||
print(f'Output: {result}')
|
||||
|
||||
# If result is sympy expression, show more information
|
||||
if result:
|
||||
for item in result:
|
||||
if isinstance(item, sympy.Basic):
|
||||
print(f'Sympy repr: {repr(item)}')
|
||||
try:
|
||||
print(f'Evaluated: {item.evalf()}')
|
||||
except Exception as e:
|
||||
print(f'Cannot evaluate: {e}')
|
||||
|
||||
# Test all cases
|
||||
for test_expr in test_cases:
|
||||
try:
|
||||
result = parse(test_expr)
|
||||
print_result(test_expr, result)
|
||||
except Exception as e:
|
||||
print(f'\nError processing {test_expr}: {e}')
|
||||
|
||||
# Special test: verify numerical calculations
|
||||
numerical_tests = [
|
||||
r'\boxed{2 + 2}', # Should equal 4
|
||||
r'\boxed{\frac{1}{2} + \frac{1}{3}}', # Should equal 5/6
|
||||
r'\boxed{\sqrt{16} + \sqrt{9}}', # Should equal 7
|
||||
]
|
||||
|
||||
print('\n' + '=' * 50 + '\nNumerical Verification Tests:')
|
||||
for test_expr in numerical_tests:
|
||||
try:
|
||||
result = parse(test_expr)
|
||||
if result and isinstance(result[0], sympy.Basic):
|
||||
expr = result[0]
|
||||
print(f'\nExpression: {test_expr}')
|
||||
print(f'Symbolic: {expr}')
|
||||
print(f'Numerical value: {float(expr.evalf())}')
|
||||
except Exception as e:
|
||||
print(f'\nError in numerical test {test_expr}: {e}')
|
@ -286,7 +286,7 @@ class DLCRunner(BaseRunner):
|
||||
f'Failed to get job info for {job_id}')
|
||||
|
||||
status = job_info['Status']
|
||||
if status == 'Failed':
|
||||
if status == 'Failed' or status == 'Stopped':
|
||||
return -1
|
||||
elif status == 'Succeeded':
|
||||
return 0
|
||||
|
@ -34,39 +34,29 @@ MAP = {
|
||||
'总分',
|
||||
'中文总分',
|
||||
'英文总分',
|
||||
'instruct/compassbenchv1_4_IF_en_fofo_sub',
|
||||
'instruct/compassbenchv1_4_IF_zh_fofo_sub',
|
||||
'instruct/compassbench_2501_IF_en_chatIF_sub',
|
||||
'instruct/compassbench_2501_IF_en_functionalIF_sub',
|
||||
'instruct/compassbench_2501_IF_cn_chatIF_sub',
|
||||
'instruct/compassbench_2501_IF_cn_functionalIF_sub',
|
||||
],
|
||||
'language': [
|
||||
'总分',
|
||||
'中文总分',
|
||||
'英文总分',
|
||||
'language/compassbenchv1_4_language_zh_chat_sub',
|
||||
'language/compassbenchv1_4_language_zh_creation_sub',
|
||||
'language/compassbenchv1_4_language_zh_NLP_sub',
|
||||
'language/compassbenchv1_4_language_en_chat_sub',
|
||||
'language/compassbenchv1_4_language_en_creation_sub',
|
||||
'language/compassbenchv1_4_language_en_NLP_sub',
|
||||
'language/compassbench_v2501_language_zh_chat_sub',
|
||||
'language/compassbench_v2501_language_zh_nlp_sub',
|
||||
'language/compassbench_v2501_language_zh_creation_sub',
|
||||
'language/compassbench_v2501_language_en_chat_sub',
|
||||
'language/compassbench_v2501_language_en_nlp_sub',
|
||||
'language/compassbench_v2501_language_en_creation_sub',
|
||||
],
|
||||
'reasoning': [
|
||||
|
||||
'code': [
|
||||
'总分',
|
||||
'中文总分',
|
||||
'英文总分',
|
||||
'reasoning/compassbenchv1_4_reasoning_en_CommonSenseSense_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_en_Humanities_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_en_ScienceEngineering_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_en_Social_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_zh_CommonSenseSense_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_zh_Humanities_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_zh_ScienceEngineering_sub',
|
||||
'reasoning/compassbenchv1_4_reasoning_zh_Social_sub',
|
||||
],
|
||||
'coding': [
|
||||
'总分',
|
||||
'中文总分',
|
||||
'英文总分',
|
||||
'coding/compassbenchv1_4_coding_en_sub',
|
||||
'coding/compassbenchv1_4_coding_zh_sub',
|
||||
'code/compassbench_2501_code_arena_en_sub',
|
||||
'code/compassbench_2501_code_arena_zh_sub',
|
||||
],
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
import argparse
|
||||
import copy
|
||||
import fnmatch
|
||||
import math
|
||||
import os
|
||||
import os.path as osp
|
||||
@ -18,9 +17,8 @@ from mmengine.utils import mkdir_or_exist
|
||||
from opencompass.registry import (ICL_EVALUATORS, MODELS, TASKS,
|
||||
TEXT_POSTPROCESSORS)
|
||||
from opencompass.tasks.base import BaseTask, extract_role_pred
|
||||
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
|
||||
get_infer_output_path, get_logger,
|
||||
task_abbr_from_cfg)
|
||||
from opencompass.utils import (build_dataset_from_cfg, get_infer_output_path,
|
||||
get_logger, task_abbr_from_cfg)
|
||||
|
||||
|
||||
@TASKS.register_module()
|
||||
@ -60,19 +58,9 @@ class OpenICLEvalTask(BaseTask):
|
||||
self.dataset_cfg = dataset_cfg
|
||||
|
||||
# Load Dataset
|
||||
self.eval_cfg = self.dataset_cfg.get('eval_cfg')
|
||||
self.output_column = dataset_cfg['reader_cfg']['output_column']
|
||||
|
||||
# overwrite postprocessor if the model has specified one
|
||||
ds_abbr = dataset_abbr_from_cfg(self.dataset_cfg)
|
||||
model_postprocessors = self.model_cfg.get(
|
||||
'pred_postprocessor', {})
|
||||
for pattern in model_postprocessors.keys():
|
||||
if fnmatch.fnmatch(ds_abbr, pattern):
|
||||
self.eval_cfg[
|
||||
'pred_postprocessor'] = model_postprocessors[
|
||||
pattern] # noqa
|
||||
break
|
||||
self.eval_cfg = copy.deepcopy(dataset_cfg.get('eval_cfg'))
|
||||
self.output_column = copy.deepcopy(
|
||||
dataset_cfg['reader_cfg']['output_column'])
|
||||
|
||||
out_path = get_infer_output_path(
|
||||
self.model_cfg, self.dataset_cfg,
|
||||
@ -155,8 +143,20 @@ class OpenICLEvalTask(BaseTask):
|
||||
]
|
||||
|
||||
# Postprocess predictions if necessary
|
||||
# Model Specified Postprocessor
|
||||
if 'pred_postprocessor' in self.model_cfg:
|
||||
kwargs = copy.deepcopy(self.model_cfg['pred_postprocessor'])
|
||||
proc = kwargs.pop('type')
|
||||
if isinstance(proc, str):
|
||||
proc = TEXT_POSTPROCESSORS.get(proc)
|
||||
if pred_list_flag:
|
||||
pred_strs = [[proc(s, **kwargs) for s in preds]
|
||||
for preds in pred_strs]
|
||||
else:
|
||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
||||
# Dataset Specified Postprocessor
|
||||
if 'pred_postprocessor' in self.eval_cfg:
|
||||
kwargs = self.eval_cfg['pred_postprocessor']
|
||||
kwargs = copy.deepcopy(self.eval_cfg['pred_postprocessor'])
|
||||
proc = kwargs.pop('type')
|
||||
if isinstance(proc, str):
|
||||
proc = TEXT_POSTPROCESSORS.get(proc)
|
||||
|
@ -198,14 +198,24 @@ class SubjectiveEvalTask(BaseTask):
|
||||
if fnmatch.fnmatch(ds_abbr, pattern):
|
||||
pred_postprocessor = model_postprocessors[pattern]
|
||||
break
|
||||
|
||||
if 'pred_postprocessor' in eval_cfg['evaluator'] or pred_postprocessor:
|
||||
kwargs = pred_postprocessor or eval_cfg['evaluator'][
|
||||
'pred_postprocessor']
|
||||
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||
temp_kwargs = copy.deepcopy(kwargs)
|
||||
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
|
||||
self.logger.info('Get postprocessor {postprocessor}.')
|
||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
||||
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
|
||||
else:
|
||||
self.logger.info('No postprocessor found.')
|
||||
self.logger.info('No dataset postprocessor found.')
|
||||
|
||||
if 'pred_postprocessor' in model_cfg or pred_postprocessor:
|
||||
kwargs = pred_postprocessor or model_cfg['pred_postprocessor']
|
||||
temp_kwargs = copy.deepcopy(kwargs)
|
||||
proc = TEXT_POSTPROCESSORS.get(temp_kwargs.pop('type'))
|
||||
pred_strs = [proc(s, **temp_kwargs) for s in pred_strs]
|
||||
else:
|
||||
self.logger.info('No model postprocessor found.')
|
||||
|
||||
return {
|
||||
'model_name': model_abbr_from_cfg(model_cfg),
|
||||
@ -329,7 +339,9 @@ class SubjectiveEvalTask(BaseTask):
|
||||
if fnmatch.fnmatch(ds_abbr, pattern):
|
||||
pred_postprocessor = model_postprocessors[pattern]
|
||||
break
|
||||
|
||||
if 'pred_postprocessor' in eval_cfg or pred_postprocessor:
|
||||
|
||||
kwargs = pred_postprocessor or eval_cfg['pred_postprocessor']
|
||||
proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type'))
|
||||
pred_strs = [proc(s, **kwargs) for s in pred_strs]
|
||||
|
@ -37,6 +37,7 @@ def general_cn_postprocess(text: str) -> str:
|
||||
|
||||
cleaned_text = re.sub(r'\s+', ' ', no_articles).strip()
|
||||
import jieba
|
||||
|
||||
cleaned_text = ' '.join(jieba.cut(text))
|
||||
return cleaned_text
|
||||
|
||||
@ -57,6 +58,18 @@ def last_capital_postprocess(text: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('think_pred')
|
||||
def think_pred_postprocess(
|
||||
prediction: str,
|
||||
re_pattern: str,
|
||||
) -> str:
|
||||
match = re.search(re_pattern, prediction)
|
||||
if match:
|
||||
return match.group(1).strip()
|
||||
else:
|
||||
return prediction
|
||||
|
||||
|
||||
def first_option_postprocess(text: str, options: str, cushion=True) -> str:
|
||||
"""Find first valid option for text."""
|
||||
|
||||
@ -229,3 +242,44 @@ def match_answer_pattern(response_text: str, answer_pattern: str):
|
||||
match = re.search(answer_pattern, response_text)
|
||||
extracted_answer = match.group(1) if match else ''
|
||||
return extracted_answer
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('extract-non-reasoning-content')
|
||||
def extract_non_reasoning_content(
|
||||
text: str,
|
||||
think_start_token: str = '<think>',
|
||||
think_end_token: str = '</think>',
|
||||
) -> str:
|
||||
"""Extract content after the last reasoning tag from text.
|
||||
|
||||
When only end token is present, returns content after the end token.
|
||||
When both tokens are present, removes all content between start and end tokens.
|
||||
|
||||
Args:
|
||||
text (str): Input text containing reasoning tags.
|
||||
think_start_token (str, optional): Start token for reasoning section. Defaults to '<think>'.
|
||||
think_end_token (str, optional): End token for reasoning section. Defaults to '</think>'.
|
||||
|
||||
Returns:
|
||||
str: Processed text after removing reasoning sections.
|
||||
|
||||
Examples:
|
||||
>>> # When only end token exists
|
||||
>>> text = "This is a test.</think> How are you?"
|
||||
>>> extract_non_reasoning_content(text)
|
||||
'How are you?'
|
||||
|
||||
>>> # When both tokens exist
|
||||
>>> text = "Start<think>reasoning here</think> End"
|
||||
>>> extract_non_reasoning_content(text)
|
||||
'Start End'
|
||||
"""
|
||||
# If text contains only end token, split by end token and take the last part
|
||||
if think_start_token not in text and think_end_token in text:
|
||||
return text.split(think_end_token)[-1].strip()
|
||||
|
||||
# Original behavior for complete tag pairs
|
||||
reasoning_regex = re.compile(rf'{think_start_token}(.*?){think_end_token}',
|
||||
re.DOTALL)
|
||||
non_reasoning_content = reasoning_regex.sub('', text).strip()
|
||||
return non_reasoning_content
|
||||
|
@ -15,6 +15,8 @@ langdetect
|
||||
latex2sympy2
|
||||
# Lawbench, leval
|
||||
ltp
|
||||
# Math
|
||||
math-verify
|
||||
# Taco, apps Dataset
|
||||
pyext
|
||||
# Law Bench
|
||||
|
Loading…
Reference in New Issue
Block a user