mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Doc] Update running command in README (#1206)
This commit is contained in:
parent
0b50112dc1
commit
d59189b87f
@ -1,5 +1,10 @@
|
||||
# GaokaoBench
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | GaokaoBench |
|
||||
|
31
configs/datasets/IFEval/README.md
Normal file
31
configs/datasets/IFEval/README.md
Normal file
@ -0,0 +1,31 @@
|
||||
# IFEval
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug
|
||||
```
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | Prompt-level-strict-accuracy | Inst-level-strict-accuracy | Prompt-level-loose-accuracy | Inst-level-loose-accuracy |
|
||||
|:-----------------------------:|-------------------------------:|-----------------------------:|------------------------------:|----------------------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 13.12 | 23.26 | 15.71 | 26.38 |
|
||||
| qwen1.5-1.8b-chat-hf | 16.08 | 26.26 | 18.30 | 29.02 |
|
||||
| qwen1.5-4b-chat-hf | 25.51 | 35.97 | 28.84 | 39.81 |
|
||||
| qwen1.5-7b-chat-hf | 38.82 | 50.00 | 42.70 | 53.48 |
|
||||
| qwen1.5-14b-chat-hf | 42.51 | 54.20 | 49.17 | 59.95 |
|
||||
| qwen1.5-32b-chat-hf | 49.54 | 60.43 | 53.97 | 64.39 |
|
||||
| qwen1.5-72b-chat-hf | 51.02 | 61.99 | 57.12 | 67.27 |
|
||||
| qwen1.5-110b-chat-hf | 55.08 | 65.59 | 61.18 | 70.86 |
|
||||
| internlm2-chat-1.8b-hf | 18.30 | 28.78 | 21.44 | 32.01 |
|
||||
| internlm2-chat-1.8b-sft-hf | 18.67 | 31.18 | 19.78 | 32.85 |
|
||||
| internlm2-chat-7b-hf | 34.75 | 46.28 | 40.48 | 51.44 |
|
||||
| internlm2-chat-7b-sft-hf | 39.19 | 50.12 | 42.33 | 52.76 |
|
||||
| internlm2-chat-20b-hf | 36.41 | 48.68 | 40.67 | 53.24 |
|
||||
| internlm2-chat-20b-sft-hf | 44.55 | 55.64 | 46.77 | 58.03 |
|
||||
| llama-3-8b-instruct-hf | 68.02 | 76.74 | 75.42 | 82.85 |
|
||||
| llama-3-70b-instruct-hf | 78.00 | 84.65 | 84.29 | 89.21 |
|
||||
| llama-3-8b-instruct-lmdeploy | 69.13 | 77.46 | 77.26 | 83.93 |
|
||||
| llama-3-70b-instruct-lmdeploy | 75.97 | 82.97 | 83.18 | 88.37 |
|
||||
| mistral-7b-instruct-v0.1-hf | 40.30 | 50.96 | 41.96 | 53.48 |
|
||||
| mistral-7b-instruct-v0.2-hf | 49.17 | 60.43 | 51.94 | 64.03 |
|
||||
| mixtral-8x7b-instruct-v0.1-hf | 50.09 | 60.67 | 55.64 | 65.83 |
|
@ -1,5 +1,10 @@
|
||||
# TheoremQA
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | TheoremQA |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# BBH
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | bbh |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# C-Eval
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# GPQA
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | GPQA_diamond |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# GSM8K
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | gsm8k |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# HellaSwag
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | hellaswag |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# HumanEval
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | pass@1 |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# MATH
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | math |
|
||||
|
@ -1,9 +1,14 @@
|
||||
# MBPP
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|
||||
|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
|
||||
| model | pass@1 | pass | timeout | failed | wrong_answer |
|
||||
|:------------------------:|---------:|-------:|----------:|---------:|---------------:|
|
||||
| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 |
|
||||
| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 |
|
||||
| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 |
|
||||
@ -39,8 +44,8 @@
|
||||
|
||||
## Chat Models
|
||||
|
||||
| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer |
|
||||
|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:|
|
||||
| model | pass@1 | pass | timeout | failed | wrong_answer |
|
||||
|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:|
|
||||
| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 |
|
||||
| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 |
|
||||
| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# MMLU
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# NQ
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | nq |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# RACE
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | race-high | race-middle |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# TriviaQA
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | triviaqa |
|
||||
|
@ -1,5 +1,10 @@
|
||||
# WinoGrande
|
||||
|
||||
```bash
|
||||
python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug
|
||||
python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug
|
||||
```
|
||||
|
||||
## Base Models
|
||||
|
||||
| model | winogrande |
|
||||
|
Loading…
Reference in New Issue
Block a user