From d59189b87fe66a9dace16930f193742969e9fb84 Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Thu, 30 May 2024 00:06:39 +0800 Subject: [PATCH] [Doc] Update running command in README (#1206) --- configs/datasets/GaokaoBench/README.md | 5 ++ configs/datasets/IFEval/README.md | 31 +++++++ configs/datasets/TheoremQA/README.md | 5 ++ configs/datasets/bbh/README.md | 5 ++ configs/datasets/ceval/README.md | 5 ++ configs/datasets/gpqa/README.md | 5 ++ configs/datasets/gsm8k/README.md | 5 ++ configs/datasets/hellaswag/README.md | 5 ++ configs/datasets/humaneval/README.md | 5 ++ configs/datasets/math/README.md | 5 ++ configs/datasets/mbpp/README.md | 119 +++++++++++++------------ configs/datasets/mmlu/README.md | 5 ++ configs/datasets/nq/README.md | 5 ++ configs/datasets/race/README.md | 5 ++ configs/datasets/triviaqa/README.md | 5 ++ configs/datasets/winogrande/README.md | 5 ++ 16 files changed, 163 insertions(+), 57 deletions(-) create mode 100644 configs/datasets/IFEval/README.md diff --git a/configs/datasets/GaokaoBench/README.md b/configs/datasets/GaokaoBench/README.md index d05c091c..71b0ffac 100644 --- a/configs/datasets/GaokaoBench/README.md +++ b/configs/datasets/GaokaoBench/README.md @@ -1,5 +1,10 @@ # GaokaoBench +```bash +python3 run.py --models hf_internlm2_7b --datasets GaokaoBench_no_subjective_gen_d21e37 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets GaokaoBench_no_subjective_gen_4c31db --debug +``` + ## Base Models | model | GaokaoBench | diff --git a/configs/datasets/IFEval/README.md b/configs/datasets/IFEval/README.md new file mode 100644 index 00000000..3753262c --- /dev/null +++ b/configs/datasets/IFEval/README.md @@ -0,0 +1,31 @@ +# IFEval + +```bash +python3 run.py --models hf_internlm2_chat_7b --datasets IFEval_gen_3321a3 --debug +``` + +## Chat Models + +| model | Prompt-level-strict-accuracy | Inst-level-strict-accuracy | Prompt-level-loose-accuracy | Inst-level-loose-accuracy | +|:-----------------------------:|-------------------------------:|-----------------------------:|------------------------------:|----------------------------:| +| qwen1.5-0.5b-chat-hf | 13.12 | 23.26 | 15.71 | 26.38 | +| qwen1.5-1.8b-chat-hf | 16.08 | 26.26 | 18.30 | 29.02 | +| qwen1.5-4b-chat-hf | 25.51 | 35.97 | 28.84 | 39.81 | +| qwen1.5-7b-chat-hf | 38.82 | 50.00 | 42.70 | 53.48 | +| qwen1.5-14b-chat-hf | 42.51 | 54.20 | 49.17 | 59.95 | +| qwen1.5-32b-chat-hf | 49.54 | 60.43 | 53.97 | 64.39 | +| qwen1.5-72b-chat-hf | 51.02 | 61.99 | 57.12 | 67.27 | +| qwen1.5-110b-chat-hf | 55.08 | 65.59 | 61.18 | 70.86 | +| internlm2-chat-1.8b-hf | 18.30 | 28.78 | 21.44 | 32.01 | +| internlm2-chat-1.8b-sft-hf | 18.67 | 31.18 | 19.78 | 32.85 | +| internlm2-chat-7b-hf | 34.75 | 46.28 | 40.48 | 51.44 | +| internlm2-chat-7b-sft-hf | 39.19 | 50.12 | 42.33 | 52.76 | +| internlm2-chat-20b-hf | 36.41 | 48.68 | 40.67 | 53.24 | +| internlm2-chat-20b-sft-hf | 44.55 | 55.64 | 46.77 | 58.03 | +| llama-3-8b-instruct-hf | 68.02 | 76.74 | 75.42 | 82.85 | +| llama-3-70b-instruct-hf | 78.00 | 84.65 | 84.29 | 89.21 | +| llama-3-8b-instruct-lmdeploy | 69.13 | 77.46 | 77.26 | 83.93 | +| llama-3-70b-instruct-lmdeploy | 75.97 | 82.97 | 83.18 | 88.37 | +| mistral-7b-instruct-v0.1-hf | 40.30 | 50.96 | 41.96 | 53.48 | +| mistral-7b-instruct-v0.2-hf | 49.17 | 60.43 | 51.94 | 64.03 | +| mixtral-8x7b-instruct-v0.1-hf | 50.09 | 60.67 | 55.64 | 65.83 | diff --git a/configs/datasets/TheoremQA/README.md b/configs/datasets/TheoremQA/README.md index d62dddad..ab5e57f1 100644 --- a/configs/datasets/TheoremQA/README.md +++ b/configs/datasets/TheoremQA/README.md @@ -1,5 +1,10 @@ # TheoremQA +```bash +python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug +``` + ## Base Models | model | TheoremQA | diff --git a/configs/datasets/bbh/README.md b/configs/datasets/bbh/README.md index e0e35850..34aee487 100644 --- a/configs/datasets/bbh/README.md +++ b/configs/datasets/bbh/README.md @@ -1,5 +1,10 @@ # BBH +```bash +python3 run.py --models hf_internlm2_7b --datasets bbh_gen_98fba6 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets bbh_gen_5b92b0 --debug +``` + ## Base Models | model | bbh | diff --git a/configs/datasets/ceval/README.md b/configs/datasets/ceval/README.md index 4b49f787..6932b73b 100644 --- a/configs/datasets/ceval/README.md +++ b/configs/datasets/ceval/README.md @@ -1,5 +1,10 @@ # C-Eval +```bash +python3 run.py --models hf_internlm2_7b --datasets ceval_internal_ppl_93e5ce --debug +python3 run.py --models hf_internlm2_chat_7b --datasets ceval_internal_gen_2daf24 --debug +``` + ## Base Models | model | ceval-test | ceval-test-hard | ceval-test-stem | ceval-test-social-science | ceval-test-humanities | ceval-test-other | ceval-dev | ceval-dev-hard | ceval-dev-stem | ceval-dev-social-science | ceval-dev-humanities | ceval-dev-other | diff --git a/configs/datasets/gpqa/README.md b/configs/datasets/gpqa/README.md index 05f4aeb9..02560df5 100644 --- a/configs/datasets/gpqa/README.md +++ b/configs/datasets/gpqa/README.md @@ -1,5 +1,10 @@ # GPQA +```bash +python3 run.py --models hf_internlm2_7b --datasets gpqa_ppl_6bf57a --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gpqa_gen_4baadb --debug +``` + ## Base Models | model | GPQA_diamond | diff --git a/configs/datasets/gsm8k/README.md b/configs/datasets/gsm8k/README.md index c91910de..e4b9f9f6 100644 --- a/configs/datasets/gsm8k/README.md +++ b/configs/datasets/gsm8k/README.md @@ -1,5 +1,10 @@ # GSM8K +```bash +python3 run.py --models hf_internlm2_7b --datasets gsm8k_gen_17d0dc --debug +python3 run.py --models hf_internlm2_chat_7b --datasets gsm8k_gen_1d7fe4 --debug +``` + ## Base Models | model | gsm8k | diff --git a/configs/datasets/hellaswag/README.md b/configs/datasets/hellaswag/README.md index dd62abc7..91a5e226 100644 --- a/configs/datasets/hellaswag/README.md +++ b/configs/datasets/hellaswag/README.md @@ -1,5 +1,10 @@ # HellaSwag +```bash +python3 run.py --models hf_internlm2_7b --datasets hellaswag_10shot_ppl_59c85e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets hellaswag_10shot_gen_e42710 --debug +``` + ## Base Models | model | hellaswag | diff --git a/configs/datasets/humaneval/README.md b/configs/datasets/humaneval/README.md index 4637e8a0..c0e1e157 100644 --- a/configs/datasets/humaneval/README.md +++ b/configs/datasets/humaneval/README.md @@ -1,5 +1,10 @@ # HumanEval +```bash +python3 run.py --models hf_internlm2_7b --datasets humaneval_gen_d2537e --debug +python3 run.py --models hf_internlm2_chat_7b --datasets humaneval_gen_8e312c --debug +``` + ## Base Models | model | pass@1 | diff --git a/configs/datasets/math/README.md b/configs/datasets/math/README.md index a11a3908..c498db34 100644 --- a/configs/datasets/math/README.md +++ b/configs/datasets/math/README.md @@ -1,5 +1,10 @@ # MATH +```bash +python3 run.py --models hf_internlm2_7b --datasets math_4shot_base_gen_db136b --debug +python3 run.py --models hf_internlm2_chat_7b --datasets math_0shot_gen_393424 --debug +``` + ## Base Models | model | math | diff --git a/configs/datasets/mbpp/README.md b/configs/datasets/mbpp/README.md index 3d0d8483..2f2b6111 100644 --- a/configs/datasets/mbpp/README.md +++ b/configs/datasets/mbpp/README.md @@ -1,64 +1,69 @@ # MBPP +```bash +python3 run.py --models hf_internlm2_7b --datasets sanitized_mbpp_gen_742f0c --debug +python3 run.py --models hf_internlm2_chat_7b --datasets sanitized_mbpp_mdblock_gen_a447ff --debug +``` + ## Base Models -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | -|:------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| -| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | -| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | -| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | -| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | -| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | -| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | -| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | -| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | -| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | -| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | -| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | -| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | -| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | -| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | -| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | -| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | -| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | -| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | -| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | -| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | -| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | -| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | -| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | -| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | -| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | -| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | -| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | -| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | -| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | -| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | -| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | -| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:------------------------:|---------:|-------:|----------:|---------:|---------------:| +| llama-7b-turbomind | 25.29 | 65 | 8 | 62 | 122 | +| llama-13b-turbomind | 29.96 | 77 | 4 | 74 | 102 | +| llama-30b-turbomind | 37.35 | 96 | 17 | 39 | 105 | +| llama-65b-turbomind | 45.53 | 117 | 10 | 35 | 95 | +| llama-2-7b-turbomind | 26.46 | 68 | 18 | 49 | 122 | +| llama-2-13b-turbomind | 36.58 | 94 | 17 | 45 | 101 | +| llama-2-70b-turbomind | 49.42 | 127 | 12 | 32 | 86 | +| llama-3-8b-turbomind | 54.86 | 141 | 11 | 22 | 83 | +| llama-3-70b-turbomind | 77.82 | 200 | 0 | 10 | 47 | +| internlm2-1.8b-turbomind | 30.74 | 79 | 10 | 61 | 107 | +| internlm2-7b-turbomind | 54.47 | 140 | 11 | 28 | 78 | +| internlm2-20b-turbomind | 59.92 | 154 | 6 | 31 | 66 | +| qwen-1.8b-turbomind | 2.72 | 7 | 16 | 222 | 12 | +| qwen-7b-turbomind | 46.69 | 120 | 10 | 37 | 90 | +| qwen-14b-turbomind | 55.64 | 143 | 0 | 31 | 83 | +| qwen-72b-turbomind | 65.76 | 169 | 0 | 26 | 62 | +| qwen1.5-0.5b-hf | 5.06 | 13 | 13 | 190 | 41 | +| qwen1.5-1.8b-hf | 15.95 | 41 | 19 | 124 | 73 | +| qwen1.5-4b-hf | 45.91 | 118 | 8 | 27 | 104 | +| qwen1.5-7b-hf | 52.14 | 134 | 11 | 24 | 88 | +| qwen1.5-14b-hf | 52.14 | 134 | 16 | 33 | 74 | +| qwen1.5-32b-hf | 59.14 | 152 | 7 | 25 | 73 | +| qwen1.5-72b-hf | 61.09 | 157 | 1 | 21 | 78 | +| qwen1.5-moe-a2-7b-hf | 47.08 | 121 | 0 | 52 | 84 | +| mistral-7b-v0.1-hf | 47.47 | 122 | 9 | 33 | 93 | +| mistral-7b-v0.2-hf | 49.81 | 128 | 9 | 27 | 93 | +| mixtral-8x7b-v0.1-hf | 62.65 | 161 | 10 | 13 | 73 | +| mixtral-8x22b-v0.1-hf | 73.15 | 188 | 1 | 10 | 58 | +| yi-6b-hf | 30.35 | 78 | 8 | 40 | 131 | +| yi-34b-hf | 48.64 | 125 | 0 | 43 | 89 | +| deepseek-7b-base-hf | 43.97 | 113 | 11 | 34 | 99 | +| deepseek-67b-base-hf | 64.98 | 167 | 0 | 24 | 66 | ## Chat Models -| model | mbpp/pass@1 | mbpp/pass | mbpp/timeout | mbpp/failed | mbpp/wrong_answer | -|:-----------------------------:|--------------:|------------:|---------------:|--------------:|--------------------:| -| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | -| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | -| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | -| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | -| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | -| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | -| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | -| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | -| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | -| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | -| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | -| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | -| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | -| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | -| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | -| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | -| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | -| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | -| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | -| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | -| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | +| model | pass@1 | pass | timeout | failed | wrong_answer | +|:-----------------------------:|---------:|-------:|----------:|---------:|---------------:| +| qwen1.5-0.5b-chat-hf | 11.28 | 29 | 1 | 129 | 98 | +| qwen1.5-1.8b-chat-hf | 22.57 | 58 | 2 | 70 | 127 | +| qwen1.5-4b-chat-hf | 43.58 | 112 | 1 | 33 | 111 | +| qwen1.5-7b-chat-hf | 50.58 | 130 | 0 | 35 | 92 | +| qwen1.5-14b-chat-hf | 56.03 | 144 | 0 | 24 | 89 | +| qwen1.5-32b-chat-hf | 65.37 | 168 | 2 | 13 | 74 | +| qwen1.5-72b-chat-hf | 66.93 | 172 | 0 | 17 | 68 | +| qwen1.5-110b-chat-hf | 68.48 | 176 | 0 | 16 | 65 | +| internlm2-chat-1.8b-hf | 39.69 | 102 | 0 | 48 | 107 | +| internlm2-chat-1.8b-sft-hf | 36.19 | 93 | 1 | 58 | 105 | +| internlm2-chat-7b-hf | 57.59 | 148 | 0 | 21 | 88 | +| internlm2-chat-7b-sft-hf | 55.64 | 143 | 2 | 22 | 90 | +| internlm2-chat-20b-hf | 68.87 | 177 | 0 | 16 | 64 | +| internlm2-chat-20b-sft-hf | 69.65 | 179 | 0 | 16 | 62 | +| llama-3-8b-instruct-hf | 68.87 | 177 | 0 | 8 | 72 | +| llama-3-70b-instruct-hf | 79.77 | 205 | 0 | 2 | 50 | +| llama-3-8b-instruct-lmdeploy | 66.93 | 172 | 0 | 7 | 78 | +| llama-3-70b-instruct-lmdeploy | 77.82 | 200 | 1 | 2 | 54 | +| mistral-7b-instruct-v0.1-hf | 47.86 | 123 | 0 | 29 | 105 | +| mistral-7b-instruct-v0.2-hf | 45.91 | 118 | 0 | 31 | 108 | +| mixtral-8x7b-instruct-v0.1-hf | 61.48 | 158 | 1 | 13 | 85 | diff --git a/configs/datasets/mmlu/README.md b/configs/datasets/mmlu/README.md index b8e4a9af..eaa2181e 100644 --- a/configs/datasets/mmlu/README.md +++ b/configs/datasets/mmlu/README.md @@ -1,5 +1,10 @@ # MMLU +```bash +python3 run.py --models hf_internlm2_7b --datasets mmlu_ppl_ac766d --debug +python3 run.py --models hf_internlm2_chat_7b --datasets mmlu_gen_4d595a --debug +``` + ## Base Models | model | mmlu | mmlu-stem | mmlu-social-science | mmlu-humanities | mmlu-other | diff --git a/configs/datasets/nq/README.md b/configs/datasets/nq/README.md index 3c795f2f..0cf8e714 100644 --- a/configs/datasets/nq/README.md +++ b/configs/datasets/nq/README.md @@ -1,5 +1,10 @@ # NQ +```bash +python3 run.py --models hf_internlm2_7b --datasets nq_open_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets nq_open_1shot_gen_01cf41 --debug +``` + ## Base Models | model | nq | diff --git a/configs/datasets/race/README.md b/configs/datasets/race/README.md index f1c55ba7..916bdb80 100644 --- a/configs/datasets/race/README.md +++ b/configs/datasets/race/README.md @@ -1,5 +1,10 @@ # RACE +```bash +python3 run.py --models hf_internlm2_7b --datasets race_ppl_abed12 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets race_gen_69ee4f --debug +``` + ## Base Models | model | race-high | race-middle | diff --git a/configs/datasets/triviaqa/README.md b/configs/datasets/triviaqa/README.md index f5e155a9..c849ed12 100644 --- a/configs/datasets/triviaqa/README.md +++ b/configs/datasets/triviaqa/README.md @@ -1,5 +1,10 @@ # TriviaQA +```bash +python3 run.py --models hf_internlm2_7b --datasets triviaqa_wiki_1shot_gen_20a989 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets triviaqa_wiki_1shot_gen_eaf81e --debug +``` + ## Base Models | model | triviaqa | diff --git a/configs/datasets/winogrande/README.md b/configs/datasets/winogrande/README.md index 81673268..48a5ba70 100644 --- a/configs/datasets/winogrande/README.md +++ b/configs/datasets/winogrande/README.md @@ -1,5 +1,10 @@ # WinoGrande +```bash +python3 run.py --models hf_internlm2_7b --datasets winogrande_5shot_ll_252f01 --debug +python3 run.py --models hf_internlm2_chat_7b --datasets winogrande_5shot_gen_b36770 --debug +``` + ## Base Models | model | winogrande |