diff --git a/.github/scripts/oc_score_baseline_fullbench.yaml b/.github/scripts/oc_score_baseline_fullbench.yaml index 9f171a02..c0e735fb 100644 --- a/.github/scripts/oc_score_baseline_fullbench.yaml +++ b/.github/scripts/oc_score_baseline_fullbench.yaml @@ -24,8 +24,8 @@ internlm2_5-7b-chat-hf_fullbench: lcb_test_output_pass@1: 18.75 bbh-logical_deduction_seven_objects_score: 50 bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 72.6 - cmmlu-china-specific_naive_average: 76.25 + mmlu-other_accuracy: 72.6 + cmmlu-china-specific_accuracy: 76.25 mmlu_pro_math_accuracy: 25 ds1000_Pandas_accuracy: 12.5 ds1000_Numpy_accuracy: 0 @@ -101,8 +101,8 @@ internlm2_5-7b-chat-turbomind_fullbench: lcb_test_output_pass@1: 25.00 bbh-logical_deduction_seven_objects_score: 50.00 bbh-multistep_arithmetic_two_score: 68.75 - mmlu-other_naive_average: 69.71 - cmmlu-china-specific_naive_average: 75.83 + mmlu-other_accuracy: 69.71 + cmmlu-china-specific_accuracy: 75.83 mmlu_pro_math_accuracy: 31.25 ds1000_Pandas_accuracy: 0 ds1000_Numpy_accuracy: 0 @@ -234,15 +234,15 @@ internlm2_5-7b-turbomind: sanitized_mbpp_score: 55.25 dingo_en_192_score: 60.94 dingo_zh_170_score: 67.65 - mmlu-stem_naive_average: 63.72 - mmlu-social-science_naive_average: 80.15 - mmlu-humanities_naive_average: 74.27 - mmlu-other_naive_average: 71.85 - cmmlu-stem_naive_average: 67.07 - cmmlu-social-science_naive_average: 81.49 - cmmlu-humanities_naive_average: 85.84 - cmmlu-other_naive_average: 82.69 - cmmlu-china-specific_naive_average: 79.88 + mmlu-stem_accuracy: 63.72 + mmlu-social-science_accuracy: 80.15 + mmlu-humanities_accuracy: 74.27 + mmlu-other_accuracy: 71.85 + cmmlu-stem_accuracy: 67.07 + cmmlu-social-science_accuracy: 81.49 + cmmlu-humanities_accuracy: 85.84 + cmmlu-other_accuracy: 82.69 + cmmlu-china-specific_accuracy: 79.88 mmlu_pro_biology_accuracy: 58.58 mmlu_pro_business_accuracy: 28.01 mmlu_pro_chemistry_accuracy: 22.79 @@ -281,12 +281,12 @@ internlm2_5-7b-turbomind: longbench_naive_average: 46.19 longbench_zh_naive_average: 49.3 longbench_en_naive_average: 43.97 - longbench_single-document-qa_naive_average: 42.84 - longbench_multi-document-qa_naive_average: 37.29 - longbench_summarization_naive_average: 23.21 - longbench_few-shot-learning_naive_average: 61.67 - longbench_synthetic-tasks_naive_average: 60.05 - longbench_code-completion_naive_average: 52.09 + longbench_single-document-qa_score: 42.84 + longbench_multi-document-qa_score: 41.25 + longbench_summarization_score: 23.21 + longbench_few-shot-learning_score: 61.67 + longbench_synthetic-tasks_score: 60.05 + longbench_code-completion_score: 52.09 internlm2_5-7b-chat-turbomind: objective: @@ -327,15 +327,15 @@ internlm2_5-7b-chat-turbomind: teval_naive_average: 80 SciCode_sub_accuracy: 5.56 qa_dingo_cn_score: 99.01 - mmlu-stem_naive_average: 68.2 - mmlu-social-science_naive_average: 75.8 - mmlu-humanities_naive_average: 69.3 - mmlu-other_naive_average: 71.3 - cmmlu-stem_naive_average: 66.64 - cmmlu-social-science_naive_average: 76 - cmmlu-humanities_naive_average: 77.9 - cmmlu-other_naive_average: 77.25 - cmmlu-china-specific_naive_average: 73.6 + mmlu-stem_accuracy: 68.2 + mmlu-social-science_accuracy: 75.8 + mmlu-humanities_accuracy: 69.3 + mmlu-other_accuracy: 71.3 + cmmlu-stem_accuracy: 66.64 + cmmlu-social-science_accuracy: 76 + cmmlu-humanities_accuracy: 77.9 + cmmlu-other_accuracy: 77.25 + cmmlu-china-specific_accuracy: 73.6 mmlu_pro_biology_accuracy: 66.67 mmlu_pro_business_accuracy: 47.91 mmlu_pro_chemistry_accuracy: 35 @@ -448,9 +448,9 @@ internlm2_5-7b-chat-1m-turbomind: babilong_32k_naive_average: 48.9 babilong_128k_naive_average: 40.8 babilong_256k_naive_average: 23.5 - longbench_single-document-qa_naive_average: 43.56 - longbench_multi-document-qa_naive_average: 46.24 - longbench_summarization_naive_average: 24.32 - longbench_few-shot-learning_naive_average: 51.67 - longbench_synthetic-tasks_naive_average: 66.83 - longbench_code-completion_naive_average: 45.99 + longbench_single-document-qa_score: 43.56 + longbench_multi-document-qa_score: 46.24 + longbench_summarization_score: 24.32 + longbench_few-shot-learning_score: 51.67 + longbench_synthetic-tasks_score: 66.83 + longbench_code-completion_score: 45.99 diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 3cdb3a73..a5a930fa 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -157,7 +157,9 @@ jobs: pip install opencompass*.whl --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[lmdeploy] --cache-dir ${{env.PIP_CACHE_PATH}} pip install opencompass[vllm] --cache-dir ${{env.PIP_CACHE_PATH}} - pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[full] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install opencompass[api] --cache-dir ${{env.PIP_CACHE_PATH}} + pip install torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 --cache-dir ${{env.PIP_CACHE_PATH}} FLASH_ATTENTION_FORCE_BUILD=TRUE pip install /fs-computility/llm/qa-llm-cicd/packages/flash_attn-2.7.0.post2+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install xformers --index-url https://download.pytorch.org/whl/cu121 --cache-dir ${{env.PIP_CACHE_PATH}} cp -r /root/nltk_data ${{env.CONDA_PATH}}/envs/${{env.CONDA_ENV}}/nltk_data diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 032c4bc0..45fbd634 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -45,7 +45,7 @@ jobs: . ${{env.CONDA_PATH}}/bin/activate conda activate ${{env.CONDA_ENV}} python3 -m pip uninstall opencompass -y - python3 -m pip install -e . --cache-dir ${{env.PIP_CACHE_PATH}} + python3 -m pip install -e ".[full]" --cache-dir ${{env.PIP_CACHE_PATH}} conda info --envs - name: conda env run: |