From 62dbf047086fe9c9ee950a06168f88283ebadcee Mon Sep 17 00:00:00 2001 From: Fengzhe Zhou Date: Tue, 14 May 2024 22:42:23 +0800 Subject: [PATCH] [Sync] update github workflow (#1156) --- .github/scripts/pr_oc_score_assert.py | 4 +- .github/workflows/daily-run-test.yml | 5 +- .github/workflows/pr-run-test.yml | 12 +- .../{bbh_gen_0a5495.py => bbh_gen_98fba6.py} | 5 +- configs/datasets/collections/base_core.py | 4 +- configs/datasets/gsm8k/gsm8k_gen_17d0dc.py | 39 ++++ .../math/math_4shot_base_gen_db136b.py | 30 +++ ...math_4shot_example_from_google_research.py | 40 ++++ ...2049.py => math_evaluatorv2_gen_2f4a71.py} | 2 +- .../mbpp/sanitized_mbpp_gen_742f0c.py | 82 ++++++++ .../mbpp/sanitized_mbpp_gen_a0fc46.py | 41 ++++ configs/models/deepseek/hf_deepseek_v2.py | 18 ++ .../models/deepseek/hf_deepseek_v2_chat.py | 18 ++ .../deepseek/lmdeploy_deepseek_series.py | 23 +++ .../hf_internlm/lmdeploy_internlm2_series.py | 24 +++ .../models/hf_llama/lmdeploy_llama_series.py | 30 +++ .../models/mistral/lmdeploy_mistral_series.py | 24 +++ configs/models/qwen/hf_qwen1_5_110b.py | 12 ++ configs/models/qwen/hf_qwen1_5_110b_chat.py | 12 ++ .../models/qwen/lmdeploy_qwen1_5_series.py | 29 +++ configs/models/qwen/lmdeploy_qwen_series.py | 25 +++ configs/models/yi/lmdeploy_yi_series.py | 23 +++ .../summarizers/compassbench_v1_objective.py | 2 +- configs/summarizers/groups/legacy/cibench.py | 109 +++++++++++ opencompass/cli/main.py | 5 +- opencompass/models/__init__.py | 8 +- opencompass/models/ai360_api.py | 44 +++-- opencompass/models/baichuan_api.py | 2 +- opencompass/models/baidu_api.py | 35 +++- opencompass/models/deepseek_api.py | 178 +++++++++++++++++ opencompass/models/gemini_api.py | 63 ------ opencompass/models/huggingface.py | 11 +- opencompass/models/minimax_api.py | 170 ++++++++++++++++ opencompass/models/qwen_api.py | 3 +- opencompass/models/stepfun_api.py | 182 ++++++++++++++++++ opencompass/models/turbomind.py | 7 +- opencompass/models/xunfei_api.py | 149 ++++++++++++++ opencompass/runners/dlc.py | 13 +- opencompass/runners/local.py | 18 +- opencompass/utils/text_postprocessors.py | 12 +- 40 files changed, 1377 insertions(+), 136 deletions(-) rename configs/datasets/bbh/{bbh_gen_0a5495.py => bbh_gen_98fba6.py} (94%) create mode 100644 configs/datasets/gsm8k/gsm8k_gen_17d0dc.py create mode 100644 configs/datasets/math/math_4shot_base_gen_db136b.py create mode 100644 configs/datasets/math/math_4shot_example_from_google_research.py rename configs/datasets/math/{math_evaluatorv2_gen_9d2049.py => math_evaluatorv2_gen_2f4a71.py} (96%) create mode 100644 configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py create mode 100644 configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py create mode 100644 configs/models/deepseek/hf_deepseek_v2.py create mode 100644 configs/models/deepseek/hf_deepseek_v2_chat.py create mode 100644 configs/models/deepseek/lmdeploy_deepseek_series.py create mode 100644 configs/models/hf_internlm/lmdeploy_internlm2_series.py create mode 100644 configs/models/hf_llama/lmdeploy_llama_series.py create mode 100644 configs/models/mistral/lmdeploy_mistral_series.py create mode 100644 configs/models/qwen/hf_qwen1_5_110b.py create mode 100644 configs/models/qwen/hf_qwen1_5_110b_chat.py create mode 100644 configs/models/qwen/lmdeploy_qwen1_5_series.py create mode 100644 configs/models/qwen/lmdeploy_qwen_series.py create mode 100644 configs/models/yi/lmdeploy_yi_series.py create mode 100644 configs/summarizers/groups/legacy/cibench.py create mode 100644 opencompass/models/deepseek_api.py create mode 100644 opencompass/models/stepfun_api.py diff --git a/.github/scripts/pr_oc_score_assert.py b/.github/scripts/pr_oc_score_assert.py index 5c4bb85b..6ac8750c 100644 --- a/.github/scripts/pr_oc_score_assert.py +++ b/.github/scripts/pr_oc_score_assert.py @@ -4,7 +4,7 @@ import os import pytest output_path = 'regression_result' -model = 'internlm-chat-7b-hf' +model = 'internlm2-chat-7b-hf' dataset = 'siqa' @@ -22,7 +22,7 @@ class TestChatScore: def test_model_dataset_score(self, result_scores): result_score = result_scores.get(model).get(dataset) - assert_score(result_score, 73.59) + assert_score(result_score, 79.53) def assert_score(score, baseline): diff --git a/.github/workflows/daily-run-test.yml b/.github/workflows/daily-run-test.yml index 922bf433..1b887b23 100644 --- a/.github/workflows/daily-run-test.yml +++ b/.github/workflows/daily-run-test.yml @@ -14,6 +14,9 @@ env: PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HF_DATASETS_OFFLINE: 1 + TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 jobs: daily_run_test: @@ -42,7 +45,7 @@ jobs: cp -r ${{env.USERSPACE_PREFIX}}/data . rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; + export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_HUB_OFFLINE=1; - name: Run test run: | eval "$(conda shell.bash hook)" diff --git a/.github/workflows/pr-run-test.yml b/.github/workflows/pr-run-test.yml index 7ada6945..a754c4aa 100644 --- a/.github/workflows/pr-run-test.yml +++ b/.github/workflows/pr-run-test.yml @@ -21,6 +21,9 @@ env: CONDA_ENV: opencompass_base USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub + HF_DATASETS_OFFLINE: 1 + TRANSFORMERS_OFFLINE: 1 + HF_HUB_OFFLINE: 1 jobs: pr_run_test: @@ -42,21 +45,20 @@ jobs: cp -r ${{env.USERSPACE_PREFIX}}/data . rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub - export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; - name: Run test run: | eval "$(conda shell.bash hook)" conda activate ${{env.CONDA_ENV}} conda info --envs rm -rf regression_result - python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug + python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug - name: Get result run: | score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}') - if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then - echo "score is $score between 70 and 75" + if (( ${score%.*} >= 75 && ${score%.*} <= 85 )); then + echo "score is $score between 75 and 85" else - echo "score is $score not between 70 and 75" + echo "score is $score not between 75 and 85" exit 1 fi rm -rf regression_result diff --git a/configs/datasets/bbh/bbh_gen_0a5495.py b/configs/datasets/bbh/bbh_gen_98fba6.py similarity index 94% rename from configs/datasets/bbh/bbh_gen_0a5495.py rename to configs/datasets/bbh/bbh_gen_98fba6.py index 6aebc233..78edd95b 100644 --- a/configs/datasets/bbh/bbh_gen_0a5495.py +++ b/configs/datasets/bbh/bbh_gen_98fba6.py @@ -49,7 +49,7 @@ for _name in bbh_multiple_choice_sets: template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step." ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:'])) bbh_eval_cfg = dict( evaluator=dict(type=BBHEvaluator_mcq), pred_role='BOT', @@ -66,6 +66,7 @@ for _name in bbh_multiple_choice_sets: infer_cfg=bbh_infer_cfg.copy(), eval_cfg=bbh_eval_cfg.copy())) + for _name in bbh_free_form_sets: with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f: _hint = f.read() @@ -75,7 +76,7 @@ for _name in bbh_free_form_sets: template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step." ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:'])) bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT') bbh_datasets.append( diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py index 6b677ee1..9df244f2 100644 --- a/configs/datasets/collections/base_core.py +++ b/configs/datasets/collections/base_core.py @@ -10,9 +10,9 @@ with read_base(): from ..race.race_ppl_abed12 import race_datasets from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets - from ..bbh.bbh_gen_0a5495 import bbh_datasets + from ..bbh.bbh_gen_98fba6 import bbh_datasets from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets - from ..math.math_evaluatorv2_gen_9d2049 import math_datasets + from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets from ..humaneval.humaneval_gen_d2537e import humaneval_datasets from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets diff --git a/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py new file mode 100644 index 00000000..fd448cc8 --- /dev/null +++ b/configs/datasets/gsm8k/gsm8k_gen_17d0dc.py @@ -0,0 +1,39 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator + +gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer') + +gsm8k_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"), + dict(role='BOT', prompt='Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n'), + dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"), + dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"), + dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"), + dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"), + dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"), + dict(role='BOT', prompt='For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n'), + dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"), + ], + )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Question'])) + +gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator), + pred_postprocessor=dict(type=gsm8k_postprocess), + dataset_postprocessor=dict(type=gsm8k_dataset_postprocess)) + +gsm8k_datasets = [ + dict( + abbr='gsm8k', + type=GSM8KDataset, + path='./data/gsm8k', + reader_cfg=gsm8k_reader_cfg, + infer_cfg=gsm8k_infer_cfg, + eval_cfg=gsm8k_eval_cfg) +] diff --git a/configs/datasets/math/math_4shot_base_gen_db136b.py b/configs/datasets/math/math_4shot_base_gen_db136b.py new file mode 100644 index 00000000..16883f37 --- /dev/null +++ b/configs/datasets/math/math_4shot_base_gen_db136b.py @@ -0,0 +1,30 @@ +from mmengine.config import read_base +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2 + +with read_base(): + from .math_4shot_example_from_google_research import prompt + +math_reader_cfg = dict(input_columns=['problem'], output_column='solution') + +math_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem'])) + +# postprocess v2 +math_eval_cfg = dict( + evaluator=dict(type=MATHEvaluator, version='v2'), + pred_postprocessor=dict(type=math_postprocess_v2)) + +math_datasets = [ + dict( + type=MATHDataset, + abbr='math', + path='./data/math/math.json', + reader_cfg=math_reader_cfg, + infer_cfg=math_infer_cfg, + eval_cfg=math_eval_cfg) +] diff --git a/configs/datasets/math/math_4shot_example_from_google_research.py b/configs/datasets/math/math_4shot_example_from_google_research.py new file mode 100644 index 00000000..80feee44 --- /dev/null +++ b/configs/datasets/math/math_4shot_example_from_google_research.py @@ -0,0 +1,40 @@ +# Solving Quantitative Reasoning Problems with Language Models + +prompt = ''' +Problem: +Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$. + +Solution: +The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$. +Final Answer: The final answer is $[2,5)$. I hope it is correct. + +Problem: +If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$ + +Solution: +We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$ +Final Answer: The final answer is $24$. I hope it is correct. + +Problem: +Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight? + +Solution: +If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: +\\begin{align*} +30n&=480\\ +\\Rightarrow\\qquad n&=480/30=\\boxed{16} +\\end{align*} +Final Answer: The final answer is $16$. I hope it is correct. + +Problem: +If the system of equations +\\begin{align*} +6x-4y&=a,\\ +6y-9x &=b. +\\end{align*} +has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero. + +Solution: +If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$ +Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct. +'''.strip() diff --git a/configs/datasets/math/math_evaluatorv2_gen_9d2049.py b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py similarity index 96% rename from configs/datasets/math/math_evaluatorv2_gen_9d2049.py rename to configs/datasets/math/math_evaluatorv2_gen_2f4a71.py index e777e1e3..ca9b9b90 100644 --- a/configs/datasets/math/math_evaluatorv2_gen_9d2049.py +++ b/configs/datasets/math/math_evaluatorv2_gen_2f4a71.py @@ -38,7 +38,7 @@ Problem: Solution:""" ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512)) + inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Problem'])) # postprocess v2 math_eval_cfg = dict( diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py new file mode 100644 index 00000000..5ed9f457 --- /dev/null +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_742f0c.py @@ -0,0 +1,82 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +prompt = ''' +You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests: + +assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5) +assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) +assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) + +[BEGIN] + '\ +def similar_elements(test_tup1, test_tup2): + res = tuple(set(test_tup1) & set(test_tup2)) + return (res)\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests: + +assert is_not_prime(2) == False +assert is_not_prime(10) == True +assert is_not_prime(35) == True + +[BEGIN] + '\ +import math +def is_not_prime(n): + result = False + for i in range(2,int(math.sqrt(n)) + 1): + if n % i == 0: + result = True + return result\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests: + +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] +assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] + +[BEGIN] + '\ +import heapq as hq +def heap_queue_largest(nums,n): + largest_nums = hq.nlargest(n, nums) + return largest_nums\ +' +[DONE] + + +You are an expert Python programmer, and here is your task: {text} Your code should pass these tests: + +{test_list} + +'''.strip() + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict(type=PromptTemplate, template=prompt), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='./data/mbpp/sanitized-mbpp.jsonl', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py new file mode 100644 index 00000000..48c12835 --- /dev/null +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_a0fc46.py @@ -0,0 +1,41 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2') + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',), + dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',), + dict(role='BOT', prompt="[BEGIN]\n 'import math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',), + dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE]\n\n",), + + dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT') + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr='sanitized_mbpp', + path='./data/mbpp/sanitized-mbpp.jsonl', + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/configs/models/deepseek/hf_deepseek_v2.py b/configs/models/deepseek/hf_deepseek_v2.py new file mode 100644 index 00000000..e05be313 --- /dev/null +++ b/configs/models/deepseek/hf_deepseek_v2.py @@ -0,0 +1,18 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='deepseek-v2-hf', + path='deepseek-ai/DeepSeek-V2', + max_out_len=1024, + batch_size=4, + model_kwargs=dict( + device_map='sequential', + torch_dtype='torch.bfloat16', + max_memory={i: '75GB' for i in range(8)}, + attn_implementation='eager' + ), + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/deepseek/hf_deepseek_v2_chat.py b/configs/models/deepseek/hf_deepseek_v2_chat.py new file mode 100644 index 00000000..67dfd0bd --- /dev/null +++ b/configs/models/deepseek/hf_deepseek_v2_chat.py @@ -0,0 +1,18 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='deepseek-v2-chat-hf', + path='deepseek-ai/DeepSeek-V2-Chat', + max_out_len=1024, + batch_size=4, + model_kwargs=dict( + device_map='sequential', + torch_dtype='torch.bfloat16', + max_memory={i: '75GB' for i in range(8)}, + attn_implementation='eager' + ), + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/deepseek/lmdeploy_deepseek_series.py b/configs/models/deepseek/lmdeploy_deepseek_series.py new file mode 100644 index 00000000..5060b0c0 --- /dev/null +++ b/configs/models/deepseek/lmdeploy_deepseek_series.py @@ -0,0 +1,23 @@ +from opencompass.models import LmdeployPytorchModel + +settings = [ + ('deepseek-7b-base-hf', 'deepseek-ai/deepseek-llm-7b-base', 1), + ('deepseek-67b-base-hf', 'deepseek-ai/deepseek-llm-67b-base', 4), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=LmdeployPytorchModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_series.py b/configs/models/hf_internlm/lmdeploy_internlm2_series.py new file mode 100644 index 00000000..f01ee714 --- /dev/null +++ b/configs/models/hf_internlm/lmdeploy_internlm2_series.py @@ -0,0 +1,24 @@ +from opencompass.models import TurboMindModel + +settings = [ + ('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1), + ('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1), + ('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=TurboMindModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/hf_llama/lmdeploy_llama_series.py b/configs/models/hf_llama/lmdeploy_llama_series.py new file mode 100644 index 00000000..346d920d --- /dev/null +++ b/configs/models/hf_llama/lmdeploy_llama_series.py @@ -0,0 +1,30 @@ +from opencompass.models import TurboMindModel + +settings = [ + ('llama-7b-turbomind', 'huggyllama/llama-7b', 1), + ('llama-13b-turbomind', 'huggyllama/llama-13b', 1), + ('llama-30b-turbomind', 'huggyllama/llama-30b', 2), + ('llama-65b-turbomind', 'huggyllama/llama-65b', 4), + ('llama-2-7b-turbomind', 'meta-llama/Llama-2-7b-hf', 1), + ('llama-2-13b-turbomind', 'meta-llama/Llama-2-13b-hf', 1), + ('llama-2-70b-turbomind', 'meta-llama/Llama-2-70b-hf', 4), + ('llama-3-8b-turbomind', 'meta-llama/Meta-Llama-3-8B', 1), + ('llama-3-70b-turbomind', 'meta-llama/Meta-Llama-3-70B', 4), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=TurboMindModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/mistral/lmdeploy_mistral_series.py b/configs/models/mistral/lmdeploy_mistral_series.py new file mode 100644 index 00000000..0bb07c52 --- /dev/null +++ b/configs/models/mistral/lmdeploy_mistral_series.py @@ -0,0 +1,24 @@ +from opencompass.models import LmdeployPytorchModel + +settings = [ + ('mistral-7b-v0.1-pytorch', 'mistralai/Mistral-7B-v0.1', 1), + ('mixtral-8x7b-v0.1-pytorch', 'mistralai/Mixtral-8x7B-v0.1', 2), + ('mixtral-8x22b-v0.1-pytorch', 'mistralai/Mixtral-8x22B-v0.1', 4), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=LmdeployPytorchModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/qwen/hf_qwen1_5_110b.py b/configs/models/qwen/hf_qwen1_5_110b.py new file mode 100644 index 00000000..1ba10658 --- /dev/null +++ b/configs/models/qwen/hf_qwen1_5_110b.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFaceBaseModel + +models = [ + dict( + type=HuggingFaceBaseModel, + abbr='qwen1.5-110b-hf', + path='Qwen/Qwen1.5-110B', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen/hf_qwen1_5_110b_chat.py b/configs/models/qwen/hf_qwen1_5_110b_chat.py new file mode 100644 index 00000000..e77bad42 --- /dev/null +++ b/configs/models/qwen/hf_qwen1_5_110b_chat.py @@ -0,0 +1,12 @@ +from opencompass.models import HuggingFacewithChatTemplate + +models = [ + dict( + type=HuggingFacewithChatTemplate, + abbr='qwen1.5-110b-chat-hf', + path='Qwen/Qwen1.5-110B-Chat', + max_out_len=1024, + batch_size=8, + run_cfg=dict(num_gpus=4), + ) +] diff --git a/configs/models/qwen/lmdeploy_qwen1_5_series.py b/configs/models/qwen/lmdeploy_qwen1_5_series.py new file mode 100644 index 00000000..f3aef87f --- /dev/null +++ b/configs/models/qwen/lmdeploy_qwen1_5_series.py @@ -0,0 +1,29 @@ +from opencompass.models import LmdeployPytorchModel + +settings = [ + ('qwen1.5-0.5b-pytorch', 'Qwen/Qwen1.5-0.5B', 1), + ('qwen1.5-1.8b-pytorch', 'Qwen/Qwen1.5-1.8B', 1), + ('qwen1.5-4b-pytorch', 'Qwen/Qwen1.5-4B', 1), + ('qwen1.5-7b-pytorch', 'Qwen/Qwen1.5-7B', 1), + ('qwen1.5-14b-pytorch', 'Qwen/Qwen1.5-14B', 1), + ('qwen1.5-32b-pytorch', 'Qwen/Qwen1.5-32B', 2), + ('qwen1.5-72b-pytorch', 'Qwen/Qwen1.5-72B', 4), + ('qwen1.5-moe-a2.7b-pytorch', 'Qwen/Qwen1.5-MoE-A2.7B', 1), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=LmdeployPytorchModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/qwen/lmdeploy_qwen_series.py b/configs/models/qwen/lmdeploy_qwen_series.py new file mode 100644 index 00000000..12b16820 --- /dev/null +++ b/configs/models/qwen/lmdeploy_qwen_series.py @@ -0,0 +1,25 @@ +from opencompass.models import TurboMindModel + +settings = [ + ('qwen-1.8b-turbomind', 'Qwen/Qwen-1_8B', 1), + ('qwen-7b-turbomind', 'Qwen/Qwen-7B', 1), + ('qwen-14b-turbomind', 'Qwen/Qwen-14B', 1), + ('qwen-72b-turbomind', 'Qwen/Qwen-72B', 4), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=TurboMindModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/models/yi/lmdeploy_yi_series.py b/configs/models/yi/lmdeploy_yi_series.py new file mode 100644 index 00000000..f19476b7 --- /dev/null +++ b/configs/models/yi/lmdeploy_yi_series.py @@ -0,0 +1,23 @@ +from opencompass.models import LmdeployPytorchModel + +settings = [ + ('yi-6b-pytorch', '01-ai/Yi-6B', 1), + ('yi-34b-pytorch', '01-ai/Yi-34B', 2), +] + +models = [] +for abbr, path, num_gpus in settings: + models.append( + dict( + type=LmdeployPytorchModel, + abbr=abbr, + path=path, + engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus), + gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024), + max_out_len=1024, + max_seq_len=2048, + batch_size=16, + concurrency=16, + run_cfg=dict(num_gpus=num_gpus), + ) + ) diff --git a/configs/summarizers/compassbench_v1_objective.py b/configs/summarizers/compassbench_v1_objective.py index 79f8ec28..2d677d0a 100644 --- a/configs/summarizers/compassbench_v1_objective.py +++ b/configs/summarizers/compassbench_v1_objective.py @@ -2,7 +2,7 @@ from mmengine.config import read_base with read_base(): - from .groups.cibench import cibench_summary_groups + from .groups.legacy.cibench import cibench_summary_groups from .groups.plugineval import plugineval_summary_groups diff --git a/configs/summarizers/groups/legacy/cibench.py b/configs/summarizers/groups/legacy/cibench.py new file mode 100644 index 00000000..bc2ab94c --- /dev/null +++ b/configs/summarizers/groups/legacy/cibench.py @@ -0,0 +1,109 @@ + +_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch'] +_cibench = ['cibench_' + i for i in _cibench] +cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}] + +_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch', + 'scipy', 'seaborn', 'sklearn', 'tensorflow'] +_cibench_template = ['cibench_template/' + i for i in _cibench_template] +# number of total exec questions in this module +_cibench_template_weight = { + 'lightgbm': [30, 15, 0, 0], + 'matplotlib': [42, 0, 0, 36], + 'nltk': [70, 30, 20, 10], + 'opencv': [60, 10, 0, 40], + 'pandas': [60, 40, 0, 10], + 'pytorch': [28, 0, 0, 0], + 'scipy': [60, 40, 0, 0], + 'seaborn': [42, 0, 0, 35], + 'sklearn': [42, 6, 0, 18], + 'tensorflow': [36, 6, 0, 12], +} +cibench_summary_groups.extend([ + { + 'name': 'cibench_template:executable', + 'subsets': [[i, 'executable'] for i in _cibench_template], + 'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template:numeric_correct', + 'subsets': [[i, 'numeric_correct'] for i in _cibench_template], + 'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template:text_score', + 'subsets': [[i, 'text_score'] for i in _cibench_template], + 'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template:vis_sim', + 'subsets': [[i, 'vis_sim'] for i in _cibench_template], + 'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()}, + }, +]) + + +## chinese +_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch', + 'scipy', 'seaborn', 'sklearn', 'tensorflow'] +_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn] +cibench_summary_groups.extend([ + { + 'name': 'cibench_template_cn:executable', + 'subsets': [[i, 'executable'] for i in _cibench_template_cn], + 'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template_cn:numeric_correct', + 'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn], + 'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template_cn:text_score', + 'subsets': [[i, 'text_score'] for i in _cibench_template_cn], + 'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()}, + }, + { + 'name': 'cibench_template_cn:vis_sim', + 'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn], + 'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()}, + }, +]) + + +## add more without nltk +cibench_summary_groups.extend([ + { + 'name': 'cibench_template_wo_nltk:executable', + 'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i], + 'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, + { + 'name': 'cibench_template_wo_nltk:numeric_correct', + 'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i], + 'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, + { + 'name': 'cibench_template_wo_nltk:vis_sim', + 'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i], + 'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, +]) + +cibench_summary_groups.extend([ + { + 'name': 'cibench_template_cn_wo_nltk:executable', + 'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i], + 'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, + { + 'name': 'cibench_template_cn_wo_nltk:numeric_correct', + 'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i], + 'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, + { + 'name': 'cibench_template_cn_wo_nltk:vis_sim', + 'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i], + 'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k}, + }, +]) diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index e9b5abb3..c2682ab2 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -170,6 +170,8 @@ def parse_dlc_args(dlc_parser): type=str) + + def parse_hf_args(hf_parser): """These args are all for the quick construction of HuggingFace models.""" hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat') @@ -212,7 +214,7 @@ def main(): if args.work_dir is not None: cfg['work_dir'] = args.work_dir else: - cfg.setdefault('work_dir', osp.join('outputs', 'default')) + cfg.setdefault('work_dir', os.path.join('outputs', 'default')) # cfg_time_str defaults to the current time cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S') @@ -340,5 +342,6 @@ def main(): summarizer.summarize(time_str=cfg_time_str) + if __name__ == '__main__': main() diff --git a/opencompass/models/__init__.py b/opencompass/models/__init__.py index fab0824b..d7f7c063 100644 --- a/opencompass/models/__init__.py +++ b/opencompass/models/__init__.py @@ -7,7 +7,8 @@ from .base import BaseModel, LMTemplateParser # noqa: F401 from .base_api import APITemplateParser, BaseAPIModel # noqa: F401 from .bytedance_api import ByteDance # noqa: F401 from .claude_api import Claude # noqa: F401 -from .gemini_api import Gemini, GeminiAllesAPIN # noqa: F401 +from .deepseek_api import DeepseekAPI # noqa: F401 +from .gemini_api import Gemini # noqa: F401 from .glm import GLM130B # noqa: F401 from .huggingface import HuggingFace # noqa: F401 from .huggingface import HuggingFaceCausalLM # noqa: F401 @@ -21,7 +22,7 @@ from .lightllm_api import LightllmAPI # noqa: F401 from .llama2 import Llama2, Llama2Chat # noqa: F401 from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401 from .lmdeploy_tis import LmdeployTisModel # noqa: F401 -from .minimax_api import MiniMax # noqa: F401 +from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401 from .mistral_api import Mistral # noqa: F401 from .mixtral import Mixtral # noqa: F401 from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401 @@ -31,11 +32,12 @@ from .openai_api import OpenAI # noqa: F401 from .pangu_api import PanGu # noqa: F401 from .qwen_api import Qwen # noqa: F401 from .sensetime_api import SenseTime # noqa: F401 +from .stepfun_api import StepFun # noqa: F401 from .turbomind import TurboMindModel # noqa: F401 from .turbomind_tis import TurboMindTisModel # noqa: F401 from .unigpt_api import UniGPT # noqa: F401 from .vllm import VLLM # noqa: F401 -from .xunfei_api import XunFei # noqa: F401 +from .xunfei_api import XunFei, XunFeiSpark # noqa: F401 from .yayi_api import Yayi # noqa: F401 from .zhipuai_api import ZhiPuAI # noqa: F401 from .zhipuai_v2_api import ZhiPuV2AI # noqa: F401 diff --git a/opencompass/models/ai360_api.py b/opencompass/models/ai360_api.py index 87c80564..223a6d79 100644 --- a/opencompass/models/ai360_api.py +++ b/opencompass/models/ai360_api.py @@ -1,4 +1,3 @@ -import time from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -141,29 +140,32 @@ class AI360GPT(BaseAPIModel): self.wait() continue if raw_response.status_code == 200: - try: - msg = response['choices'][0]['message']['content'].strip() - return msg - - except KeyError: - if 'error' in response: - # tpm(token per minitue) limit - if response['erro']['code'] == '1005': - time.sleep(1) - continue - - self.logger.error('Find error message in response: ', - str(response['error'])) + msg = response['choices'][0]['message']['content'].strip() + self.logger.debug(f'Generated: {msg}') + return msg # sensitive content, prompt overlength, network error # or illegal prompt - if (raw_response.status_code == 400 - or raw_response.status_code == 401 - or raw_response.status_code == 402 - or raw_response.status_code == 429 - or raw_response.status_code == 500): - print(raw_response.text) - continue + if raw_response.status_code in [400, 401, 402, 429, 500]: + if 'error' not in response: + print(raw_response.status_code) + print(raw_response.text) + continue + print(response) + # tpm(token per minitue) limit + if response['error']['code'] == '1005': + self.logger.debug('tpm limit, ignoring') + continue + elif response['error']['code'] == '1001': + msg = '参数错误:messages参数过长或max_tokens参数值过大' + self.logger.debug(f'Generated: {msg}') + return msg + else: + print(response) + + self.logger.error('Find error message in response: ', + str(response['error'])) + print(raw_response) max_num_retries += 1 diff --git a/opencompass/models/baichuan_api.py b/opencompass/models/baichuan_api.py index b4cc0dc2..9bca78e6 100644 --- a/opencompass/models/baichuan_api.py +++ b/opencompass/models/baichuan_api.py @@ -145,8 +145,8 @@ class BaiChuan(BaseAPIModel): self.wait() continue if raw_response.status_code == 200: - msg = response['choices'][0]['message']['content'] + self.logger.debug(f'Generated: {msg}') return msg if raw_response.status_code != 200: diff --git a/opencompass/models/baidu_api.py b/opencompass/models/baidu_api.py index ef20807e..51d81ff5 100644 --- a/opencompass/models/baidu_api.py +++ b/opencompass/models/baidu_api.py @@ -53,6 +53,8 @@ class ERNIEBot(BaseAPIModel): self.headers = {'Content_Type': 'application/json'} self.secretkey = secretkey self.key = key + if not url.endswith('?access_token='): + url += '?access_token=' self.url = url access_token, _ = self._generate_access_token() self.access_token = access_token @@ -143,14 +145,25 @@ class ERNIEBot(BaseAPIModel): messages = [{'role': 'user', 'content': input}] else: messages = [] + msg_buffer, last_role = [], None for item in input: - msg = {'content': item['prompt']} - if item['role'] == 'HUMAN': - msg['role'] = 'user' - elif item['role'] == 'BOT': - msg['role'] = 'assistant' + if item['role'] == 'BOT': + role = 'assistant' + else: # USER or SYSTEM + role = 'user' + if role != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = role + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) - messages.append(msg) data = {'messages': messages} data.update(self.generation_kwargs) @@ -181,6 +194,7 @@ class ERNIEBot(BaseAPIModel): if raw_response.status_code == 200: try: msg = response['result'] + self.logger.debug(msg) return msg except KeyError: print(response) @@ -188,9 +202,12 @@ class ERNIEBot(BaseAPIModel): if response['error_code'] == 336007: # exceed max length return '' - - time.sleep(1) - continue + elif response['error_code'] == 336103: + # prompt tokens too long + return '' + else: + time.sleep(1) + continue if (response['error_code'] == 110 or response['error_code'] == 100 or response['error_code'] == 111 diff --git a/opencompass/models/deepseek_api.py b/opencompass/models/deepseek_api.py new file mode 100644 index 00000000..dba51937 --- /dev/null +++ b/opencompass/models/deepseek_api.py @@ -0,0 +1,178 @@ +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class DeepseekAPI(BaseAPIModel): + """Model wrapper around DeepseekAPI. + + Documentation: + + Args: + path (str): The name of DeepseekAPI model. + e.g. `moonshot-v1-32k` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + system_prompt: str = '', + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + self.headers = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + key, + } + self.url = url + self.model = path + self.system_prompt = system_prompt + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + if self.system_prompt: + system = {'role': 'system', 'content': self.system_prompt} + messages.insert(0, system) + + data = {'model': self.model, 'messages': messages} + + max_num_retries = 0 + while max_num_retries < self.retry: + self.acquire() + try: + raw_response = requests.request('POST', + url=self.url, + headers=self.headers, + json=data) + except Exception as err: + print('Request Error:{}'.format(err)) + time.sleep(2) + continue + + try: + response = raw_response.json() + except Exception as err: + print('Response Error:{}'.format(err)) + response = None + self.release() + + if response is None: + print('Connection error, reconnect.') + # if connect error, frequent requests will casuse + # continuous unstable network, therefore wait here + # to slow down the request + self.wait() + continue + + if raw_response.status_code == 200: + # msg = json.load(response.text) + # response + msg = response['choices'][0]['message']['content'] + self.logger.debug(f'Generated: {msg}') + return msg + + if raw_response.status_code == 401: + print('请求被拒绝 api_key错误') + continue + elif raw_response.status_code == 400: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The request was rejected because high risk' + return msg + elif raw_response.status_code == 429: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(5) + continue + else: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(1) + + max_num_retries += 1 + + raise RuntimeError(raw_response) diff --git a/opencompass/models/gemini_api.py b/opencompass/models/gemini_api.py index 5779d3d9..0e9b089f 100644 --- a/opencompass/models/gemini_api.py +++ b/opencompass/models/gemini_api.py @@ -186,66 +186,3 @@ class Gemini(BaseAPIModel): time.sleep(1) raise RuntimeError('API call failed.') - - -class GeminiAllesAPIN(Gemini): - """Model wrapper around Gemini models. - - Documentation: - - Args: - path (str): The name of Gemini model. - e.g. `gemini-pro` - key (str): Authorization key. - query_per_second (int): The maximum queries allowed per second - between two consecutive calls of the API. Defaults to 1. - max_seq_len (int): Unused here. - meta_template (Dict, optional): The model's meta prompt - template if needed, in case the requirement of injecting or - wrapping of any meta instructions. - retry (int): Number of retires if the API call fails. Defaults to 2. - """ - - def __init__( - self, - path: str, - key: str, - url: str, - query_per_second: int = 2, - max_seq_len: int = 2048, - meta_template: Optional[Dict] = None, - retry: int = 2, - temperature: float = 1.0, - top_p: float = 0.8, - top_k: float = 10.0, - ): - super().__init__(key=key, - path=path, - max_seq_len=max_seq_len, - query_per_second=query_per_second, - meta_template=meta_template, - retry=retry) - # Replace the url and headers into AllesApin - self.url = url - self.headers = { - 'alles-apin-token': key, - 'content-type': 'application/json', - } - - def generate( - self, - inputs: List[PromptType], - max_out_len: int = 512, - ) -> List[str]: - """Generate results given a list of inputs. - - Args: - inputs (List[PromptType]): A list of strings or PromptDicts. - The PromptDict should be organized in OpenCompass' - API format. - max_out_len (int): The maximum length of the output. - - Returns: - List[str]: A list of generated strings. - """ - return super().generate(inputs, max_out_len) diff --git a/opencompass/models/huggingface.py b/opencompass/models/huggingface.py index 3974ad52..a91ffea0 100644 --- a/opencompass/models/huggingface.py +++ b/opencompass/models/huggingface.py @@ -289,13 +289,13 @@ class HuggingFace(BaseModel): tokens = self.tokenizer.batch_encode_plus(inputs, padding=True, truncation=True, - max_length=self.max_seq_len - - max_out_len) + max_length=self.max_seq_len) tokens = { k: torch.tensor(np.array(tokens[k]), device=self.model.device) for k in tokens if k in ['input_ids', 'attention_mask'] } + origin_stopping_criteria = stopping_criteria if stopping_criteria: # Construct huggingface stopping criteria if self.tokenizer.eos_token is not None: @@ -332,6 +332,9 @@ class HuggingFace(BaseModel): if self.end_str: decodeds = [token.split(self.end_str)[0] for token in decodeds] + if origin_stopping_criteria: + for t in origin_stopping_criteria: + decodeds = [token.split(t)[0] for token in decodeds] return decodeds def _single_generate(self, @@ -382,6 +385,7 @@ class HuggingFace(BaseModel): max_length=self.max_seq_len - max_out_len)['input_ids'] input_ids = torch.tensor(input_ids, device=self.model.device) + origin_stopping_criteria = stopping_criteria if stopping_criteria: # Construct huggingface stopping criteria if self.tokenizer.eos_token is not None: @@ -419,6 +423,9 @@ class HuggingFace(BaseModel): if self.end_str: decodeds = [token.split(self.end_str)[0] for token in decodeds] + if origin_stopping_criteria: + for t in origin_stopping_criteria: + decodeds = [token.split(t)[0] for token in decodeds] return decodeds def get_logits(self, inputs: List[str]): diff --git a/opencompass/models/minimax_api.py b/opencompass/models/minimax_api.py index c069dc97..44ea267a 100644 --- a/opencompass/models/minimax_api.py +++ b/opencompass/models/minimax_api.py @@ -180,3 +180,173 @@ class MiniMax(BaseAPIModel): max_num_retries += 1 raise RuntimeError(response.text) + + +class MiniMaxChatCompletionV2(BaseAPIModel): + """Model wrapper around MiniMax ChatCompletionV2. + + Documentation: + + Args: + path (str): The name of MiniMax model. + e.g. `moonshot-v1-32k` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + self.headers = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + key, + } + self.url = url + self.model = path + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + data = { + 'model': self.model, + 'messages': messages, + 'max_tokens': max_out_len + } + + max_num_retries = 0 + while max_num_retries < self.retry: + self.acquire() + try: + raw_response = requests.request('POST', + url=self.url, + headers=self.headers, + json=data) + except Exception as err: + print('Request Error:{}'.format(err)) + time.sleep(2) + continue + + response = raw_response.json() + self.release() + + if response is None: + print('Connection error, reconnect.') + # if connect error, frequent requests will casuse + # continuous unstable network, therefore wait here + # to slow down the request + self.wait() + continue + + if raw_response.status_code == 200: + try: + msg = response['choices'][0]['message']['content'] + self.logger.debug(f'Generated: {msg}') + return msg + except Exception: + code = response.get('base_resp', {}).get('status_code') + if code == 1002: + # rate limit + time.sleep(1) + continue + elif code == 1027: + return 'The request was rejected because high risk' + print(messages, response) + pass + + elif raw_response.status_code == 401: + print('请求被拒绝 api_key错误') + continue + elif raw_response.status_code == 400: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The request was rejected because high risk' + return msg + elif raw_response.status_code == 429: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(5) + continue + else: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(1) + + max_num_retries += 1 + + raise RuntimeError(raw_response) diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py index 1f34cd2c..d22c0785 100644 --- a/opencompass/models/qwen_api.py +++ b/opencompass/models/qwen_api.py @@ -152,8 +152,7 @@ class Qwen(BaseAPIModel): if response.status_code == 200: try: msg = response.output.text - print('=' * 128) - print(msg) + self.logger.debug(msg) return msg except KeyError: print(response) diff --git a/opencompass/models/stepfun_api.py b/opencompass/models/stepfun_api.py new file mode 100644 index 00000000..54d1a82b --- /dev/null +++ b/opencompass/models/stepfun_api.py @@ -0,0 +1,182 @@ +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Dict, List, Optional, Union + +import requests + +from opencompass.utils.prompt import PromptList + +from .base_api import BaseAPIModel + +PromptType = Union[PromptList, str] + + +class StepFun(BaseAPIModel): + """Model wrapper around StepFun. + + Documentation: + + Args: + path (str): The name of StepFun model. + e.g. `moonshot-v1-32k` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + key: str, + url: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + system_prompt: str = '', + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + self.headers = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer ' + key, + } + self.url = url + self.model = path + self.system_prompt = system_prompt + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + """Generate results given a list of inputs. + + Args: + inputs (List[PromptType]): A list of strings or PromptDicts. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + List[str]: A list of generated strings. + """ + with ThreadPoolExecutor() as executor: + results = list( + executor.map(self._generate, inputs, + [max_out_len] * len(inputs))) + self.flush() + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + """Generate results given an input. + + Args: + inputs (PromptType): A string or PromptDict. + The PromptDict should be organized in OpenCompass' + API format. + max_out_len (int): The maximum length of the output. + + Returns: + str: The generated string. + """ + assert isinstance(input, (str, PromptList)) + + if isinstance(input, str): + messages = [{'role': 'user', 'content': input}] + else: + messages = [] + msg_buffer, last_role = [], None + for item in input: + item['role'] = 'assistant' if item['role'] == 'BOT' else 'user' + if item['role'] != last_role and last_role is not None: + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + msg_buffer = [] + msg_buffer.append(item['prompt']) + last_role = item['role'] + messages.append({ + 'content': '\n'.join(msg_buffer), + 'role': last_role + }) + + if self.system_prompt: + system = {'role': 'system', 'content': self.system_prompt} + messages.insert(0, system) + + data = {'model': self.model, 'messages': messages} + + max_num_retries = 0 + while max_num_retries < self.retry: + self.acquire() + try: + raw_response = requests.request('POST', + url=self.url, + headers=self.headers, + json=data) + except Exception as err: + print('Request Error:{}'.format(err)) + time.sleep(2) + continue + + try: + response = raw_response.json() + except Exception: + response = None + self.release() + + if response is None: + print('Connection error, reconnect.') + # if connect error, frequent requests will casuse + # continuous unstable network, therefore wait here + # to slow down the request + self.wait() + continue + + if raw_response.status_code == 200: + # msg = json.load(response.text) + # response + msg = response['choices'][0]['message']['content'] + self.logger.debug(f'Generated: {msg}') + return msg + + if raw_response.status_code == 400: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The context length exceeded' + return msg + elif raw_response.status_code == 403: + print('请求被拒绝 api_key错误') + continue + elif raw_response.status_code == 429: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(5) + continue + elif raw_response.status_code == 451: + print(messages, response) + print('请求失败,状态码:', raw_response) + msg = 'The request was rejected because high risk' + return msg + else: + print(messages, response) + print('请求失败,状态码:', raw_response) + time.sleep(1) + + max_num_retries += 1 + + raise RuntimeError(raw_response) diff --git a/opencompass/models/turbomind.py b/opencompass/models/turbomind.py index f64249bc..8e573135 100644 --- a/opencompass/models/turbomind.py +++ b/opencompass/models/turbomind.py @@ -55,9 +55,6 @@ class TurboMindModel(BaseModel): if engine_config is not None: from lmdeploy.messages import TurbomindEngineConfig engine_config = TurbomindEngineConfig(**engine_config) - if gen_config is not None: - from lmdeploy.messages import EngineGenerationConfig - gen_config = EngineGenerationConfig(**gen_config) self.logger = get_logger() tm_model = TurboMind.from_pretrained(path, engine_config=engine_config) self.tokenizer = tm_model.tokenizer @@ -106,6 +103,7 @@ class TurboMindModel(BaseModel): t = self.tokenizer.encode(t, add_bos=False) stop_words.append(t[0]) gen_config['stop_words'] = list(set(stop_words)) + gen_config.setdefault('min_new_tokens', 1) from lmdeploy.messages import EngineGenerationConfig gen_config = EngineGenerationConfig(**gen_config) @@ -123,6 +121,9 @@ class TurboMindModel(BaseModel): [gen_config] * len(batch_input), )) results += _results + if stopping_criteria: + for s in stopping_criteria: + results = [r.split(s)[0] for r in results] return results def get_token_len(self, prompt: str) -> int: diff --git a/opencompass/models/xunfei_api.py b/opencompass/models/xunfei_api.py index ee75ca12..8de6f607 100644 --- a/opencompass/models/xunfei_api.py +++ b/opencompass/models/xunfei_api.py @@ -1,4 +1,6 @@ import json +import re +import time from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -221,3 +223,150 @@ class XunFei(BaseAPIModel): if err_code == 10013: return err_data['header']['message'] raise RuntimeError(f'Code: {err_code}, data: {err_data}') + + +class XunFeiSpark(BaseAPIModel): + """Model wrapper around XunFeiSpark. + + Documentation: + + Args: + path (str): The name of XunFeiSpark model. + e.g. `moonshot-v1-32k` + key (str): Authorization key. + query_per_second (int): The maximum queries allowed per second + between two consecutive calls of the API. Defaults to 1. + max_seq_len (int): Unused here. + meta_template (Dict, optional): The model's meta prompt + template if needed, in case the requirement of injecting or + wrapping of any meta instructions. + retry (int): Number of retires if the API call fails. Defaults to 2. + """ + + def __init__( + self, + path: str, + url: str, + app_id: str, + api_key: str, + api_secret: str, + query_per_second: int = 2, + max_seq_len: int = 2048, + meta_template: Optional[Dict] = None, + retry: int = 2, + ): + super().__init__(path=path, + max_seq_len=max_seq_len, + query_per_second=query_per_second, + meta_template=meta_template, + retry=retry) + try: + from sparkai.llm.llm import ChatSparkLLM # noqa: F401 + except ImportError: + raise ImportError('run `pip install --upgrade spark_ai_python`') + + self.spark_domain = path + self.url = url + self.app_id = app_id + self.api_key = api_key + self.api_secret = api_secret + + def generate( + self, + inputs: List[PromptType], + max_out_len: int = 512, + ) -> List[str]: + results = [self._generate(input, max_out_len) for input in inputs] + return results + + def _generate( + self, + input: PromptType, + max_out_len: int = 512, + ) -> str: + assert isinstance(input, (str, PromptList)) + + from sparkai.core.messages import ChatMessage + from sparkai.llm.llm import ChatSparkLLM + + if isinstance(input, str): + messages = [ChatMessage(role='user', content=input)] + else: + messages = [] + msg_buffer, last_role = [], None + for index, item in enumerate(input): + if index == 0 and item['role'] == 'SYSTEM': + role = 'system' + elif item['role'] == 'BOT': + role = 'assistant' + else: + role = 'user' + + if role != last_role and last_role is not None: + content = '\n'.join(msg_buffer) + messages.append( + ChatMessage(role=last_role, content=content)) + msg_buffer = [] + + msg_buffer.append(item['prompt']) + last_role = role + + content = '\n'.join(msg_buffer) + messages.append(ChatMessage(role=last_role, content=content)) + + spark = ChatSparkLLM( + spark_api_url=self.url, + spark_app_id=self.app_id, + spark_api_key=self.api_key, + spark_api_secret=self.api_secret, + spark_llm_domain=self.spark_domain, + streaming=False, + max_tokens=max_out_len, + ) + + all_empty_response = True + for _ in range(self.retry + 1): + try: + outputs = spark.generate([messages]).generations[0] + if len(outputs) == 0: + self.logger.error('Empty response, retrying...') + continue + msg = outputs[0].text + self.logger.debug(f'Generated: {msg}') + return msg + except ConnectionError as e: + match = re.match(r'Error Code: (\d+), Error: (.*)', + e.args[0], + flags=re.DOTALL) + if match: + error_code = int(match.group(1)) + msg = match.group(2) + if error_code == 10003: # query data exceed limit + self.logger.error(f'Error {error_code}: {msg}') + return msg + elif error_code in [10013, 10014]: # skip safety problem + self.logger.debug(f'Generated: {msg}') + return msg + elif error_code == 10020: # plugin result is empty + self.logger.error(f'Error {error_code}: {msg}') + return msg + elif error_code == 11202: # qps limit + time.sleep(1) + else: + self.logger.error(f'Error {error_code}: {msg}') + raise e + raise e + except TimeoutError: + self.logger.error('TimeoutError, sleep 60, retrying...') + time.sleep(60) + except Exception as e: + self.logger.error(str(e)) + pass + + all_empty_response = False + + if all_empty_response: + self.logger.error('All empty response') + return 'all empty response' + + raise RuntimeError('Failed to generate response') diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 55fcb62a..bc4ca0dd 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -141,7 +141,7 @@ class DLCRunner(BaseRunner): hf_offline = self.aliyun_cfg.get('hf_offline', True) if hf_offline: - shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; ' # noqa: E501 + shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; export HF_HUB_OFFLINE=1; ' # noqa: E501 http_proxy = self.aliyun_cfg.get('http_proxy') if http_proxy is not None: @@ -158,6 +158,7 @@ class DLCRunner(BaseRunner): shell_cmd += f'export {extra_env}; ' shell_cmd += f'cd {pwd}; ' + shell_cmd += 'umask 0000; ' shell_cmd += '{task_cmd}' tmpl = ('dlc create job' @@ -195,7 +196,10 @@ class DLCRunner(BaseRunner): index_to_start = 0 while index_to_start < num_retry_to_start: index_to_start += 1 - output = subprocess.getoutput(cmd) + try: + output = subprocess.getoutput(cmd) + except BlockingIOError: + output = '' match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output) if match is None: stdout.write('Failed to get job id from output:') @@ -264,7 +268,10 @@ class DLCRunner(BaseRunner): f" -c {self.aliyun_cfg['dlc_config_path']}" f' --start_time {pri_time}' f' --end_time {cur_time}') - log_output = subprocess.getoutput(logs_cmd) + try: + log_output = subprocess.getoutput(logs_cmd) + except BlockingIOError: + log_output = '[WARN] No logs found for the pod' if '[WARN] No logs found for the pod' not in log_output: pri_time = cur_time diff --git a/opencompass/runners/local.py b/opencompass/runners/local.py index c7d3632d..3be17e4e 100644 --- a/opencompass/runners/local.py +++ b/opencompass/runners/local.py @@ -46,17 +46,19 @@ class LocalRunner(BaseRunner): lark_bot_url (str): Lark bot url. """ - def __init__( - self, - task: ConfigDict, - max_num_workers: int = 16, - debug: bool = False, - max_workers_per_gpu: int = 1, - lark_bot_url: str = None, - ): + def __init__(self, + task: ConfigDict, + max_num_workers: int = 16, + debug: bool = False, + max_workers_per_gpu: int = 1, + lark_bot_url: str = None, + **kwargs): super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url) self.max_num_workers = max_num_workers self.max_workers_per_gpu = max_workers_per_gpu + logger = get_logger() + for k, v in kwargs.items(): + logger.warning(f'Ignored argument in {self.__module__}: {k}={v}') def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]: """Launch multiple tasks. diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 23ce8b77..b1b0c173 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -94,11 +94,11 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: f'答案是\s?(\S+)(?:。|$)', f'答案应该是\s?(\S+)(?:。|$)', f'答案为\s?(\S+)(?:。|$)', - f'[Tt]he answer is \(?([{options}])\)?', - f'[Tt]he answer is option \(?([{options}])\)?', - f'[Tt]he correct answer is \(?([{options}])\)?', - f'[Tt]he correct answer is option \(?([{options}])\)?', - f'[Tt]he answer to the question is \(?([{options}])\)?', + f'[Tt]he answer is:?\s+\(?([{options}])\)?', + f'[Tt]he answer is option:?\s+\(?([{options}])\)?', + f'[Tt]he correct answer is:?\s+\(?([{options}])\)?', + f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?', + f'[Tt]he answer to the question is:?\s+\(?([{options}])\)?', f'^选项\s?([{options}])', f'^([{options}])\s?选?项', f'(\s|^)[{options}][\s。,,::\.$]', @@ -116,7 +116,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str: if cushion: patterns.extend(cushion_patterns) for pattern in patterns: - match = re.search(pattern, text) + match = re.search(pattern, text, re.DOTALL) if match: outputs = match.group(0) for i in options: