[Sync] update github workflow (#1156)

This commit is contained in:
Fengzhe Zhou 2024-05-14 22:42:23 +08:00 committed by GitHub
parent aa2dd2b58c
commit 62dbf04708
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
40 changed files with 1377 additions and 136 deletions

View File

@ -4,7 +4,7 @@ import os
import pytest
output_path = 'regression_result'
model = 'internlm-chat-7b-hf'
model = 'internlm2-chat-7b-hf'
dataset = 'siqa'
@ -22,7 +22,7 @@ class TestChatScore:
def test_model_dataset_score(self, result_scores):
result_score = result_scores.get(model).get(dataset)
assert_score(result_score, 73.59)
assert_score(result_score, 79.53)
def assert_score(score, baseline):

View File

@ -14,6 +14,9 @@ env:
PIP_CACHE_PATH: /cpfs01/user/qa-llm-cicd/.cache/pip
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_DATASETS_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
jobs:
daily_run_test:
@ -42,7 +45,7 @@ jobs:
cp -r ${{env.USERSPACE_PREFIX}}/data .
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_HUB_OFFLINE=1;
- name: Run test
run: |
eval "$(conda shell.bash hook)"

View File

@ -21,6 +21,9 @@ env:
CONDA_ENV: opencompass_base
USERSPACE_PREFIX: /cpfs01/user/qa-llm-cicd
HF_CACHE_PATH: /cpfs01/shared/public/public_hdd/llmeval/model_weights/hf_hub
HF_DATASETS_OFFLINE: 1
TRANSFORMERS_OFFLINE: 1
HF_HUB_OFFLINE: 1
jobs:
pr_run_test:
@ -42,21 +45,20 @@ jobs:
cp -r ${{env.USERSPACE_PREFIX}}/data .
rm -rf ~/.cache/huggingface/hub -f && mkdir ~/.cache -p && mkdir ~/.cache/huggingface -p
ln -s ${{env.HF_CACHE_PATH}} ~/.cache/huggingface/hub
export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1;
- name: Run test
run: |
eval "$(conda shell.bash hook)"
conda activate ${{env.CONDA_ENV}}
conda info --envs
rm -rf regression_result
python3 run.py --models hf_internlm_chat_7b --datasets siqa_gen --work-dir regression_result --debug
python3 run.py --models hf_internlm2_chat_7b --datasets siqa_gen --work-dir regression_result --debug
- name: Get result
run: |
score=$(sed -n '$p' regression_result/*/summary/*.csv | awk -F ',' '{print $NF}')
if (( ${score%.*} >= 70 && ${score%.*} <= 75 )); then
echo "score is $score between 70 and 75"
if (( ${score%.*} >= 75 && ${score%.*} <= 85 )); then
echo "score is $score between 75 and 85"
else
echo "score is $score not between 70 and 75"
echo "score is $score not between 75 and 85"
exit 1
fi
rm -rf regression_result

View File

@ -49,7 +49,7 @@ for _name in bbh_multiple_choice_sets:
template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator_mcq),
pred_role='BOT',
@ -66,6 +66,7 @@ for _name in bbh_multiple_choice_sets:
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
for _name in bbh_free_form_sets:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{_name}.txt'), 'r') as f:
_hint = f.read()
@ -75,7 +76,7 @@ for _name in bbh_free_form_sets:
template=f"Follow the given examples and answer the question.\n{_hint}\n\nQ: {{input}}\nA: Let's think step by step."
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Q:']))
bbh_eval_cfg = dict(evaluator=dict(type=BBHEvaluator), pred_role='BOT')
bbh_datasets.append(

View File

@ -10,9 +10,9 @@ with read_base():
from ..race.race_ppl_abed12 import race_datasets
from ..winogrande.winogrande_5shot_ll_252f01 import winogrande_datasets
from ..hellaswag.hellaswag_10shot_ppl_59c85e import hellaswag_datasets
from ..bbh.bbh_gen_0a5495 import bbh_datasets
from ..bbh.bbh_gen_98fba6 import bbh_datasets
from ..gsm8k.gsm8k_gen_ee684f import gsm8k_datasets
from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
from ..math.math_evaluatorv2_gen_2f4a71 import math_datasets
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets

View File

@ -0,0 +1,39 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt='Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n'),
dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt='For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n'),
dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Question']))
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
gsm8k_datasets = [
dict(
abbr='gsm8k',
type=GSM8KDataset,
path='./data/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg)
]

View File

@ -0,0 +1,30 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
with read_base():
from .math_4shot_example_from_google_research import prompt
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=prompt + '\n\nProblem:\n{problem}\nSolution:'),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem']))
# postprocess v2
math_eval_cfg = dict(
evaluator=dict(type=MATHEvaluator, version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2))
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]

View File

@ -0,0 +1,40 @@
# Solving Quantitative Reasoning Problems with Language Models
prompt = '''
Problem:
Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.
Solution:
The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.
Final Answer: The final answer is $[2,5)$. I hope it is correct.
Problem:
If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$
Solution:
We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$
Final Answer: The final answer is $24$. I hope it is correct.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
Solution:
If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:
\\begin{align*}
30n&=480\\
\\Rightarrow\\qquad n&=480/30=\\boxed{16}
\\end{align*}
Final Answer: The final answer is $16$. I hope it is correct.
Problem:
If the system of equations
\\begin{align*}
6x-4y&=a,\\
6y-9x &=b.
\\end{align*}
has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{a}{b},$ assuming $b$ is nonzero.
Solution:
If we multiply the first equation by $-\\frac{3}{2}$, we obtain $$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$
Final Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.
'''.strip()

View File

@ -38,7 +38,7 @@ Problem:
Solution:"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=['Problem']))
# postprocess v2
math_eval_cfg = dict(

View File

@ -0,0 +1,82 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
prompt = '''
You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:
assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)
assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)
assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)
[BEGIN]
'\
def similar_elements(test_tup1, test_tup2):
res = tuple(set(test_tup1) & set(test_tup2))
return (res)\
'
[DONE]
You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:
assert is_not_prime(2) == False
assert is_not_prime(10) == True
assert is_not_prime(35) == True
[BEGIN]
'\
import math
def is_not_prime(n):
result = False
for i in range(2,int(math.sqrt(n)) + 1):
if n % i == 0:
result = True
return result\
'
[DONE]
You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:
assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]
assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]
assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]
[BEGIN]
'\
import heapq as hq
def heap_queue_largest(nums,n):
largest_nums = hq.nlargest(n, nums)
return largest_nums\
'
[DONE]
You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:
{test_list}
'''.strip()
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=prompt),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp',
path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',),
dict(role='BOT', prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)' \n[DONE]\n\n",),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',),
dict(role='BOT', prompt="[BEGIN]\n 'import math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result' \n[DONE]\n\n",),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',),
dict(role='BOT', prompt="[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE]\n\n",),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp',
path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,18 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='deepseek-v2-hf',
path='deepseek-ai/DeepSeek-V2',
max_out_len=1024,
batch_size=4,
model_kwargs=dict(
device_map='sequential',
torch_dtype='torch.bfloat16',
max_memory={i: '75GB' for i in range(8)},
attn_implementation='eager'
),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,18 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='deepseek-v2-chat-hf',
path='deepseek-ai/DeepSeek-V2-Chat',
max_out_len=1024,
batch_size=4,
model_kwargs=dict(
device_map='sequential',
torch_dtype='torch.bfloat16',
max_memory={i: '75GB' for i in range(8)},
attn_implementation='eager'
),
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,23 @@
from opencompass.models import LmdeployPytorchModel
settings = [
('deepseek-7b-base-hf', 'deepseek-ai/deepseek-llm-7b-base', 1),
('deepseek-67b-base-hf', 'deepseek-ai/deepseek-llm-67b-base', 4),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=LmdeployPytorchModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,24 @@
from opencompass.models import TurboMindModel
settings = [
('internlm2-1.8b-turbomind', 'internlm/internlm2-1_8b', 1),
('internlm2-7b-turbomind', 'internlm/internlm2-7b', 1),
('internlm2-20b-turbomind', 'internlm/internlm2-20b', 2),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=TurboMindModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,30 @@
from opencompass.models import TurboMindModel
settings = [
('llama-7b-turbomind', 'huggyllama/llama-7b', 1),
('llama-13b-turbomind', 'huggyllama/llama-13b', 1),
('llama-30b-turbomind', 'huggyllama/llama-30b', 2),
('llama-65b-turbomind', 'huggyllama/llama-65b', 4),
('llama-2-7b-turbomind', 'meta-llama/Llama-2-7b-hf', 1),
('llama-2-13b-turbomind', 'meta-llama/Llama-2-13b-hf', 1),
('llama-2-70b-turbomind', 'meta-llama/Llama-2-70b-hf', 4),
('llama-3-8b-turbomind', 'meta-llama/Meta-Llama-3-8B', 1),
('llama-3-70b-turbomind', 'meta-llama/Meta-Llama-3-70B', 4),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=TurboMindModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,24 @@
from opencompass.models import LmdeployPytorchModel
settings = [
('mistral-7b-v0.1-pytorch', 'mistralai/Mistral-7B-v0.1', 1),
('mixtral-8x7b-v0.1-pytorch', 'mistralai/Mixtral-8x7B-v0.1', 2),
('mixtral-8x22b-v0.1-pytorch', 'mistralai/Mixtral-8x22B-v0.1', 4),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=LmdeployPytorchModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFaceBaseModel
models = [
dict(
type=HuggingFaceBaseModel,
abbr='qwen1.5-110b-hf',
path='Qwen/Qwen1.5-110B',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='qwen1.5-110b-chat-hf',
path='Qwen/Qwen1.5-110B-Chat',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
)
]

View File

@ -0,0 +1,29 @@
from opencompass.models import LmdeployPytorchModel
settings = [
('qwen1.5-0.5b-pytorch', 'Qwen/Qwen1.5-0.5B', 1),
('qwen1.5-1.8b-pytorch', 'Qwen/Qwen1.5-1.8B', 1),
('qwen1.5-4b-pytorch', 'Qwen/Qwen1.5-4B', 1),
('qwen1.5-7b-pytorch', 'Qwen/Qwen1.5-7B', 1),
('qwen1.5-14b-pytorch', 'Qwen/Qwen1.5-14B', 1),
('qwen1.5-32b-pytorch', 'Qwen/Qwen1.5-32B', 2),
('qwen1.5-72b-pytorch', 'Qwen/Qwen1.5-72B', 4),
('qwen1.5-moe-a2.7b-pytorch', 'Qwen/Qwen1.5-MoE-A2.7B', 1),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=LmdeployPytorchModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,25 @@
from opencompass.models import TurboMindModel
settings = [
('qwen-1.8b-turbomind', 'Qwen/Qwen-1_8B', 1),
('qwen-7b-turbomind', 'Qwen/Qwen-7B', 1),
('qwen-14b-turbomind', 'Qwen/Qwen-14B', 1),
('qwen-72b-turbomind', 'Qwen/Qwen-72B', 4),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=TurboMindModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -0,0 +1,23 @@
from opencompass.models import LmdeployPytorchModel
settings = [
('yi-6b-pytorch', '01-ai/Yi-6B', 1),
('yi-34b-pytorch', '01-ai/Yi-34B', 2),
]
models = []
for abbr, path, num_gpus in settings:
models.append(
dict(
type=LmdeployPytorchModel,
abbr=abbr,
path=path,
engine_config=dict(session_len=2048, max_batch_size=16, tp=num_gpus),
gen_config=dict(top_k=1, temperature=1, top_p=0.9, max_new_tokens=1024),
max_out_len=1024,
max_seq_len=2048,
batch_size=16,
concurrency=16,
run_cfg=dict(num_gpus=num_gpus),
)
)

View File

@ -2,7 +2,7 @@
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
from .groups.legacy.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups

View File

@ -0,0 +1,109 @@
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template = ['cibench_template/' + i for i in _cibench_template]
# number of total exec questions in this module
_cibench_template_weight = {
'lightgbm': [30, 15, 0, 0],
'matplotlib': [42, 0, 0, 36],
'nltk': [70, 30, 20, 10],
'opencv': [60, 10, 0, 40],
'pandas': [60, 40, 0, 10],
'pytorch': [28, 0, 0, 0],
'scipy': [60, 40, 0, 0],
'seaborn': [42, 0, 0, 35],
'sklearn': [42, 6, 0, 18],
'tensorflow': [36, 6, 0, 12],
}
cibench_summary_groups.extend([
{
'name': 'cibench_template:executable',
'subsets': [[i, 'executable'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## chinese
_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn]
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## add more without nltk
cibench_summary_groups.extend([
{
'name': 'cibench_template_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])

View File

@ -170,6 +170,8 @@ def parse_dlc_args(dlc_parser):
type=str)
def parse_hf_args(hf_parser):
"""These args are all for the quick construction of HuggingFace models."""
hf_parser.add_argument('--hf-type', type=str, choices=['base', 'chat'], default='chat', help='The type of the HuggingFace model, base or chat')
@ -212,7 +214,7 @@ def main():
if args.work_dir is not None:
cfg['work_dir'] = args.work_dir
else:
cfg.setdefault('work_dir', osp.join('outputs', 'default'))
cfg.setdefault('work_dir', os.path.join('outputs', 'default'))
# cfg_time_str defaults to the current time
cfg_time_str = dir_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
@ -340,5 +342,6 @@ def main():
summarizer.summarize(time_str=cfg_time_str)
if __name__ == '__main__':
main()

View File

@ -7,7 +7,8 @@ from .base import BaseModel, LMTemplateParser # noqa: F401
from .base_api import APITemplateParser, BaseAPIModel # noqa: F401
from .bytedance_api import ByteDance # noqa: F401
from .claude_api import Claude # noqa: F401
from .gemini_api import Gemini, GeminiAllesAPIN # noqa: F401
from .deepseek_api import DeepseekAPI # noqa: F401
from .gemini_api import Gemini # noqa: F401
from .glm import GLM130B # noqa: F401
from .huggingface import HuggingFace # noqa: F401
from .huggingface import HuggingFaceCausalLM # noqa: F401
@ -21,7 +22,7 @@ from .lightllm_api import LightllmAPI # noqa: F401
from .llama2 import Llama2, Llama2Chat # noqa: F401
from .lmdeploy_pytorch import LmdeployPytorchModel # noqa: F401
from .lmdeploy_tis import LmdeployTisModel # noqa: F401
from .minimax_api import MiniMax # noqa: F401
from .minimax_api import MiniMax, MiniMaxChatCompletionV2 # noqa: F401
from .mistral_api import Mistral # noqa: F401
from .mixtral import Mixtral # noqa: F401
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401
@ -31,11 +32,12 @@ from .openai_api import OpenAI # noqa: F401
from .pangu_api import PanGu # noqa: F401
from .qwen_api import Qwen # noqa: F401
from .sensetime_api import SenseTime # noqa: F401
from .stepfun_api import StepFun # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
from .turbomind_tis import TurboMindTisModel # noqa: F401
from .unigpt_api import UniGPT # noqa: F401
from .vllm import VLLM # noqa: F401
from .xunfei_api import XunFei # noqa: F401
from .xunfei_api import XunFei, XunFeiSpark # noqa: F401
from .yayi_api import Yayi # noqa: F401
from .zhipuai_api import ZhiPuAI # noqa: F401
from .zhipuai_v2_api import ZhiPuV2AI # noqa: F401

View File

@ -1,4 +1,3 @@
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
@ -141,29 +140,32 @@ class AI360GPT(BaseAPIModel):
self.wait()
continue
if raw_response.status_code == 200:
try:
msg = response['choices'][0]['message']['content'].strip()
return msg
except KeyError:
if 'error' in response:
# tpm(token per minitue) limit
if response['erro']['code'] == '1005':
time.sleep(1)
continue
self.logger.error('Find error message in response: ',
str(response['error']))
msg = response['choices'][0]['message']['content'].strip()
self.logger.debug(f'Generated: {msg}')
return msg
# sensitive content, prompt overlength, network error
# or illegal prompt
if (raw_response.status_code == 400
or raw_response.status_code == 401
or raw_response.status_code == 402
or raw_response.status_code == 429
or raw_response.status_code == 500):
print(raw_response.text)
continue
if raw_response.status_code in [400, 401, 402, 429, 500]:
if 'error' not in response:
print(raw_response.status_code)
print(raw_response.text)
continue
print(response)
# tpm(token per minitue) limit
if response['error']['code'] == '1005':
self.logger.debug('tpm limit, ignoring')
continue
elif response['error']['code'] == '1001':
msg = '参数错误:messages参数过长或max_tokens参数值过大'
self.logger.debug(f'Generated: {msg}')
return msg
else:
print(response)
self.logger.error('Find error message in response: ',
str(response['error']))
print(raw_response)
max_num_retries += 1

View File

@ -145,8 +145,8 @@ class BaiChuan(BaseAPIModel):
self.wait()
continue
if raw_response.status_code == 200:
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
if raw_response.status_code != 200:

View File

@ -53,6 +53,8 @@ class ERNIEBot(BaseAPIModel):
self.headers = {'Content_Type': 'application/json'}
self.secretkey = secretkey
self.key = key
if not url.endswith('?access_token='):
url += '?access_token='
self.url = url
access_token, _ = self._generate_access_token()
self.access_token = access_token
@ -143,14 +145,25 @@ class ERNIEBot(BaseAPIModel):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
msg['role'] = 'assistant'
if item['role'] == 'BOT':
role = 'assistant'
else: # USER or SYSTEM
role = 'user'
if role != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = role
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
messages.append(msg)
data = {'messages': messages}
data.update(self.generation_kwargs)
@ -181,6 +194,7 @@ class ERNIEBot(BaseAPIModel):
if raw_response.status_code == 200:
try:
msg = response['result']
self.logger.debug(msg)
return msg
except KeyError:
print(response)
@ -188,9 +202,12 @@ class ERNIEBot(BaseAPIModel):
if response['error_code'] == 336007:
# exceed max length
return ''
time.sleep(1)
continue
elif response['error_code'] == 336103:
# prompt tokens too long
return ''
else:
time.sleep(1)
continue
if (response['error_code'] == 110 or response['error_code'] == 100
or response['error_code'] == 111

View File

@ -0,0 +1,178 @@
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
import requests
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
class DeepseekAPI(BaseAPIModel):
"""Model wrapper around DeepseekAPI.
Documentation:
Args:
path (str): The name of DeepseekAPI model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
key: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
system_prompt: str = '',
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
self.headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key,
}
self.url = url
self.model = path
self.system_prompt = system_prompt
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
self.flush()
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
if item['role'] != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = item['role']
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
if self.system_prompt:
system = {'role': 'system', 'content': self.system_prompt}
messages.insert(0, system)
data = {'model': self.model, 'messages': messages}
max_num_retries = 0
while max_num_retries < self.retry:
self.acquire()
try:
raw_response = requests.request('POST',
url=self.url,
headers=self.headers,
json=data)
except Exception as err:
print('Request Error:{}'.format(err))
time.sleep(2)
continue
try:
response = raw_response.json()
except Exception as err:
print('Response Error:{}'.format(err))
response = None
self.release()
if response is None:
print('Connection error, reconnect.')
# if connect error, frequent requests will casuse
# continuous unstable network, therefore wait here
# to slow down the request
self.wait()
continue
if raw_response.status_code == 200:
# msg = json.load(response.text)
# response
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
if raw_response.status_code == 401:
print('请求被拒绝 api_key错误')
continue
elif raw_response.status_code == 400:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The request was rejected because high risk'
return msg
elif raw_response.status_code == 429:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(5)
continue
else:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(1)
max_num_retries += 1
raise RuntimeError(raw_response)

View File

@ -186,66 +186,3 @@ class Gemini(BaseAPIModel):
time.sleep(1)
raise RuntimeError('API call failed.')
class GeminiAllesAPIN(Gemini):
"""Model wrapper around Gemini models.
Documentation:
Args:
path (str): The name of Gemini model.
e.g. `gemini-pro`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
key: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
temperature: float = 1.0,
top_p: float = 0.8,
top_k: float = 10.0,
):
super().__init__(key=key,
path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
# Replace the url and headers into AllesApin
self.url = url
self.headers = {
'alles-apin-token': key,
'content-type': 'application/json',
}
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
return super().generate(inputs, max_out_len)

View File

@ -289,13 +289,13 @@ class HuggingFace(BaseModel):
tokens = self.tokenizer.batch_encode_plus(inputs,
padding=True,
truncation=True,
max_length=self.max_seq_len -
max_out_len)
max_length=self.max_seq_len)
tokens = {
k: torch.tensor(np.array(tokens[k]), device=self.model.device)
for k in tokens if k in ['input_ids', 'attention_mask']
}
origin_stopping_criteria = stopping_criteria
if stopping_criteria:
# Construct huggingface stopping criteria
if self.tokenizer.eos_token is not None:
@ -332,6 +332,9 @@ class HuggingFace(BaseModel):
if self.end_str:
decodeds = [token.split(self.end_str)[0] for token in decodeds]
if origin_stopping_criteria:
for t in origin_stopping_criteria:
decodeds = [token.split(t)[0] for token in decodeds]
return decodeds
def _single_generate(self,
@ -382,6 +385,7 @@ class HuggingFace(BaseModel):
max_length=self.max_seq_len -
max_out_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
origin_stopping_criteria = stopping_criteria
if stopping_criteria:
# Construct huggingface stopping criteria
if self.tokenizer.eos_token is not None:
@ -419,6 +423,9 @@ class HuggingFace(BaseModel):
if self.end_str:
decodeds = [token.split(self.end_str)[0] for token in decodeds]
if origin_stopping_criteria:
for t in origin_stopping_criteria:
decodeds = [token.split(t)[0] for token in decodeds]
return decodeds
def get_logits(self, inputs: List[str]):

View File

@ -180,3 +180,173 @@ class MiniMax(BaseAPIModel):
max_num_retries += 1
raise RuntimeError(response.text)
class MiniMaxChatCompletionV2(BaseAPIModel):
"""Model wrapper around MiniMax ChatCompletionV2.
Documentation:
Args:
path (str): The name of MiniMax model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
key: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
self.headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key,
}
self.url = url
self.model = path
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
self.flush()
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
if item['role'] != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = item['role']
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
data = {
'model': self.model,
'messages': messages,
'max_tokens': max_out_len
}
max_num_retries = 0
while max_num_retries < self.retry:
self.acquire()
try:
raw_response = requests.request('POST',
url=self.url,
headers=self.headers,
json=data)
except Exception as err:
print('Request Error:{}'.format(err))
time.sleep(2)
continue
response = raw_response.json()
self.release()
if response is None:
print('Connection error, reconnect.')
# if connect error, frequent requests will casuse
# continuous unstable network, therefore wait here
# to slow down the request
self.wait()
continue
if raw_response.status_code == 200:
try:
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
except Exception:
code = response.get('base_resp', {}).get('status_code')
if code == 1002:
# rate limit
time.sleep(1)
continue
elif code == 1027:
return 'The request was rejected because high risk'
print(messages, response)
pass
elif raw_response.status_code == 401:
print('请求被拒绝 api_key错误')
continue
elif raw_response.status_code == 400:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The request was rejected because high risk'
return msg
elif raw_response.status_code == 429:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(5)
continue
else:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(1)
max_num_retries += 1
raise RuntimeError(raw_response)

View File

@ -152,8 +152,7 @@ class Qwen(BaseAPIModel):
if response.status_code == 200:
try:
msg = response.output.text
print('=' * 128)
print(msg)
self.logger.debug(msg)
return msg
except KeyError:
print(response)

View File

@ -0,0 +1,182 @@
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
import requests
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
class StepFun(BaseAPIModel):
"""Model wrapper around StepFun.
Documentation:
Args:
path (str): The name of StepFun model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
key: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
system_prompt: str = '',
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
self.headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key,
}
self.url = url
self.model = path
self.system_prompt = system_prompt
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
self.flush()
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
if item['role'] != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = item['role']
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
if self.system_prompt:
system = {'role': 'system', 'content': self.system_prompt}
messages.insert(0, system)
data = {'model': self.model, 'messages': messages}
max_num_retries = 0
while max_num_retries < self.retry:
self.acquire()
try:
raw_response = requests.request('POST',
url=self.url,
headers=self.headers,
json=data)
except Exception as err:
print('Request Error:{}'.format(err))
time.sleep(2)
continue
try:
response = raw_response.json()
except Exception:
response = None
self.release()
if response is None:
print('Connection error, reconnect.')
# if connect error, frequent requests will casuse
# continuous unstable network, therefore wait here
# to slow down the request
self.wait()
continue
if raw_response.status_code == 200:
# msg = json.load(response.text)
# response
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
if raw_response.status_code == 400:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The context length exceeded'
return msg
elif raw_response.status_code == 403:
print('请求被拒绝 api_key错误')
continue
elif raw_response.status_code == 429:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(5)
continue
elif raw_response.status_code == 451:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The request was rejected because high risk'
return msg
else:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(1)
max_num_retries += 1
raise RuntimeError(raw_response)

View File

@ -55,9 +55,6 @@ class TurboMindModel(BaseModel):
if engine_config is not None:
from lmdeploy.messages import TurbomindEngineConfig
engine_config = TurbomindEngineConfig(**engine_config)
if gen_config is not None:
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
self.logger = get_logger()
tm_model = TurboMind.from_pretrained(path, engine_config=engine_config)
self.tokenizer = tm_model.tokenizer
@ -106,6 +103,7 @@ class TurboMindModel(BaseModel):
t = self.tokenizer.encode(t, add_bos=False)
stop_words.append(t[0])
gen_config['stop_words'] = list(set(stop_words))
gen_config.setdefault('min_new_tokens', 1)
from lmdeploy.messages import EngineGenerationConfig
gen_config = EngineGenerationConfig(**gen_config)
@ -123,6 +121,9 @@ class TurboMindModel(BaseModel):
[gen_config] * len(batch_input),
))
results += _results
if stopping_criteria:
for s in stopping_criteria:
results = [r.split(s)[0] for r in results]
return results
def get_token_len(self, prompt: str) -> int:

View File

@ -1,4 +1,6 @@
import json
import re
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
@ -221,3 +223,150 @@ class XunFei(BaseAPIModel):
if err_code == 10013:
return err_data['header']['message']
raise RuntimeError(f'Code: {err_code}, data: {err_data}')
class XunFeiSpark(BaseAPIModel):
"""Model wrapper around XunFeiSpark.
Documentation:
Args:
path (str): The name of XunFeiSpark model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
url: str,
app_id: str,
api_key: str,
api_secret: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
try:
from sparkai.llm.llm import ChatSparkLLM # noqa: F401
except ImportError:
raise ImportError('run `pip install --upgrade spark_ai_python`')
self.spark_domain = path
self.url = url
self.app_id = app_id
self.api_key = api_key
self.api_secret = api_secret
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
results = [self._generate(input, max_out_len) for input in inputs]
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
assert isinstance(input, (str, PromptList))
from sparkai.core.messages import ChatMessage
from sparkai.llm.llm import ChatSparkLLM
if isinstance(input, str):
messages = [ChatMessage(role='user', content=input)]
else:
messages = []
msg_buffer, last_role = [], None
for index, item in enumerate(input):
if index == 0 and item['role'] == 'SYSTEM':
role = 'system'
elif item['role'] == 'BOT':
role = 'assistant'
else:
role = 'user'
if role != last_role and last_role is not None:
content = '\n'.join(msg_buffer)
messages.append(
ChatMessage(role=last_role, content=content))
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = role
content = '\n'.join(msg_buffer)
messages.append(ChatMessage(role=last_role, content=content))
spark = ChatSparkLLM(
spark_api_url=self.url,
spark_app_id=self.app_id,
spark_api_key=self.api_key,
spark_api_secret=self.api_secret,
spark_llm_domain=self.spark_domain,
streaming=False,
max_tokens=max_out_len,
)
all_empty_response = True
for _ in range(self.retry + 1):
try:
outputs = spark.generate([messages]).generations[0]
if len(outputs) == 0:
self.logger.error('Empty response, retrying...')
continue
msg = outputs[0].text
self.logger.debug(f'Generated: {msg}')
return msg
except ConnectionError as e:
match = re.match(r'Error Code: (\d+), Error: (.*)',
e.args[0],
flags=re.DOTALL)
if match:
error_code = int(match.group(1))
msg = match.group(2)
if error_code == 10003: # query data exceed limit
self.logger.error(f'Error {error_code}: {msg}')
return msg
elif error_code in [10013, 10014]: # skip safety problem
self.logger.debug(f'Generated: {msg}')
return msg
elif error_code == 10020: # plugin result is empty
self.logger.error(f'Error {error_code}: {msg}')
return msg
elif error_code == 11202: # qps limit
time.sleep(1)
else:
self.logger.error(f'Error {error_code}: {msg}')
raise e
raise e
except TimeoutError:
self.logger.error('TimeoutError, sleep 60, retrying...')
time.sleep(60)
except Exception as e:
self.logger.error(str(e))
pass
all_empty_response = False
if all_empty_response:
self.logger.error('All empty response')
return 'all empty response'
raise RuntimeError('Failed to generate response')

View File

@ -141,7 +141,7 @@ class DLCRunner(BaseRunner):
hf_offline = self.aliyun_cfg.get('hf_offline', True)
if hf_offline:
shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; ' # noqa: E501
shell_cmd += 'export HF_DATASETS_OFFLINE=1; export TRANSFORMERS_OFFLINE=1; export HF_EVALUATE_OFFLINE=1; export HF_HUB_OFFLINE=1; ' # noqa: E501
http_proxy = self.aliyun_cfg.get('http_proxy')
if http_proxy is not None:
@ -158,6 +158,7 @@ class DLCRunner(BaseRunner):
shell_cmd += f'export {extra_env}; '
shell_cmd += f'cd {pwd}; '
shell_cmd += 'umask 0000; '
shell_cmd += '{task_cmd}'
tmpl = ('dlc create job'
@ -195,7 +196,10 @@ class DLCRunner(BaseRunner):
index_to_start = 0
while index_to_start < num_retry_to_start:
index_to_start += 1
output = subprocess.getoutput(cmd)
try:
output = subprocess.getoutput(cmd)
except BlockingIOError:
output = ''
match = re.search(r'\|\s+(dlc[0-9a-z]+)\s+\|', output)
if match is None:
stdout.write('Failed to get job id from output:')
@ -264,7 +268,10 @@ class DLCRunner(BaseRunner):
f" -c {self.aliyun_cfg['dlc_config_path']}"
f' --start_time {pri_time}'
f' --end_time {cur_time}')
log_output = subprocess.getoutput(logs_cmd)
try:
log_output = subprocess.getoutput(logs_cmd)
except BlockingIOError:
log_output = '[WARN] No logs found for the pod'
if '[WARN] No logs found for the pod' not in log_output:
pri_time = cur_time

View File

@ -46,17 +46,19 @@ class LocalRunner(BaseRunner):
lark_bot_url (str): Lark bot url.
"""
def __init__(
self,
task: ConfigDict,
max_num_workers: int = 16,
debug: bool = False,
max_workers_per_gpu: int = 1,
lark_bot_url: str = None,
):
def __init__(self,
task: ConfigDict,
max_num_workers: int = 16,
debug: bool = False,
max_workers_per_gpu: int = 1,
lark_bot_url: str = None,
**kwargs):
super().__init__(task=task, debug=debug, lark_bot_url=lark_bot_url)
self.max_num_workers = max_num_workers
self.max_workers_per_gpu = max_workers_per_gpu
logger = get_logger()
for k, v in kwargs.items():
logger.warning(f'Ignored argument in {self.__module__}: {k}={v}')
def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
"""Launch multiple tasks.

View File

@ -94,11 +94,11 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
f'答案是\s?(\S+)(?:。|$)',
f'答案应该是\s?(\S+)(?:。|$)',
f'答案为\s?(\S+)(?:。|$)',
f'[Tt]he answer is \(?([{options}])\)?',
f'[Tt]he answer is option \(?([{options}])\)?',
f'[Tt]he correct answer is \(?([{options}])\)?',
f'[Tt]he correct answer is option \(?([{options}])\)?',
f'[Tt]he answer to the question is \(?([{options}])\)?',
f'[Tt]he answer is:?\s+\(?([{options}])\)?',
f'[Tt]he answer is option:?\s+\(?([{options}])\)?',
f'[Tt]he correct answer is:?\s+\(?([{options}])\)?',
f'[Tt]he correct answer is option:?\s+\(?([{options}])\)?',
f'[Tt]he answer to the question is:?\s+\(?([{options}])\)?',
f'^选项\s?([{options}])',
f'^([{options}])\s?选?项',
f'(\s|^)[{options}][\s。,:\.$]',
@ -116,7 +116,7 @@ def first_option_postprocess(text: str, options: str, cushion=True) -> str:
if cushion:
patterns.extend(cushion_patterns)
for pattern in patterns:
match = re.search(pattern, text)
match = re.search(pattern, text, re.DOTALL)
if match:
outputs = match.group(0)
for i in options: