[Sync] add OC16 entry (#1171)

This commit is contained in:
Fengzhe Zhou 2024-05-17 16:50:58 +08:00 committed by GitHub
parent 94eb90569f
commit 5de85406ce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 321 additions and 27 deletions

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=['text', 'test_list'], output_column='test_list_2')
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the similar elements from the given two tuple lists.\nYour code should pass these tests:\n\nassert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4)\nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14)\n',),
dict(role='BOT', prompt='```python\ndef similar_elements(test_tup1, test_tup2):\n res = tuple(set(test_tup1) & set(test_tup2))\n return (res)```',),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a python function to identify non-prime numbers.\nYour code should pass these tests:\n\nassert is_not_prime(2) == False\nassert is_not_prime(10) == True\nassert is_not_prime(35) == True\n',),
dict(role='BOT', prompt='```python\nimport math\ndef is_not_prime(n):\n result = False\n for i in range(2,int(math.sqrt(n)) + 1):\n if n %% i == 0:\n result = True\n return result```',),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\nWrite a function to find the largest integers from a given list of numbers using heap queue algorithm.\nYour code should pass these tests:\n\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75]\nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35]\n',),
dict(role='BOT', prompt='```python\nimport heapq as hq\ndef heap_queue_largest(nums,n):\n largest_nums = hq.nlargest(n, nums)\n return largest_nums```',),
dict(role='HUMAN', prompt='You are an expert Python programmer, and here is your task:\n{text}\nYour code should pass these tests:\n\n{test_list}\n',),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role='BOT')
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp',
path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TACODataset, TACOEvaluator
TACO_difficulties_list = ['EASY', 'MEDIUM', 'MEDIUM_HARD', 'HARD', 'VERY_HARD']
TACO_reader_cfg = dict(input_columns=['question', 'starter'], output_column='problem_id', train_split='test')
TACO_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n'),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024),
)
TACO_eval_cfg = dict(evaluator=dict(type=TACOEvaluator), pred_role='BOT')
TACO_datasets = []
for difficulty in TACO_difficulties_list:
TACO_datasets.append(
dict(
type=TACODataset,
abbr='TACO-' + difficulty,
path='./data/BAAI-TACO',
difficulty=difficulty,
reader_cfg=TACO_reader_cfg,
infer_cfg=TACO_infer_cfg,
eval_cfg=TACO_eval_cfg,
)
)

View File

@ -13,6 +13,6 @@ models = [
max_memory={i: '75GB' for i in range(8)},
attn_implementation='eager'
),
run_cfg=dict(num_gpus=4),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -13,6 +13,6 @@ models = [
max_memory={i: '75GB' for i in range(8)},
attn_implementation='eager'
),
run_cfg=dict(num_gpus=4),
run_cfg=dict(num_gpus=8),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='yi-1.5-34b-chat-hf',
path='01-ai/Yi-1.5-34B-Chat',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=2),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='yi-1.5-6b-chat-hf',
path='01-ai/Yi-1.5-6B-Chat',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -0,0 +1,12 @@
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='yi-1.5-9b-chat-hf',
path='01-ai/Yi-1.5-9B-Chat',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
)
]

View File

@ -288,6 +288,7 @@ class MBPPEvaluator(BaseEvaluator):
r'(.*)\s*```.*',
r"\[BEGIN\]\s*'(.*)",
r'\[BEGIN\](.*)',
r"'(.*)'\s*\[DONE\]",
]
for p in patterns:
match = re.search(p, text, re.DOTALL)

View File

@ -37,13 +37,16 @@ TIMEOUT = 10
class TACODataset(BaseDataset):
@staticmethod
def load(path: str, num_repeats: int = 1):
def load(path: str, num_repeats: int = 1, difficulty='ALL'):
dataset = load_from_disk(path)
new_dataset = DatasetDict()
# add new column "starter" in the prompt
for split in dataset.keys():
new_samples = []
for idx, sample in enumerate(dataset[split]):
if 'ALL' not in difficulty:
if not sample['difficulty'] == difficulty:
continue
starter_code = None if len(
sample['starter_code']) == 0 else sample['starter_code']
try:
@ -71,7 +74,6 @@ class TACODataset(BaseDataset):
for key in new_samples[0].keys()
}
new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate
# train_repeated = []
test_repeated = []
@ -84,7 +86,6 @@ class TACODataset(BaseDataset):
# train_repeated
# )
dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({
# 'train': dataset_train_repeated,
'test': dataset_test_repeated

View File

@ -42,5 +42,6 @@ from .vllm import VLLM # noqa: F401
from .vllm_with_tf_above_v4_33 import VLLMwithChatTemplate # noqa: F401
from .xunfei_api import XunFei, XunFeiSpark # noqa: F401
from .yayi_api import Yayi # noqa: F401
from .yi_api import YiAPI # noqa: F401
from .zhipuai_api import ZhiPuAI # noqa: F401
from .zhipuai_v2_api import ZhiPuV2AI # noqa: F401

View File

@ -64,7 +64,7 @@ def _convert_chat_messages(inputs):
for _input in inputs:
messages = []
if isinstance(_input, str):
messages.append({'role': 'HUMAN', 'content': _input})
messages.append({'role': 'user', 'content': _input})
else:
for item in _input:
role = {
@ -165,7 +165,7 @@ class HuggingFacewithChatTemplate(BaseModel):
def _load_tokenizer(self, path: Optional[str], kwargs: dict, pad_token_id: Optional[int] = None):
from transformers import AutoTokenizer, GenerationConfig
DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', use_fast=False, trust_remote_code=True)
DEFAULT_TOKENIZER_KWARGS = dict(padding_side='left', truncation_side='left', trust_remote_code=True)
tokenizer_kwargs = DEFAULT_TOKENIZER_KWARGS
tokenizer_kwargs.update(kwargs)
self.tokenizer = AutoTokenizer.from_pretrained(path, **tokenizer_kwargs)
@ -199,6 +199,7 @@ class HuggingFacewithChatTemplate(BaseModel):
model_kwargs = DEFAULT_MODEL_KWARGS
model_kwargs.update(kwargs)
model_kwargs = _set_model_kwargs_torch_dtype(model_kwargs)
self.logger.debug(f'using model_kwargs: {model_kwargs}')
try:
self.model = AutoModelForCausalLM.from_pretrained(path, **model_kwargs)

View File

@ -68,6 +68,7 @@ class LightllmAPI(BaseModel):
self.wait()
header = {'content-type': 'application/json'}
try:
self.logger.debug(f'input: {input}')
data = dict(inputs=input, parameters=self.generation_kwargs)
raw_response = requests.post(self.url,
headers=header,
@ -80,6 +81,7 @@ class LightllmAPI(BaseModel):
generated_text = response['generated_text']
if isinstance(generated_text, list):
generated_text = generated_text[0]
self.logger.debug(f'generated_text: {generated_text}')
return generated_text
except requests.JSONDecodeError:
self.logger.error('JsonDecode error, got',

View File

@ -0,0 +1,178 @@
import time
from concurrent.futures import ThreadPoolExecutor
from typing import Dict, List, Optional, Union
import requests
from opencompass.utils.prompt import PromptList
from .base_api import BaseAPIModel
PromptType = Union[PromptList, str]
class YiAPI(BaseAPIModel):
"""Model wrapper around YiAPI.
Documentation:
Args:
path (str): The name of YiAPI model.
e.g. `moonshot-v1-32k`
key (str): Authorization key.
query_per_second (int): The maximum queries allowed per second
between two consecutive calls of the API. Defaults to 1.
max_seq_len (int): Unused here.
meta_template (Dict, optional): The model's meta prompt
template if needed, in case the requirement of injecting or
wrapping of any meta instructions.
retry (int): Number of retires if the API call fails. Defaults to 2.
"""
def __init__(
self,
path: str,
key: str,
url: str,
query_per_second: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
retry: int = 2,
system_prompt: str = '',
):
super().__init__(path=path,
max_seq_len=max_seq_len,
query_per_second=query_per_second,
meta_template=meta_template,
retry=retry)
self.headers = {
'Content-Type': 'application/json',
'Authorization': 'Bearer ' + key,
}
self.url = url
self.model = path
self.system_prompt = system_prompt
def generate(
self,
inputs: List[PromptType],
max_out_len: int = 512,
) -> List[str]:
"""Generate results given a list of inputs.
Args:
inputs (List[PromptType]): A list of strings or PromptDicts.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
List[str]: A list of generated strings.
"""
with ThreadPoolExecutor() as executor:
results = list(
executor.map(self._generate, inputs,
[max_out_len] * len(inputs)))
self.flush()
return results
def _generate(
self,
input: PromptType,
max_out_len: int = 512,
) -> str:
"""Generate results given an input.
Args:
inputs (PromptType): A string or PromptDict.
The PromptDict should be organized in OpenCompass'
API format.
max_out_len (int): The maximum length of the output.
Returns:
str: The generated string.
"""
assert isinstance(input, (str, PromptList))
if isinstance(input, str):
messages = [{'role': 'user', 'content': input}]
else:
messages = []
msg_buffer, last_role = [], None
for item in input:
item['role'] = 'assistant' if item['role'] == 'BOT' else 'user'
if item['role'] != last_role and last_role is not None:
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
msg_buffer = []
msg_buffer.append(item['prompt'])
last_role = item['role']
messages.append({
'content': '\n'.join(msg_buffer),
'role': last_role
})
if self.system_prompt:
system = {'role': 'system', 'content': self.system_prompt}
messages.insert(0, system)
data = {'model': self.model, 'messages': messages}
max_num_retries = 0
while max_num_retries < self.retry:
self.acquire()
try:
raw_response = requests.request('POST',
url=self.url,
headers=self.headers,
json=data)
except Exception as err:
print('Request Error:{}'.format(err))
time.sleep(2)
continue
try:
response = raw_response.json()
except Exception as err:
print('Response Error:{}'.format(err))
response = None
self.release()
if response is None:
print('Connection error, reconnect.')
# if connect error, frequent requests will casuse
# continuous unstable network, therefore wait here
# to slow down the request
self.wait()
continue
if raw_response.status_code == 200:
# msg = json.load(response.text)
# response
msg = response['choices'][0]['message']['content']
self.logger.debug(f'Generated: {msg}')
return msg
if raw_response.status_code == 401:
print('请求被拒绝 api_key错误')
continue
elif raw_response.status_code == 400:
print(messages, response)
print('请求失败,状态码:', raw_response)
msg = 'The request was rejected because high risk'
return msg
elif raw_response.status_code == 429:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(5)
continue
else:
print(messages, response)
print('请求失败,状态码:', raw_response)
time.sleep(1)
max_num_retries += 1
raise RuntimeError(raw_response)

View File

@ -161,17 +161,19 @@ class DLCRunner(BaseRunner):
shell_cmd += 'umask 0000; '
shell_cmd += '{task_cmd}'
tmpl = ('dlc create job'
f" --command '{shell_cmd}'"
f' --name {task_name[:512]}'
' --kind BatchJob'
f" -c {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
' --worker_count 1'
f' --worker_cpu {max(num_gpus * 8, 12)}'
f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 128, 192)}'
f" --worker_image {self.aliyun_cfg['worker_image']}")
tmpl = (
'dlc submit pytorchjob'
f" --command '{shell_cmd}'"
f' --name {task_name[:512]}'
f" --config {self.aliyun_cfg['dlc_config_path']}"
f" --workspace_id {self.aliyun_cfg['workspace_id']}"
f" --resource_id {self.aliyun_cfg['resource_id']}"
' --workers 1'
f' --worker_cpu {max(num_gpus * 8, 12)}'
f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 128, 192)}Gi'
f" --worker_image {self.aliyun_cfg['worker_image']}"
f" --data_sources {','.join(self.aliyun_cfg['data_sources'])}")
get_cmd = partial(task.get_command,
cfg_path=param_file,
template=tmpl)
@ -219,14 +221,9 @@ class DLCRunner(BaseRunner):
pri_time = None
initial_time = datetime.datetime.now()
url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501
url = f"https://pai.console.aliyun.com/?regionId=cn-wulanchabu&workspaceId={self.aliyun_cfg['workspace_id']}#/dlc/jobs/{job_id}" # noqa: E501
logger = get_logger()
logger.debug('')
logger.debug('*' * 168)
logger.debug(
f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501
)
logger.debug('*' * 168)
logger.debug('\n' + '*' * 168 + '\n' + url + '\n' + '*' * 168)
while True:
# 1. Avoid to request dlc too frequently.
@ -264,7 +261,7 @@ class DLCRunner(BaseRunner):
cur_time = (pod_create_time +
elasped_time).strftime('%Y-%m-%dT%H:%M:%SZ')
logs_cmd = ('dlc logs'
f' {job_id} {job_id}-worker-0'
f' {job_id} {job_id}-master-0'
f" -c {self.aliyun_cfg['dlc_config_path']}"
f' --start_time {pri_time}'
f' --end_time {cur_time}')

View File

@ -84,7 +84,7 @@ def get_config_from_arg(args) -> Config:
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
config['models'] = change_accelerator(config['models'], args.accelerator)
if 'eval' in config and 'partitioner' in config['eval']:
if config.get('eval', {}).get('partitioner', {}).get('models') is not None:
config['eval']['partitioner']['models'] = change_accelerator(config['eval']['partitioner']['models'], args.accelerator)
if config.get('eval', {}).get('partitioner', {}).get('judge_models') is not None:
config['eval']['partitioner']['judge_models'] = change_accelerator(config['eval']['partitioner']['judge_models'], args.accelerator)