mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Fix] Update alignmentbench (#704)
* update alignmentbench * update alignmentbench * update alignmentbench
This commit is contained in:
parent
1fe152b3e8
commit
bfe4aa2af5
@ -1,5 +1,3 @@
|
||||
from os import getenv as gv
|
||||
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
@ -15,10 +13,10 @@ subjective_reader_cfg = dict(
|
||||
subjective_all_sets = [
|
||||
"alignment_bench",
|
||||
]
|
||||
data_path =gv('WORKDIR')+"data/subjective/alignment_bench"
|
||||
data_path ="data/subjective/alignment_bench"
|
||||
|
||||
alignment_bench_config_path = gv('WORKDIR')+"data/subjective/alignment_bench/config"
|
||||
alignment_bench_config_name = 'multi-dimension'
|
||||
alignment_bench_config_path = "data/subjective/alignment_bench/"
|
||||
alignment_bench_config_name = 'config/multi-dimension'
|
||||
|
||||
subjective_datasets = []
|
||||
|
||||
|
@ -11,7 +11,7 @@ with read_base():
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI, HuggingFaceChatGLM3
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
@ -19,17 +19,11 @@ from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True)
|
||||
],
|
||||
reserved_roles=[
|
||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||
],
|
||||
)
|
||||
|
||||
# -------------Inferen Stage ----------------------------------------
|
||||
|
||||
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
@ -42,6 +36,10 @@ infer = dict(
|
||||
)
|
||||
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
@ -50,26 +48,18 @@ api_meta_template = dict(
|
||||
)
|
||||
|
||||
judge_model = dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
url='xxxx',
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=100,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1)
|
||||
)
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8
|
||||
)
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
@ -77,17 +67,16 @@ eval = dict(
|
||||
models = [*hf_baichuan2_7b]
|
||||
),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
type=LocalRunner,
|
||||
max_num_workers=2,
|
||||
task=dict(
|
||||
type=SubjectiveEvalTask,
|
||||
judge_cfg=judge_model
|
||||
)),
|
||||
)
|
||||
work_dir = gv('WORKDIR')+'alignment_bench/'
|
||||
|
||||
summarizer = dict(
|
||||
type=AlignmentBenchSummarizer,
|
||||
)
|
||||
)
|
||||
|
||||
work_dir = 'outputs/alignment_bench/'
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
)
|
||||
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
)
|
||||
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
||||
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||
],
|
||||
)
|
||||
|
@ -17,7 +17,7 @@ from .minimax_api import MiniMax # noqa: F401
|
||||
from .mixtral import Mixtral # noqa: F401
|
||||
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403
|
||||
from .moonshot_api import MoonShot # noqa: F401
|
||||
from .openai_api import OpenAI # noqa: F401
|
||||
from .openai_api import OpenAI, OpenAIAllesAPIN # noqa: F401
|
||||
from .pangu_api import PanGu # noqa: F401
|
||||
from .sensetime_api import SenseTime # noqa: F401
|
||||
from .turbomind import TurboMindModel # noqa: F401
|
||||
|
@ -310,3 +310,121 @@ class OpenAI(BaseAPIModel):
|
||||
elif self.mode == 'rear':
|
||||
prompt = sep.join(words[:l])
|
||||
return prompt
|
||||
|
||||
|
||||
@MODELS.register_module(name=['OpenAIAllesAPIN'])
|
||||
class OpenAIAllesAPIN(OpenAI):
|
||||
"""Model wrapper around OpenAI-AllesAPIN.
|
||||
|
||||
Args:
|
||||
path (str): The name of OpenAI's model.
|
||||
url (str): URL to AllesAPIN.
|
||||
key (str): AllesAPIN key.
|
||||
query_per_second (int): The maximum queries allowed per second
|
||||
between two consecutive calls of the API. Defaults to 1.
|
||||
max_seq_len (int): Unused here.
|
||||
meta_template (Dict, optional): The model's meta prompt
|
||||
template if needed, in case the requirement of injecting or
|
||||
wrapping of any meta instructions.
|
||||
retry (int): Number of retires if the API call fails. Defaults to 2.
|
||||
"""
|
||||
|
||||
is_api: bool = True
|
||||
|
||||
def __init__(self,
|
||||
path: str,
|
||||
url: str,
|
||||
key: str,
|
||||
query_per_second: int = 1,
|
||||
rpm_verbose: bool = False,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
retry: int = 2):
|
||||
super().__init__(path=path,
|
||||
max_seq_len=max_seq_len,
|
||||
query_per_second=query_per_second,
|
||||
rpm_verbose=rpm_verbose,
|
||||
meta_template=meta_template,
|
||||
retry=retry)
|
||||
self.url = url
|
||||
self.headers = {
|
||||
'alles-apin-token': key,
|
||||
'content-type': 'application/json',
|
||||
}
|
||||
|
||||
def _generate(self, input: str or PromptList, max_out_len: int,
|
||||
temperature: float) -> str:
|
||||
"""Generate results given an input.
|
||||
|
||||
Args:
|
||||
inputs (str or PromptList): A string or PromptDict.
|
||||
The PromptDict should be organized in OpenCompass'
|
||||
API format.
|
||||
max_out_len (int): The maximum length of the output.
|
||||
temperature (float): What sampling temperature to use,
|
||||
between 0 and 2. Higher values like 0.8 will make the output
|
||||
more random, while lower values like 0.2 will make it more
|
||||
focused and deterministic.
|
||||
|
||||
Returns:
|
||||
str: The generated string.
|
||||
"""
|
||||
assert isinstance(input, (str, PromptList))
|
||||
|
||||
if isinstance(input, str):
|
||||
messages = [{'role': 'user', 'content': input}]
|
||||
else:
|
||||
messages = []
|
||||
for item in input:
|
||||
msg = {'content': item['prompt']}
|
||||
if item['role'] == 'HUMAN':
|
||||
msg['role'] = 'user'
|
||||
elif item['role'] == 'BOT':
|
||||
msg['role'] = 'assistant'
|
||||
elif item['role'] == 'SYSTEM':
|
||||
msg['role'] = 'system'
|
||||
messages.append(msg)
|
||||
# model can be response with user and system
|
||||
# when it comes with agent involved.
|
||||
assert msg['role'] in ['user', 'system']
|
||||
data = {
|
||||
'model': self.path,
|
||||
'messages': messages,
|
||||
}
|
||||
|
||||
for _ in range(self.retry):
|
||||
self.wait()
|
||||
raw_response = requests.post(self.url,
|
||||
headers=self.headers,
|
||||
data=json.dumps(data))
|
||||
try:
|
||||
response = raw_response.json()
|
||||
except requests.JSONDecodeError:
|
||||
self.logger.error('JsonDecode error, got',
|
||||
str(raw_response.content))
|
||||
continue
|
||||
if raw_response.status_code == 200 and response[
|
||||
'msgCode'] == '10000':
|
||||
data = response['data']
|
||||
choices = data['choices']
|
||||
if choices is None:
|
||||
self.logger.error(data)
|
||||
else:
|
||||
return choices[0]['message']['content'].strip()
|
||||
self.logger.error(response['msg'])
|
||||
|
||||
raise RuntimeError('API call failed.')
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
"""Get lengths of the tokenized string. Only English and Chinese
|
||||
characters are counted for now. Users are encouraged to override this
|
||||
method if more accurate length is needed.
|
||||
|
||||
Args:
|
||||
prompt (str): Input string.
|
||||
|
||||
Returns:
|
||||
int: Length of the input tokens
|
||||
"""
|
||||
enc = self.tiktoken.encoding_for_model(self.path)
|
||||
return len(enc.encode(prompt))
|
||||
|
@ -172,7 +172,8 @@ class LocalAPIRunner(BaseRunner):
|
||||
self.max_num_workers = max_num_workers
|
||||
self.concurrent_users = concurrent_users
|
||||
assert task['type'] in [
|
||||
'OpenICLInferTask', 'opencompass.tasks.OpenICLInferTask'
|
||||
'OpenICLInferTask',
|
||||
'opencompass.tasks.OpenICLInferTask',
|
||||
], 'Only supported for api infer task.'
|
||||
|
||||
def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
|
||||
|
@ -15,7 +15,7 @@ try:
|
||||
except ImportError:
|
||||
from_csv = None
|
||||
|
||||
from opencompass.utils import dataset_abbr_from_cfg
|
||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||
|
||||
CATEGORIES = {
|
||||
'中文推理': ['数学计算', '逻辑推理'],
|
||||
@ -91,6 +91,10 @@ class AlignmentBenchSummarizer:
|
||||
def __init__(self, config: ConfigDict) -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
self.eval_model_abbrs = [
|
||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||
]
|
||||
|
||||
def summarize(self,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||
@ -116,6 +120,8 @@ class AlignmentBenchSummarizer:
|
||||
fout2 = osp.join(output_dir, 'capability.csv')
|
||||
fout_flag, fout_flag2 = 0, 0
|
||||
for subdir in os.listdir(results_folder):
|
||||
if subdir not in self.eval_model_abbrs:
|
||||
continue
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
model = subdir
|
||||
|
Loading…
Reference in New Issue
Block a user