mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Fix] Update alignmentbench (#704)
* update alignmentbench * update alignmentbench * update alignmentbench
This commit is contained in:
parent
1fe152b3e8
commit
bfe4aa2af5
@ -1,5 +1,3 @@
|
|||||||
from os import getenv as gv
|
|
||||||
|
|
||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
@ -15,10 +13,10 @@ subjective_reader_cfg = dict(
|
|||||||
subjective_all_sets = [
|
subjective_all_sets = [
|
||||||
"alignment_bench",
|
"alignment_bench",
|
||||||
]
|
]
|
||||||
data_path =gv('WORKDIR')+"data/subjective/alignment_bench"
|
data_path ="data/subjective/alignment_bench"
|
||||||
|
|
||||||
alignment_bench_config_path = gv('WORKDIR')+"data/subjective/alignment_bench/config"
|
alignment_bench_config_path = "data/subjective/alignment_bench/"
|
||||||
alignment_bench_config_name = 'multi-dimension'
|
alignment_bench_config_name = 'config/multi-dimension'
|
||||||
|
|
||||||
subjective_datasets = []
|
subjective_datasets = []
|
||||||
|
|
||||||
|
@ -11,7 +11,7 @@ with read_base():
|
|||||||
|
|
||||||
datasets = [*subjective_datasets]
|
datasets = [*subjective_datasets]
|
||||||
|
|
||||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI, HuggingFaceChatGLM3
|
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAIAllesAPIN, HuggingFaceChatGLM3
|
||||||
from opencompass.partitioners import NaivePartitioner
|
from opencompass.partitioners import NaivePartitioner
|
||||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||||
from opencompass.runners import LocalRunner
|
from opencompass.runners import LocalRunner
|
||||||
@ -19,17 +19,11 @@ from opencompass.runners import SlurmSequentialRunner
|
|||||||
from opencompass.tasks import OpenICLInferTask
|
from opencompass.tasks import OpenICLInferTask
|
||||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||||
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
|
|
||||||
|
|
||||||
api_meta_template = dict(
|
|
||||||
round=[
|
# -------------Inferen Stage ----------------------------------------
|
||||||
dict(role='HUMAN', api_role='HUMAN'),
|
|
||||||
dict(role='BOT', api_role='BOT', generate=True)
|
models = [*hf_baichuan2_7b]#, *hf_chatglm3_6b, *hf_internlm_chat_20b, *hf_qwen_7b_chat, *hf_qwen_14b_chat]
|
||||||
],
|
|
||||||
reserved_roles=[
|
|
||||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
infer = dict(
|
infer = dict(
|
||||||
partitioner=dict(type=NaivePartitioner),
|
partitioner=dict(type=NaivePartitioner),
|
||||||
@ -42,6 +36,10 @@ infer = dict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# -------------Evalation Stage ----------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
## ------------- JudgeLLM Configuration
|
||||||
api_meta_template = dict(
|
api_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', api_role='HUMAN'),
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
@ -50,26 +48,18 @@ api_meta_template = dict(
|
|||||||
)
|
)
|
||||||
|
|
||||||
judge_model = dict(
|
judge_model = dict(
|
||||||
type=HuggingFaceChatGLM3,
|
abbr='GPT4-Turbo',
|
||||||
abbr='chatglm3-6b-hf',
|
type=OpenAIAllesAPIN, path='gpt-4-1106-preview',
|
||||||
path='THUDM/chatglm3-6b',
|
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||||
tokenizer_path='THUDM/chatglm3-6b',
|
url='xxxx',
|
||||||
model_kwargs=dict(
|
|
||||||
device_map='auto',
|
|
||||||
trust_remote_code=True,
|
|
||||||
),
|
|
||||||
tokenizer_kwargs=dict(
|
|
||||||
padding_side='left',
|
|
||||||
truncation_side='left',
|
|
||||||
trust_remote_code=True,
|
|
||||||
),
|
|
||||||
meta_template=api_meta_template,
|
meta_template=api_meta_template,
|
||||||
max_out_len=100,
|
query_per_second=16,
|
||||||
max_seq_len=4096,
|
max_out_len=2048,
|
||||||
batch_size=1,
|
max_seq_len=2048,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1)
|
batch_size=8
|
||||||
)
|
)
|
||||||
|
|
||||||
|
## ------------- Evaluation Configuration
|
||||||
eval = dict(
|
eval = dict(
|
||||||
partitioner=dict(
|
partitioner=dict(
|
||||||
type=SubjectiveNaivePartitioner,
|
type=SubjectiveNaivePartitioner,
|
||||||
@ -77,17 +67,16 @@ eval = dict(
|
|||||||
models = [*hf_baichuan2_7b]
|
models = [*hf_baichuan2_7b]
|
||||||
),
|
),
|
||||||
runner=dict(
|
runner=dict(
|
||||||
type=SlurmSequentialRunner,
|
type=LocalRunner,
|
||||||
partition='llmeval',
|
max_num_workers=2,
|
||||||
quotatype='auto',
|
|
||||||
max_num_workers=256,
|
|
||||||
task=dict(
|
task=dict(
|
||||||
type=SubjectiveEvalTask,
|
type=SubjectiveEvalTask,
|
||||||
judge_cfg=judge_model
|
judge_cfg=judge_model
|
||||||
)),
|
)),
|
||||||
)
|
)
|
||||||
work_dir = gv('WORKDIR')+'alignment_bench/'
|
|
||||||
|
|
||||||
summarizer = dict(
|
summarizer = dict(
|
||||||
type=AlignmentBenchSummarizer,
|
type=AlignmentBenchSummarizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
work_dir = 'outputs/alignment_bench/'
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
|||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
|||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -3,7 +3,7 @@ from opencompass.models import HuggingFaceCausalLM
|
|||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', begin='<|User|>:', end='<eoh>\n'),
|
dict(role='HUMAN', begin='<|User|>:', end='\n'),
|
||||||
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
dict(role='BOT', begin='<|Bot|>:', end='<eoa>\n', generate=True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -17,7 +17,7 @@ from .minimax_api import MiniMax # noqa: F401
|
|||||||
from .mixtral import Mixtral # noqa: F401
|
from .mixtral import Mixtral # noqa: F401
|
||||||
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403
|
from .modelscope import ModelScope, ModelScopeCausalLM # noqa: F401, F403
|
||||||
from .moonshot_api import MoonShot # noqa: F401
|
from .moonshot_api import MoonShot # noqa: F401
|
||||||
from .openai_api import OpenAI # noqa: F401
|
from .openai_api import OpenAI, OpenAIAllesAPIN # noqa: F401
|
||||||
from .pangu_api import PanGu # noqa: F401
|
from .pangu_api import PanGu # noqa: F401
|
||||||
from .sensetime_api import SenseTime # noqa: F401
|
from .sensetime_api import SenseTime # noqa: F401
|
||||||
from .turbomind import TurboMindModel # noqa: F401
|
from .turbomind import TurboMindModel # noqa: F401
|
||||||
|
@ -310,3 +310,121 @@ class OpenAI(BaseAPIModel):
|
|||||||
elif self.mode == 'rear':
|
elif self.mode == 'rear':
|
||||||
prompt = sep.join(words[:l])
|
prompt = sep.join(words[:l])
|
||||||
return prompt
|
return prompt
|
||||||
|
|
||||||
|
|
||||||
|
@MODELS.register_module(name=['OpenAIAllesAPIN'])
|
||||||
|
class OpenAIAllesAPIN(OpenAI):
|
||||||
|
"""Model wrapper around OpenAI-AllesAPIN.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str): The name of OpenAI's model.
|
||||||
|
url (str): URL to AllesAPIN.
|
||||||
|
key (str): AllesAPIN key.
|
||||||
|
query_per_second (int): The maximum queries allowed per second
|
||||||
|
between two consecutive calls of the API. Defaults to 1.
|
||||||
|
max_seq_len (int): Unused here.
|
||||||
|
meta_template (Dict, optional): The model's meta prompt
|
||||||
|
template if needed, in case the requirement of injecting or
|
||||||
|
wrapping of any meta instructions.
|
||||||
|
retry (int): Number of retires if the API call fails. Defaults to 2.
|
||||||
|
"""
|
||||||
|
|
||||||
|
is_api: bool = True
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
path: str,
|
||||||
|
url: str,
|
||||||
|
key: str,
|
||||||
|
query_per_second: int = 1,
|
||||||
|
rpm_verbose: bool = False,
|
||||||
|
max_seq_len: int = 2048,
|
||||||
|
meta_template: Optional[Dict] = None,
|
||||||
|
retry: int = 2):
|
||||||
|
super().__init__(path=path,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
query_per_second=query_per_second,
|
||||||
|
rpm_verbose=rpm_verbose,
|
||||||
|
meta_template=meta_template,
|
||||||
|
retry=retry)
|
||||||
|
self.url = url
|
||||||
|
self.headers = {
|
||||||
|
'alles-apin-token': key,
|
||||||
|
'content-type': 'application/json',
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate(self, input: str or PromptList, max_out_len: int,
|
||||||
|
temperature: float) -> str:
|
||||||
|
"""Generate results given an input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs (str or PromptList): A string or PromptDict.
|
||||||
|
The PromptDict should be organized in OpenCompass'
|
||||||
|
API format.
|
||||||
|
max_out_len (int): The maximum length of the output.
|
||||||
|
temperature (float): What sampling temperature to use,
|
||||||
|
between 0 and 2. Higher values like 0.8 will make the output
|
||||||
|
more random, while lower values like 0.2 will make it more
|
||||||
|
focused and deterministic.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The generated string.
|
||||||
|
"""
|
||||||
|
assert isinstance(input, (str, PromptList))
|
||||||
|
|
||||||
|
if isinstance(input, str):
|
||||||
|
messages = [{'role': 'user', 'content': input}]
|
||||||
|
else:
|
||||||
|
messages = []
|
||||||
|
for item in input:
|
||||||
|
msg = {'content': item['prompt']}
|
||||||
|
if item['role'] == 'HUMAN':
|
||||||
|
msg['role'] = 'user'
|
||||||
|
elif item['role'] == 'BOT':
|
||||||
|
msg['role'] = 'assistant'
|
||||||
|
elif item['role'] == 'SYSTEM':
|
||||||
|
msg['role'] = 'system'
|
||||||
|
messages.append(msg)
|
||||||
|
# model can be response with user and system
|
||||||
|
# when it comes with agent involved.
|
||||||
|
assert msg['role'] in ['user', 'system']
|
||||||
|
data = {
|
||||||
|
'model': self.path,
|
||||||
|
'messages': messages,
|
||||||
|
}
|
||||||
|
|
||||||
|
for _ in range(self.retry):
|
||||||
|
self.wait()
|
||||||
|
raw_response = requests.post(self.url,
|
||||||
|
headers=self.headers,
|
||||||
|
data=json.dumps(data))
|
||||||
|
try:
|
||||||
|
response = raw_response.json()
|
||||||
|
except requests.JSONDecodeError:
|
||||||
|
self.logger.error('JsonDecode error, got',
|
||||||
|
str(raw_response.content))
|
||||||
|
continue
|
||||||
|
if raw_response.status_code == 200 and response[
|
||||||
|
'msgCode'] == '10000':
|
||||||
|
data = response['data']
|
||||||
|
choices = data['choices']
|
||||||
|
if choices is None:
|
||||||
|
self.logger.error(data)
|
||||||
|
else:
|
||||||
|
return choices[0]['message']['content'].strip()
|
||||||
|
self.logger.error(response['msg'])
|
||||||
|
|
||||||
|
raise RuntimeError('API call failed.')
|
||||||
|
|
||||||
|
def get_token_len(self, prompt: str) -> int:
|
||||||
|
"""Get lengths of the tokenized string. Only English and Chinese
|
||||||
|
characters are counted for now. Users are encouraged to override this
|
||||||
|
method if more accurate length is needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): Input string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
int: Length of the input tokens
|
||||||
|
"""
|
||||||
|
enc = self.tiktoken.encoding_for_model(self.path)
|
||||||
|
return len(enc.encode(prompt))
|
||||||
|
@ -172,7 +172,8 @@ class LocalAPIRunner(BaseRunner):
|
|||||||
self.max_num_workers = max_num_workers
|
self.max_num_workers = max_num_workers
|
||||||
self.concurrent_users = concurrent_users
|
self.concurrent_users = concurrent_users
|
||||||
assert task['type'] in [
|
assert task['type'] in [
|
||||||
'OpenICLInferTask', 'opencompass.tasks.OpenICLInferTask'
|
'OpenICLInferTask',
|
||||||
|
'opencompass.tasks.OpenICLInferTask',
|
||||||
], 'Only supported for api infer task.'
|
], 'Only supported for api infer task.'
|
||||||
|
|
||||||
def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
|
def launch(self, tasks: List[Dict[str, Any]]) -> List[Tuple[str, int]]:
|
||||||
|
@ -15,7 +15,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from_csv = None
|
from_csv = None
|
||||||
|
|
||||||
from opencompass.utils import dataset_abbr_from_cfg
|
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||||
|
|
||||||
CATEGORIES = {
|
CATEGORIES = {
|
||||||
'中文推理': ['数学计算', '逻辑推理'],
|
'中文推理': ['数学计算', '逻辑推理'],
|
||||||
@ -91,6 +91,10 @@ class AlignmentBenchSummarizer:
|
|||||||
def __init__(self, config: ConfigDict) -> None:
|
def __init__(self, config: ConfigDict) -> None:
|
||||||
self.tasks = []
|
self.tasks = []
|
||||||
self.cfg = config
|
self.cfg = config
|
||||||
|
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||||
|
self.eval_model_abbrs = [
|
||||||
|
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||||
|
]
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
@ -116,6 +120,8 @@ class AlignmentBenchSummarizer:
|
|||||||
fout2 = osp.join(output_dir, 'capability.csv')
|
fout2 = osp.join(output_dir, 'capability.csv')
|
||||||
fout_flag, fout_flag2 = 0, 0
|
fout_flag, fout_flag2 = 0, 0
|
||||||
for subdir in os.listdir(results_folder):
|
for subdir in os.listdir(results_folder):
|
||||||
|
if subdir not in self.eval_model_abbrs:
|
||||||
|
continue
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
if os.path.isdir(subdir_path):
|
if os.path.isdir(subdir_path):
|
||||||
model = subdir
|
model = subdir
|
||||||
|
Loading…
Reference in New Issue
Block a user