mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
parent
d34ba11106
commit
1c8e193de8
@ -2,7 +2,6 @@ from mmengine.config import read_base
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
|
from .datasets.subjective.multiround.mtbench_single_judge import subjective_datasets
|
||||||
|
|
||||||
# from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
|
# from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
|
||||||
|
|
||||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||||
@ -18,6 +17,7 @@ from opencompass.summarizers import MTBenchSummarizer
|
|||||||
|
|
||||||
api_meta_template = dict(
|
api_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
|
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||||
dict(role='HUMAN', api_role='HUMAN'),
|
dict(role='HUMAN', api_role='HUMAN'),
|
||||||
dict(role='BOT', api_role='BOT', generate=True),
|
dict(role='BOT', api_role='BOT', generate=True),
|
||||||
]
|
]
|
||||||
@ -54,10 +54,10 @@ models = [
|
|||||||
datasets = [*subjective_datasets]
|
datasets = [*subjective_datasets]
|
||||||
|
|
||||||
infer = dict(
|
infer = dict(
|
||||||
partitioner=dict(type=SizePartitioner, max_task_size=100),
|
partitioner=dict(type=SizePartitioner, max_task_size=10000),
|
||||||
runner=dict(
|
runner=dict(
|
||||||
type=SlurmSequentialRunner,
|
type=SlurmSequentialRunner,
|
||||||
partition='llmeval',
|
partition='llm_dev2',
|
||||||
quotatype='auto',
|
quotatype='auto',
|
||||||
max_num_workers=256,
|
max_num_workers=256,
|
||||||
task=dict(type=OpenICLInferTask),
|
task=dict(type=OpenICLInferTask),
|
||||||
@ -70,12 +70,12 @@ infer = dict(
|
|||||||
judge_model = dict(
|
judge_model = dict(
|
||||||
abbr='GPT4-Turbo',
|
abbr='GPT4-Turbo',
|
||||||
type=OpenAIAllesAPIN,
|
type=OpenAIAllesAPIN,
|
||||||
path='gpt-4-0613',
|
path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
|
||||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||||
url='xxxx',
|
url='xxxx',
|
||||||
meta_template=api_meta_template,
|
meta_template=api_meta_template,
|
||||||
query_per_second=16,
|
query_per_second=16,
|
||||||
max_out_len=1024,
|
max_out_len=2048,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
temperature=0,
|
temperature=0,
|
||||||
@ -95,7 +95,7 @@ judge_model = dict(
|
|||||||
|
|
||||||
## single evaluation
|
## single evaluation
|
||||||
eval = dict(
|
eval = dict(
|
||||||
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100, mode='singlescore', models=models),
|
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models),
|
||||||
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
|
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -17,11 +17,10 @@ except ImportError:
|
|||||||
from opencompass.utils import model_abbr_from_cfg
|
from opencompass.utils import model_abbr_from_cfg
|
||||||
|
|
||||||
from .compass_arena import CompassArenaSummarizer
|
from .compass_arena import CompassArenaSummarizer
|
||||||
from .subjective_post_process import post_process_autoj
|
|
||||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
|
|
||||||
def post_process_mtbench(judgement: str):
|
def post_process_mtbench_pair(judgement: str):
|
||||||
"""Input a string like below:
|
"""Input a string like below:
|
||||||
|
|
||||||
xxx[[A]]xxx, and extract the judge
|
xxx[[A]]xxx, and extract the judge
|
||||||
@ -34,6 +33,20 @@ def post_process_mtbench(judgement: str):
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def post_process_mtbench_single(judgement: str):
|
||||||
|
"""Input a string like below:
|
||||||
|
|
||||||
|
xxx[[5]]xxx, and extract the score
|
||||||
|
"""
|
||||||
|
pattern = r'Rating:\s*\[\[([\d.]+)\]\]'
|
||||||
|
matched_result = re.findall(pattern, judgement)
|
||||||
|
if matched_result:
|
||||||
|
score = float(matched_result[0])
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
return {'score': score}
|
||||||
|
|
||||||
|
|
||||||
def get_capability_results(
|
def get_capability_results(
|
||||||
judged_answers,
|
judged_answers,
|
||||||
references,
|
references,
|
||||||
@ -87,8 +100,8 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
|||||||
'compare_models']
|
'compare_models']
|
||||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||||
self.judge_map = {
|
self.judge_map = {
|
||||||
'single': post_process_autoj,
|
'single': post_process_mtbench_single,
|
||||||
'pair': post_process_mtbench
|
'pair': post_process_mtbench_pair
|
||||||
}
|
}
|
||||||
self.judge_function = self.judge_map[self.judge_type]
|
self.judge_function = self.judge_map[self.judge_type]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user