mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Refactor] Reorganize subjective eval (#1284)
* fix pip version * fix pip version * reorganize subjective eval * reorg sub * reorg subeval * reorg subeval * update subjective doc * reorg subeval * reorg subeval
This commit is contained in:
parent
aadcfa625f
commit
68ca48496b
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import AlignmentBenchDataset
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'ref'],
|
||||
@ -14,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/alignment_bench'
|
||||
|
||||
subjective_datasets = []
|
||||
alignbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -59,7 +60,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alignbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=AlignmentBenchDataset,
|
||||
@ -67,5 +68,7 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='autoj')
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import AlignmentBenchDataset
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'critiquellm_prefix'],
|
||||
@ -17,7 +18,7 @@ data_path ='data/subjective/alignment_bench'
|
||||
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
|
||||
alignment_bench_config_name = 'multi-dimension'
|
||||
|
||||
subjective_datasets = []
|
||||
alignbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -50,7 +51,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alignbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=AlignmentBenchDataset,
|
||||
@ -60,5 +61,7 @@ for _name in subjective_all_sets:
|
||||
alignment_bench_config_name=alignment_bench_config_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import AlignmentBenchDataset
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'ref'],
|
||||
@ -14,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/alignment_bench'
|
||||
|
||||
subjective_datasets = []
|
||||
alignbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -47,7 +48,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alignbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=AlignmentBenchDataset,
|
||||
@ -55,5 +56,7 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='judgelm')
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import AlignmentBenchDataset
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'capability', 'critiquellm_prefix'],
|
||||
@ -10,14 +11,14 @@ subjective_reader_cfg = dict(
|
||||
)
|
||||
|
||||
subjective_all_sets = [
|
||||
'alignment_bench_v1_1',
|
||||
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
|
||||
]
|
||||
data_path ='data/subjective/alignment_bench'
|
||||
|
||||
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
|
||||
alignment_bench_config_name = 'multi-dimension'
|
||||
|
||||
subjective_datasets = []
|
||||
alignbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -50,7 +51,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alignbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=AlignmentBenchDataset,
|
||||
@ -60,5 +61,7 @@ for _name in subjective_all_sets:
|
||||
alignment_bench_config_name=alignment_bench_config_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
|
||||
))
|
||||
|
@ -15,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
|
||||
|
||||
subjective_datasets = []
|
||||
alpacav1_datasets = []
|
||||
|
||||
gpt4_prompt = """
|
||||
I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries.
|
||||
@ -85,7 +85,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alpacav1_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=SubjectiveCmpDataset,
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import SubjectiveCmpDataset
|
||||
from opencompass.summarizers import AlpacaSummarizer
|
||||
from mmengine.config import read_base
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
@ -15,7 +16,7 @@ subjective_all_sets = [
|
||||
]
|
||||
|
||||
|
||||
subjective_datasets = []
|
||||
alpacav2_datasets = []
|
||||
|
||||
gpt4_prompt = """
|
||||
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
|
||||
@ -48,6 +49,17 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
|
||||
## Best Model Identifier
|
||||
"""
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -87,7 +99,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
alpacav2_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=SubjectiveCmpDataset,
|
||||
@ -95,5 +107,10 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='random',
|
||||
base_models=gpt4,
|
||||
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import ArenaHardDataset
|
||||
from opencompass.summarizers import ArenaHardSummarizer
|
||||
from mmengine.config import read_base
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
@ -15,12 +16,15 @@ subjective_all_sets = [
|
||||
]
|
||||
|
||||
|
||||
subjective_datasets = []
|
||||
arenahard_datasets = []
|
||||
|
||||
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
|
||||
|
||||
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-0314',
|
||||
)]
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -60,13 +64,18 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
arenahard_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
abbr='arenahard',
|
||||
type=ArenaHardDataset,
|
||||
path='./data/subjective/arena_hard',
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer = dict(type=ArenaHardSummarizer),
|
||||
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassArenaDataset
|
||||
from opencompass.summarizers import CompassArenaSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'ref'],
|
||||
@ -11,7 +12,7 @@ subjective_reader_cfg = dict(
|
||||
|
||||
data_path ='data/subjective/compass_arena'
|
||||
|
||||
subjective_datasets = []
|
||||
compassarena_datasets = []
|
||||
|
||||
base_prompt = """
|
||||
|
||||
@ -101,6 +102,10 @@ creation_prompt = """
|
||||
|
||||
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
|
||||
for _name, _prompt in sub_map.items():
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
@ -132,13 +137,18 @@ for _name, _prompt in sub_map.items():
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
compassarena_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
abbr=f'compassarena_{_name}',
|
||||
type=CompassArenaDataset,
|
||||
path=data_path,
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassBenchDataset
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'judge_prompt'],
|
||||
@ -11,10 +12,14 @@ subjective_reader_cfg = dict(
|
||||
|
||||
data_path ='data/subjective/compassbench'
|
||||
|
||||
subjective_datasets = []
|
||||
compassbench_datasets = []
|
||||
|
||||
versions = ['CompassBenchV1.1']
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
|
||||
for version_abbr in versions:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
@ -46,7 +51,7 @@ for version_abbr in versions:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
compassbench_datasets.append(
|
||||
dict(
|
||||
abbr=version_abbr,
|
||||
type=CompassBenchDataset,
|
||||
@ -54,5 +59,10 @@ for version_abbr in versions:
|
||||
name=version_abbr,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassBenchControlLengthBiasDataset
|
||||
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'judge_prompt'],
|
||||
output_column='judge',
|
||||
@ -11,10 +11,12 @@ subjective_reader_cfg = dict(
|
||||
|
||||
data_path ='data/subjective/compassbench'
|
||||
|
||||
subjective_datasets = []
|
||||
compassbench_datasets = []
|
||||
|
||||
versions = ['CompassBenchV1.1']
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
for version_abbr in versions:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
@ -46,7 +48,7 @@ for version_abbr in versions:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
compassbench_datasets.append(
|
||||
dict(
|
||||
abbr=version_abbr,
|
||||
type=CompassBenchControlLengthBiasDataset,
|
||||
@ -54,5 +56,10 @@ for version_abbr in versions:
|
||||
name=version_abbr,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassBenchControlLengthBiasDataset
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'judge_prompt'],
|
||||
@ -11,10 +12,15 @@ subjective_reader_cfg = dict(
|
||||
|
||||
data_path ='data/subjective/compassbench'
|
||||
|
||||
subjective_datasets = []
|
||||
compassbench_datasets = []
|
||||
|
||||
versions = ['CompassBenchV1.1.patch', 'CompassBenchV1.1.patch.en']
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
|
||||
|
||||
for version_abbr in versions:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
@ -46,7 +52,7 @@ for version_abbr in versions:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
compassbench_datasets.append(
|
||||
dict(
|
||||
abbr=version_abbr,
|
||||
type=CompassBenchControlLengthBiasDataset,
|
||||
@ -54,5 +60,10 @@ for version_abbr in versions:
|
||||
name=version_abbr,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import CompassBenchControlLengthBiasDataset
|
||||
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['question', 'judge_prompt'],
|
||||
output_column='judge',
|
||||
@ -11,10 +11,12 @@ subjective_reader_cfg = dict(
|
||||
|
||||
data_path ='data/subjective/compassbench'
|
||||
|
||||
subjective_datasets = []
|
||||
compassbench_datasets = []
|
||||
|
||||
versions = ['CompassBenchV1.2']
|
||||
|
||||
gpt4 = [dict(
|
||||
abbr='gpt4-turbo',
|
||||
)]
|
||||
for version_abbr in versions:
|
||||
subjective_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
@ -46,7 +48,7 @@ for version_abbr in versions:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
compassbench_datasets.append(
|
||||
dict(
|
||||
abbr=version_abbr,
|
||||
type=CompassBenchControlLengthBiasDataset,
|
||||
@ -54,5 +56,10 @@ for version_abbr in versions:
|
||||
name=version_abbr,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=gpt4,
|
||||
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
|
||||
))
|
||||
|
@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import FofoDataset
|
||||
from opencompass.summarizers import FofoSummarizer
|
||||
from mmengine.config import read_base
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
@ -44,7 +45,7 @@ Please evaluate the formatting of the model's responses by checking if they comp
|
||||
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
|
||||
"""
|
||||
|
||||
subjective_datasets = []
|
||||
fofo_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -84,7 +85,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
fofo_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=FofoDataset,
|
||||
@ -92,5 +93,7 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=FofoSummarizer, judge_type='general')
|
||||
))
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import MTBench101Dataset
|
||||
|
||||
from opencompass.summarizers import MTBench101Summarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
|
||||
@ -15,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/'
|
||||
|
||||
subjective_datasets = []
|
||||
mtbench101_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
subjective_infer_cfg = dict(
|
||||
@ -50,7 +50,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
mtbench101_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=MTBench101Dataset,
|
||||
@ -58,5 +58,7 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
|
||||
))
|
||||
|
@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.datasets import MTBenchDataset
|
||||
|
||||
from opencompass.summarizers import MTBenchSummarizer
|
||||
|
||||
subjective_reader_cfg = dict(
|
||||
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
|
||||
@ -15,7 +15,7 @@ subjective_all_sets = [
|
||||
]
|
||||
data_path ='data/subjective/mtbench'
|
||||
|
||||
subjective_datasets = []
|
||||
mtbench_datasets = []
|
||||
|
||||
for _name in subjective_all_sets:
|
||||
temperature = float(_name.split('_')[1])
|
||||
@ -52,7 +52,7 @@ for _name in subjective_all_sets:
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
subjective_datasets.append(
|
||||
mtbench_datasets.append(
|
||||
dict(
|
||||
abbr=f'{_name}',
|
||||
type=MTBenchDataset,
|
||||
@ -60,5 +60,7 @@ for _name in subjective_all_sets:
|
||||
name=_name,
|
||||
reader_cfg=subjective_reader_cfg,
|
||||
infer_cfg=subjective_infer_cfg,
|
||||
eval_cfg=subjective_eval_cfg
|
||||
eval_cfg=subjective_eval_cfg,
|
||||
mode='singlescore',
|
||||
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
|
||||
))
|
||||
|
136
configs/eval_subjective.py
Normal file
136
configs/eval_subjective.py
Normal file
@ -0,0 +1,136 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
|
||||
from .datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
|
||||
from .datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
|
||||
from .datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
|
||||
from .datasets.subjective.fofo.fofo_judge import fofo_datasets
|
||||
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
|
||||
from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
|
||||
from .models.chatglm.hf_chatglm3_6b import models
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import SubjectiveSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
for model in models:
|
||||
model['generation_kwargs'] = dict(do_sample=True)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
),dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf2',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
),dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf3',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)]
|
||||
judge_models = [models[0]]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
|
||||
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
|
||||
work_dir = 'outputs/subjective/'
|
@ -1,79 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner, max_task_size=1000, mode='singlescore', models=models, judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
|
||||
|
||||
work_dir = 'outputs/alignment_bench/'
|
@ -1,104 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import AlpacaSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*alpacav2]
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=4,
|
||||
retry=20,
|
||||
temperature=1,
|
||||
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
|
||||
|
||||
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=2,
|
||||
retry=20,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models,
|
||||
infer_order='random',
|
||||
judge_models=judge_models
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
|
||||
)
|
||||
work_dir = 'outputs/alpaca/'
|
||||
|
||||
|
||||
|
||||
summarizer = dict(type=AlpacaSummarizer, judge_type='v2')
|
@ -1,104 +0,0 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from copy import deepcopy
|
||||
from opencompass.models import TurboMindModel
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import ArenaHardSummarizer
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.arena_hard.arena_hard_compare import subjective_datasets
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|begin_of_text|>user<|end_header_id|>\n\n', end='<|eot_id|>'),
|
||||
dict(role='BOT', begin='<|begin_of_text|>assistant<|end_header_id|>\n\n', end='<|eot_id|>', generate=True),
|
||||
],
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='llama-3-8b-instruct-hf',
|
||||
path='meta-llama/Meta-Llama-3-8B-Instruct',
|
||||
model_kwargs=dict(device_map='auto'),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
use_fast=False,
|
||||
),
|
||||
meta_template=_meta_template,
|
||||
max_out_len=4096,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
generation_kwargs={'eos_token_id': [128001, 128009]},
|
||||
batch_padding=True,
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
work_dir = 'outputs/arena_hard/'
|
||||
# -------------Inferen Stage ----------------------------------------
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=1000000),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=32,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='',
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=4096,
|
||||
max_seq_len=8192,
|
||||
batch_size=10,
|
||||
retry=10,
|
||||
temperature = 0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
gpt4_0314 = dict(
|
||||
abbr='gpt4-0314',
|
||||
type=OpenAI,
|
||||
)
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
max_task_size=1000000,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=[gpt4_0314],
|
||||
compare_models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
|
||||
given_pred = [{'abbr':'gpt4-0314', 'path':''}]
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=ArenaHardSummarizer
|
||||
)
|
@ -1,106 +0,0 @@
|
||||
from os import getenv as gv
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import CompassArenaSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=4,
|
||||
retry=20,
|
||||
temperature=1,
|
||||
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=2,
|
||||
retry=20,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
strategy='split',
|
||||
max_task_size=10000,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=[gpt4],
|
||||
compare_models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
|
||||
)
|
||||
|
||||
work_dir = 'outputs/compass_arena_debug/'
|
||||
|
||||
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add')
|
@ -1,137 +0,0 @@
|
||||
from os import getenv as gv
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import CompassBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
],
|
||||
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm2-chat-7b-hf',
|
||||
path='internlm/internlm2-chat-7b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='reserved',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=4,
|
||||
retry=20,
|
||||
temperature=1,
|
||||
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=2,
|
||||
retry=20,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
judge_models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
),
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b2',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
),
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm102b3',
|
||||
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=4),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
)
|
||||
]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
strategy='split',
|
||||
max_task_size=10000000,
|
||||
mode='m2n',
|
||||
infer_order='double',
|
||||
base_models=[gpt4],
|
||||
compare_models=models,
|
||||
judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
|
||||
#given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
|
||||
)
|
||||
|
||||
work_dir = 'outputs/compassbench/'
|
||||
|
||||
summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')
|
@ -1,77 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import CreationBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_model = dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=CreationBenchSummarizer, judge_type='general')
|
||||
|
||||
work_dir = 'outputs/creationbench/'
|
@ -1,69 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.fofo.fofo_judge import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.models import HuggingFacewithChatTemplate
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import FofoSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFacewithChatTemplate,
|
||||
abbr='internlm2-chat-1.8b-hf',
|
||||
path='internlm/internlm2-chat-1_8b',
|
||||
max_out_len=1024,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1),
|
||||
stop_words=['</s>', '<|im_end|>'],
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models,
|
||||
),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=FofoSummarizer, judge_type='general')
|
||||
|
||||
work_dir = 'outputs/fofo/'
|
@ -1,111 +0,0 @@
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from copy import deepcopy
|
||||
from opencompass.models import TurboMindModel
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import MultiroundSummarizer
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.multiround.functionalmt_zh_judgeby_gpt4 import subjective_datasets
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||
],
|
||||
eos_token_id=151645,
|
||||
)
|
||||
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen1.5-7b-chat-hf',
|
||||
path='Qwen/Qwen1.5-7B-Chat',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=_meta_template,
|
||||
pad_token_id=151645,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
work_dir = 'outputs/multiround/'
|
||||
# -------------Inferen Stage ----------------------------------------
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='your part',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='',
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=10,
|
||||
retry=10,
|
||||
temperature = 0,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
max_task_size=1000,
|
||||
mode='singlescore',
|
||||
models = models,
|
||||
judge_models=judge_models
|
||||
),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='your part',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(
|
||||
type=MultiroundSummarizer
|
||||
)
|
@ -1,92 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='pandalm-7b-v1-hf',
|
||||
path='WeOpenML/PandaLM-7B-v1',
|
||||
tokenizer_path='WeOpenML/PandaLM-7B-v1',
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
max_out_len=512,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models, judge_models=judge_models),
|
||||
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=AlignmentBenchSummarizer)
|
||||
|
||||
work_dir = 'outputs/pandalm'
|
@ -1,84 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import MTBenchSummarizer
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
|
||||
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
|
||||
],
|
||||
)
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceCausalLM,
|
||||
abbr='qwen-7b-chat-hf',
|
||||
path='Qwen/Qwen-7B-Chat',
|
||||
tokenizer_path='Qwen/Qwen-7B-Chat',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
|
||||
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=2048,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
temperature=0,
|
||||
)]
|
||||
|
||||
## single evaluation
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
|
||||
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
|
||||
|
||||
work_dir = 'outputs/mtbench/'
|
@ -1,94 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .datasets.subjective.multiround.mtbench101_judge import subjective_datasets
|
||||
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import MTBench101Summarizer
|
||||
|
||||
# ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='SYSTEM', api_role='SYSTEM'),
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=4096,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=10000),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llm_dev2',
|
||||
quotatype='auto',
|
||||
max_num_workers=32,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
|
||||
# -------------Evalation Stage ----------------------------------------
|
||||
|
||||
## ------------- JudgeLLM Configuration
|
||||
judge_models = [dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview', # To compare with the official leaderboard, please use gpt-4-1106-preview
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=16,
|
||||
max_out_len=4096,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
temperature=0.8,
|
||||
)]
|
||||
|
||||
## ------------- Evaluation Configuration
|
||||
|
||||
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100000, mode='singlescore', models=models, judge_models=judge_models),
|
||||
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
|
||||
)
|
||||
|
||||
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
|
||||
|
||||
work_dir = 'outputs/mtbench101/'
|
@ -1,21 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ..datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
|
||||
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.summarizers import AlignmentBenchSummarizer
|
||||
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
datasets = [*subjective_datasets]
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner, mode='singlescore', models=models
|
||||
),
|
||||
runner=runner,
|
||||
)
|
||||
|
||||
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
|
||||
work_dir = 'outputs/alignment_bench/'
|
@ -1,29 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ..datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
|
||||
from ..datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.summarizers import AlpacaSummarizer
|
||||
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
|
||||
datasets = [*alpacav2]
|
||||
gpt4_judge = dict(
|
||||
abbr='GPT4-Turbo',
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
config='weighted_alpaca_eval_gpt4_turbo'
|
||||
)
|
||||
## ------------- Evaluation Configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=NaivePartitioner
|
||||
),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=256,
|
||||
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
|
||||
)
|
||||
)
|
||||
work_dir = 'outputs/alpaca/'
|
@ -1,28 +0,0 @@
|
||||
from os import getenv as gv
|
||||
from opencompass.models import HuggingFaceCausalLM
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ..datasets.subjective.compassarena.compassarena_compare import subjective_datasets
|
||||
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.summarizers import CompassArenaSummarizer
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveSizePartitioner,
|
||||
strategy='split',
|
||||
max_task_size=10000,
|
||||
mode='m2n',
|
||||
base_models=[gpt4],
|
||||
compare_models=models,
|
||||
),
|
||||
runner=runner,
|
||||
given_pred=given_pred
|
||||
)
|
||||
|
||||
work_dir = 'outputs/compass_arena/'
|
||||
|
||||
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add')
|
@ -1,25 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from ..datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
|
||||
# from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
|
||||
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.summarizers import MTBenchSummarizer
|
||||
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
for model in models:
|
||||
if 'generation_kwargs' in model:
|
||||
if 'do_sample' in model['generation_kwargs']:
|
||||
del model['generation_kwargs']['do_sample']
|
||||
|
||||
eval = dict(
|
||||
partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models),
|
||||
runner=runner
|
||||
)
|
||||
|
||||
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
|
||||
|
||||
work_dir = 'outputs/mtbench/'
|
@ -1,84 +0,0 @@
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
api_meta_template = dict(
|
||||
round=[
|
||||
dict(role='HUMAN', api_role='HUMAN'),
|
||||
dict(role='BOT', api_role='BOT', generate=True),
|
||||
]
|
||||
)
|
||||
# -------------Inference Stage ----------------------------------------
|
||||
# For subjective evaluation, we often set do sample for models
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
judge_model = dict(
|
||||
abbr='GPT4-Turbo',
|
||||
type=OpenAI, path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
url='',
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=1024,
|
||||
max_seq_len=4096,
|
||||
batch_size=1,
|
||||
retry=30,
|
||||
temperature = 0
|
||||
)
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask),
|
||||
),
|
||||
)
|
||||
runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model))
|
||||
|
||||
gpt4 = dict(
|
||||
abbr='gpt4-turbo',
|
||||
type=OpenAI,
|
||||
path='gpt-4-1106-preview',
|
||||
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
|
||||
meta_template=api_meta_template,
|
||||
query_per_second=1,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=4,
|
||||
retry=20,
|
||||
temperature=1,
|
||||
)
|
||||
given_pred = [{'abbr':'gpt4-turbo', 'path':'your path'}]
|
@ -13,13 +13,65 @@ A popular evaluation method involves
|
||||
|
||||
We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
|
||||
|
||||
## Current Supported Subjective Evaluation Datasets
|
||||
## Currently Supported Subjective Evaluation Datasets
|
||||
|
||||
1. AlignBench (https://github.com/THUDM/AlignBench)
|
||||
2. MTBench (https://github.com/lm-sys/FastChat)
|
||||
3. AlpacaEvalv2 (https://github.com/tatsu-lab/alpaca_eval)
|
||||
4. ArenaHard (https://github.com/lm-sys/arena-hard/tree/main)
|
||||
5. CompassArena (Internal dataset)
|
||||
1. AlignBench Chinese Scoring Dataset (https://github.com/THUDM/AlignBench)
|
||||
2. MTBench English Scoring Dataset, two-turn dialogue (https://github.com/lm-sys/FastChat)
|
||||
3. MTBench101 English Scoring Dataset, multi-turn dialogue (https://github.com/mtbench101/mt-bench-101)
|
||||
4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
|
||||
5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
|
||||
6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
|
||||
|
||||
## Initiating Subjective Evaluation
|
||||
|
||||
Similar to existing objective evaluation methods, you can configure related settings in `configs/eval_subjective.py`.
|
||||
|
||||
### Basic Parameters: Specifying models, datasets, and judgemodels
|
||||
|
||||
Similar to objective evaluation, import the models and datasets that need to be evaluated, for example:
|
||||
|
||||
```
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
from .models.qwen.hf_qwen_7b import models
|
||||
```
|
||||
|
||||
It is worth noting that since the model setup parameters for subjective evaluation are often different from those for objective evaluation, it often requires setting up `do_sample` for inference instead of `greedy`. You can modify the relevant parameters in the configuration file as needed, for example:
|
||||
|
||||
```
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf2',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
The judgemodel is usually set to a powerful model like GPT4, and you can directly enter your API key according to the configuration in the config file, or use a custom model as the judgemodel.
|
||||
|
||||
### Specifying Other Parameters
|
||||
|
||||
In addition to the basic parameters, you can also modify the `infer` and `eval` fields in the config to set a more appropriate partitioning method. The currently supported partitioning methods mainly include three types: NaivePartitioner, SizePartitioner, and NumberWorkPartitioner. You can also specify your own workdir to save related files.
|
||||
|
||||
## Subjective Evaluation with Custom Dataset
|
||||
|
||||
@ -32,6 +84,9 @@ The specific process includes:
|
||||
|
||||
### Step-1: Data Preparation
|
||||
|
||||
This step requires preparing the dataset file and implementing your own dataset class under `Opencompass/datasets/subjective/`, returning the read data in the format of `list of dict`.
|
||||
|
||||
Actually, you can prepare the data in any format you like (csv, json, jsonl, etc.). However, to make it easier to get started, it is recommended to construct the data according to the format of the existing subjective datasets or according to the following json format.
|
||||
We provide mini test-set for **Compare Mode** and **Score Mode** as below:
|
||||
|
||||
```python
|
||||
@ -66,80 +121,12 @@ If you want to modify prompt on each single question, you can full some other in
|
||||
|
||||
### Step-2: Evaluation Configuration(Compare Mode)
|
||||
|
||||
For `config/eval_subjective_compare.py`, we provide some annotations to help users understand the configuration file.
|
||||
Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`:
|
||||
|
||||
```python
|
||||
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
|
||||
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import Corev2Summarizer
|
||||
|
||||
with read_base():
|
||||
# Pre-defined models
|
||||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
|
||||
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
|
||||
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
|
||||
from .models.openai.gpt_4 import models as gpt4_model
|
||||
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
|
||||
|
||||
# Evaluation datasets
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# Model to be evaluated
|
||||
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
|
||||
|
||||
# Inference configuration
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
# Evaluation configuration
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
mode='m2n', # m-model v.s n-model
|
||||
# Under m2n setting
|
||||
# must specify base_models and compare_models, program will generate pairs between base_models compare_models.
|
||||
base_models = [*hf_qwen_14b_chat], # Baseline model
|
||||
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # model to be evaluated
|
||||
),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(
|
||||
type=SubjectiveEvalTask,
|
||||
judge_cfg=gpt4_model # Judge model
|
||||
)),
|
||||
)
|
||||
work_dir = './outputs/subjective/'
|
||||
|
||||
summarizer = dict(
|
||||
type=Corev2Summarizer, # Custom summarizer
|
||||
match_method='smart', # Answer extraction
|
||||
)
|
||||
```
|
||||
|
||||
In addition, you can also change the response order of the two models, please refer to `config/eval_subjective_compare.py`,
|
||||
when `infer_order` is setting to `random`, the response will be random ordered,
|
||||
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
|
||||
|
||||
### Step-2: Evaluation Configuration(Score Mode)
|
||||
|
||||
For `config/eval_subjective_score.py`, it is mainly same with `config/eval_subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
|
||||
1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
|
||||
2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
|
||||
3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
|
||||
4. Finally, specify additional information such as `mode`, `summarizer`, etc., at the appropriate location. Note that for different subjective datasets, the fields that need to be specified may vary. Additionally, the summarizer class for the respective dataset also needs to be implemented to perform data statistics. You can refer to the summarizer implementations of other datasets, located in `opencompass/opencompass/summarizers/subjective`.
|
||||
|
||||
### Step-3: Launch the Evaluation
|
||||
|
||||
@ -152,67 +139,9 @@ The `-r` parameter allows the reuse of model inference and GPT-4 evaluation resu
|
||||
The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
|
||||
The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
|
||||
|
||||
Opencompass has supported lots of JudgeLLM, actually, you can take any model as JudgeLLM in opencompass configs.
|
||||
And we list the popular open-source JudgeLLM here:
|
||||
|
||||
1. Auto-J, refer to `configs/models/judge_llm/auto_j`
|
||||
|
||||
Consider cite the following paper if you find it helpful:
|
||||
|
||||
```bibtex
|
||||
@article{li2023generative,
|
||||
title={Generative judge for evaluating alignment},
|
||||
author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
|
||||
journal={arXiv preprint arXiv:2310.05470},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
2. JudgeLM, refer to `configs/models/judge_llm/judgelm`
|
||||
|
||||
```bibtex
|
||||
@article{zhu2023judgelm,
|
||||
title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
|
||||
author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
|
||||
journal={arXiv preprint arXiv:2310.17631},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
3. PandaLM, refer to `configs/models/judge_llm/pandalm`
|
||||
|
||||
Consider cite the following paper if you find it helpful:
|
||||
|
||||
```bibtex
|
||||
@article{wang2023pandalm,
|
||||
title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
|
||||
author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
|
||||
journal={arXiv preprint arXiv:2306.05087},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
## Multi-round Subjective Evaluation in OpenCompass
|
||||
|
||||
In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/eval_subjective_mtbench.py`.
|
||||
In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/datasets/subjective/multiround`.
|
||||
|
||||
In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
|
||||
|
||||
@ -238,82 +167,3 @@ In the multi-turn dialogue evaluation, you need to organize the data format into
|
||||
```
|
||||
|
||||
It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.
|
||||
|
||||
Consider cite the following paper if you find it helpful:
|
||||
|
||||
```bibtex
|
||||
@misc{zheng2023judging,
|
||||
title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
|
||||
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
|
||||
year={2023},
|
||||
eprint={2306.05685},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
## Practice: AlignBench Evaluation
|
||||
|
||||
### Dataset
|
||||
|
||||
```bash
|
||||
mkdir -p ./data/subjective/
|
||||
|
||||
cd ./data/subjective
|
||||
git clone https://github.com/THUDM/AlignBench.git
|
||||
|
||||
# data format conversion
|
||||
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
|
||||
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
Please edit the config `configs/eval_subjective_alignbench.py` according to your demand.
|
||||
|
||||
### Evaluation
|
||||
|
||||
```bash
|
||||
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py workspace/eval_subjective_alignbench.py
|
||||
```
|
||||
|
||||
### Submit to Official Leaderboard(Optional)
|
||||
|
||||
If you need to submit your prediction into official leaderboard, you can use `tools/convert_alignmentbench.py` for format conversion.
|
||||
|
||||
- Make sure you have the following results
|
||||
|
||||
```bash
|
||||
outputs/
|
||||
└── 20231214_173632
|
||||
├── configs
|
||||
├── logs
|
||||
├── predictions # model's response
|
||||
├── results
|
||||
└── summary
|
||||
```
|
||||
|
||||
- Convert the data
|
||||
|
||||
```bash
|
||||
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
|
||||
```
|
||||
|
||||
- Get `.csv` in `submission/` for submission
|
||||
|
||||
```bash
|
||||
outputs/
|
||||
└── 20231214_173632
|
||||
├── configs
|
||||
├── logs
|
||||
├── predictions
|
||||
├── results
|
||||
├── submission # 可提交文件
|
||||
└── summary
|
||||
```
|
||||
|
@ -15,11 +15,64 @@
|
||||
|
||||
## 目前已支持的主观评测数据集
|
||||
|
||||
1. AlignBench(https://github.com/THUDM/AlignBench)
|
||||
2. MTBench (https://github.com/lm-sys/FastChat)
|
||||
3. AlpacaEvalv2 (https://github.com/tatsu-lab/alpaca_eval)
|
||||
4. ArenaHard (https://github.com/lm-sys/arena-hard/tree/main)
|
||||
5. CompassArena(内部数据集)
|
||||
1. AlignBench 中文Scoring数据集(https://github.com/THUDM/AlignBench)
|
||||
2. MTBench 英文Scoring数据集,两轮对话(https://github.com/lm-sys/FastChat)
|
||||
3. MTBench101 英文Scoring数据集,多轮对话(https://github.com/mtbench101/mt-bench-101)
|
||||
4. AlpacaEvalv2 英文Compare数据集(https://github.com/tatsu-lab/alpaca_eval)
|
||||
5. ArenaHard 英文Compare数据集,主要面向coding(https://github.com/lm-sys/arena-hard/tree/main)
|
||||
6. Fofo 英文Socring数据集(https://github.com/SalesforceAIResearch/FoFo/)
|
||||
|
||||
## 启动主观评测
|
||||
|
||||
类似于已有的客观评测方式,可以在configs/eval_subjective.py中进行相关配置
|
||||
|
||||
### 基本参数models, datasets 和 judgemodels的指定
|
||||
|
||||
类似于客观评测的方式,导入需要评测的models和datasets,例如
|
||||
|
||||
```
|
||||
with read_base():
|
||||
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
|
||||
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
|
||||
from .models.qwen.hf_qwen_7b import models
|
||||
```
|
||||
|
||||
值得注意的是,由于主观评测的模型设置参数通常与客观评测不同,往往需要设置`do_sample`的方式进行推理而不是`greedy`,故可以在配置文件中自行修改相关参数,例如
|
||||
|
||||
```
|
||||
models = [
|
||||
dict(
|
||||
type=HuggingFaceChatGLM3,
|
||||
abbr='chatglm3-6b-hf2',
|
||||
path='THUDM/chatglm3-6b',
|
||||
tokenizer_path='THUDM/chatglm3-6b',
|
||||
model_kwargs=dict(
|
||||
device_map='auto',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
tokenizer_kwargs=dict(
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
),
|
||||
generation_kwargs=dict(
|
||||
do_sample=True,
|
||||
),
|
||||
meta_template=api_meta_template,
|
||||
max_out_len=2048,
|
||||
max_seq_len=4096,
|
||||
batch_size=8,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
)
|
||||
]
|
||||
```
|
||||
|
||||
judgemodel通常被设置为GPT4等强力模型,可以直接按照config文件中的配置填入自己的API key,或使用自定义的模型作为judgemodel
|
||||
|
||||
### 其他参数的指定
|
||||
|
||||
除了基本参数以外,还可以在config中修改`infer`和`eval`字段里的partitioner,从而设置更合适的分片方式,目前支持的分片方式主要有三种:NaivePartitoner, SizePartitioner和NumberWorkPartitioner
|
||||
以及可以指定自己的workdir用以保存相关文件。
|
||||
|
||||
## 自定义主观数据集评测
|
||||
|
||||
@ -32,6 +85,9 @@
|
||||
|
||||
### 第一步:数据准备
|
||||
|
||||
这一步需要准备好数据集文件以及在`Opencompass/datasets/subjective/`下实现自己数据集的类,将读取到的数据以`list of dict`的格式return
|
||||
|
||||
实际上可以按照自己喜欢的任意格式进行数据准备(csv, json, jsonl)等皆可,不过为了方便上手,推荐按照已有的主观数据集的格式进行构建或按照如下的json格式进行构建。
|
||||
对于对战模式和打分模式,我们各提供了一个demo测试集如下:
|
||||
|
||||
```python
|
||||
@ -64,85 +120,19 @@
|
||||
|
||||
以上三个字段是必要的,用户也可以添加其他字段,如果需要对每个问题的prompt进行单独处理,可以在'others'字段中进行一些额外设置,并在Dataset类中添加相应的字段。
|
||||
|
||||
### 第二步:构建评测配置(对战模式)
|
||||
### 第二步:构建评测配置
|
||||
|
||||
对于两回答比较,更详细的config setting请参考 `config/eval_subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
|
||||
以Alignbench为例`configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`,
|
||||
|
||||
```python
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
|
||||
|
||||
from opencompass.partitioners import NaivePartitioner
|
||||
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.runners import SlurmSequentialRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
|
||||
from opencompass.summarizers import Corev2Summarizer
|
||||
|
||||
with read_base():
|
||||
# 导入预设模型
|
||||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
|
||||
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
|
||||
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
|
||||
from .models.openai.gpt_4 import models as gpt4_model
|
||||
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
|
||||
|
||||
# 评测数据集
|
||||
datasets = [*subjective_datasets]
|
||||
|
||||
# 待测模型列表
|
||||
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
|
||||
|
||||
# 推理配置
|
||||
infer = dict(
|
||||
partitioner=dict(type=NaivePartitioner),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
# 评测配置
|
||||
eval = dict(
|
||||
partitioner=dict(
|
||||
type=SubjectiveNaivePartitioner,
|
||||
mode='m2n', # m个模型 与 n个模型进行对战
|
||||
# 在m2n模式下,需要指定base_models和compare_models,将会对base_models和compare_models生成对应的两两pair(去重且不会与自身进行比较)
|
||||
base_models = [*hf_qwen_14b_chat], # 用于对比的基线模型
|
||||
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # 待评测模型
|
||||
),
|
||||
runner=dict(
|
||||
type=SlurmSequentialRunner,
|
||||
partition='llmeval',
|
||||
quotatype='auto',
|
||||
max_num_workers=256,
|
||||
task=dict(
|
||||
type=SubjectiveEvalTask,
|
||||
judge_cfg=gpt4_model # 评价模型
|
||||
)),
|
||||
)
|
||||
work_dir = './outputs/subjective/' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测,将自动复用该目录下已有的所有结果
|
||||
|
||||
summarizer = dict(
|
||||
type=Corev2Summarizer, #自定义数据集Summarizer
|
||||
match_method='smart', #自定义答案提取方式
|
||||
)
|
||||
```
|
||||
|
||||
此外,在数据集的配置config中,还可以选择两回答比较时的回答顺序,请参考`config/eval_subjective_compare.py`,
|
||||
当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
|
||||
当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
|
||||
|
||||
### 第二步:构建评测配置(打分模式)
|
||||
|
||||
对于单回答打分,更详细的config setting请参考 `config/eval_subjective_score.py`,该config的大部分都与两回答比较的config相同,只需要修改评测模式即可,将评测模式设置为`singlescore`。
|
||||
1. 首先需要设置`subjective_reader_cfg`,用以接收从自定义的Dataset类里return回来的相关字段并指定保存文件时的output字段
|
||||
2. 然后需要指定数据集的根路径`data_path`以及数据集的文件名`subjective_all_sets`,如果有多个子文件,在这个list里进行添加即可
|
||||
3. 指定`subjective_infer_cfg`和`subjective_eval_cfg`,配置好相应的推理和评测的prompt
|
||||
4. 最后在相应的位置指定`mode`,`summarizer`等额外信息,注意,对于不同的主观数据集,所需指定的字段可能不尽相同。此外,相应数据集的summarizer类也需要自己实现以进行数据的统计,可以参考其他数据集的summarizer实现,位于`opencompass/opencompass/summarizers/subjective`
|
||||
|
||||
### 第三步 启动评测并输出评测结果
|
||||
|
||||
```shell
|
||||
python run.py configs/eval_subjective_score.py -r
|
||||
python run.py configs/eval_subjective.py -r
|
||||
```
|
||||
|
||||
- `-r` 参数支持复用模型推理和评估结果。
|
||||
@ -150,69 +140,9 @@ python run.py configs/eval_subjective_score.py -r
|
||||
JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
|
||||
评测报告则会输出到 `output/.../summary/timestamp/report.csv`。
|
||||
|
||||
Opencompass 已经支持了很多的JudgeLLM,实际上,你可以将Opencompass中所支持的所有模型都当作JudgeLLM使用。
|
||||
我们列出目前比较流行的开源JudgeLLM:
|
||||
|
||||
1. Auto-J,请参考 `configs/models/judge_llm/auto_j`
|
||||
|
||||
如果使用了该方法,请添加引用:
|
||||
|
||||
```bibtex
|
||||
@article{li2023generative,
|
||||
title={Generative judge for evaluating alignment},
|
||||
author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
|
||||
journal={arXiv preprint arXiv:2310.05470},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
2. JudgeLM,请参考 `configs/models/judge_llm/judgelm`
|
||||
|
||||
如果使用了该方法,请添加引用:
|
||||
|
||||
```bibtex
|
||||
@article{zhu2023judgelm,
|
||||
title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
|
||||
author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
|
||||
journal={arXiv preprint arXiv:2310.17631},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
3. PandaLM,请参考 `configs/models/judge_llm/pandalm`
|
||||
|
||||
如果使用了该方法,请添加引用:
|
||||
|
||||
```bibtex
|
||||
@article{wang2023pandalm,
|
||||
title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
|
||||
author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
|
||||
journal={arXiv preprint arXiv:2306.05087},
|
||||
year={2023}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
## 主观多轮对话评测
|
||||
|
||||
在OpenCompass中我们同样支持了主观的多轮对话评测,以MT-Bench为例,对MTBench的评测可以参见`configs/eval_subjective_mtbench.py`
|
||||
在OpenCompass中我们同样支持了主观的多轮对话评测,以MT-Bench为例,对MTBench的评测可以参见`configs/datasets/subjective/multiround`
|
||||
|
||||
在多轮对话评测中,你需要将数据格式整理为如下的dialogue格式
|
||||
|
||||
@ -238,84 +168,3 @@ Opencompass 已经支持了很多的JudgeLLM,实际上,你可以将Opencompa
|
||||
```
|
||||
|
||||
值得注意的是,由于MTBench各不同的题目类型设置了不同的温度,因此我们需要将原始数据文件按照温度分成三个不同的子集以分别推理,针对不同的子集我们可以设置不同的温度,具体设置参加`configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`
|
||||
|
||||
如果使用了该方法,请添加引用:
|
||||
|
||||
```bibtex
|
||||
@misc{zheng2023judging,
|
||||
title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
|
||||
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
|
||||
year={2023},
|
||||
eprint={2306.05685},
|
||||
archivePrefix={arXiv},
|
||||
primaryClass={cs.CL}
|
||||
}
|
||||
@misc{2023opencompass,
|
||||
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
|
||||
author={OpenCompass Contributors},
|
||||
howpublished = {\url{https://github.com/open-compass/opencompass}},
|
||||
year={2023}
|
||||
}
|
||||
```
|
||||
|
||||
## 实战:AlignBench 主观评测
|
||||
|
||||
### 数据集准备
|
||||
|
||||
```bash
|
||||
mkdir -p ./data/subjective/
|
||||
|
||||
cd ./data/subjective
|
||||
git clone https://github.com/THUDM/AlignBench.git
|
||||
|
||||
# data format conversion
|
||||
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
|
||||
|
||||
```
|
||||
|
||||
### 配置文件
|
||||
|
||||
请根据需要修改配置文件 `configs/eval_subjective_alignbench.py`
|
||||
|
||||
### 启动评测
|
||||
|
||||
按如下方式执行命令后,将会开始答案推理和主观打分,如只需进行推理,可以通过制定 `-m infer`实现
|
||||
|
||||
```bash
|
||||
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py configs/eval_subjective_alignbench.py
|
||||
```
|
||||
|
||||
### 提交官方评测(Optional)
|
||||
|
||||
完成评测后,如需提交官方榜单进行评测,可以使用它`tools/convert_alignmentbench.py`进行格式转换。
|
||||
|
||||
- 请确保已完成推理,并获得如下所示的文件:
|
||||
|
||||
```bash
|
||||
outputs/
|
||||
└── 20231214_173632
|
||||
├── configs
|
||||
├── logs
|
||||
├── predictions # 模型回复
|
||||
├── results
|
||||
└── summary
|
||||
```
|
||||
|
||||
- 执行如下命令获得可用于提交的结果
|
||||
|
||||
```bash
|
||||
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
|
||||
```
|
||||
|
||||
- 进入 `submission`文件夹获得可用于提交的`.csv`文件
|
||||
|
||||
```bash
|
||||
outputs/
|
||||
└── 20231214_173632
|
||||
├── configs
|
||||
├── logs
|
||||
├── predictions
|
||||
├── results
|
||||
├── submission # 可提交文件
|
||||
└── summary
|
||||
```
|
||||
|
@ -4,6 +4,7 @@ import argparse
|
||||
import getpass
|
||||
import os
|
||||
import os.path as osp
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
|
||||
from mmengine.config import Config, DictAction
|
||||
@ -345,13 +346,37 @@ def main():
|
||||
# visualize
|
||||
if args.mode in ['all', 'eval', 'viz']:
|
||||
summarizer_cfg = cfg.get('summarizer', {})
|
||||
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
|
||||
summarizer_cfg['type'] = DefaultSummarizer
|
||||
summarizer_cfg['config'] = cfg
|
||||
summarizer = build_from_cfg(summarizer_cfg)
|
||||
summarizer.summarize(time_str=cfg_time_str)
|
||||
|
||||
|
||||
# For subjective summarizer
|
||||
if summarizer_cfg.get('function', None):
|
||||
main_summarizer_cfg = deepcopy(summarizer_cfg)
|
||||
grouped_datasets = {}
|
||||
for dataset in cfg.datasets:
|
||||
prefix = dataset['abbr'].split('_')[0]
|
||||
if prefix not in grouped_datasets:
|
||||
grouped_datasets[prefix] = []
|
||||
grouped_datasets[prefix].append(dataset)
|
||||
all_grouped_lists = []
|
||||
for prefix in grouped_datasets:
|
||||
all_grouped_lists.append(grouped_datasets[prefix])
|
||||
dataset_score_container = []
|
||||
for dataset in all_grouped_lists:
|
||||
temp_cfg = deepcopy(cfg)
|
||||
temp_cfg.datasets = dataset
|
||||
summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
|
||||
summarizer = build_from_cfg(summarizer_cfg)
|
||||
dataset_score = summarizer.summarize(time_str=cfg_time_str)
|
||||
if dataset_score:
|
||||
dataset_score_container.append(dataset_score)
|
||||
main_summarizer_cfg['config'] = cfg
|
||||
main_summarizer = build_from_cfg(main_summarizer_cfg)
|
||||
main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
|
||||
else:
|
||||
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
|
||||
summarizer_cfg['type'] = DefaultSummarizer
|
||||
summarizer_cfg['config'] = cfg
|
||||
summarizer = build_from_cfg(summarizer_cfg)
|
||||
summarizer.summarize(time_str=cfg_time_str)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
@ -88,7 +88,9 @@ class AlignmentBenchDataset(SubjectiveCmpDataset):
|
||||
path: str,
|
||||
name: str,
|
||||
alignment_bench_config_path: Optional[str] = '',
|
||||
alignment_bench_config_name: Optional[str] = ''):
|
||||
alignment_bench_config_name: Optional[str] = '',
|
||||
*args,
|
||||
**kwargs):
|
||||
if alignment_bench_config_path != '':
|
||||
alignmentbench_config = Config(alignment_bench_config_path,
|
||||
alignment_bench_config_name)
|
||||
@ -106,17 +108,3 @@ class AlignmentBenchDataset(SubjectiveCmpDataset):
|
||||
alignbench_dataset.append(data)
|
||||
dataset = Dataset.from_list(alignbench_dataset)
|
||||
return dataset
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
data = {
|
||||
'question': '高音单簧管和高音萨克斯的调性相同吗?如果相同,请说出他们的调性,如果不同,请分别说出他们的调性',
|
||||
'capability': '专业能力',
|
||||
'others': {
|
||||
'subcategory': '音乐',
|
||||
'reference': '高音单簧管和高音萨克斯的调性不同。高音单簧管的调性通常为E♭,而高音萨克斯的调性则为B♭。\n',
|
||||
'question_id': 1
|
||||
}
|
||||
}
|
||||
prefix = prompt_construct(data, alignmentbench_config)
|
||||
print(prefix)
|
||||
|
@ -11,7 +11,7 @@ from ..base import BaseDataset
|
||||
@LOAD_DATASET.register_module()
|
||||
class ArenaHardDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
filename = osp.join(path, f'{name}.jsonl')
|
||||
dataset = DatasetDict()
|
||||
raw_data = []
|
||||
|
@ -8,11 +8,7 @@ from .subjective_cmp import SubjectiveCmpDataset
|
||||
@LOAD_DATASET.register_module()
|
||||
class CompassArenaDataset(SubjectiveCmpDataset):
|
||||
|
||||
def load(
|
||||
self,
|
||||
path: str,
|
||||
name: str,
|
||||
):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
dataset = list(super().load(path, name))
|
||||
creation_dataset = []
|
||||
for data in dataset:
|
||||
|
@ -77,7 +77,7 @@ Choice: [[C]]
|
||||
@LOAD_DATASET.register_module()
|
||||
class CompassBenchDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
raw_data = []
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
|
@ -105,7 +105,7 @@ Choice: [[C]]
|
||||
@LOAD_DATASET.register_module()
|
||||
class CompassBenchControlLengthBiasDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
raw_data = []
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
|
@ -12,7 +12,7 @@ from ..base import BaseDataset
|
||||
@LOAD_DATASET.register_module()
|
||||
class FofoDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
raw_data = []
|
||||
with open(filename, 'r', encoding='utf-8') as f:
|
||||
|
@ -2,7 +2,6 @@
|
||||
import json
|
||||
import os.path as osp
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
|
||||
@ -166,7 +165,13 @@ def prompt_construct(problem, multi_turn=False, judge_type='single'):
|
||||
@LOAD_DATASET.register_module()
|
||||
class MTBenchDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str, multi_turn=True, judge_type='single'):
|
||||
def load(self,
|
||||
path: str,
|
||||
name: str,
|
||||
judge_type='single',
|
||||
multi_turn=True,
|
||||
*args,
|
||||
**kwargs):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
dataset = DatasetDict()
|
||||
raw_data = []
|
||||
|
@ -2,7 +2,6 @@
|
||||
import json
|
||||
import os.path as osp
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
from torch.utils.data import DataLoader
|
||||
@ -258,7 +257,7 @@ def add_format(question, answer):
|
||||
@LOAD_DATASET.register_module()
|
||||
class MTBench101Dataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
import copy
|
||||
|
||||
filename = osp.join(path, f'{name}.jsonl')
|
||||
|
@ -11,7 +11,7 @@ from ..base import BaseDataset
|
||||
@LOAD_DATASET.register_module()
|
||||
class SubjectiveCmpDataset(BaseDataset):
|
||||
|
||||
def load(self, path: str, name: str):
|
||||
def load(self, path: str, name: str, *args, **kwargs):
|
||||
filename = osp.join(path, f'{name}.json')
|
||||
dataset = DatasetDict()
|
||||
raw_data = []
|
||||
|
@ -102,6 +102,25 @@ def remove_already_tasks(tasks, work_dir, meta_judge_model):
|
||||
return tasks_to_keep
|
||||
|
||||
|
||||
def get_model_combinations(
|
||||
mode,
|
||||
models: List[ConfigDict],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = []) -> List:
|
||||
if mode == 'allpair':
|
||||
assert len(models) > 1
|
||||
return combinations(models, 2)
|
||||
elif mode == 'm2n':
|
||||
assert len(base_models) > 0 and len(compare_models) > 0
|
||||
model_combinations = list(product(base_models, compare_models))
|
||||
unique_combinations = remove_duplicate_pairs(
|
||||
[combo for combo in model_combinations if combo[0] != combo[1]])
|
||||
return unique_combinations
|
||||
elif mode == 'fixed':
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
@PARTITIONERS.register_module()
|
||||
class SubjectiveNaivePartitioner(NaivePartitioner):
|
||||
"""Naive task partitioner for subjective evaluation. Compared to
|
||||
@ -113,46 +132,25 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
|
||||
to the task config.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
mode: str,
|
||||
out_dir: str,
|
||||
models: Optional[List[ConfigDict]] = [],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = [],
|
||||
judge_models: Optional[List[ConfigDict]] = [],
|
||||
meta_judge_model: Optional[ConfigDict] = None,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
keep_keys: Optional[List[str]] = None,
|
||||
infer_order: Optional[str] = 'random'):
|
||||
def __init__(
|
||||
self,
|
||||
out_dir: str,
|
||||
models: Optional[List[ConfigDict]] = [],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = [],
|
||||
judge_models: Optional[List[ConfigDict]] = [],
|
||||
meta_judge_model: Optional[ConfigDict] = None,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
keep_keys: Optional[List[str]] = None,
|
||||
):
|
||||
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
|
||||
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
|
||||
assert infer_order in ['random', 'double']
|
||||
self.mode = mode
|
||||
|
||||
self.models = models
|
||||
self.base_models = base_models
|
||||
self.compare_models = compare_models
|
||||
self.model_pairs = model_pairs
|
||||
self.judge_models = judge_models
|
||||
self.meta_judge_model = meta_judge_model
|
||||
self.infer_order = infer_order
|
||||
|
||||
def get_model_combinations(
|
||||
self,
|
||||
models: List[ConfigDict],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = []) -> List:
|
||||
if self.mode == 'allpair':
|
||||
assert len(models) > 1
|
||||
return combinations(models, 2)
|
||||
elif self.mode == 'm2n':
|
||||
assert len(base_models) > 0 and len(compare_models) > 0
|
||||
model_combinations = list(product(base_models, compare_models))
|
||||
unique_combinations = remove_duplicate_pairs([
|
||||
combo for combo in model_combinations if combo[0] != combo[1]
|
||||
])
|
||||
return unique_combinations
|
||||
elif self.mode == 'fixed':
|
||||
pass
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
@ -187,34 +185,46 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
|
||||
models = self.models if self.models != [] else models
|
||||
base_models, compare_models = self.base_models, self.compare_models
|
||||
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
|
||||
if self.mode == 'singlescore':
|
||||
models = models
|
||||
else:
|
||||
models = self.get_model_combinations(models, base_models,
|
||||
compare_models)
|
||||
model_dataset_combinations = [{'models': models, 'datasets': datasets}]
|
||||
tasks = super().partition(
|
||||
model_dataset_combinations=model_dataset_combinations,
|
||||
work_dir=work_dir,
|
||||
out_dir=out_dir,
|
||||
add_cfg=add_cfg)
|
||||
all_tasks = []
|
||||
for dataset in datasets:
|
||||
mode = dataset['mode']
|
||||
infer_order = dataset.get('infer_order', None)
|
||||
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
|
||||
assert infer_order in ['random', 'double', None]
|
||||
if mode == 'singlescore':
|
||||
temp_models = models
|
||||
else:
|
||||
temp_models = get_model_combinations(mode, models,
|
||||
dataset['base_models'],
|
||||
models)
|
||||
model_dataset_combinations = [{
|
||||
'models': temp_models,
|
||||
'datasets': [dataset]
|
||||
}]
|
||||
|
||||
# We need to add judge models and meta-judge-model as new tasks
|
||||
# When there is no meta-judge-model, we assign all judge models to each tasks
|
||||
# When there is a meta-judge-model, we add an additional task stage
|
||||
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
|
||||
meta_judge_model)
|
||||
tasks = super().partition(
|
||||
model_dataset_combinations=model_dataset_combinations,
|
||||
work_dir=work_dir,
|
||||
out_dir=out_dir,
|
||||
add_cfg=add_cfg)
|
||||
|
||||
# We also need to check and remove the already done tasks
|
||||
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
|
||||
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
||||
tasks[0], list):
|
||||
# Refer to meta review judge
|
||||
for task_stage in tasks:
|
||||
for task in task_stage:
|
||||
task['infer_order'] = self.infer_order
|
||||
else:
|
||||
# Refer to just have review judge
|
||||
for task in tasks:
|
||||
task['infer_order'] = self.infer_order
|
||||
return tasks
|
||||
# We need to add judge models and meta-judge-model as new tasks
|
||||
# When there is no meta-judge-model, we assign all judge models to each tasks
|
||||
# When there is a meta-judge-model, we add an additional task stage
|
||||
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
|
||||
meta_judge_model)
|
||||
|
||||
# We also need to check and remove the already done tasks
|
||||
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
|
||||
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
||||
tasks[0], list):
|
||||
# Refer to meta review judge
|
||||
for task_stage in tasks:
|
||||
for task in task_stage:
|
||||
task['infer_order'] = infer_order
|
||||
else:
|
||||
# Refer to just have review judge
|
||||
for task in tasks:
|
||||
task['infer_order'] = infer_order
|
||||
all_tasks += tasks
|
||||
return all_tasks
|
||||
|
209
opencompass/partitioners/sub_num_worker.py
Normal file
209
opencompass/partitioners/sub_num_worker.py
Normal file
@ -0,0 +1,209 @@
|
||||
# flake8: noqa: E501
|
||||
import copy
|
||||
import math
|
||||
import os.path as osp
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import mmengine
|
||||
from mmengine.config import Config, ConfigDict
|
||||
|
||||
from opencompass.registry import PARTITIONERS
|
||||
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
|
||||
get_infer_output_path)
|
||||
|
||||
from .sub_naive import (SubjectiveNaivePartitioner, get_model_combinations,
|
||||
remove_already_tasks,
|
||||
replicate_tasks_with_judge_models)
|
||||
|
||||
|
||||
@PARTITIONERS.register_module()
|
||||
class SubjectiveNumWorkerPartitioner(SubjectiveNaivePartitioner):
|
||||
"""Task partitioner based on the pre-defined number of workers.
|
||||
|
||||
Args:
|
||||
out_dir (str): The output directory of tasks.
|
||||
num_worker (int): The number of workers. default: 8.
|
||||
min_task_size (int): The minimum size of a task. default: 16.
|
||||
dataset_size_path (str): The path to the dataset size cache file.
|
||||
keep_keys (list[str]): The keys to be kept from the experiment config
|
||||
to the task config.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
out_dir: str,
|
||||
models: Optional[List[ConfigDict]] = [],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = [],
|
||||
judge_models: Optional[List[ConfigDict]] = [],
|
||||
meta_judge_model: Optional[ConfigDict] = None,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
num_worker: int = 8,
|
||||
num_worker_split: Optional[int] = None,
|
||||
min_task_size: int = 16,
|
||||
strategy: str = 'heuristic',
|
||||
dataset_size_path: str = '.cache/dataset_size.json',
|
||||
keep_keys: Optional[List[str]] = None):
|
||||
super().__init__(
|
||||
out_dir=out_dir,
|
||||
keep_keys=keep_keys,
|
||||
models=models,
|
||||
base_models=base_models,
|
||||
compare_models=compare_models,
|
||||
judge_models=judge_models,
|
||||
meta_judge_model=meta_judge_model,
|
||||
model_pairs=model_pairs,
|
||||
)
|
||||
if strategy == 'split' and num_worker_split is not None:
|
||||
self.logger.warning('num_worker_split is ignored with split.')
|
||||
|
||||
self.num_worker = num_worker
|
||||
self.num_worker_split = num_worker_split or num_worker
|
||||
self.min_task_size = min_task_size
|
||||
self.dataset_size_path = dataset_size_path
|
||||
assert strategy in ('heuristic', 'split'), \
|
||||
f'Unsupported partition strategy: {strategy}. '\
|
||||
'Supported strategies are: `heuristic`, `split` .'
|
||||
self.strategy = strategy
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
datasets: List[ConfigDict],
|
||||
work_dir: str,
|
||||
out_dir: str,
|
||||
add_cfg: Dict = {}) -> List[ConfigDict]:
|
||||
|
||||
# intentionally avoid any sort here,
|
||||
# for user's abaility to manipulate the order
|
||||
models = self.models if self.models != [] else models
|
||||
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
|
||||
self.num_worker = int(self.num_worker / len(datasets))
|
||||
all_tasks = []
|
||||
for dataset in datasets:
|
||||
mode = dataset['mode']
|
||||
infer_order = dataset.get('infer_order', None)
|
||||
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
|
||||
assert infer_order in ['random', 'double', None]
|
||||
if mode == 'singlescore':
|
||||
temp_models = models
|
||||
else:
|
||||
temp_models = get_model_combinations(mode, models,
|
||||
dataset['base_models'],
|
||||
models)
|
||||
model_dataset_combinations = [{
|
||||
'models': temp_models,
|
||||
'datasets': [dataset]
|
||||
}]
|
||||
|
||||
tasks = []
|
||||
for comb in model_dataset_combinations:
|
||||
for model in comb['models']:
|
||||
chunks = []
|
||||
for dataset in comb['datasets']:
|
||||
filename = get_infer_output_path(
|
||||
model, dataset, out_dir)
|
||||
# skip the task if the task output exists
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
dataset_size = self.get_size(dataset)
|
||||
if self.num_worker <= 1:
|
||||
chunks.append(dataset)
|
||||
elif dataset_size <= self.min_task_size:
|
||||
chunks.append(dataset)
|
||||
else:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
chunks.append(dataset_split)
|
||||
|
||||
if self.strategy == 'heuristic':
|
||||
buckets = [[] for _ in range(self.num_worker_split)]
|
||||
for i, chunk in enumerate(chunks):
|
||||
buckets[i % self.num_worker_split].append(chunk)
|
||||
|
||||
for bucket in buckets:
|
||||
if len(bucket) > 0:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [bucket],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
elif self.strategy == 'split':
|
||||
for dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
# We need to add judge models and meta-judge-model as new tasks
|
||||
# When there is no meta-judge-model, we assign all judge models to each tasks
|
||||
# When there is a meta-judge-model, we add an additional task stage
|
||||
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
|
||||
meta_judge_model)
|
||||
|
||||
# We also need to check and remove the already done tasks
|
||||
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
|
||||
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
||||
tasks[0], list):
|
||||
# Refer to meta review judge
|
||||
for task_stage in tasks:
|
||||
for task in task_stage:
|
||||
task['infer_order'] = infer_order
|
||||
else:
|
||||
# Refer to just have review judge
|
||||
for task in tasks:
|
||||
task['infer_order'] = infer_order
|
||||
all_tasks += tasks
|
||||
return all_tasks
|
||||
|
||||
@property
|
||||
def dataset_size(self):
|
||||
if not hasattr(self, '_dataset_size'):
|
||||
if osp.exists(self.dataset_size_path):
|
||||
self._dataset_size = mmengine.load(self.dataset_size_path)
|
||||
else:
|
||||
self._dataset_size = {}
|
||||
return self._dataset_size
|
||||
|
||||
def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]:
|
||||
"""Split dataset into several parts."""
|
||||
dataset_size = self.get_size(dataset_cfg)
|
||||
split_configs = []
|
||||
abbr = dataset_abbr_from_cfg(dataset_cfg)
|
||||
# evenly distribute the task
|
||||
num_split = self.num_worker
|
||||
step = max(math.ceil(dataset_size / num_split), self.min_task_size)
|
||||
for part, i in enumerate(range(0, dataset_size, step)):
|
||||
cfg = copy.deepcopy(dataset_cfg)
|
||||
cfg['abbr'] = abbr + f'_{part}'
|
||||
test_range = cfg['reader_cfg'].get('test_range', '')
|
||||
cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]'
|
||||
split_configs.append(cfg)
|
||||
return split_configs
|
||||
|
||||
def get_size(self, dataset: ConfigDict) -> int:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
|
||||
test_range = dataset.reader_cfg.get('test_range', '')
|
||||
|
||||
if dataset_abbr in self.dataset_size:
|
||||
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
|
||||
f'{test_range})')
|
||||
return actual_size
|
||||
|
||||
dataset = build_dataset_from_cfg(dataset)
|
||||
self.dataset_size[dataset_abbr] = len(dataset.test)
|
||||
|
||||
mmengine.mkdir_or_exist('.cache/')
|
||||
mmengine.dump(self.dataset_size,
|
||||
self.dataset_size_path,
|
||||
indent=4,
|
||||
ensure_ascii=False)
|
||||
|
||||
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
|
||||
f'{test_range})')
|
||||
return actual_size
|
@ -12,7 +12,8 @@ from opencompass.registry import PARTITIONERS
|
||||
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
|
||||
get_infer_output_path)
|
||||
|
||||
from .sub_naive import (SubjectiveNaivePartitioner, remove_already_tasks,
|
||||
from .sub_naive import (SubjectiveNaivePartitioner, get_model_combinations,
|
||||
remove_already_tasks,
|
||||
replicate_tasks_with_judge_models)
|
||||
|
||||
|
||||
@ -36,31 +37,31 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
|
||||
to the task config.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
mode: str,
|
||||
out_dir: str,
|
||||
models: Optional[List[ConfigDict]] = [],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = [],
|
||||
judge_models: Optional[List[ConfigDict]] = [],
|
||||
meta_judge_model: Optional[ConfigDict] = None,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
max_task_size: int = 40000,
|
||||
gen_task_coef: int = 20,
|
||||
strategy: str = 'heuristic',
|
||||
dataset_size_path: str = '.cache/dataset_size.json',
|
||||
keep_keys: Optional[List[str]] = None,
|
||||
infer_order: Optional[str] = 'random'):
|
||||
super().__init__(out_dir=out_dir,
|
||||
keep_keys=keep_keys,
|
||||
mode=mode,
|
||||
models=models,
|
||||
base_models=base_models,
|
||||
compare_models=compare_models,
|
||||
judge_models=judge_models,
|
||||
meta_judge_model=meta_judge_model,
|
||||
model_pairs=model_pairs,
|
||||
infer_order=infer_order)
|
||||
def __init__(
|
||||
self,
|
||||
out_dir: str,
|
||||
models: Optional[List[ConfigDict]] = [],
|
||||
base_models: Optional[List[ConfigDict]] = [],
|
||||
compare_models: Optional[List[ConfigDict]] = [],
|
||||
judge_models: Optional[List[ConfigDict]] = [],
|
||||
meta_judge_model: Optional[ConfigDict] = None,
|
||||
model_pairs: Optional[List[Tuple]] = None,
|
||||
max_task_size: int = 40000,
|
||||
gen_task_coef: int = 20,
|
||||
strategy: str = 'heuristic',
|
||||
dataset_size_path: str = '.cache/dataset_size.json',
|
||||
keep_keys: Optional[List[str]] = None,
|
||||
):
|
||||
super().__init__(
|
||||
out_dir=out_dir,
|
||||
keep_keys=keep_keys,
|
||||
models=models,
|
||||
base_models=base_models,
|
||||
compare_models=compare_models,
|
||||
judge_models=judge_models,
|
||||
meta_judge_model=meta_judge_model,
|
||||
model_pairs=model_pairs,
|
||||
)
|
||||
self.max_task_size = max_task_size
|
||||
self.gen_task_coef = gen_task_coef
|
||||
self.dataset_size_path = dataset_size_path
|
||||
@ -105,76 +106,94 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
|
||||
models = self.models if self.models != [] else models
|
||||
base_models, compare_models = self.base_models, self.compare_models
|
||||
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
|
||||
if self.mode == 'singlescore':
|
||||
models = models
|
||||
else:
|
||||
models = super().get_model_combinations(models, base_models,
|
||||
compare_models)
|
||||
model_dataset_combinations = [{'models': models, 'datasets': datasets}]
|
||||
tasks = []
|
||||
for comb in model_dataset_combinations:
|
||||
comb['datasets'] = sorted(comb['datasets'],
|
||||
key=lambda x: self.get_cost(x),
|
||||
reverse=True)
|
||||
for model in comb['models']:
|
||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||
for dataset in comb['datasets']:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
# skip the task if the task output exists
|
||||
# if osp.exists(filename):
|
||||
# continue
|
||||
dataset_size = self.get_cost(dataset)
|
||||
if dataset_size > self.max_task_size:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
chunks.append(
|
||||
(self.max_task_size, dataset_split))
|
||||
else:
|
||||
chunks.append((dataset_size, dataset))
|
||||
self.max_task_size *= len(datasets)
|
||||
all_tasks = []
|
||||
for dataset in datasets:
|
||||
mode = dataset['mode']
|
||||
infer_order = dataset.get('infer_order', None)
|
||||
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
|
||||
assert infer_order in ['random', 'double', None]
|
||||
if mode == 'singlescore':
|
||||
temp_models = models
|
||||
else:
|
||||
temp_models = get_model_combinations(mode, models,
|
||||
dataset['base_models'],
|
||||
models)
|
||||
model_dataset_combinations = [{
|
||||
'models': temp_models,
|
||||
'datasets': [dataset]
|
||||
}]
|
||||
|
||||
if self.strategy == 'heuristic':
|
||||
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
||||
current_size, current_chunks = 0, []
|
||||
for index in range(len(chunks)):
|
||||
current_size += chunks[index][0]
|
||||
current_chunks.append(chunks[index][1])
|
||||
if index == len(chunks) - 1 or current_size + chunks[
|
||||
index + 1][0] > self.max_task_size:
|
||||
tasks = []
|
||||
for comb in model_dataset_combinations:
|
||||
comb['datasets'] = sorted(comb['datasets'],
|
||||
key=lambda x: self.get_cost(x),
|
||||
reverse=True)
|
||||
for model in comb['models']:
|
||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||
for dataset in comb['datasets']:
|
||||
filename = get_infer_output_path(
|
||||
model, dataset, out_dir)
|
||||
# skip the task if the task output exists
|
||||
# if osp.exists(filename):
|
||||
# continue
|
||||
dataset_size = self.get_cost(dataset)
|
||||
if dataset_size > self.max_task_size:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
chunks.append(
|
||||
(self.max_task_size, dataset_split))
|
||||
else:
|
||||
chunks.append((dataset_size, dataset))
|
||||
|
||||
if self.strategy == 'heuristic':
|
||||
chunks = sorted(chunks,
|
||||
key=lambda x: x[0],
|
||||
reverse=True)
|
||||
current_size, current_chunks = 0, []
|
||||
for index in range(len(chunks)):
|
||||
current_size += chunks[index][0]
|
||||
current_chunks.append(chunks[index][1])
|
||||
if index == len(
|
||||
chunks) - 1 or current_size + chunks[
|
||||
index + 1][0] > self.max_task_size:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [current_chunks],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
current_size, current_chunks = 0, []
|
||||
elif self.strategy == 'split':
|
||||
for _, dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [current_chunks],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
current_size, current_chunks = 0, []
|
||||
elif self.strategy == 'split':
|
||||
for _, dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
|
||||
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
|
||||
meta_judge_model)
|
||||
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
|
||||
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
|
||||
meta_judge_model)
|
||||
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
|
||||
|
||||
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
||||
tasks[0], list):
|
||||
# Refer to meta review judge
|
||||
for task_stage in tasks:
|
||||
for task in task_stage:
|
||||
task['infer_order'] = self.infer_order
|
||||
else:
|
||||
# Refer to just have review judge
|
||||
for task in tasks:
|
||||
task['infer_order'] = self.infer_order
|
||||
return tasks
|
||||
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
|
||||
tasks[0], list):
|
||||
# Refer to meta review judge
|
||||
for task_stage in tasks:
|
||||
for task in task_stage:
|
||||
task['infer_order'] = infer_order
|
||||
else:
|
||||
# Refer to just have review judge
|
||||
for task in tasks:
|
||||
task['infer_order'] = infer_order
|
||||
|
||||
all_tasks += tasks
|
||||
return all_tasks
|
||||
|
||||
@property
|
||||
def dataset_size(self):
|
||||
|
@ -5,13 +5,12 @@ from .alpacaeval import AlpacaSummarizer
|
||||
from .arenahard import ArenaHardSummarizer
|
||||
from .compass_arena import CompassArenaSummarizer
|
||||
from .compassbench import CompassBenchSummarizer
|
||||
from .compassbench_th import CompassBenchTHSummarizer
|
||||
from .corev2 import Corev2Summarizer
|
||||
from .creationbench import CreationBenchSummarizer
|
||||
from .flames import FlamesSummarizer
|
||||
from .fofo import FofoSummarizer
|
||||
from .information_retrival import IRSummarizer
|
||||
from .mtbench import MTBenchSummarizer
|
||||
from .mtbench101 import MTBench101Summarizer
|
||||
from .multiround import MultiroundSummarizer
|
||||
from .subjective import SubjectiveSummarizer
|
||||
from .wildbench import WildBenchPairSummarizer, WildBenchSingleSummarizer
|
||||
|
@ -271,7 +271,6 @@ def get_capability_results(judged_answers,
|
||||
capability_avg_ratings['总分'] /= len(temp_list)
|
||||
capability_avg_ratings['总分'] = round(capability_avg_ratings['总分'], 2)
|
||||
scores = {model: capability_avg_ratings}
|
||||
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
@ -298,6 +297,15 @@ def get_capability_results(judged_answers,
|
||||
row.append(scores[model][sub_category])
|
||||
writer.writerow(row)
|
||||
|
||||
scores = scores[model]
|
||||
scores.pop('中文推理总分', None)
|
||||
scores.pop('中文语言总分', None)
|
||||
|
||||
# Creating a new dictionary with '总分' as the first item
|
||||
updated_scores = {'总分': scores.pop('总分')}
|
||||
updated_scores.update(scores)
|
||||
return updated_scores
|
||||
|
||||
|
||||
class AlignmentBenchSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
@ -338,42 +346,42 @@ class AlignmentBenchSummarizer:
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
all_scores = {}
|
||||
for judge_model in self.judge_models:
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
dataset = dataset_cfgs[0] # Alignbench just have only one subfile
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag, fout_flag2 = 0, 0
|
||||
if self.judge_type == 'general':
|
||||
fout = osp.join(
|
||||
output_dir,
|
||||
'Alignbench-judged-by--' + judge_abbr + '-dimension.csv')
|
||||
fout2 = osp.join(
|
||||
output_dir,
|
||||
'Alignbench-judged-by--' + judge_abbr + '-capability.csv')
|
||||
|
||||
for eval_model_abbr in self.eval_model_abbrs:
|
||||
subdir = eval_model_abbr + '_judged-by--' + judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
model = eval_model_abbr
|
||||
if os.path.isdir(subdir_path):
|
||||
model = eval_model_abbr
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
if self.judge_type == 'general':
|
||||
fout = osp.join(
|
||||
output_dir,
|
||||
'judged-by--' + judge_abbr + '-dimension.csv')
|
||||
fout2 = osp.join(
|
||||
output_dir,
|
||||
'judged-by--' + judge_abbr + '-capability.csv')
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
if self.judge_type == 'general':
|
||||
get_dimension_results(judged_answers, references,
|
||||
fout, fout_flag, model)
|
||||
fout_flag += 1
|
||||
get_capability_results(judged_answers, references,
|
||||
fout2, fout_flag2, model,
|
||||
self.category)
|
||||
fout_flag2 += 1
|
||||
get_dimension_results(judged_answers, references, fout,
|
||||
fout_flag, model)
|
||||
fout_flag += 1
|
||||
scores = get_capability_results(judged_answers, references,
|
||||
fout2, fout_flag2, model,
|
||||
self.category)
|
||||
|
||||
score_by_judgemodel[model] = scores
|
||||
fout_flag2 += 1
|
||||
else:
|
||||
score_by_judgemodel[model] = None
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
if self.judge_type == 'general':
|
||||
with open(fout, 'r') as f:
|
||||
x = from_csv(f, delimiter=',')
|
||||
print(x)
|
||||
print(fout)
|
||||
with open(fout2, 'r') as f:
|
||||
x = from_csv(f, delimiter=',')
|
||||
print(x)
|
||||
print(fout2)
|
||||
|
||||
all_scores[judge_abbr] = score_by_judgemodel
|
||||
return {'Alignbench': all_scores}
|
||||
|
@ -80,10 +80,9 @@ class AlpacaSummarizer:
|
||||
def __init__(self, config: ConfigDict, judge_type='v2') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.judge_abbr = model_abbr_from_cfg(
|
||||
self.cfg['judge_models'][0]) # We will reorganize the summarizers
|
||||
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.judge_type = judge_type
|
||||
assert self.judge_type in ['v1', 'v2']
|
||||
self.judge_map = {
|
||||
@ -102,22 +101,34 @@ class AlpacaSummarizer:
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
model_combinations = list(
|
||||
product(self.base_models, self.compare_models))
|
||||
unique_combinations = remove_duplicate_pairs(
|
||||
[combo for combo in model_combinations if combo[0] != combo[1]])
|
||||
all_scores = {}
|
||||
for judge_model in self.judge_models:
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
dataset = dataset_cfgs[0] # AlpacaEval just have only one subfile
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
model_combinations = list(
|
||||
product(self.base_models, self.compare_models))
|
||||
unique_combinations = remove_duplicate_pairs([
|
||||
combo for combo in model_combinations if combo[0] != combo[1]
|
||||
])
|
||||
|
||||
for model_pair in unique_combinations:
|
||||
model1, model2 = model_pair[0]['abbr'], model_pair[1]['abbr']
|
||||
subdir = model1 + '_' + model2 + '_judged-by--' + judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
filename = osp.realpath(
|
||||
osp.join(subdir_path, dataset_abbr + '.json'))
|
||||
partial_filename = osp.realpath(
|
||||
osp.join(subdir_path, dataset_abbr + '_0.json'))
|
||||
if osp.exists(osp.realpath(filename)) or osp.exists(
|
||||
osp.realpath(partial_filename)):
|
||||
fout = osp.join(
|
||||
output_dir,
|
||||
'AlpacaEval2-judged-by--' + judge_abbr + '.csv')
|
||||
|
||||
for model_pair in unique_combinations:
|
||||
model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][
|
||||
'abbr'], self.judge_abbr
|
||||
subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
fout = osp.join(output_dir,
|
||||
'judged-by--' + judge_model + '-report.csv')
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
win_model1, win_model2, categories = defaultdict(
|
||||
@ -155,9 +166,11 @@ class AlpacaSummarizer:
|
||||
win_model2[capability] = round(
|
||||
(win_model2[capability] /
|
||||
categories[capability]) * 100, 2)
|
||||
|
||||
scores = {
|
||||
'win_' + model1: win_model1,
|
||||
'win_' + model2: win_model2
|
||||
#'win_' + model1: win_model1, # We just show winrate of model2, because model1 is base model and only one model as base model in AlpacaEval
|
||||
'win_' + model2:
|
||||
win_model2
|
||||
}
|
||||
rows = list(scores.keys())
|
||||
columns = list(scores[rows[0]].keys())
|
||||
@ -169,8 +182,11 @@ class AlpacaSummarizer:
|
||||
writer.writerow(
|
||||
[row] +
|
||||
[scores[row][column] for column in columns])
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(fout, 'r') as f:
|
||||
x = from_csv(f)
|
||||
print(x)
|
||||
win_model2_update = {'total': win_model2.pop('total')}
|
||||
win_model2_update.update(win_model2)
|
||||
score_by_judgemodel[model2] = win_model2_update
|
||||
else:
|
||||
score_by_judgemodel[model2] = None
|
||||
# print(subdir_path + ' is not exist! please check!')
|
||||
all_scores[judge_abbr] = score_by_judgemodel
|
||||
return {'AlpacaEval': all_scores}
|
||||
|
@ -232,8 +232,8 @@ class ArenaHardSummarizer:
|
||||
summary_type='single') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||
self.judge_type = judge_type
|
||||
@ -251,23 +251,28 @@ class ArenaHardSummarizer:
|
||||
if self.meta_judge_model is not None:
|
||||
self.judge_models.append(self.meta_judge_model)
|
||||
|
||||
scores = {}
|
||||
all_scores = {}
|
||||
|
||||
for idx, judge_model_cfg in enumerate(self.judge_models):
|
||||
score_by_judgemodel = {}
|
||||
judge_model = model_abbr_from_cfg(judge_model_cfg)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
battles = pd.DataFrame()
|
||||
print('Turning judgment results into battles...')
|
||||
for model_pair in unique_combinations:
|
||||
model1 = model_pair[0]['abbr']
|
||||
model2 = model_pair[1]['abbr']
|
||||
model1 = model_pair[0]['abbr'] # base model, in ArenaHard it is gpt4-0314
|
||||
model2 = model_pair[1]['abbr'] # compare model, your models
|
||||
if idx == len(self.judge_models):
|
||||
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
|
||||
else:
|
||||
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
filename = osp.realpath(osp.join(subdir_path, dataset_abbr + '.json'))
|
||||
partial_filename = osp.realpath(osp.join(subdir_path, dataset_abbr + '_0.json'))
|
||||
if not osp.exists(osp.realpath(filename)) and not osp.exists(osp.realpath(partial_filename)):
|
||||
score_by_judgemodel[model2] = None
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
continue
|
||||
|
||||
@ -279,7 +284,7 @@ class ArenaHardSummarizer:
|
||||
|
||||
np.random.seed(42)
|
||||
bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
|
||||
bootstrap_elo_lu.to_json(os.path.join(output_dir,'bootstrapping_results'+ judge_model+'.jsonl'), lines=True, orient='records')
|
||||
bootstrap_elo_lu.to_json(os.path.join(output_dir,'arena_hard_bootstrapping_results_judged-by--'+ judge_model+'.jsonl'), lines=True, orient='records')
|
||||
|
||||
stats = pd.DataFrame()
|
||||
stats['results'] = None
|
||||
@ -292,8 +297,11 @@ class ArenaHardSummarizer:
|
||||
stats.at[i, 'score'] = bootstrap_online_elo[model]
|
||||
stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5)
|
||||
stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5)
|
||||
if model == 'gpt4-0314':
|
||||
stats.at[i, 'avg_tokens'] = 423
|
||||
if model == model1:
|
||||
if model1 == 'gpt4-0314':
|
||||
stats.at[i, 'avg_tokens'] = 423
|
||||
else:
|
||||
stats.at[i, 'avg_tokens'] = 0 # Not expected model
|
||||
else:
|
||||
file_name = os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json')
|
||||
model_preds = load_model_preds(file_name)
|
||||
@ -304,16 +312,20 @@ class ArenaHardSummarizer:
|
||||
stats.at[i, 'avg_tokens'] = pred_length
|
||||
stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()
|
||||
stats.sort_values(by='model', inplace=True)
|
||||
stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
|
||||
stats['lower'] = get_win_rate_column(stats, 'lower', 'gpt4-0314').tolist()
|
||||
stats['upper'] = get_win_rate_column(stats, 'upper', 'gpt4-0314').tolist()
|
||||
stats['score'] = get_win_rate_column(stats, 'score', model1).tolist()
|
||||
stats['lower'] = get_win_rate_column(stats, 'lower', model1).tolist()
|
||||
stats['upper'] = get_win_rate_column(stats, 'upper', model1).tolist()
|
||||
decimal = 1
|
||||
stats.sort_values(by='score', ascending=False, inplace=True)
|
||||
for _, row in stats.iterrows():
|
||||
interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
|
||||
print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}")
|
||||
if row['model'] != model1:
|
||||
score_by_judgemodel[row['model']] = {'score': row['score']}
|
||||
stats.to_json(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.json'), orient='records', indent=4)
|
||||
stats.to_csv(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.csv'))
|
||||
all_scores[judge_model] = score_by_judgemodel
|
||||
return {'ArenaHard': all_scores}
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
@ -327,4 +339,4 @@ class ArenaHardSummarizer:
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
self.get_score(time_str)
|
||||
return self.get_score(time_str)
|
||||
|
@ -70,8 +70,8 @@ class CompassArenaSummarizer:
|
||||
summary_type='single') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||
self.judge_type = judge_type
|
||||
@ -107,6 +107,9 @@ class CompassArenaSummarizer:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
continue
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
if len(judged_answers) == 0:
|
||||
scores[judge_model][dataset_abbr][model2] = {}
|
||||
continue
|
||||
if self.check_pos_bias:
|
||||
bias_num = check_position_bias(judged_answers, references)
|
||||
else:
|
||||
@ -175,8 +178,9 @@ class CompassArenaSummarizer:
|
||||
# scores['win_' + model1] = win_model1
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
|
||||
|
||||
all_scores = {}
|
||||
for idx, judge_model in enumerate(self.judge_models):
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
@ -198,18 +202,16 @@ class CompassArenaSummarizer:
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
output_filename = osp.join(output_dir, dataset_abbr + '-summarized-by--' + judge_abbr + '-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
output_filename = osp.join(output_dir, dataset_abbr + '-judged-by--' + judge_abbr + '-report.csv')
|
||||
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
|
||||
table = []
|
||||
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||
@ -227,14 +229,21 @@ class CompassArenaSummarizer:
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
|
||||
output_filename = osp.join(output_dir, 'compassarena-overall-summarized-by--' + judge_abbr + '.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
|
||||
output_filename = osp.join(output_dir, 'compassarena-overall-judged-by--' + judge_abbr + '.csv')
|
||||
|
||||
table = [[row[0]] + [f'{x:.2f}' if not isinstance(x, str) else x for x in row[1:]] for row in table]
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
|
||||
for idx, model in enumerate(summarizer_model_abbrs):
|
||||
score_by_judgemodel[model] = {}
|
||||
for subset in table:
|
||||
score_by_judgemodel[model][subset[0]] = subset[idx+1]
|
||||
all_scores[judge_abbr]=score_by_judgemodel
|
||||
return {'CompassArena': all_scores}
|
||||
|
@ -71,8 +71,8 @@ class CompassBenchSummarizer:
|
||||
summary_type='single') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.base_models = self.cfg['datasets'][0]['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||
self.judge_type = judge_type
|
||||
@ -108,6 +108,9 @@ class CompassBenchSummarizer:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
continue
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
if len(judged_answers) == 0:
|
||||
scores[judge_model][dataset_abbr][model2] = {}
|
||||
continue
|
||||
if self.check_pos_bias:
|
||||
bias_num = check_position_bias(judged_answers, references)
|
||||
else:
|
||||
@ -189,8 +192,9 @@ class CompassBenchSummarizer:
|
||||
# scores['win_' + model1] = win_model1
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
all_judge_file_list = []
|
||||
|
||||
all_scores = {}
|
||||
for idx, judge_model in enumerate(self.judge_models):
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
@ -220,24 +224,25 @@ class CompassBenchSummarizer:
|
||||
# print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
output_filename = osp.join(output_dir, dataset_abbr + '-summarized-by--' + judge_abbr + '-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
output_filename = osp.join(output_dir, dataset_abbr + '-judged-by--' + judge_abbr + '-report.csv')
|
||||
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
all_judge_file_list.append(output_filename)
|
||||
|
||||
for idx, model in enumerate(summarizer_model_abbrs):
|
||||
score_by_judgemodel[model] = {'overall': table[0][idx+1]}
|
||||
all_scores[judge_abbr]=score_by_judgemodel
|
||||
dfs = [pd.read_csv(file) for file in all_judge_file_list]
|
||||
|
||||
if len(dfs) > 1:
|
||||
average_df = copy.deepcopy(dfs[0])
|
||||
for col in dfs[0].columns[1:]:
|
||||
for i in range(1, len(dfs[0])):
|
||||
for i in range(0, len(dfs[0])):
|
||||
average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
|
||||
average_csv_path = osp.join(output_dir, 'Averaged-' + dataset_abbr + '-report.csv')
|
||||
average_csv_path = osp.join(output_dir, 'CompassBench-Averaged-' + dataset_abbr + '-report.csv')
|
||||
average_df.to_csv(average_csv_path, index=False)
|
||||
print(average_csv_path)
|
||||
return {'CompassBench': all_scores}
|
||||
|
@ -1,353 +0,0 @@
|
||||
# flake8: noqa
|
||||
# yapf: disable
|
||||
import copy
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from itertools import product
|
||||
|
||||
import mmengine
|
||||
import pandas as pd
|
||||
from mmengine import ConfigDict
|
||||
from tabulate import tabulate
|
||||
|
||||
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||
|
||||
|
||||
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||
if model.get('summarizer_abbr', None):
|
||||
return model['summarizer_abbr']
|
||||
else:
|
||||
return model_abbr_from_cfg(model)
|
||||
|
||||
def post_process_compass_arena(s):
|
||||
if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s):
|
||||
return result[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def get_outdir(cfg, time_str):
|
||||
"""Get out put path.
|
||||
|
||||
Args:
|
||||
cfg (ConfigDict): The running config.
|
||||
time_str (str): Current time.
|
||||
"""
|
||||
work_dir = cfg['work_dir']
|
||||
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
|
||||
output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
|
||||
mmengine.mkdir_or_exist(output_dir)
|
||||
results_folder = osp.join(work_dir, 'results')
|
||||
return output_dir, results_folder
|
||||
|
||||
|
||||
def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
|
||||
"""Extract judgements (scores) and references.
|
||||
|
||||
Args:
|
||||
dataset (ConfigDict): Dataset config.
|
||||
subdir_path (str): Model path in results dir.
|
||||
post_process (function): The pre-defined extract function.
|
||||
"""
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
filename = osp.join(subdir_path, dataset_abbr + '.json')
|
||||
partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
|
||||
if osp.exists(osp.realpath(filename)):
|
||||
result = mmengine.load(filename)
|
||||
elif osp.exists(osp.realpath(partial_filename)):
|
||||
filename = partial_filename
|
||||
result = {}
|
||||
i = 1
|
||||
partial_dict_flag = 0
|
||||
while osp.exists(osp.realpath(filename)):
|
||||
res = mmengine.load(filename)
|
||||
for k, v in res.items():
|
||||
result[partial_dict_flag] = v
|
||||
partial_dict_flag += 1
|
||||
filename = osp.join(subdir_path,
|
||||
dataset_abbr + '_' + str(i) + '.json')
|
||||
i += 1
|
||||
else:
|
||||
result = {}
|
||||
|
||||
if len(result) == 0:
|
||||
print('*' * 100)
|
||||
print('There are no results for ' + filename + ' or ' +
|
||||
partial_filename)
|
||||
print('*' * 100)
|
||||
assert len(result) > 0
|
||||
|
||||
judged_answers = []
|
||||
references = []
|
||||
result_items = []
|
||||
for k, v in result.items():
|
||||
processed_judge = post_process(v['prediction'])
|
||||
if processed_judge is not None:
|
||||
judged_answers.append(processed_judge)
|
||||
references.append(v['gold'])
|
||||
result_items.append(v)
|
||||
# else:
|
||||
# print(v['prediction'])
|
||||
# print('-' * 128)
|
||||
if len(judged_answers) != len(result):
|
||||
print(
|
||||
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
|
||||
)
|
||||
if len(judged_answers) == 0:
|
||||
print('*' * 100)
|
||||
print(
|
||||
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
||||
)
|
||||
print('*' * 100)
|
||||
assert len(judged_answers) > 0
|
||||
return judged_answers, references, result_items
|
||||
|
||||
|
||||
|
||||
def check_position_bias(judged_answers, references, banned_choice=['C']):
|
||||
"""Check position bias for judgellm's judgement.
|
||||
|
||||
Args:
|
||||
judged_answers: The successfully extracted judgement.
|
||||
references: The references contains original question, which is used to located the same question for different position judgement.
|
||||
"""
|
||||
position_bias_flag = 0
|
||||
position_bias_dict = {}
|
||||
for judge, ref in zip(judged_answers, references):
|
||||
question = ref['question']
|
||||
question_hash = hash(question)
|
||||
if question_hash not in position_bias_dict:
|
||||
position_bias_dict[question_hash] = {
|
||||
'question': question,
|
||||
'judge': judge
|
||||
}
|
||||
else:
|
||||
first_judge = position_bias_dict[question_hash]['judge']
|
||||
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
|
||||
# If second choice is same with first choice, there has position bias.
|
||||
position_bias_flag += 1
|
||||
return position_bias_flag
|
||||
|
||||
|
||||
def count_chinese_characters(text):
|
||||
words = re.findall(r'[\u4e00-\u9fff]', text)
|
||||
return len(words)
|
||||
|
||||
|
||||
def count_english_words(text):
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', text)
|
||||
return len(words)
|
||||
|
||||
|
||||
class CompassBenchTHSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
config: ConfigDict,
|
||||
judge_type='general',
|
||||
check_pos_bias=True,
|
||||
summary_type='single',
|
||||
word_count_threshold=None) -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||
self.judge_type = judge_type
|
||||
assert self.judge_type in ['general']
|
||||
self.judge_map = {'general': post_process_compass_arena}
|
||||
self.judge_function = self.judge_map[self.judge_type]
|
||||
self.check_pos_bias = check_pos_bias
|
||||
self.summary_type = summary_type
|
||||
self.word_count_threshold = word_count_threshold
|
||||
|
||||
def get_score(self, time_str):
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
model_combinations = list(product(self.base_models, self.compare_models))
|
||||
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
|
||||
|
||||
if self.meta_judge_model is not None:
|
||||
self.judge_models.append(self.meta_judge_model)
|
||||
|
||||
scores = {}
|
||||
|
||||
for idx, judge_model_cfg in enumerate(self.judge_models):
|
||||
judge_model = model_abbr_from_cfg(judge_model_cfg)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
for model_pair in unique_combinations:
|
||||
model1 = model_pair[0]['abbr']
|
||||
model2 = model_pair[1]['abbr']
|
||||
if idx == len(self.judge_models):
|
||||
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
|
||||
else:
|
||||
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
continue
|
||||
judged_answers, references, result_items = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
if self.check_pos_bias:
|
||||
bias_num = check_position_bias(judged_answers, references)
|
||||
else:
|
||||
bias_num = 0
|
||||
win_model1 = defaultdict(float)
|
||||
win_model2 = defaultdict(float)
|
||||
categories = defaultdict(float)
|
||||
difficulties = defaultdict(float)
|
||||
languages = defaultdict(float)
|
||||
model1 = references[0]['answer1']
|
||||
model2 = references[0]['answer2']
|
||||
for prediction, reference, result_item in zip(judged_answers, references, result_items):
|
||||
categories[dataset_abbr] += 1
|
||||
categories[reference['category']] += 1
|
||||
difficulties['Level-' + str(reference['level'])] += 1
|
||||
languages['Lan-' + reference['lan']] += 1
|
||||
|
||||
if prediction == 'A':
|
||||
if reference['answer1'] == model1:
|
||||
score_1, score_2 = 1, 0
|
||||
else:
|
||||
score_1, score_2 = 0, 1
|
||||
elif prediction == 'B':
|
||||
if reference['answer1'] == model1:
|
||||
score_1, score_2 = 0, 1
|
||||
else:
|
||||
score_1, score_2 = 1, 0
|
||||
elif prediction == 'C':
|
||||
if self.summary_type == 'half_add':
|
||||
score_1, score_2 = 0.5, 0.5
|
||||
else:
|
||||
score_1, score_2 = 0, 0
|
||||
|
||||
# 进行分数修正
|
||||
if self.word_count_threshold is not None:
|
||||
try:
|
||||
if reference['lan'] == 'zh':
|
||||
answer1 = re.search(r'\[回答1开始\](.*)\[回答1结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
|
||||
answer2 = re.search(r'\[回答2开始\](.*)\[回答2结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
|
||||
else:
|
||||
answer1 = re.search(r'\[Response 1 Start\](.*)\[Response 1 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
|
||||
answer2 = re.search(r'\[Response 2 Start\](.*)\[Response 2 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
|
||||
word_count1 = count_chinese_characters(answer1) + count_english_words(answer1)
|
||||
word_count2 = count_chinese_characters(answer2) + count_english_words(answer2)
|
||||
if score_1 == 1 and score_2 == 0 and word_count1 - word_count2 > self.word_count_threshold:
|
||||
score_1, score_2 = 0.5, 0.5
|
||||
elif score_1 == 0 and score_2 == 1 and word_count2 - word_count1 > self.word_count_threshold:
|
||||
score_1, score_2 = 0.5, 0.5
|
||||
except Exception as e:
|
||||
print(e)
|
||||
from IPython import embed; embed(); exit()
|
||||
|
||||
win_model1[reference['category']] += score_1
|
||||
win_model1[dataset_abbr] += score_1
|
||||
win_model1['Level-' + str(reference['level'])] += score_1
|
||||
win_model1['Lan-' + reference['lan']] += score_1
|
||||
win_model2[reference['category']] += score_2
|
||||
win_model2[dataset_abbr] += score_2
|
||||
win_model2['Level-' + str(reference['level'])] += score_2
|
||||
win_model2['Lan-' + reference['lan']] += score_2
|
||||
for category in categories:
|
||||
win_model1[category] = win_model1[category] / categories[category] * 100
|
||||
win_model1[category] = round(win_model1[category], 2)
|
||||
win_model2[category] = win_model2[category] / categories[category] * 100
|
||||
win_model2[category] = round(win_model2[category], 2)
|
||||
win_model1['position_bias'] = bias_num
|
||||
win_model2['position_bias'] = bias_num
|
||||
for difficulty in difficulties:
|
||||
win_model1[difficulty] = win_model1[difficulty] / difficulties[difficulty] * 100
|
||||
win_model2[difficulty] = win_model2[difficulty] / difficulties[difficulty] * 100
|
||||
for language in languages:
|
||||
win_model1[language] = win_model1[language] / languages[language] * 100
|
||||
win_model2[language] = win_model2[language] / languages[language] * 100
|
||||
|
||||
if judge_model not in scores:
|
||||
scores[judge_model] = {}
|
||||
if dataset_abbr not in scores[judge_model]:
|
||||
scores[judge_model][dataset_abbr] = {}
|
||||
scores[judge_model][dataset_abbr][model2] = win_model2
|
||||
|
||||
return scores, difficulties, languages
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
||||
):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
|
||||
|
||||
scores, difficulties, languages = self.get_score(time_str)
|
||||
# scores['win_' + model1] = win_model1
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
all_judge_file_list = []
|
||||
|
||||
for idx, judge_model in enumerate(self.judge_models):
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||
one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
|
||||
detail_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias'] and i not in difficulties and i not in languages]
|
||||
row_headers = [dataset_abbr, 'position_bias']
|
||||
for difficulty in difficulties:
|
||||
row_headers += [difficulty]
|
||||
for language in languages:
|
||||
row_headers += [language]
|
||||
row_headers += detail_headers
|
||||
headers = [''] + summarizer_model_abbrs
|
||||
table = []
|
||||
for row_header in row_headers:
|
||||
row = [row_header]
|
||||
for model_cfg in self.compare_models:
|
||||
model_abbr = model_abbr_from_cfg(model_cfg)
|
||||
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
|
||||
if isinstance(s, float):
|
||||
s = f'{s:.2f}'
|
||||
if isinstance(s, int):
|
||||
s = str(s)
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
# print(txt)
|
||||
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
# print(output_filename)
|
||||
all_judge_file_list.append(output_filename)
|
||||
|
||||
|
||||
dfs = [pd.read_csv(file) for file in all_judge_file_list]
|
||||
|
||||
average_df = copy.deepcopy(dfs[0])
|
||||
for col in dfs[0].columns[1:]:
|
||||
for i in range(1, len(dfs[0])):
|
||||
average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
|
||||
average_csv_path = osp.join(output_dir, 'Averaged-' + dataset_abbr + '-report.csv')
|
||||
average_df.to_csv(average_csv_path, index=False)
|
||||
print(average_csv_path)
|
@ -83,10 +83,13 @@ class FofoSummarizer:
|
||||
scores[domain].append(score)
|
||||
if format_type == 'general':
|
||||
scores[format_name].append(score)
|
||||
single_model_scores = {
|
||||
task: sum(score) / len(score)
|
||||
for task, score in scores.items()
|
||||
}
|
||||
if len(judged_answers) == 0:
|
||||
single_model_scores = {}
|
||||
else:
|
||||
single_model_scores = {
|
||||
task: sum(score) / len(score)
|
||||
for task, score in scores.items()
|
||||
}
|
||||
if judge_model not in total_scores:
|
||||
total_scores[judge_model] = {}
|
||||
if dataset_abbr not in total_scores[judge_model]:
|
||||
@ -107,11 +110,13 @@ class FofoSummarizer:
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
all_scores = {}
|
||||
scores = self.get_score(time_str)
|
||||
print(scores)
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
for idx, judge_model in enumerate(self.judge_models):
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
score_by_judgemodel = {}
|
||||
score_saver = {}
|
||||
for dataset in self.cfg['datasets']:
|
||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||
summarizer_model_abbrs = self.eval_model_abbrs
|
||||
@ -119,7 +124,7 @@ class FofoSummarizer:
|
||||
format_types = ['Json', 'CSV', 'XML', 'YAML', 'Markdown']
|
||||
row_headers = [
|
||||
i for i in one_column.keys()
|
||||
if i not in [dataset_abbr] + format_types
|
||||
if i not in [dataset_abbr] + format_types + ['overall']
|
||||
]
|
||||
row_headers = ['overall'] + format_types + row_headers
|
||||
headers = [dataset_abbr] + summarizer_model_abbrs
|
||||
@ -136,19 +141,24 @@ class FofoSummarizer:
|
||||
row.append(s)
|
||||
table.append(row)
|
||||
txt = tabulate(table, headers=headers)
|
||||
print(txt)
|
||||
|
||||
score_saver[dataset_abbr] = [s for s in table[0][1:]]
|
||||
if idx == len(self.judge_models):
|
||||
output_filename = osp.join(
|
||||
output_dir, 'summarized-by--' + judge_abbr + '-' +
|
||||
dataset_abbr + '-report.csv')
|
||||
output_dir, dataset_abbr + '-summarized-by--' +
|
||||
judge_abbr + '-' + '-report.csv')
|
||||
else:
|
||||
output_filename = osp.join(
|
||||
output_dir, 'judged-by--' + judge_abbr + '-' +
|
||||
dataset_abbr + '-report.csv')
|
||||
output_dir, dataset_abbr + '-judged-by--' +
|
||||
judge_abbr + '-' + '-report.csv')
|
||||
|
||||
with open(output_filename, 'w') as f:
|
||||
f.write(','.join(headers) + '\n')
|
||||
for line in table:
|
||||
f.write(','.join(line) + '\n')
|
||||
print(output_filename)
|
||||
for idx, model in enumerate(summarizer_model_abbrs):
|
||||
score_by_judgemodel[model] = {}
|
||||
for subset_name, subset_scores in score_saver.items():
|
||||
score_by_judgemodel[model][subset_name] = subset_scores[
|
||||
idx]
|
||||
all_scores[judge_abbr] = score_by_judgemodel
|
||||
return {'Fofo': all_scores}
|
||||
|
@ -1,138 +0,0 @@
|
||||
# flake8: noqa: E501
|
||||
import csv
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from mmengine import ConfigDict
|
||||
|
||||
try:
|
||||
from prettytable import from_csv
|
||||
except ImportError:
|
||||
from_csv = None
|
||||
|
||||
from opencompass.utils import model_abbr_from_cfg
|
||||
|
||||
from .subjective_post_process import post_process_autoj
|
||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||
|
||||
|
||||
def post_process_ir(judgement: str):
|
||||
"""Input a string like below:
|
||||
|
||||
Conclusion: [[Correct]]\nReasoning: xxx
|
||||
and extract the score
|
||||
"""
|
||||
matches = re.findall(r'\[\[(.*?)\]\]', judgement)
|
||||
if matches:
|
||||
matches = matches[0]
|
||||
if matches in ['Correct', 'Wrong', '对', '错']:
|
||||
if matches == 'Correct' or matches == '对':
|
||||
return {'score': 1}
|
||||
else:
|
||||
return {'score': 0}
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def get_results(
|
||||
judged_answers,
|
||||
references,
|
||||
fout,
|
||||
fout_flag,
|
||||
model,
|
||||
):
|
||||
capability_ratings = defaultdict(int)
|
||||
capability_counts = defaultdict(int)
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
lan = ref['others']['lan']
|
||||
capability_ratings['total'] += ans['score']
|
||||
capability_counts['total'] += 1
|
||||
capability_ratings[lan] += ans['score']
|
||||
capability_counts[lan] += 1
|
||||
|
||||
capability_avg_ratings = defaultdict(float)
|
||||
|
||||
for capability, total_score in capability_ratings.items():
|
||||
capability_avg_ratings[
|
||||
capability] = total_score / capability_counts[capability]
|
||||
|
||||
scores = {model: capability_avg_ratings}
|
||||
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
num_header = [str(i) for i in range(4)]
|
||||
writer.writerow(num_header)
|
||||
|
||||
header = ['模型']
|
||||
for category in capability_avg_ratings:
|
||||
header.append(category)
|
||||
writer.writerow(header)
|
||||
|
||||
row = [model]
|
||||
for category in capability_avg_ratings:
|
||||
row.append(scores[model][category])
|
||||
writer.writerow(row)
|
||||
|
||||
|
||||
class IRSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
|
||||
self.tasks = []
|
||||
self.cfg = config
|
||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||
self.eval_model_abbrs = [
|
||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||
]
|
||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
|
||||
self.judge_type = judge_type
|
||||
assert self.judge_type in ['general', 'autoj']
|
||||
self.judge_map = {
|
||||
'general': post_process_ir,
|
||||
'autoj': post_process_autoj,
|
||||
}
|
||||
self.judge_function = self.judge_map[self.judge_type]
|
||||
|
||||
def summarize(self,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag = 0
|
||||
for eval_model_abbr in self.eval_model_abbrs:
|
||||
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||
fout = osp.join(output_dir,
|
||||
'judged-by--' + judge_model + '.csv')
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
get_results(judged_answers, references, fout, fout_flag,
|
||||
model)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(fout, 'r') as f:
|
||||
x = from_csv(f)
|
||||
print(x)
|
@ -16,6 +16,7 @@ from opencompass.utils import model_abbr_from_cfg
|
||||
from .compass_arena import CompassArenaSummarizer
|
||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||
|
||||
COLUMNS = ['total', 'writing', 'roleplay', 'reasoning', 'math', 'coding', 'extraction', 'stem', 'humanities']
|
||||
|
||||
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||
if model.get('summarizer_abbr', None):
|
||||
@ -57,22 +58,24 @@ def get_capability_results(
|
||||
fout_flag,
|
||||
model_abbr,
|
||||
):
|
||||
columns = COLUMNS
|
||||
capability_ratings = defaultdict(int)
|
||||
capability_counts = defaultdict(int)
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
capability_ratings['total'] += ans['score']
|
||||
capability_counts['total'] += 1
|
||||
capability_ratings[ref['capability']] += ans['score']
|
||||
capability_counts[ref['capability']] += 1
|
||||
|
||||
capability_avg_ratings = defaultdict(float)
|
||||
if len(judged_answers) == 0:
|
||||
for column in columns:
|
||||
capability_avg_ratings[column] = ''
|
||||
else:
|
||||
for ans, ref in zip(judged_answers, references):
|
||||
capability_ratings['total'] += ans['score']
|
||||
capability_counts['total'] += 1
|
||||
capability_ratings[ref['capability']] += ans['score']
|
||||
capability_counts[ref['capability']] += 1
|
||||
|
||||
for capability, total_score in capability_ratings.items():
|
||||
s = total_score / capability_counts[capability]
|
||||
s = round(s, 2)
|
||||
capability_avg_ratings[capability] = s
|
||||
columns = list(capability_avg_ratings.keys())
|
||||
columns.insert(0, columns.pop(columns.index('total')))
|
||||
for capability, total_score in capability_ratings.items():
|
||||
s = total_score / capability_counts[capability]
|
||||
s = round(s, 2)
|
||||
capability_avg_ratings[capability] = s
|
||||
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
writer = csv.writer(csvfile)
|
||||
@ -98,7 +101,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
||||
elif self.judge_type == 'pair':
|
||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.judge_map = {
|
||||
'single': post_process_mtbench_single,
|
||||
'pair': post_process_mtbench_pair
|
||||
@ -120,34 +123,34 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
||||
# self.judge_type == 'single'
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag = 0
|
||||
for eval_model_cfg in self.eval_model_cfgs:
|
||||
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
|
||||
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
|
||||
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
|
||||
if os.path.isdir(subdir_path):
|
||||
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
|
||||
overall_judged_answers, overall_references = [], []
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
overall_judged_answers += judged_answers
|
||||
overall_references += references
|
||||
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(fout, 'r') as f:
|
||||
csv_reader = csv.reader(f)
|
||||
header = next(csv_reader)
|
||||
table = [line for line in csv_reader]
|
||||
all_scores = {}
|
||||
for judge_model in self.judge_models:
|
||||
fout_flag = 0
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for eval_model_cfg in self.eval_model_cfgs:
|
||||
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
|
||||
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
|
||||
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
|
||||
if os.path.isdir(subdir_path):
|
||||
fout = osp.join(output_dir, 'MTBench-judged-by--' + judge_abbr + '-capability.csv')
|
||||
overall_judged_answers, overall_references = [], []
|
||||
for dataset in dataset_cfgs:
|
||||
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||
overall_judged_answers += judged_answers
|
||||
overall_references += references
|
||||
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
with open(fout, 'r') as f:
|
||||
csv_reader = csv.reader(f)
|
||||
header = next(csv_reader)
|
||||
table = [line for line in csv_reader]
|
||||
|
||||
new_header = [''] + [line[0] for line in table]
|
||||
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
|
||||
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
|
||||
t = tabulate(new_table, headers=new_header)
|
||||
with open(fout, 'w') as f:
|
||||
f.write(','.join(new_header) + '\n')
|
||||
for line in new_table:
|
||||
f.write(','.join(map(str, line)) + '\n')
|
||||
print(t)
|
||||
print(fout)
|
||||
for model_score in table:
|
||||
score_by_judgemodel[model_score[0]] = {}
|
||||
for idx, column in enumerate(COLUMNS):
|
||||
score_by_judgemodel[model_score[0]][column] = model_score[idx+1]
|
||||
all_scores[judge_abbr] = score_by_judgemodel
|
||||
return {'MTbench': all_scores}
|
||||
|
@ -50,8 +50,8 @@ def post_process_mtbench101(judgement: str):
|
||||
return {'score': score, 'judgement': judgement}
|
||||
|
||||
|
||||
def get_final_results(judged_answers, references, output_dir, fout_flag,
|
||||
model):
|
||||
def get_final_results(judged_answers, references, output_dir, fout_flag, model,
|
||||
judgemodel):
|
||||
|
||||
task_multi_id_scores = defaultdict(list)
|
||||
task_scores = defaultdict(list)
|
||||
@ -72,22 +72,21 @@ def get_final_results(judged_answers, references, output_dir, fout_flag,
|
||||
task: sum(scores) / len(scores) if scores else 0
|
||||
for task, scores in task_scores.items()
|
||||
}
|
||||
|
||||
fout = osp.join(output_dir, 'task_score.csv')
|
||||
average_score = round(
|
||||
sum(final_task_scores.values()) / len(final_task_scores), 2)
|
||||
fout = osp.join(output_dir,
|
||||
'MTBench101-task_score-judged-by--' + judgemodel + '.csv')
|
||||
|
||||
columns = list(final_task_scores.keys())
|
||||
|
||||
print('================task_score=====================')
|
||||
print(final_task_scores)
|
||||
|
||||
with open(fout, 'a+', newline='') as csvfile:
|
||||
|
||||
writer = csv.writer(csvfile)
|
||||
if fout_flag == 0:
|
||||
writer.writerow(['model'] + columns)
|
||||
writer.writerow([model] +
|
||||
writer.writerow(['model', 'average'] + columns)
|
||||
writer.writerow([model, average_score] +
|
||||
[final_task_scores[column] for column in columns])
|
||||
return 0
|
||||
return average_score
|
||||
|
||||
|
||||
class MTBench101Summarizer(CompassArenaSummarizer):
|
||||
@ -107,7 +106,7 @@ class MTBench101Summarizer(CompassArenaSummarizer):
|
||||
self.eval_model_abbrs = [
|
||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
||||
]
|
||||
|
||||
self.judge_models = self.cfg.get('judge_models', None)
|
||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||
|
||||
self.judge_function = post_process_mtbench101
|
||||
@ -122,21 +121,27 @@ class MTBench101Summarizer(CompassArenaSummarizer):
|
||||
Returns:
|
||||
pd.DataFrame: The summary results.
|
||||
"""
|
||||
dataset_cfgs = self.cfg['datasets']
|
||||
dataset = self.cfg['datasets'][0] # MTBench101 has just one subfile
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
fout_flag = 0
|
||||
for eval_model_abbr in self.eval_model_abbrs:
|
||||
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
model, judge_model = eval_model_abbr, self.judge_abbr
|
||||
|
||||
for dataset in dataset_cfgs:
|
||||
print()
|
||||
all_scores = {}
|
||||
for judge_model in self.judge_models:
|
||||
fout_flag = 0
|
||||
score_by_judgemodel = {}
|
||||
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||
for eval_model_abbr in self.eval_model_abbrs:
|
||||
subdir = eval_model_abbr + '_judged-by--' + judge_abbr
|
||||
subdir_path = os.path.join(results_folder, subdir)
|
||||
if os.path.isdir(subdir_path):
|
||||
judged_answers, references = get_judgeanswer_and_reference(
|
||||
dataset, subdir_path, self.judge_function)
|
||||
get_final_results(judged_answers, references, output_dir,
|
||||
fout_flag, model)
|
||||
model_average_score = get_final_results(
|
||||
judged_answers, references, output_dir, fout_flag,
|
||||
eval_model_abbr, judge_abbr)
|
||||
fout_flag += 1
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
score_by_judgemodel[eval_model_abbr] = {
|
||||
'average': model_average_score
|
||||
}
|
||||
else:
|
||||
print(subdir_path + ' is not exist! please check!')
|
||||
all_scores[judge_abbr] = score_by_judgemodel
|
||||
return {'MTBench101': all_scores}
|
||||
|
105
opencompass/summarizers/subjective/subjective.py
Normal file
105
opencompass/summarizers/subjective/subjective.py
Normal file
@ -0,0 +1,105 @@
|
||||
# flake8: noqa: E501
|
||||
import os.path as osp
|
||||
from datetime import datetime
|
||||
|
||||
import pandas as pd
|
||||
from mmengine import ConfigDict
|
||||
|
||||
from .utils import get_outdir
|
||||
|
||||
|
||||
# Flatten the nested structure and ensure consistent order of models across datasets
|
||||
def flatten_data(data):
|
||||
flat_data = {}
|
||||
models_order = set()
|
||||
for dataset in data:
|
||||
for dataset_name, judgemodel_scores in dataset.items():
|
||||
for judgemodel_name, model_scores in judgemodel_scores.items():
|
||||
if judgemodel_name not in flat_data:
|
||||
flat_data[judgemodel_name] = {}
|
||||
if dataset_name not in flat_data[judgemodel_name]:
|
||||
flat_data[judgemodel_name][dataset_name] = {}
|
||||
for model_name, scores in model_scores.items():
|
||||
models_order.add(model_name)
|
||||
if scores is not None:
|
||||
for score_name, score_value in scores.items():
|
||||
flat_data[
|
||||
judgemodel_name][dataset_name].setdefault(
|
||||
score_name,
|
||||
{}).setdefault(model_name, score_value)
|
||||
else:
|
||||
for score_name in flat_data[judgemodel_name][
|
||||
dataset_name]:
|
||||
flat_data[judgemodel_name][dataset_name][
|
||||
score_name].setdefault(model_name, None)
|
||||
|
||||
# Ensure consistent order of models
|
||||
consistent_models_order = sorted(list(models_order))
|
||||
|
||||
for judgemodel_name in flat_data:
|
||||
for dataset_name in flat_data[judgemodel_name]:
|
||||
for score_name in flat_data[judgemodel_name][dataset_name]:
|
||||
for model_name in consistent_models_order:
|
||||
flat_data[judgemodel_name][dataset_name][
|
||||
score_name].setdefault(model_name, None)
|
||||
|
||||
return flat_data, consistent_models_order
|
||||
|
||||
|
||||
class SubjectiveSummarizer:
|
||||
"""Do the subjectivity analyze based on evaluation results.
|
||||
|
||||
Args:
|
||||
config (ConfigDict): The configuration object of the evaluation task.
|
||||
It's expected to be filled out at runtime.
|
||||
"""
|
||||
|
||||
def __init__(self, config: ConfigDict, function: str) -> None:
|
||||
self.cfg = config
|
||||
self.function = function
|
||||
|
||||
def summarize(
|
||||
self,
|
||||
subjective_scores: list,
|
||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
||||
):
|
||||
"""Summarize the subjectivity analysis based on evaluation results.
|
||||
|
||||
Args:
|
||||
subjective_scores (list of dicts): Container of saving score information for each datasets and models
|
||||
time_str (str): Timestamp for file naming.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||
|
||||
flat_data, models_order = flatten_data(subjective_scores)
|
||||
|
||||
# Create a DataFrame for each judgemodel with models as rows and datasets as columns
|
||||
judgemodel_dfs_final_corrected = {}
|
||||
for judgemodel_name, datasets_scores in flat_data.items():
|
||||
dfs = {} # Dictionary to hold DataFrames for each dataset
|
||||
for dataset_name, scores in datasets_scores.items():
|
||||
# Create a DataFrame with models as index and datasets as columns
|
||||
df = pd.DataFrame.from_dict(scores,
|
||||
orient='index',
|
||||
columns=models_order)
|
||||
# Insert a new row at the top for the dataset names
|
||||
df.insert(0, 'Detailed Scores', list(scores.keys()))
|
||||
df.insert(0, 'Dataset',
|
||||
[dataset_name for _ in range(len(df.index))])
|
||||
dfs[dataset_name] = df
|
||||
|
||||
# Concatenate all DataFrames for the current judgemodel
|
||||
judgemodel_df = pd.concat(dfs.values(), ignore_index=True)
|
||||
judgemodel_dfs_final_corrected[judgemodel_name] = judgemodel_df
|
||||
|
||||
# Save each DataFrame to a separate CSV file
|
||||
for judgemodel_name, df in judgemodel_dfs_final_corrected.items():
|
||||
fout = osp.join(
|
||||
output_dir, 'Subjective_all_results-judged-by--' +
|
||||
judgemodel_name + '.csv')
|
||||
print('Your subjective evaluation results have been saved at ' +
|
||||
str(fout))
|
||||
df.to_csv(fout, index=False)
|
@ -55,7 +55,6 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
|
||||
print('There are no results for ' + filename + ' or ' +
|
||||
partial_filename)
|
||||
print('*' * 100)
|
||||
assert len(result) > 0
|
||||
|
||||
judged_answers = []
|
||||
references = []
|
||||
@ -67,15 +66,10 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
|
||||
# else:
|
||||
# print(v['prediction'])
|
||||
# print('-' * 128)
|
||||
if len(judged_answers) != len(result):
|
||||
print(
|
||||
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
|
||||
)
|
||||
if len(judged_answers) == 0:
|
||||
if len(judged_answers) <= 0.95 * len(result):
|
||||
print('*' * 100)
|
||||
print(
|
||||
'There are no extracted judgements, please change your judge model or check your prompt!!!'
|
||||
f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
|
||||
)
|
||||
print('*' * 100)
|
||||
assert len(judged_answers) > 0
|
||||
return judged_answers, references
|
||||
|
@ -56,7 +56,7 @@ class SubjectiveEvalTask(BaseTask):
|
||||
self.judge_cfg = copy.deepcopy(judge_cfg)
|
||||
self.judge_models = judge_models
|
||||
self.infer_order = cfg.get('infer_order')
|
||||
self.given_pred = cfg.eval.get('given_pred', [])
|
||||
self.given_pred = cfg['datasets'][0][0].get('given_pred', [])
|
||||
|
||||
def get_command(self, cfg_path, template):
|
||||
"""Get the command template for the task.
|
||||
|
Loading…
Reference in New Issue
Block a user