[Refactor] Reorganize subjective eval (#1284)

* fix pip version

* fix pip version

* reorganize subjective eval

* reorg sub

* reorg subeval

* reorg subeval

* update subjective doc

* reorg subeval

* reorg subeval
This commit is contained in:
bittersweet1999 2024-07-05 22:11:37 +08:00 committed by GitHub
parent aadcfa625f
commit 68ca48496b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
61 changed files with 1196 additions and 2589 deletions

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset
from opencompass.summarizers import AlignmentBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'ref'],
@ -14,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/alignment_bench'
subjective_datasets = []
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -59,7 +60,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
@ -67,5 +68,7 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='autoj')
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset
from opencompass.summarizers import AlignmentBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -17,7 +18,7 @@ data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
subjective_datasets = []
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -50,7 +51,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
@ -60,5 +61,7 @@ for _name in subjective_all_sets:
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset
from opencompass.summarizers import AlignmentBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'ref'],
@ -14,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/alignment_bench'
subjective_datasets = []
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -47,7 +48,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
@ -55,5 +56,7 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='judgelm')
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset
from opencompass.summarizers import AlignmentBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -10,14 +11,14 @@ subjective_reader_cfg = dict(
)
subjective_all_sets = [
'alignment_bench_v1_1',
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
subjective_datasets = []
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -50,7 +51,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
@ -60,5 +61,7 @@ for _name in subjective_all_sets:
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -15,7 +15,7 @@ subjective_all_sets = [
]
subjective_datasets = []
alpacav1_datasets = []
gpt4_prompt = """
I want you to create a leaderboard of different of large-language models. To do so, I will give you the instructions (prompts) given to the models, and the responses of two models. Please rank the models based on which responses would be preferred by humans. All inputs and outputs should be python dictionaries.
@ -85,7 +85,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alpacav1_datasets.append(
dict(
abbr=f'{_name}',
type=SubjectiveCmpDataset,

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -15,7 +16,7 @@ subjective_all_sets = [
]
subjective_datasets = []
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
@ -48,6 +49,17 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -87,7 +99,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=SubjectiveCmpDataset,
@ -95,5 +107,10 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset
from opencompass.summarizers import ArenaHardSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -15,12 +16,15 @@ subjective_all_sets = [
]
subjective_datasets = []
arenahard_datasets = []
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
gpt4 = [dict(
abbr='gpt4-0314',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -60,13 +64,18 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
arenahard_datasets.append(
dict(
abbr=f'{_name}',
abbr='arenahard',
type=ArenaHardDataset,
path='./data/subjective/arena_hard',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer = dict(type=ArenaHardSummarizer),
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
@ -11,7 +12,7 @@ subjective_reader_cfg = dict(
data_path ='data/subjective/compass_arena'
subjective_datasets = []
compassarena_datasets = []
base_prompt = """
@ -101,6 +102,10 @@ creation_prompt = """
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
@ -132,13 +137,18 @@ for _name, _prompt in sub_map.items():
pred_role='BOT',
)
subjective_datasets.append(
compassarena_datasets.append(
dict(
abbr=f'{_name}',
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchDataset
from opencompass.summarizers import CompassBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'judge_prompt'],
@ -11,10 +12,14 @@ subjective_reader_cfg = dict(
data_path ='data/subjective/compassbench'
subjective_datasets = []
compassbench_datasets = []
versions = ['CompassBenchV1.1']
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for version_abbr in versions:
subjective_infer_cfg = dict(
prompt_template=dict(
@ -46,7 +51,7 @@ for version_abbr in versions:
pred_role='BOT',
)
subjective_datasets.append(
compassbench_datasets.append(
dict(
abbr=version_abbr,
type=CompassBenchDataset,
@ -54,5 +59,10 @@ for version_abbr in versions:
name=version_abbr,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchControlLengthBiasDataset
from opencompass.summarizers import CompassBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'judge_prompt'],
output_column='judge',
@ -11,10 +11,12 @@ subjective_reader_cfg = dict(
data_path ='data/subjective/compassbench'
subjective_datasets = []
compassbench_datasets = []
versions = ['CompassBenchV1.1']
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for version_abbr in versions:
subjective_infer_cfg = dict(
prompt_template=dict(
@ -46,7 +48,7 @@ for version_abbr in versions:
pred_role='BOT',
)
subjective_datasets.append(
compassbench_datasets.append(
dict(
abbr=version_abbr,
type=CompassBenchControlLengthBiasDataset,
@ -54,5 +56,10 @@ for version_abbr in versions:
name=version_abbr,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchControlLengthBiasDataset
from opencompass.summarizers import CompassBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'judge_prompt'],
@ -11,10 +12,15 @@ subjective_reader_cfg = dict(
data_path ='data/subjective/compassbench'
subjective_datasets = []
compassbench_datasets = []
versions = ['CompassBenchV1.1.patch', 'CompassBenchV1.1.patch.en']
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for version_abbr in versions:
subjective_infer_cfg = dict(
prompt_template=dict(
@ -46,7 +52,7 @@ for version_abbr in versions:
pred_role='BOT',
)
subjective_datasets.append(
compassbench_datasets.append(
dict(
abbr=version_abbr,
type=CompassBenchControlLengthBiasDataset,
@ -54,5 +60,10 @@ for version_abbr in versions:
name=version_abbr,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassBenchControlLengthBiasDataset
from opencompass.summarizers import CompassBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'judge_prompt'],
output_column='judge',
@ -11,10 +11,12 @@ subjective_reader_cfg = dict(
data_path ='data/subjective/compassbench'
subjective_datasets = []
compassbench_datasets = []
versions = ['CompassBenchV1.2']
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for version_abbr in versions:
subjective_infer_cfg = dict(
prompt_template=dict(
@ -46,7 +48,7 @@ for version_abbr in versions:
pred_role='BOT',
)
subjective_datasets.append(
compassbench_datasets.append(
dict(
abbr=version_abbr,
type=CompassBenchControlLengthBiasDataset,
@ -54,5 +56,10 @@ for version_abbr in versions:
name=version_abbr,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
summarizer=dict(type=CompassBenchSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -3,6 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -44,7 +45,7 @@ Please evaluate the formatting of the model's responses by checking if they comp
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
subjective_datasets = []
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -84,7 +85,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
@ -92,5 +93,7 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset
from opencompass.summarizers import MTBench101Summarizer
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -15,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/'
subjective_datasets = []
mtbench101_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
@ -50,7 +50,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
mtbench101_datasets.append(
dict(
abbr=f'{_name}',
type=MTBench101Dataset,
@ -58,5 +58,7 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
))

View File

@ -3,7 +3,7 @@ from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset
from opencompass.summarizers import MTBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -15,7 +15,7 @@ subjective_all_sets = [
]
data_path ='data/subjective/mtbench'
subjective_datasets = []
mtbench_datasets = []
for _name in subjective_all_sets:
temperature = float(_name.split('_')[1])
@ -52,7 +52,7 @@ for _name in subjective_all_sets:
pred_role='BOT',
)
subjective_datasets.append(
mtbench_datasets.append(
dict(
abbr=f'{_name}',
type=MTBenchDataset,
@ -60,5 +60,7 @@ for _name in subjective_all_sets:
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
))

136
configs/eval_subjective.py Normal file
View File

@ -0,0 +1,136 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import alpacav2_datasets
from .datasets.subjective.compassarena.compassarena_compare import compassarena_datasets
from .datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
from .datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
from .datasets.subjective.fofo.fofo_judge import fofo_datasets
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
from .models.chatglm.hf_chatglm3_6b import models
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.partitioners.sub_num_worker import SubjectiveNumWorkerPartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import SubjectiveSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
for model in models:
model['generation_kwargs'] = dict(do_sample=True)
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
),dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf2',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
),dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf3',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=OpenICLInferTask)),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)]
judge_models = [models[0]]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner, models=models, judge_models=judge_models,),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=SubjectiveSummarizer, function='subjective')
work_dir = 'outputs/subjective/'

View File

@ -1,79 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlignmentBenchSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*subjective_datasets]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=1000, mode='singlescore', models=models, judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
work_dir = 'outputs/alignment_bench/'

View File

@ -1,104 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlpacaSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*alpacav2]
gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models,
infer_order='random',
judge_models=judge_models
),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
)
work_dir = 'outputs/alpaca/'
summarizer = dict(type=AlpacaSummarizer, judge_type='v2')

View File

@ -1,104 +0,0 @@
from opencompass.models import HuggingFaceCausalLM
from copy import deepcopy
from opencompass.models import TurboMindModel
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import ArenaHardSummarizer
with read_base():
from .datasets.subjective.arena_hard.arena_hard_compare import subjective_datasets
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|begin_of_text|>user<|end_header_id|>\n\n', end='<|eot_id|>'),
dict(role='BOT', begin='<|begin_of_text|>assistant<|end_header_id|>\n\n', end='<|eot_id|>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='llama-3-8b-instruct-hf',
path='meta-llama/Meta-Llama-3-8B-Instruct',
model_kwargs=dict(device_map='auto'),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=4096,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
generation_kwargs={'eos_token_id': [128001, 128009]},
batch_padding=True,
)
]
datasets = [*subjective_datasets]
work_dir = 'outputs/arena_hard/'
# -------------Inferen Stage ----------------------------------------
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000000),
runner=dict(
type=LocalRunner,
max_num_workers=32,
task=dict(type=OpenICLInferTask)),
)
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='',
meta_template=api_meta_template,
query_per_second=1,
max_out_len=4096,
max_seq_len=8192,
batch_size=10,
retry=10,
temperature = 0,
)]
## ------------- Evaluation Configuration
gpt4_0314 = dict(
abbr='gpt4-0314',
type=OpenAI,
)
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=1000000,
mode='m2n',
infer_order='double',
base_models=[gpt4_0314],
compare_models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=16, task=dict(type=SubjectiveEvalTask)),
given_pred = [{'abbr':'gpt4-0314', 'path':''}]
)
summarizer = dict(
type=ArenaHardSummarizer
)

View File

@ -1,106 +0,0 @@
from os import getenv as gv
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
with read_base():
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import CompassArenaSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*subjective_datasets]
gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
strategy='split',
max_task_size=10000,
mode='m2n',
infer_order='double',
base_models=[gpt4],
compare_models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
)
work_dir = 'outputs/compass_arena_debug/'
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add')

View File

@ -1,137 +0,0 @@
from os import getenv as gv
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
with read_base():
from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import CompassBenchSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
# -------------Inference Stage ----------------------------------------
from opencompass.models import HuggingFacewithChatTemplate
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm2-chat-7b-hf',
path='internlm/internlm2-chat-7b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
stop_words=['</s>', '<|im_end|>'],
generation_kwargs=dict(
do_sample=True,
),
)
]
datasets = [*subjective_datasets]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='reserved',
max_num_workers=256,
task=dict(type=OpenICLInferTask),
),
)
gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
) # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature=0,
)]
judge_models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
),
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b2',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
),
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm102b3',
path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=4),
stop_words=['</s>', '<|im_end|>'],
)
]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
strategy='split',
max_task_size=10000000,
mode='m2n',
infer_order='double',
base_models=[gpt4],
compare_models=models,
judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
#given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
)
work_dir = 'outputs/compassbench/'
summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')

View File

@ -1,77 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.creationbench.creationbench_judgeby_gpt4_withref import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import CreationBenchSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*subjective_datasets]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)),
)
summarizer = dict(type=CreationBenchSummarizer, judge_type='general')
work_dir = 'outputs/creationbench/'

View File

@ -1,69 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.fofo.fofo_judge import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models import HuggingFacewithChatTemplate
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import FofoSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFacewithChatTemplate,
abbr='internlm2-chat-1.8b-hf',
path='internlm/internlm2-chat-1_8b',
max_out_len=1024,
batch_size=8,
run_cfg=dict(num_gpus=1),
stop_words=['</s>', '<|im_end|>'],
generation_kwargs=dict(
do_sample=True,
),
)
]
datasets = [*subjective_datasets]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner, max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models,
),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=FofoSummarizer, judge_type='general')
work_dir = 'outputs/fofo/'

View File

@ -1,111 +0,0 @@
from opencompass.models import HuggingFaceCausalLM
from copy import deepcopy
from opencompass.models import TurboMindModel
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import MultiroundSummarizer
with read_base():
from .datasets.subjective.multiround.functionalmt_zh_judgeby_gpt4 import subjective_datasets
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
_meta_template = dict(
round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
],
eos_token_id=151645,
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen1.5-7b-chat-hf',
path='Qwen/Qwen1.5-7B-Chat',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]
datasets = [*subjective_datasets]
work_dir = 'outputs/multiround/'
# -------------Inferen Stage ----------------------------------------
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(
type=SlurmSequentialRunner,
partition='your part',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='',
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=10,
retry=10,
temperature = 0,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
max_task_size=1000,
mode='singlescore',
models = models,
judge_models=judge_models
),
runner=dict(
type=SlurmSequentialRunner,
partition='your part',
quotatype='auto',
max_num_workers=256,
task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(
type=MultiroundSummarizer
)

View File

@ -1,92 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import AlignmentBenchSummarizer
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
datasets = [*subjective_datasets]
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask),
),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
type=HuggingFaceCausalLM,
abbr='pandalm-7b-v1-hf',
path='WeOpenML/PandaLM-7B-v1',
tokenizer_path='WeOpenML/PandaLM-7B-v1',
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
max_out_len=512,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1, num_procs=1),
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models, judge_models=judge_models),
runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=AlignmentBenchSummarizer)
work_dir = 'outputs/pandalm'

View File

@ -1,84 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import MTBenchSummarizer
api_meta_template = dict(
round=[
dict(role='SYSTEM', api_role='SYSTEM'),
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
_meta_template = dict(
round=[
dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
],
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen-7b-chat-hf',
path='Qwen/Qwen-7B-Chat',
tokenizer_path='Qwen/Qwen-7B-Chat',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]
datasets = [*subjective_datasets]
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
key='xxxx', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=2048,
max_seq_len=2048,
batch_size=8,
temperature=0,
)]
## single evaluation
eval = dict(
partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
work_dir = 'outputs/mtbench/'

View File

@ -1,94 +0,0 @@
from mmengine.config import read_base
with read_base():
from .datasets.subjective.multiround.mtbench101_judge import subjective_datasets
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import MTBench101Summarizer
# ---------------------------------------------------------------------------------------------------------
api_meta_template = dict(
round=[
dict(role='SYSTEM', api_role='SYSTEM'),
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=4096,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=2, num_procs=1),
)
]
datasets = [*subjective_datasets]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=10000),
runner=dict(
type=SlurmSequentialRunner,
partition='llm_dev2',
quotatype='auto',
max_num_workers=32,
task=dict(type=OpenICLInferTask),
),
)
# -------------Evalation Stage ----------------------------------------
## ------------- JudgeLLM Configuration
judge_models = [dict(
abbr='GPT4-Turbo',
type=OpenAI,
path='gpt-4-1106-preview', # To compare with the official leaderboard, please use gpt-4-1106-preview
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=16,
max_out_len=4096,
max_seq_len=4096,
batch_size=8,
temperature=0.8,
)]
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(type=SubjectiveSizePartitioner, max_task_size=100000, mode='singlescore', models=models, judge_models=judge_models),
runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
)
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
work_dir = 'outputs/mtbench101/'

View File

@ -1,21 +0,0 @@
from mmengine.config import read_base
with read_base():
from ..datasets.subjective.alignbench.alignbench_judgeby_critiquellm import subjective_datasets
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.summarizers import AlignmentBenchSummarizer
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
datasets = [*subjective_datasets]
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner, mode='singlescore', models=models
),
runner=runner,
)
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
work_dir = 'outputs/alignment_bench/'

View File

@ -1,29 +0,0 @@
from mmengine.config import read_base
with read_base():
from ..datasets.subjective.alpaca_eval.alpacav1_judgeby_gpt4 import subjective_datasets as alpacav1
from ..datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.summarizers import AlpacaSummarizer
from opencompass.tasks.outer_eval.alpacaeval import AlpacaEvalTask
datasets = [*alpacav2]
gpt4_judge = dict(
abbr='GPT4-Turbo',
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
config='weighted_alpaca_eval_gpt4_turbo'
)
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=NaivePartitioner
),
runner=dict(
type=LocalRunner,
max_num_workers=256,
task=dict(type=AlpacaEvalTask, judge_cfg=gpt4_judge),
)
)
work_dir = 'outputs/alpaca/'

View File

@ -1,28 +0,0 @@
from os import getenv as gv
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
with read_base():
from ..datasets.subjective.compassarena.compassarena_compare import subjective_datasets
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.summarizers import CompassArenaSummarizer
datasets = [*subjective_datasets]
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
strategy='split',
max_task_size=10000,
mode='m2n',
base_models=[gpt4],
compare_models=models,
),
runner=runner,
given_pred=given_pred
)
work_dir = 'outputs/compass_arena/'
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add')

View File

@ -1,25 +0,0 @@
from mmengine.config import read_base
with read_base():
from ..datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
# from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets
from .model_cfg import models, judge_model, given_pred, infer, gpt4, runner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.summarizers import MTBenchSummarizer
datasets = [*subjective_datasets]
for model in models:
if 'generation_kwargs' in model:
if 'do_sample' in model['generation_kwargs']:
del model['generation_kwargs']['do_sample']
eval = dict(
partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models),
runner=runner
)
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
work_dir = 'outputs/mtbench/'

View File

@ -1,84 +0,0 @@
from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
# -------------Inference Stage ----------------------------------------
# For subjective evaluation, we often set do sample for models
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
url='',
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=1,
retry=30,
temperature = 0
)
infer = dict(
partitioner=dict(type=SizePartitioner, strategy='split', max_task_size=10000),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask),
),
)
runner=dict(type=LocalRunner, max_num_workers=12, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model))
gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI,
path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature=1,
)
given_pred = [{'abbr':'gpt4-turbo', 'path':'your path'}]

View File

@ -13,13 +13,65 @@ A popular evaluation method involves
We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of models based on above methods.
## Current Supported Subjective Evaluation Datasets
## Currently Supported Subjective Evaluation Datasets
1. AlignBench (https://github.com/THUDM/AlignBench)
2. MTBench (https://github.com/lm-sys/FastChat)
3. AlpacaEvalv2 (https://github.com/tatsu-lab/alpaca_eval)
4. ArenaHard (https://github.com/lm-sys/arena-hard/tree/main)
5. CompassArena (Internal dataset)
1. AlignBench Chinese Scoring Dataset (https://github.com/THUDM/AlignBench)
2. MTBench English Scoring Dataset, two-turn dialogue (https://github.com/lm-sys/FastChat)
3. MTBench101 English Scoring Dataset, multi-turn dialogue (https://github.com/mtbench101/mt-bench-101)
4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
## Initiating Subjective Evaluation
Similar to existing objective evaluation methods, you can configure related settings in `configs/eval_subjective.py`.
### Basic Parameters: Specifying models, datasets, and judgemodels
Similar to objective evaluation, import the models and datasets that need to be evaluated, for example:
```
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from .models.qwen.hf_qwen_7b import models
```
It is worth noting that since the model setup parameters for subjective evaluation are often different from those for objective evaluation, it often requires setting up `do_sample` for inference instead of `greedy`. You can modify the relevant parameters in the configuration file as needed, for example:
```
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf2',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
```
The judgemodel is usually set to a powerful model like GPT4, and you can directly enter your API key according to the configuration in the config file, or use a custom model as the judgemodel.
### Specifying Other Parameters
In addition to the basic parameters, you can also modify the `infer` and `eval` fields in the config to set a more appropriate partitioning method. The currently supported partitioning methods mainly include three types: NaivePartitioner, SizePartitioner, and NumberWorkPartitioner. You can also specify your own workdir to save related files.
## Subjective Evaluation with Custom Dataset
@ -32,6 +84,9 @@ The specific process includes:
### Step-1: Data Preparation
This step requires preparing the dataset file and implementing your own dataset class under `Opencompass/datasets/subjective/`, returning the read data in the format of `list of dict`.
Actually, you can prepare the data in any format you like (csv, json, jsonl, etc.). However, to make it easier to get started, it is recommended to construct the data according to the format of the existing subjective datasets or according to the following json format.
We provide mini test-set for **Compare Mode** and **Score Mode** as below:
```python
@ -66,80 +121,12 @@ If you want to modify prompt on each single question, you can full some other in
### Step-2: Evaluation Configuration(Compare Mode)
For `config/eval_subjective_compare.py`, we provide some annotations to help users understand the configuration file.
Taking Alignbench as an example, `configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`:
```python
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer
with read_base():
# Pre-defined models
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# Evaluation datasets
datasets = [*subjective_datasets]
# Model to be evaluated
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# Inference configuration
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# Evaluation configuration
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='m2n', # m-model v.s n-model
# Under m2n setting
# must specify base_models and compare_models, program will generate pairs between base_models compare_models.
base_models = [*hf_qwen_14b_chat], # Baseline model
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # model to be evaluated
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # Judge model
)),
)
work_dir = './outputs/subjective/'
summarizer = dict(
type=Corev2Summarizer, # Custom summarizer
match_method='smart', # Answer extraction
)
```
In addition, you can also change the response order of the two models, please refer to `config/eval_subjective_compare.py`,
when `infer_order` is setting to `random`, the response will be random ordered,
when `infer_order` is setting to `double`, the response of two models will be doubled in two ways.
### Step-2: Evaluation Configuration(Score Mode)
For `config/eval_subjective_score.py`, it is mainly same with `config/eval_subjective_compare.py`, and you just need to modify the eval mode to `singlescore`.
1. First, you need to set `subjective_reader_cfg` to receive the relevant fields returned from the custom Dataset class and specify the output fields when saving files.
2. Then, you need to specify the root path `data_path` of the dataset and the dataset filename `subjective_all_sets`. If there are multiple sub-files, you can add them to this list.
3. Specify `subjective_infer_cfg` and `subjective_eval_cfg` to configure the corresponding inference and evaluation prompts.
4. Finally, specify additional information such as `mode`, `summarizer`, etc., at the appropriate location. Note that for different subjective datasets, the fields that need to be specified may vary. Additionally, the summarizer class for the respective dataset also needs to be implemented to perform data statistics. You can refer to the summarizer implementations of other datasets, located in `opencompass/opencompass/summarizers/subjective`.
### Step-3: Launch the Evaluation
@ -152,67 +139,9 @@ The `-r` parameter allows the reuse of model inference and GPT-4 evaluation resu
The response of JudgeLLM will be output to `output/.../results/timestamp/xxmodel/xxdataset/.json`.
The evaluation report will be output to `output/.../summary/timestamp/report.csv`.
Opencompass has supported lots of JudgeLLM, actually, you can take any model as JudgeLLM in opencompass configs.
And we list the popular open-source JudgeLLM here:
1. Auto-J, refer to `configs/models/judge_llm/auto_j`
Consider cite the following paper if you find it helpful:
```bibtex
@article{li2023generative,
title={Generative judge for evaluating alignment},
author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
journal={arXiv preprint arXiv:2310.05470},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
2. JudgeLM, refer to `configs/models/judge_llm/judgelm`
```bibtex
@article{zhu2023judgelm,
title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
journal={arXiv preprint arXiv:2310.17631},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
3. PandaLM, refer to `configs/models/judge_llm/pandalm`
Consider cite the following paper if you find it helpful:
```bibtex
@article{wang2023pandalm,
title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
journal={arXiv preprint arXiv:2306.05087},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
## Multi-round Subjective Evaluation in OpenCompass
In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/eval_subjective_mtbench.py`.
In OpenCompass, we also support subjective multi-turn dialogue evaluation. For instance, the evaluation of MT-Bench can be referred to in `configs/datasets/subjective/multiround`.
In the multi-turn dialogue evaluation, you need to organize the data format into the following dialogue structure:
@ -238,82 +167,3 @@ In the multi-turn dialogue evaluation, you need to organize the data format into
```
It's important to note that due to the different question types in MTBench having different temperature settings, we need to divide the original data files into three different subsets according to the temperature for separate inference. For different subsets, we can set different temperatures. For specific settings, please refer to `configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`.
Consider cite the following paper if you find it helpful:
```bibtex
@misc{zheng2023judging,
title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2306.05685},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
## Practice: AlignBench Evaluation
### Dataset
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### Configuration
Please edit the config `configs/eval_subjective_alignbench.py` according to your demand.
### Evaluation
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py workspace/eval_subjective_alignbench.py
```
### Submit to Official Leaderboard(Optional)
If you need to submit your prediction into official leaderboard, you can use `tools/convert_alignmentbench.py` for format conversion.
- Make sure you have the following results
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # model's response
├── results
└── summary
```
- Convert the data
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- Get `.csv` in `submission/` for submission
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```

View File

@ -15,11 +15,64 @@
## 目前已支持的主观评测数据集
1. AlignBenchhttps://github.com/THUDM/AlignBench
2. MTBench https://github.com/lm-sys/FastChat
3. AlpacaEvalv2 https://github.com/tatsu-lab/alpaca_eval
4. ArenaHard (https://github.com/lm-sys/arena-hard/tree/main)
5. CompassArena内部数据集
1. AlignBench 中文Scoring数据集https://github.com/THUDM/AlignBench
2. MTBench 英文Scoring数据集两轮对话https://github.com/lm-sys/FastChat
3. MTBench101 英文Scoring数据集多轮对话https://github.com/mtbench101/mt-bench-101
4. AlpacaEvalv2 英文Compare数据集https://github.com/tatsu-lab/alpaca_eval
5. ArenaHard 英文Compare数据集主要面向coding(https://github.com/lm-sys/arena-hard/tree/main)
6. Fofo 英文Socring数据集https://github.com/SalesforceAIResearch/FoFo/
## 启动主观评测
类似于已有的客观评测方式可以在configs/eval_subjective.py中进行相关配置
### 基本参数models, datasets 和 judgemodels的指定
类似于客观评测的方式导入需要评测的models和datasets例如
```
with read_base():
from .datasets.subjective.alignbench.alignbench_judgeby_critiquellm import alignbench_datasets
from .datasets.subjective.alpaca_eval.alpacav2_judgeby_gpt4 import subjective_datasets as alpacav2
from .models.qwen.hf_qwen_7b import models
```
值得注意的是,由于主观评测的模型设置参数通常与客观评测不同,往往需要设置`do_sample`的方式进行推理而不是`greedy`,故可以在配置文件中自行修改相关参数,例如
```
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-hf2',
path='THUDM/chatglm3-6b',
tokenizer_path='THUDM/chatglm3-6b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
generation_kwargs=dict(
do_sample=True,
),
meta_template=api_meta_template,
max_out_len=2048,
max_seq_len=4096,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
```
judgemodel通常被设置为GPT4等强力模型可以直接按照config文件中的配置填入自己的API key或使用自定义的模型作为judgemodel
### 其他参数的指定
除了基本参数以外还可以在config中修改`infer`和`eval`字段里的partitioner从而设置更合适的分片方式目前支持的分片方式主要有三种NaivePartitoner, SizePartitioner和NumberWorkPartitioner
以及可以指定自己的workdir用以保存相关文件。
## 自定义主观数据集评测
@ -32,6 +85,9 @@
### 第一步:数据准备
这一步需要准备好数据集文件以及在`Opencompass/datasets/subjective/`下实现自己数据集的类,将读取到的数据以`list of dict`的格式return
实际上可以按照自己喜欢的任意格式进行数据准备(csv, json, jsonl)等皆可不过为了方便上手推荐按照已有的主观数据集的格式进行构建或按照如下的json格式进行构建。
对于对战模式和打分模式我们各提供了一个demo测试集如下
```python
@ -64,85 +120,19 @@
以上三个字段是必要的用户也可以添加其他字段如果需要对每个问题的prompt进行单独处理可以在'others'字段中进行一些额外设置并在Dataset类中添加相应的字段。
### 第二步:构建评测配置(对战模式)
### 第二步:构建评测配置
对于两回答比较更详细的config setting请参考 `config/eval_subjective_compare.py`,下面我们提供了部分简略版的注释,方便用户理解配置文件的含义。
以Alignbench为例`configs/datasets/subjective/alignbench/alignbench_judgeby_critiquellm.py`
```python
from mmengine.config import read_base
from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
from opencompass.partitioners import NaivePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Corev2Summarizer
with read_base():
# 导入预设模型
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat
from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat
from .models.openai.gpt_4 import models as gpt4_model
from .datasets.subjective_cmp.subjective_corev2 import subjective_datasets
# 评测数据集
datasets = [*subjective_datasets]
# 待测模型列表
models = [*hf_qwen_7b_chat, *hf_chatglm3_6b]
# 推理配置
infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)
# 评测配置
eval = dict(
partitioner=dict(
type=SubjectiveNaivePartitioner,
mode='m2n', # m个模型 与 n个模型进行对战
# 在m2n模式下需要指定base_models和compare_models将会对base_models和compare_models生成对应的两两pair去重且不会与自身进行比较
base_models = [*hf_qwen_14b_chat], # 用于对比的基线模型
compare_models = [*hf_baichuan2_7b, *hf_chatglm3_6b] # 待评测模型
),
runner=dict(
type=SlurmSequentialRunner,
partition='llmeval',
quotatype='auto',
max_num_workers=256,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=gpt4_model # 评价模型
)),
)
work_dir = './outputs/subjective/' #指定工作目录,在此工作目录下,若使用--reuse参数启动评测将自动复用该目录下已有的所有结果
summarizer = dict(
type=Corev2Summarizer, #自定义数据集Summarizer
match_method='smart', #自定义答案提取方式
)
```
此外在数据集的配置config中还可以选择两回答比较时的回答顺序请参考`config/eval_subjective_compare.py`,
当`infer_order`设置为`random`时,将对两模型的回复顺序进行随机打乱,
当`infer_order`设置为`double`时,将把两模型的回复按两种先后顺序进行判断。
### 第二步:构建评测配置(打分模式)
对于单回答打分更详细的config setting请参考 `config/eval_subjective_score.py`该config的大部分都与两回答比较的config相同只需要修改评测模式即可将评测模式设置为`singlescore`。
1. 首先需要设置`subjective_reader_cfg`用以接收从自定义的Dataset类里return回来的相关字段并指定保存文件时的output字段
2. 然后需要指定数据集的根路径`data_path`以及数据集的文件名`subjective_all_sets`如果有多个子文件在这个list里进行添加即可
3. 指定`subjective_infer_cfg`和`subjective_eval_cfg`配置好相应的推理和评测的prompt
4. 最后在相应的位置指定`mode``summarizer`等额外信息注意对于不同的主观数据集所需指定的字段可能不尽相同。此外相应数据集的summarizer类也需要自己实现以进行数据的统计可以参考其他数据集的summarizer实现位于`opencompass/opencompass/summarizers/subjective`
### 第三步 启动评测并输出评测结果
```shell
python run.py configs/eval_subjective_score.py -r
python run.py configs/eval_subjective.py -r
```
- `-r` 参数支持复用模型推理和评估结果。
@ -150,69 +140,9 @@ python run.py configs/eval_subjective_score.py -r
JudgeLLM的评测回复会保存在 `output/.../results/timestamp/xxmodel/xxdataset/.json`
评测报告则会输出到 `output/.../summary/timestamp/report.csv`
Opencompass 已经支持了很多的JudgeLLM实际上你可以将Opencompass中所支持的所有模型都当作JudgeLLM使用。
我们列出目前比较流行的开源JudgeLLM
1. Auto-J请参考 `configs/models/judge_llm/auto_j`
如果使用了该方法,请添加引用:
```bibtex
@article{li2023generative,
title={Generative judge for evaluating alignment},
author={Li, Junlong and Sun, Shichao and Yuan, Weizhe and Fan, Run-Ze and Zhao, Hai and Liu, Pengfei},
journal={arXiv preprint arXiv:2310.05470},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
2. JudgeLM请参考 `configs/models/judge_llm/judgelm`
如果使用了该方法,请添加引用:
```bibtex
@article{zhu2023judgelm,
title={JudgeLM: Fine-tuned Large Language Models are Scalable Judges},
author={Zhu, Lianghui and Wang, Xinggang and Wang, Xinlong},
journal={arXiv preprint arXiv:2310.17631},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
3. PandaLM请参考 `configs/models/judge_llm/pandalm`
如果使用了该方法,请添加引用:
```bibtex
@article{wang2023pandalm,
title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization},
author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and others},
journal={arXiv preprint arXiv:2306.05087},
year={2023}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
## 主观多轮对话评测
在OpenCompass中我们同样支持了主观的多轮对话评测以MT-Bench为例对MTBench的评测可以参见`configs/eval_subjective_mtbench.py`
在OpenCompass中我们同样支持了主观的多轮对话评测以MT-Bench为例对MTBench的评测可以参见`configs/datasets/subjective/multiround`
在多轮对话评测中你需要将数据格式整理为如下的dialogue格式
@ -238,84 +168,3 @@ Opencompass 已经支持了很多的JudgeLLM实际上你可以将Opencompa
```
值得注意的是由于MTBench各不同的题目类型设置了不同的温度因此我们需要将原始数据文件按照温度分成三个不同的子集以分别推理针对不同的子集我们可以设置不同的温度具体设置参加`configs\datasets\subjective\multiround\mtbench_single_judge_diff_temp.py`
如果使用了该方法,请添加引用:
```bibtex
@misc{zheng2023judging,
title={Judging LLM-as-a-judge with MT-Bench and Chatbot Arena},
author={Lianmin Zheng and Wei-Lin Chiang and Ying Sheng and Siyuan Zhuang and Zhanghao Wu and Yonghao Zhuang and Zi Lin and Zhuohan Li and Dacheng Li and Eric. P Xing and Hao Zhang and Joseph E. Gonzalez and Ion Stoica},
year={2023},
eprint={2306.05685},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{2023opencompass,
title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
author={OpenCompass Contributors},
howpublished = {\url{https://github.com/open-compass/opencompass}},
year={2023}
}
```
## 实战AlignBench 主观评测
### 数据集准备
```bash
mkdir -p ./data/subjective/
cd ./data/subjective
git clone https://github.com/THUDM/AlignBench.git
# data format conversion
python ../../../tools/convert_alignmentbench.py --mode json --jsonl data/data_release.jsonl
```
### 配置文件
请根据需要修改配置文件 `configs/eval_subjective_alignbench.py`
### 启动评测
按如下方式执行命令后,将会开始答案推理和主观打分,如只需进行推理,可以通过制定 `-m infer`实现
```bash
HF_EVALUATE_OFFLINE=1 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 python run.py configs/eval_subjective_alignbench.py
```
### 提交官方评测Optional
完成评测后,如需提交官方榜单进行评测,可以使用它`tools/convert_alignmentbench.py`进行格式转换。
- 请确保已完成推理,并获得如下所示的文件:
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions # 模型回复
├── results
└── summary
```
- 执行如下命令获得可用于提交的结果
```bash
python tools/convert_alignmentbench.py --mode csv --exp-folder outputs/20231214_173632
```
- 进入 `submission`文件夹获得可用于提交的`.csv`文件
```bash
outputs/
└── 20231214_173632
├── configs
├── logs
├── predictions
├── results
├── submission # 可提交文件
└── summary
```

View File

@ -4,6 +4,7 @@ import argparse
import getpass
import os
import os.path as osp
from copy import deepcopy
from datetime import datetime
from mmengine.config import Config, DictAction
@ -345,13 +346,37 @@ def main():
# visualize
if args.mode in ['all', 'eval', 'viz']:
summarizer_cfg = cfg.get('summarizer', {})
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
summarizer_cfg['type'] = DefaultSummarizer
summarizer_cfg['config'] = cfg
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=cfg_time_str)
# For subjective summarizer
if summarizer_cfg.get('function', None):
main_summarizer_cfg = deepcopy(summarizer_cfg)
grouped_datasets = {}
for dataset in cfg.datasets:
prefix = dataset['abbr'].split('_')[0]
if prefix not in grouped_datasets:
grouped_datasets[prefix] = []
grouped_datasets[prefix].append(dataset)
all_grouped_lists = []
for prefix in grouped_datasets:
all_grouped_lists.append(grouped_datasets[prefix])
dataset_score_container = []
for dataset in all_grouped_lists:
temp_cfg = deepcopy(cfg)
temp_cfg.datasets = dataset
summarizer_cfg = dict(type=dataset[0]['summarizer']['type'], config=temp_cfg)
summarizer = build_from_cfg(summarizer_cfg)
dataset_score = summarizer.summarize(time_str=cfg_time_str)
if dataset_score:
dataset_score_container.append(dataset_score)
main_summarizer_cfg['config'] = cfg
main_summarizer = build_from_cfg(main_summarizer_cfg)
main_summarizer.summarize(time_str=cfg_time_str, subjective_scores=dataset_score_container)
else:
if not summarizer_cfg or summarizer_cfg.get('type', None) is None:
summarizer_cfg['type'] = DefaultSummarizer
summarizer_cfg['config'] = cfg
summarizer = build_from_cfg(summarizer_cfg)
summarizer.summarize(time_str=cfg_time_str)
if __name__ == '__main__':
main()

View File

@ -88,7 +88,9 @@ class AlignmentBenchDataset(SubjectiveCmpDataset):
path: str,
name: str,
alignment_bench_config_path: Optional[str] = '',
alignment_bench_config_name: Optional[str] = ''):
alignment_bench_config_name: Optional[str] = '',
*args,
**kwargs):
if alignment_bench_config_path != '':
alignmentbench_config = Config(alignment_bench_config_path,
alignment_bench_config_name)
@ -106,17 +108,3 @@ class AlignmentBenchDataset(SubjectiveCmpDataset):
alignbench_dataset.append(data)
dataset = Dataset.from_list(alignbench_dataset)
return dataset
if __name__ == '__main__':
data = {
'question': '高音单簧管和高音萨克斯的调性相同吗?如果相同,请说出他们的调性,如果不同,请分别说出他们的调性',
'capability': '专业能力',
'others': {
'subcategory': '音乐',
'reference': '高音单簧管和高音萨克斯的调性不同。高音单簧管的调性通常为E♭而高音萨克斯的调性则为B♭。\n',
'question_id': 1
}
}
prefix = prompt_construct(data, alignmentbench_config)
print(prefix)

View File

@ -11,7 +11,7 @@ from ..base import BaseDataset
@LOAD_DATASET.register_module()
class ArenaHardDataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.jsonl')
dataset = DatasetDict()
raw_data = []

View File

@ -8,11 +8,7 @@ from .subjective_cmp import SubjectiveCmpDataset
@LOAD_DATASET.register_module()
class CompassArenaDataset(SubjectiveCmpDataset):
def load(
self,
path: str,
name: str,
):
def load(self, path: str, name: str, *args, **kwargs):
dataset = list(super().load(path, name))
creation_dataset = []
for data in dataset:

View File

@ -77,7 +77,7 @@ Choice: [[C]]
@LOAD_DATASET.register_module()
class CompassBenchDataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:

View File

@ -105,7 +105,7 @@ Choice: [[C]]
@LOAD_DATASET.register_module()
class CompassBenchControlLengthBiasDataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:

View File

@ -12,7 +12,7 @@ from ..base import BaseDataset
@LOAD_DATASET.register_module()
class FofoDataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.json')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:

View File

@ -2,7 +2,6 @@
import json
import os.path as osp
import re
from typing import Optional
from datasets import Dataset, DatasetDict
@ -166,7 +165,13 @@ def prompt_construct(problem, multi_turn=False, judge_type='single'):
@LOAD_DATASET.register_module()
class MTBenchDataset(BaseDataset):
def load(self, path: str, name: str, multi_turn=True, judge_type='single'):
def load(self,
path: str,
name: str,
judge_type='single',
multi_turn=True,
*args,
**kwargs):
filename = osp.join(path, f'{name}.json')
dataset = DatasetDict()
raw_data = []

View File

@ -2,7 +2,6 @@
import json
import os.path as osp
import re
from typing import Optional
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
@ -258,7 +257,7 @@ def add_format(question, answer):
@LOAD_DATASET.register_module()
class MTBench101Dataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
import copy
filename = osp.join(path, f'{name}.jsonl')

View File

@ -11,7 +11,7 @@ from ..base import BaseDataset
@LOAD_DATASET.register_module()
class SubjectiveCmpDataset(BaseDataset):
def load(self, path: str, name: str):
def load(self, path: str, name: str, *args, **kwargs):
filename = osp.join(path, f'{name}.json')
dataset = DatasetDict()
raw_data = []

View File

@ -102,6 +102,25 @@ def remove_already_tasks(tasks, work_dir, meta_judge_model):
return tasks_to_keep
def get_model_combinations(
mode,
models: List[ConfigDict],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = []) -> List:
if mode == 'allpair':
assert len(models) > 1
return combinations(models, 2)
elif mode == 'm2n':
assert len(base_models) > 0 and len(compare_models) > 0
model_combinations = list(product(base_models, compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
return unique_combinations
elif mode == 'fixed':
pass
return None
@PARTITIONERS.register_module()
class SubjectiveNaivePartitioner(NaivePartitioner):
"""Naive task partitioner for subjective evaluation. Compared to
@ -113,46 +132,25 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
to the task config.
"""
def __init__(self,
mode: str,
out_dir: str,
models: Optional[List[ConfigDict]] = [],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = [],
judge_models: Optional[List[ConfigDict]] = [],
meta_judge_model: Optional[ConfigDict] = None,
model_pairs: Optional[List[Tuple]] = None,
keep_keys: Optional[List[str]] = None,
infer_order: Optional[str] = 'random'):
def __init__(
self,
out_dir: str,
models: Optional[List[ConfigDict]] = [],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = [],
judge_models: Optional[List[ConfigDict]] = [],
meta_judge_model: Optional[ConfigDict] = None,
model_pairs: Optional[List[Tuple]] = None,
keep_keys: Optional[List[str]] = None,
):
super().__init__(out_dir=out_dir, keep_keys=keep_keys)
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
assert infer_order in ['random', 'double']
self.mode = mode
self.models = models
self.base_models = base_models
self.compare_models = compare_models
self.model_pairs = model_pairs
self.judge_models = judge_models
self.meta_judge_model = meta_judge_model
self.infer_order = infer_order
def get_model_combinations(
self,
models: List[ConfigDict],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = []) -> List:
if self.mode == 'allpair':
assert len(models) > 1
return combinations(models, 2)
elif self.mode == 'm2n':
assert len(base_models) > 0 and len(compare_models) > 0
model_combinations = list(product(base_models, compare_models))
unique_combinations = remove_duplicate_pairs([
combo for combo in model_combinations if combo[0] != combo[1]
])
return unique_combinations
elif self.mode == 'fixed':
pass
def partition(self,
models: List[ConfigDict],
@ -187,34 +185,46 @@ class SubjectiveNaivePartitioner(NaivePartitioner):
models = self.models if self.models != [] else models
base_models, compare_models = self.base_models, self.compare_models
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
if self.mode == 'singlescore':
models = models
else:
models = self.get_model_combinations(models, base_models,
compare_models)
model_dataset_combinations = [{'models': models, 'datasets': datasets}]
tasks = super().partition(
model_dataset_combinations=model_dataset_combinations,
work_dir=work_dir,
out_dir=out_dir,
add_cfg=add_cfg)
all_tasks = []
for dataset in datasets:
mode = dataset['mode']
infer_order = dataset.get('infer_order', None)
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
assert infer_order in ['random', 'double', None]
if mode == 'singlescore':
temp_models = models
else:
temp_models = get_model_combinations(mode, models,
dataset['base_models'],
models)
model_dataset_combinations = [{
'models': temp_models,
'datasets': [dataset]
}]
# We need to add judge models and meta-judge-model as new tasks
# When there is no meta-judge-model, we assign all judge models to each tasks
# When there is a meta-judge-model, we add an additional task stage
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
meta_judge_model)
tasks = super().partition(
model_dataset_combinations=model_dataset_combinations,
work_dir=work_dir,
out_dir=out_dir,
add_cfg=add_cfg)
# We also need to check and remove the already done tasks
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
# Refer to meta review judge
for task_stage in tasks:
for task in task_stage:
task['infer_order'] = self.infer_order
else:
# Refer to just have review judge
for task in tasks:
task['infer_order'] = self.infer_order
return tasks
# We need to add judge models and meta-judge-model as new tasks
# When there is no meta-judge-model, we assign all judge models to each tasks
# When there is a meta-judge-model, we add an additional task stage
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
meta_judge_model)
# We also need to check and remove the already done tasks
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
# Refer to meta review judge
for task_stage in tasks:
for task in task_stage:
task['infer_order'] = infer_order
else:
# Refer to just have review judge
for task in tasks:
task['infer_order'] = infer_order
all_tasks += tasks
return all_tasks

View File

@ -0,0 +1,209 @@
# flake8: noqa: E501
import copy
import math
import os.path as osp
from typing import Dict, List, Optional, Tuple
import mmengine
from mmengine.config import Config, ConfigDict
from opencompass.registry import PARTITIONERS
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path)
from .sub_naive import (SubjectiveNaivePartitioner, get_model_combinations,
remove_already_tasks,
replicate_tasks_with_judge_models)
@PARTITIONERS.register_module()
class SubjectiveNumWorkerPartitioner(SubjectiveNaivePartitioner):
"""Task partitioner based on the pre-defined number of workers.
Args:
out_dir (str): The output directory of tasks.
num_worker (int): The number of workers. default: 8.
min_task_size (int): The minimum size of a task. default: 16.
dataset_size_path (str): The path to the dataset size cache file.
keep_keys (list[str]): The keys to be kept from the experiment config
to the task config.
"""
def __init__(self,
out_dir: str,
models: Optional[List[ConfigDict]] = [],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = [],
judge_models: Optional[List[ConfigDict]] = [],
meta_judge_model: Optional[ConfigDict] = None,
model_pairs: Optional[List[Tuple]] = None,
num_worker: int = 8,
num_worker_split: Optional[int] = None,
min_task_size: int = 16,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: Optional[List[str]] = None):
super().__init__(
out_dir=out_dir,
keep_keys=keep_keys,
models=models,
base_models=base_models,
compare_models=compare_models,
judge_models=judge_models,
meta_judge_model=meta_judge_model,
model_pairs=model_pairs,
)
if strategy == 'split' and num_worker_split is not None:
self.logger.warning('num_worker_split is ignored with split.')
self.num_worker = num_worker
self.num_worker_split = num_worker_split or num_worker
self.min_task_size = min_task_size
self.dataset_size_path = dataset_size_path
assert strategy in ('heuristic', 'split'), \
f'Unsupported partition strategy: {strategy}. '\
'Supported strategies are: `heuristic`, `split` .'
self.strategy = strategy
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[ConfigDict]:
# intentionally avoid any sort here,
# for user's abaility to manipulate the order
models = self.models if self.models != [] else models
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
self.num_worker = int(self.num_worker / len(datasets))
all_tasks = []
for dataset in datasets:
mode = dataset['mode']
infer_order = dataset.get('infer_order', None)
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
assert infer_order in ['random', 'double', None]
if mode == 'singlescore':
temp_models = models
else:
temp_models = get_model_combinations(mode, models,
dataset['base_models'],
models)
model_dataset_combinations = [{
'models': temp_models,
'datasets': [dataset]
}]
tasks = []
for comb in model_dataset_combinations:
for model in comb['models']:
chunks = []
for dataset in comb['datasets']:
filename = get_infer_output_path(
model, dataset, out_dir)
# skip the task if the task output exists
if osp.exists(filename):
continue
dataset_size = self.get_size(dataset)
if self.num_worker <= 1:
chunks.append(dataset)
elif dataset_size <= self.min_task_size:
chunks.append(dataset)
else:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'):
chunks.append(dataset_split)
if self.strategy == 'heuristic':
buckets = [[] for _ in range(self.num_worker_split)]
for i, chunk in enumerate(chunks):
buckets[i % self.num_worker_split].append(chunk)
for bucket in buckets:
if len(bucket) > 0:
tasks.append(
Config({
'models': [model],
'datasets': [bucket],
'work_dir': work_dir,
**add_cfg
}))
elif self.strategy == 'split':
for dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
# We need to add judge models and meta-judge-model as new tasks
# When there is no meta-judge-model, we assign all judge models to each tasks
# When there is a meta-judge-model, we add an additional task stage
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
meta_judge_model)
# We also need to check and remove the already done tasks
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
# Refer to meta review judge
for task_stage in tasks:
for task in task_stage:
task['infer_order'] = infer_order
else:
# Refer to just have review judge
for task in tasks:
task['infer_order'] = infer_order
all_tasks += tasks
return all_tasks
@property
def dataset_size(self):
if not hasattr(self, '_dataset_size'):
if osp.exists(self.dataset_size_path):
self._dataset_size = mmengine.load(self.dataset_size_path)
else:
self._dataset_size = {}
return self._dataset_size
def split_dataset(self, dataset_cfg: ConfigDict) -> List[ConfigDict]:
"""Split dataset into several parts."""
dataset_size = self.get_size(dataset_cfg)
split_configs = []
abbr = dataset_abbr_from_cfg(dataset_cfg)
# evenly distribute the task
num_split = self.num_worker
step = max(math.ceil(dataset_size / num_split), self.min_task_size)
for part, i in enumerate(range(0, dataset_size, step)):
cfg = copy.deepcopy(dataset_cfg)
cfg['abbr'] = abbr + f'_{part}'
test_range = cfg['reader_cfg'].get('test_range', '')
cfg['reader_cfg']['test_range'] = f'{test_range}[{i}:{i+step}]'
split_configs.append(cfg)
return split_configs
def get_size(self, dataset: ConfigDict) -> int:
dataset_abbr = dataset_abbr_from_cfg(dataset)
test_range = dataset.reader_cfg.get('test_range', '')
if dataset_abbr in self.dataset_size:
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
f'{test_range})')
return actual_size
dataset = build_dataset_from_cfg(dataset)
self.dataset_size[dataset_abbr] = len(dataset.test)
mmengine.mkdir_or_exist('.cache/')
mmengine.dump(self.dataset_size,
self.dataset_size_path,
indent=4,
ensure_ascii=False)
actual_size = eval('len(range(self.dataset_size[dataset_abbr])'
f'{test_range})')
return actual_size

View File

@ -12,7 +12,8 @@ from opencompass.registry import PARTITIONERS
from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg,
get_infer_output_path)
from .sub_naive import (SubjectiveNaivePartitioner, remove_already_tasks,
from .sub_naive import (SubjectiveNaivePartitioner, get_model_combinations,
remove_already_tasks,
replicate_tasks_with_judge_models)
@ -36,31 +37,31 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
to the task config.
"""
def __init__(self,
mode: str,
out_dir: str,
models: Optional[List[ConfigDict]] = [],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = [],
judge_models: Optional[List[ConfigDict]] = [],
meta_judge_model: Optional[ConfigDict] = None,
model_pairs: Optional[List[Tuple]] = None,
max_task_size: int = 40000,
gen_task_coef: int = 20,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: Optional[List[str]] = None,
infer_order: Optional[str] = 'random'):
super().__init__(out_dir=out_dir,
keep_keys=keep_keys,
mode=mode,
models=models,
base_models=base_models,
compare_models=compare_models,
judge_models=judge_models,
meta_judge_model=meta_judge_model,
model_pairs=model_pairs,
infer_order=infer_order)
def __init__(
self,
out_dir: str,
models: Optional[List[ConfigDict]] = [],
base_models: Optional[List[ConfigDict]] = [],
compare_models: Optional[List[ConfigDict]] = [],
judge_models: Optional[List[ConfigDict]] = [],
meta_judge_model: Optional[ConfigDict] = None,
model_pairs: Optional[List[Tuple]] = None,
max_task_size: int = 40000,
gen_task_coef: int = 20,
strategy: str = 'heuristic',
dataset_size_path: str = '.cache/dataset_size.json',
keep_keys: Optional[List[str]] = None,
):
super().__init__(
out_dir=out_dir,
keep_keys=keep_keys,
models=models,
base_models=base_models,
compare_models=compare_models,
judge_models=judge_models,
meta_judge_model=meta_judge_model,
model_pairs=model_pairs,
)
self.max_task_size = max_task_size
self.gen_task_coef = gen_task_coef
self.dataset_size_path = dataset_size_path
@ -105,76 +106,94 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner):
models = self.models if self.models != [] else models
base_models, compare_models = self.base_models, self.compare_models
judge_models, meta_judge_model = self.judge_models, self.meta_judge_model
if self.mode == 'singlescore':
models = models
else:
models = super().get_model_combinations(models, base_models,
compare_models)
model_dataset_combinations = [{'models': models, 'datasets': datasets}]
tasks = []
for comb in model_dataset_combinations:
comb['datasets'] = sorted(comb['datasets'],
key=lambda x: self.get_cost(x),
reverse=True)
for model in comb['models']:
chunks = [] # elements: tuple(size, dataset_chunk)
for dataset in comb['datasets']:
filename = get_infer_output_path(model, dataset, out_dir)
# skip the task if the task output exists
# if osp.exists(filename):
# continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'):
chunks.append(
(self.max_task_size, dataset_split))
else:
chunks.append((dataset_size, dataset))
self.max_task_size *= len(datasets)
all_tasks = []
for dataset in datasets:
mode = dataset['mode']
infer_order = dataset.get('infer_order', None)
assert mode in ['singlescore', 'allpair', 'm2n', 'fixed']
assert infer_order in ['random', 'double', None]
if mode == 'singlescore':
temp_models = models
else:
temp_models = get_model_combinations(mode, models,
dataset['base_models'],
models)
model_dataset_combinations = [{
'models': temp_models,
'datasets': [dataset]
}]
if self.strategy == 'heuristic':
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
tasks = []
for comb in model_dataset_combinations:
comb['datasets'] = sorted(comb['datasets'],
key=lambda x: self.get_cost(x),
reverse=True)
for model in comb['models']:
chunks = [] # elements: tuple(size, dataset_chunk)
for dataset in comb['datasets']:
filename = get_infer_output_path(
model, dataset, out_dir)
# skip the task if the task output exists
# if osp.exists(filename):
# continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'):
chunks.append(
(self.max_task_size, dataset_split))
else:
chunks.append((dataset_size, dataset))
if self.strategy == 'heuristic':
chunks = sorted(chunks,
key=lambda x: x[0],
reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(
chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
meta_judge_model)
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
tasks = replicate_tasks_with_judge_models(tasks, judge_models,
meta_judge_model)
tasks = remove_already_tasks(tasks, work_dir, meta_judge_model)
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
# Refer to meta review judge
for task_stage in tasks:
for task in task_stage:
task['infer_order'] = self.infer_order
else:
# Refer to just have review judge
for task in tasks:
task['infer_order'] = self.infer_order
return tasks
if isinstance(tasks, list) and len(tasks) != 0 and isinstance(
tasks[0], list):
# Refer to meta review judge
for task_stage in tasks:
for task in task_stage:
task['infer_order'] = infer_order
else:
# Refer to just have review judge
for task in tasks:
task['infer_order'] = infer_order
all_tasks += tasks
return all_tasks
@property
def dataset_size(self):

View File

@ -5,13 +5,12 @@ from .alpacaeval import AlpacaSummarizer
from .arenahard import ArenaHardSummarizer
from .compass_arena import CompassArenaSummarizer
from .compassbench import CompassBenchSummarizer
from .compassbench_th import CompassBenchTHSummarizer
from .corev2 import Corev2Summarizer
from .creationbench import CreationBenchSummarizer
from .flames import FlamesSummarizer
from .fofo import FofoSummarizer
from .information_retrival import IRSummarizer
from .mtbench import MTBenchSummarizer
from .mtbench101 import MTBench101Summarizer
from .multiround import MultiroundSummarizer
from .subjective import SubjectiveSummarizer
from .wildbench import WildBenchPairSummarizer, WildBenchSingleSummarizer

View File

@ -271,7 +271,6 @@ def get_capability_results(judged_answers,
capability_avg_ratings['总分'] /= len(temp_list)
capability_avg_ratings['总分'] = round(capability_avg_ratings['总分'], 2)
scores = {model: capability_avg_ratings}
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
@ -298,6 +297,15 @@ def get_capability_results(judged_answers,
row.append(scores[model][sub_category])
writer.writerow(row)
scores = scores[model]
scores.pop('中文推理总分', None)
scores.pop('中文语言总分', None)
# Creating a new dictionary with '总分' as the first item
updated_scores = {'总分': scores.pop('总分')}
updated_scores.update(scores)
return updated_scores
class AlignmentBenchSummarizer:
"""Do the subjectivity analyze based on evaluation results.
@ -338,42 +346,42 @@ class AlignmentBenchSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
all_scores = {}
for judge_model in self.judge_models:
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
dataset_cfgs = self.cfg['datasets']
dataset = dataset_cfgs[0] # Alignbench just have only one subfile
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag, fout_flag2 = 0, 0
if self.judge_type == 'general':
fout = osp.join(
output_dir,
'Alignbench-judged-by--' + judge_abbr + '-dimension.csv')
fout2 = osp.join(
output_dir,
'Alignbench-judged-by--' + judge_abbr + '-capability.csv')
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + judge_abbr
subdir_path = os.path.join(results_folder, subdir)
model = eval_model_abbr
if os.path.isdir(subdir_path):
model = eval_model_abbr
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
if self.judge_type == 'general':
fout = osp.join(
output_dir,
'judged-by--' + judge_abbr + '-dimension.csv')
fout2 = osp.join(
output_dir,
'judged-by--' + judge_abbr + '-capability.csv')
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
if self.judge_type == 'general':
get_dimension_results(judged_answers, references,
fout, fout_flag, model)
fout_flag += 1
get_capability_results(judged_answers, references,
fout2, fout_flag2, model,
self.category)
fout_flag2 += 1
get_dimension_results(judged_answers, references, fout,
fout_flag, model)
fout_flag += 1
scores = get_capability_results(judged_answers, references,
fout2, fout_flag2, model,
self.category)
score_by_judgemodel[model] = scores
fout_flag2 += 1
else:
score_by_judgemodel[model] = None
print(subdir_path + ' is not exist! please check!')
if self.judge_type == 'general':
with open(fout, 'r') as f:
x = from_csv(f, delimiter=',')
print(x)
print(fout)
with open(fout2, 'r') as f:
x = from_csv(f, delimiter=',')
print(x)
print(fout2)
all_scores[judge_abbr] = score_by_judgemodel
return {'Alignbench': all_scores}

View File

@ -80,10 +80,9 @@ class AlpacaSummarizer:
def __init__(self, config: ConfigDict, judge_type='v2') -> None:
self.tasks = []
self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_abbr = model_abbr_from_cfg(
self.cfg['judge_models'][0]) # We will reorganize the summarizers
self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None)
self.judge_type = judge_type
assert self.judge_type in ['v1', 'v2']
self.judge_map = {
@ -102,22 +101,34 @@ class AlpacaSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
all_scores = {}
for judge_model in self.judge_models:
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
dataset_cfgs = self.cfg['datasets']
dataset = dataset_cfgs[0] # AlpacaEval just have only one subfile
dataset_abbr = dataset_abbr_from_cfg(dataset)
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs([
combo for combo in model_combinations if combo[0] != combo[1]
])
for model_pair in unique_combinations:
model1, model2 = model_pair[0]['abbr'], model_pair[1]['abbr']
subdir = model1 + '_' + model2 + '_judged-by--' + judge_abbr
subdir_path = os.path.join(results_folder, subdir)
filename = osp.realpath(
osp.join(subdir_path, dataset_abbr + '.json'))
partial_filename = osp.realpath(
osp.join(subdir_path, dataset_abbr + '_0.json'))
if osp.exists(osp.realpath(filename)) or osp.exists(
osp.realpath(partial_filename)):
fout = osp.join(
output_dir,
'AlpacaEval2-judged-by--' + judge_abbr + '.csv')
for model_pair in unique_combinations:
model1, model2, judge_model = model_pair[0]['abbr'], model_pair[1][
'abbr'], self.judge_abbr
subdir = model1 + '_' + model2 + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
fout = osp.join(output_dir,
'judged-by--' + judge_model + '-report.csv')
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
win_model1, win_model2, categories = defaultdict(
@ -155,9 +166,11 @@ class AlpacaSummarizer:
win_model2[capability] = round(
(win_model2[capability] /
categories[capability]) * 100, 2)
scores = {
'win_' + model1: win_model1,
'win_' + model2: win_model2
#'win_' + model1: win_model1, # We just show winrate of model2, because model1 is base model and only one model as base model in AlpacaEval
'win_' + model2:
win_model2
}
rows = list(scores.keys())
columns = list(scores[rows[0]].keys())
@ -169,8 +182,11 @@ class AlpacaSummarizer:
writer.writerow(
[row] +
[scores[row][column] for column in columns])
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
x = from_csv(f)
print(x)
win_model2_update = {'total': win_model2.pop('total')}
win_model2_update.update(win_model2)
score_by_judgemodel[model2] = win_model2_update
else:
score_by_judgemodel[model2] = None
# print(subdir_path + ' is not exist! please check!')
all_scores[judge_abbr] = score_by_judgemodel
return {'AlpacaEval': all_scores}

View File

@ -232,8 +232,8 @@ class ArenaHardSummarizer:
summary_type='single') -> None:
self.tasks = []
self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_type = judge_type
@ -251,23 +251,28 @@ class ArenaHardSummarizer:
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
scores = {}
all_scores = {}
for idx, judge_model_cfg in enumerate(self.judge_models):
score_by_judgemodel = {}
judge_model = model_abbr_from_cfg(judge_model_cfg)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
battles = pd.DataFrame()
print('Turning judgment results into battles...')
for model_pair in unique_combinations:
model1 = model_pair[0]['abbr']
model2 = model_pair[1]['abbr']
model1 = model_pair[0]['abbr'] # base model, in ArenaHard it is gpt4-0314
model2 = model_pair[1]['abbr'] # compare model, your models
if idx == len(self.judge_models):
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if not os.path.isdir(subdir_path):
dataset_abbr = dataset_abbr_from_cfg(dataset)
filename = osp.realpath(osp.join(subdir_path, dataset_abbr + '.json'))
partial_filename = osp.realpath(osp.join(subdir_path, dataset_abbr + '_0.json'))
if not osp.exists(osp.realpath(filename)) and not osp.exists(osp.realpath(partial_filename)):
score_by_judgemodel[model2] = None
print(subdir_path + ' is not exist! please check!')
continue
@ -279,7 +284,7 @@ class ArenaHardSummarizer:
np.random.seed(42)
bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
bootstrap_elo_lu.to_json(os.path.join(output_dir,'bootstrapping_results'+ judge_model+'.jsonl'), lines=True, orient='records')
bootstrap_elo_lu.to_json(os.path.join(output_dir,'arena_hard_bootstrapping_results_judged-by--'+ judge_model+'.jsonl'), lines=True, orient='records')
stats = pd.DataFrame()
stats['results'] = None
@ -292,8 +297,11 @@ class ArenaHardSummarizer:
stats.at[i, 'score'] = bootstrap_online_elo[model]
stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5)
stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5)
if model == 'gpt4-0314':
stats.at[i, 'avg_tokens'] = 423
if model == model1:
if model1 == 'gpt4-0314':
stats.at[i, 'avg_tokens'] = 423
else:
stats.at[i, 'avg_tokens'] = 0 # Not expected model
else:
file_name = os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json')
model_preds = load_model_preds(file_name)
@ -304,16 +312,20 @@ class ArenaHardSummarizer:
stats.at[i, 'avg_tokens'] = pred_length
stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()
stats.sort_values(by='model', inplace=True)
stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
stats['lower'] = get_win_rate_column(stats, 'lower', 'gpt4-0314').tolist()
stats['upper'] = get_win_rate_column(stats, 'upper', 'gpt4-0314').tolist()
stats['score'] = get_win_rate_column(stats, 'score', model1).tolist()
stats['lower'] = get_win_rate_column(stats, 'lower', model1).tolist()
stats['upper'] = get_win_rate_column(stats, 'upper', model1).tolist()
decimal = 1
stats.sort_values(by='score', ascending=False, inplace=True)
for _, row in stats.iterrows():
interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}")
if row['model'] != model1:
score_by_judgemodel[row['model']] = {'score': row['score']}
stats.to_json(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.json'), orient='records', indent=4)
stats.to_csv(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.csv'))
all_scores[judge_model] = score_by_judgemodel
return {'ArenaHard': all_scores}
def summarize(
self,
@ -327,4 +339,4 @@ class ArenaHardSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
self.get_score(time_str)
return self.get_score(time_str)

View File

@ -70,8 +70,8 @@ class CompassArenaSummarizer:
summary_type='single') -> None:
self.tasks = []
self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_type = judge_type
@ -107,6 +107,9 @@ class CompassArenaSummarizer:
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if len(judged_answers) == 0:
scores[judge_model][dataset_abbr][model2] = {}
continue
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
@ -175,8 +178,9 @@ class CompassArenaSummarizer:
# scores['win_' + model1] = win_model1
output_dir, results_folder = get_outdir(self.cfg, time_str)
all_scores = {}
for idx, judge_model in enumerate(self.judge_models):
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
@ -198,18 +202,16 @@ class CompassArenaSummarizer:
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_filename = osp.join(output_dir, dataset_abbr + '-summarized-by--' + judge_abbr + '-report.csv')
else:
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_filename = osp.join(output_dir, dataset_abbr + '-judged-by--' + judge_abbr + '-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
table = []
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
@ -227,14 +229,21 @@ class CompassArenaSummarizer:
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
output_filename = osp.join(output_dir, 'compassarena-overall-summarized-by--' + judge_abbr + '.csv')
else:
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
output_filename = osp.join(output_dir, 'compassarena-overall-judged-by--' + judge_abbr + '.csv')
table = [[row[0]] + [f'{x:.2f}' if not isinstance(x, str) else x for x in row[1:]] for row in table]
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
for idx, model in enumerate(summarizer_model_abbrs):
score_by_judgemodel[model] = {}
for subset in table:
score_by_judgemodel[model][subset[0]] = subset[idx+1]
all_scores[judge_abbr]=score_by_judgemodel
return {'CompassArena': all_scores}

View File

@ -71,8 +71,8 @@ class CompassBenchSummarizer:
summary_type='single') -> None:
self.tasks = []
self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.base_models = self.cfg['datasets'][0]['base_models']
self.compare_models = self.cfg['eval']['partitioner']['models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_type = judge_type
@ -108,6 +108,9 @@ class CompassBenchSummarizer:
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if len(judged_answers) == 0:
scores[judge_model][dataset_abbr][model2] = {}
continue
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
@ -189,8 +192,9 @@ class CompassBenchSummarizer:
# scores['win_' + model1] = win_model1
output_dir, results_folder = get_outdir(self.cfg, time_str)
all_judge_file_list = []
all_scores = {}
for idx, judge_model in enumerate(self.judge_models):
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
@ -220,24 +224,25 @@ class CompassBenchSummarizer:
# print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_filename = osp.join(output_dir, dataset_abbr + '-summarized-by--' + judge_abbr + '-report.csv')
else:
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_filename = osp.join(output_dir, dataset_abbr + '-judged-by--' + judge_abbr + '-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
all_judge_file_list.append(output_filename)
for idx, model in enumerate(summarizer_model_abbrs):
score_by_judgemodel[model] = {'overall': table[0][idx+1]}
all_scores[judge_abbr]=score_by_judgemodel
dfs = [pd.read_csv(file) for file in all_judge_file_list]
if len(dfs) > 1:
average_df = copy.deepcopy(dfs[0])
for col in dfs[0].columns[1:]:
for i in range(1, len(dfs[0])):
for i in range(0, len(dfs[0])):
average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
average_csv_path = osp.join(output_dir, 'Averaged-' + dataset_abbr + '-report.csv')
average_csv_path = osp.join(output_dir, 'CompassBench-Averaged-' + dataset_abbr + '-report.csv')
average_df.to_csv(average_csv_path, index=False)
print(average_csv_path)
return {'CompassBench': all_scores}

View File

@ -1,353 +0,0 @@
# flake8: noqa
# yapf: disable
import copy
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime
from itertools import product
import mmengine
import pandas as pd
from mmengine import ConfigDict
from tabulate import tabulate
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_compass_arena(s):
if result := re.findall(r'(?:选择:|Choice: )\[\[([ABC])\]\]', s):
return result[0]
else:
return None
def get_outdir(cfg, time_str):
"""Get out put path.
Args:
cfg (ConfigDict): The running config.
time_str (str): Current time.
"""
work_dir = cfg['work_dir']
output_path = osp.join(work_dir, 'summary', f'summary_{time_str}.txt')
output_dir = osp.join(osp.split(output_path)[0], f'{time_str}')
mmengine.mkdir_or_exist(output_dir)
results_folder = osp.join(work_dir, 'results')
return output_dir, results_folder
def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
"""Extract judgements (scores) and references.
Args:
dataset (ConfigDict): Dataset config.
subdir_path (str): Model path in results dir.
post_process (function): The pre-defined extract function.
"""
dataset_abbr = dataset_abbr_from_cfg(dataset)
filename = osp.join(subdir_path, dataset_abbr + '.json')
partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
if osp.exists(osp.realpath(filename)):
result = mmengine.load(filename)
elif osp.exists(osp.realpath(partial_filename)):
filename = partial_filename
result = {}
i = 1
partial_dict_flag = 0
while osp.exists(osp.realpath(filename)):
res = mmengine.load(filename)
for k, v in res.items():
result[partial_dict_flag] = v
partial_dict_flag += 1
filename = osp.join(subdir_path,
dataset_abbr + '_' + str(i) + '.json')
i += 1
else:
result = {}
if len(result) == 0:
print('*' * 100)
print('There are no results for ' + filename + ' or ' +
partial_filename)
print('*' * 100)
assert len(result) > 0
judged_answers = []
references = []
result_items = []
for k, v in result.items():
processed_judge = post_process(v['prediction'])
if processed_judge is not None:
judged_answers.append(processed_judge)
references.append(v['gold'])
result_items.append(v)
# else:
# print(v['prediction'])
# print('-' * 128)
if len(judged_answers) != len(result):
print(
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
)
if len(judged_answers) == 0:
print('*' * 100)
print(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
)
print('*' * 100)
assert len(judged_answers) > 0
return judged_answers, references, result_items
def check_position_bias(judged_answers, references, banned_choice=['C']):
"""Check position bias for judgellm's judgement.
Args:
judged_answers: The successfully extracted judgement.
references: The references contains original question, which is used to located the same question for different position judgement.
"""
position_bias_flag = 0
position_bias_dict = {}
for judge, ref in zip(judged_answers, references):
question = ref['question']
question_hash = hash(question)
if question_hash not in position_bias_dict:
position_bias_dict[question_hash] = {
'question': question,
'judge': judge
}
else:
first_judge = position_bias_dict[question_hash]['judge']
if judge == first_judge and first_judge not in banned_choice and judge not in banned_choice:
# If second choice is same with first choice, there has position bias.
position_bias_flag += 1
return position_bias_flag
def count_chinese_characters(text):
words = re.findall(r'[\u4e00-\u9fff]', text)
return len(words)
def count_english_words(text):
words = re.findall(r'\b[a-zA-Z]+\b', text)
return len(words)
class CompassBenchTHSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self,
config: ConfigDict,
judge_type='general',
check_pos_bias=True,
summary_type='single',
word_count_threshold=None) -> None:
self.tasks = []
self.cfg = config
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
self.judge_type = judge_type
assert self.judge_type in ['general']
self.judge_map = {'general': post_process_compass_arena}
self.judge_function = self.judge_map[self.judge_type]
self.check_pos_bias = check_pos_bias
self.summary_type = summary_type
self.word_count_threshold = word_count_threshold
def get_score(self, time_str):
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
scores = {}
for idx, judge_model_cfg in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model_cfg)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
for model_pair in unique_combinations:
model1 = model_pair[0]['abbr']
model2 = model_pair[1]['abbr']
if idx == len(self.judge_models):
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if not os.path.isdir(subdir_path):
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references, result_items = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
bias_num = 0
win_model1 = defaultdict(float)
win_model2 = defaultdict(float)
categories = defaultdict(float)
difficulties = defaultdict(float)
languages = defaultdict(float)
model1 = references[0]['answer1']
model2 = references[0]['answer2']
for prediction, reference, result_item in zip(judged_answers, references, result_items):
categories[dataset_abbr] += 1
categories[reference['category']] += 1
difficulties['Level-' + str(reference['level'])] += 1
languages['Lan-' + reference['lan']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
score_1, score_2 = 1, 0
else:
score_1, score_2 = 0, 1
elif prediction == 'B':
if reference['answer1'] == model1:
score_1, score_2 = 0, 1
else:
score_1, score_2 = 1, 0
elif prediction == 'C':
if self.summary_type == 'half_add':
score_1, score_2 = 0.5, 0.5
else:
score_1, score_2 = 0, 0
# 进行分数修正
if self.word_count_threshold is not None:
try:
if reference['lan'] == 'zh':
answer1 = re.search(r'\[回答1开始\](.*)\[回答1结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
answer2 = re.search(r'\[回答2开始\](.*)\[回答2结束\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
else:
answer1 = re.search(r'\[Response 1 Start\](.*)\[Response 1 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
answer2 = re.search(r'\[Response 2 Start\](.*)\[Response 2 End\]', result_item['origin_prompt'][0]['prompt'], re.DOTALL | re.MULTILINE).group(1).strip()
word_count1 = count_chinese_characters(answer1) + count_english_words(answer1)
word_count2 = count_chinese_characters(answer2) + count_english_words(answer2)
if score_1 == 1 and score_2 == 0 and word_count1 - word_count2 > self.word_count_threshold:
score_1, score_2 = 0.5, 0.5
elif score_1 == 0 and score_2 == 1 and word_count2 - word_count1 > self.word_count_threshold:
score_1, score_2 = 0.5, 0.5
except Exception as e:
print(e)
from IPython import embed; embed(); exit()
win_model1[reference['category']] += score_1
win_model1[dataset_abbr] += score_1
win_model1['Level-' + str(reference['level'])] += score_1
win_model1['Lan-' + reference['lan']] += score_1
win_model2[reference['category']] += score_2
win_model2[dataset_abbr] += score_2
win_model2['Level-' + str(reference['level'])] += score_2
win_model2['Lan-' + reference['lan']] += score_2
for category in categories:
win_model1[category] = win_model1[category] / categories[category] * 100
win_model1[category] = round(win_model1[category], 2)
win_model2[category] = win_model2[category] / categories[category] * 100
win_model2[category] = round(win_model2[category], 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
for difficulty in difficulties:
win_model1[difficulty] = win_model1[difficulty] / difficulties[difficulty] * 100
win_model2[difficulty] = win_model2[difficulty] / difficulties[difficulty] * 100
for language in languages:
win_model1[language] = win_model1[language] / languages[language] * 100
win_model2[language] = win_model2[language] / languages[language] * 100
if judge_model not in scores:
scores[judge_model] = {}
if dataset_abbr not in scores[judge_model]:
scores[judge_model][dataset_abbr] = {}
scores[judge_model][dataset_abbr][model2] = win_model2
return scores, difficulties, languages
def summarize(
self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
scores, difficulties, languages = self.get_score(time_str)
# scores['win_' + model1] = win_model1
output_dir, results_folder = get_outdir(self.cfg, time_str)
all_judge_file_list = []
for idx, judge_model in enumerate(self.judge_models):
judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
detail_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias'] and i not in difficulties and i not in languages]
row_headers = [dataset_abbr, 'position_bias']
for difficulty in difficulties:
row_headers += [difficulty]
for language in languages:
row_headers += [language]
row_headers += detail_headers
headers = [''] + summarizer_model_abbrs
table = []
for row_header in row_headers:
row = [row_header]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
# print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
else:
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
# print(output_filename)
all_judge_file_list.append(output_filename)
dfs = [pd.read_csv(file) for file in all_judge_file_list]
average_df = copy.deepcopy(dfs[0])
for col in dfs[0].columns[1:]:
for i in range(1, len(dfs[0])):
average_df[col][i] = round(sum(df[col][i] for df in dfs) / len(dfs), 2)
average_csv_path = osp.join(output_dir, 'Averaged-' + dataset_abbr + '-report.csv')
average_df.to_csv(average_csv_path, index=False)
print(average_csv_path)

View File

@ -83,10 +83,13 @@ class FofoSummarizer:
scores[domain].append(score)
if format_type == 'general':
scores[format_name].append(score)
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
if len(judged_answers) == 0:
single_model_scores = {}
else:
single_model_scores = {
task: sum(score) / len(score)
for task, score in scores.items()
}
if judge_model not in total_scores:
total_scores[judge_model] = {}
if dataset_abbr not in total_scores[judge_model]:
@ -107,11 +110,13 @@ class FofoSummarizer:
Returns:
pd.DataFrame: The summary results.
"""
all_scores = {}
scores = self.get_score(time_str)
print(scores)
output_dir, results_folder = get_outdir(self.cfg, time_str)
for idx, judge_model in enumerate(self.judge_models):
judge_abbr = model_abbr_from_cfg(judge_model)
score_by_judgemodel = {}
score_saver = {}
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
summarizer_model_abbrs = self.eval_model_abbrs
@ -119,7 +124,7 @@ class FofoSummarizer:
format_types = ['Json', 'CSV', 'XML', 'YAML', 'Markdown']
row_headers = [
i for i in one_column.keys()
if i not in [dataset_abbr] + format_types
if i not in [dataset_abbr] + format_types + ['overall']
]
row_headers = ['overall'] + format_types + row_headers
headers = [dataset_abbr] + summarizer_model_abbrs
@ -136,19 +141,24 @@ class FofoSummarizer:
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
score_saver[dataset_abbr] = [s for s in table[0][1:]]
if idx == len(self.judge_models):
output_filename = osp.join(
output_dir, 'summarized-by--' + judge_abbr + '-' +
dataset_abbr + '-report.csv')
output_dir, dataset_abbr + '-summarized-by--' +
judge_abbr + '-' + '-report.csv')
else:
output_filename = osp.join(
output_dir, 'judged-by--' + judge_abbr + '-' +
dataset_abbr + '-report.csv')
output_dir, dataset_abbr + '-judged-by--' +
judge_abbr + '-' + '-report.csv')
with open(output_filename, 'w') as f:
f.write(','.join(headers) + '\n')
for line in table:
f.write(','.join(line) + '\n')
print(output_filename)
for idx, model in enumerate(summarizer_model_abbrs):
score_by_judgemodel[model] = {}
for subset_name, subset_scores in score_saver.items():
score_by_judgemodel[model][subset_name] = subset_scores[
idx]
all_scores[judge_abbr] = score_by_judgemodel
return {'Fofo': all_scores}

View File

@ -1,138 +0,0 @@
# flake8: noqa: E501
import csv
import os
import os.path as osp
import re
from collections import defaultdict
from datetime import datetime
import numpy as np
from mmengine import ConfigDict
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from opencompass.utils import model_abbr_from_cfg
from .subjective_post_process import post_process_autoj
from .utils import get_judgeanswer_and_reference, get_outdir
def post_process_ir(judgement: str):
"""Input a string like below:
Conclusion: [[Correct]]\nReasoning: xxx
and extract the score
"""
matches = re.findall(r'\[\[(.*?)\]\]', judgement)
if matches:
matches = matches[0]
if matches in ['Correct', 'Wrong', '', '']:
if matches == 'Correct' or matches == '':
return {'score': 1}
else:
return {'score': 0}
else:
return None
else:
return None
def get_results(
judged_answers,
references,
fout,
fout_flag,
model,
):
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
lan = ref['others']['lan']
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[lan] += ans['score']
capability_counts[lan] += 1
capability_avg_ratings = defaultdict(float)
for capability, total_score in capability_ratings.items():
capability_avg_ratings[
capability] = total_score / capability_counts[capability]
scores = {model: capability_avg_ratings}
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
num_header = [str(i) for i in range(4)]
writer.writerow(num_header)
header = ['模型']
for category in capability_avg_ratings:
header.append(category)
writer.writerow(header)
row = [model]
for category in capability_avg_ratings:
row.append(scores[model][category])
writer.writerow(row)
class IRSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, judge_type='autoj') -> None:
self.tasks = []
self.cfg = config
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model'])
self.judge_type = judge_type
assert self.judge_type in ['general', 'autoj']
self.judge_map = {
'general': post_process_ir,
'autoj': post_process_autoj,
}
self.judge_function = self.judge_map[self.judge_type]
def summarize(self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
time_str (str): Timestamp for file naming.
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr
fout = osp.join(output_dir,
'judged-by--' + judge_model + '.csv')
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
get_results(judged_answers, references, fout, fout_flag,
model)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
x = from_csv(f)
print(x)

View File

@ -16,6 +16,7 @@ from opencompass.utils import model_abbr_from_cfg
from .compass_arena import CompassArenaSummarizer
from .utils import get_judgeanswer_and_reference, get_outdir
COLUMNS = ['total', 'writing', 'roleplay', 'reasoning', 'math', 'coding', 'extraction', 'stem', 'humanities']
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
@ -57,22 +58,24 @@ def get_capability_results(
fout_flag,
model_abbr,
):
columns = COLUMNS
capability_ratings = defaultdict(int)
capability_counts = defaultdict(int)
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
capability_avg_ratings = defaultdict(float)
if len(judged_answers) == 0:
for column in columns:
capability_avg_ratings[column] = ''
else:
for ans, ref in zip(judged_answers, references):
capability_ratings['total'] += ans['score']
capability_counts['total'] += 1
capability_ratings[ref['capability']] += ans['score']
capability_counts[ref['capability']] += 1
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total')))
for capability, total_score in capability_ratings.items():
s = total_score / capability_counts[capability]
s = round(s, 2)
capability_avg_ratings[capability] = s
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
@ -98,7 +101,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
elif self.judge_type == 'pair':
self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_models = self.cfg.get('judge_models', None)
self.judge_map = {
'single': post_process_mtbench_single,
'pair': post_process_mtbench_pair
@ -120,34 +123,34 @@ class MTBenchSummarizer(CompassArenaSummarizer):
# self.judge_type == 'single'
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_cfg in self.eval_model_cfgs:
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
if os.path.isdir(subdir_path):
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
table = [line for line in csv_reader]
all_scores = {}
for judge_model in self.judge_models:
fout_flag = 0
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
for eval_model_cfg in self.eval_model_cfgs:
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + judge_abbr)
if os.path.isdir(subdir_path):
fout = osp.join(output_dir, 'MTBench-judged-by--' + judge_abbr + '-capability.csv')
overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers
overall_references += references
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
table = [line for line in csv_reader]
new_header = [''] + [line[0] for line in table]
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
t = tabulate(new_table, headers=new_header)
with open(fout, 'w') as f:
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(fout)
for model_score in table:
score_by_judgemodel[model_score[0]] = {}
for idx, column in enumerate(COLUMNS):
score_by_judgemodel[model_score[0]][column] = model_score[idx+1]
all_scores[judge_abbr] = score_by_judgemodel
return {'MTbench': all_scores}

View File

@ -50,8 +50,8 @@ def post_process_mtbench101(judgement: str):
return {'score': score, 'judgement': judgement}
def get_final_results(judged_answers, references, output_dir, fout_flag,
model):
def get_final_results(judged_answers, references, output_dir, fout_flag, model,
judgemodel):
task_multi_id_scores = defaultdict(list)
task_scores = defaultdict(list)
@ -72,22 +72,21 @@ def get_final_results(judged_answers, references, output_dir, fout_flag,
task: sum(scores) / len(scores) if scores else 0
for task, scores in task_scores.items()
}
fout = osp.join(output_dir, 'task_score.csv')
average_score = round(
sum(final_task_scores.values()) / len(final_task_scores), 2)
fout = osp.join(output_dir,
'MTBench101-task_score-judged-by--' + judgemodel + '.csv')
columns = list(final_task_scores.keys())
print('================task_score=====================')
print(final_task_scores)
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
if fout_flag == 0:
writer.writerow(['model'] + columns)
writer.writerow([model] +
writer.writerow(['model', 'average'] + columns)
writer.writerow([model, average_score] +
[final_task_scores[column] for column in columns])
return 0
return average_score
class MTBench101Summarizer(CompassArenaSummarizer):
@ -107,7 +106,7 @@ class MTBench101Summarizer(CompassArenaSummarizer):
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
self.judge_models = self.cfg.get('judge_models', None)
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_function = post_process_mtbench101
@ -122,21 +121,27 @@ class MTBench101Summarizer(CompassArenaSummarizer):
Returns:
pd.DataFrame: The summary results.
"""
dataset_cfgs = self.cfg['datasets']
dataset = self.cfg['datasets'][0] # MTBench101 has just one subfile
output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr
for dataset in dataset_cfgs:
print()
all_scores = {}
for judge_model in self.judge_models:
fout_flag = 0
score_by_judgemodel = {}
judge_abbr = model_abbr_from_cfg(judge_model)
for eval_model_abbr in self.eval_model_abbrs:
subdir = eval_model_abbr + '_judged-by--' + judge_abbr
subdir_path = os.path.join(results_folder, subdir)
if os.path.isdir(subdir_path):
judged_answers, references = get_judgeanswer_and_reference(
dataset, subdir_path, self.judge_function)
get_final_results(judged_answers, references, output_dir,
fout_flag, model)
model_average_score = get_final_results(
judged_answers, references, output_dir, fout_flag,
eval_model_abbr, judge_abbr)
fout_flag += 1
else:
print(subdir_path + ' is not exist! please check!')
score_by_judgemodel[eval_model_abbr] = {
'average': model_average_score
}
else:
print(subdir_path + ' is not exist! please check!')
all_scores[judge_abbr] = score_by_judgemodel
return {'MTBench101': all_scores}

View File

@ -0,0 +1,105 @@
# flake8: noqa: E501
import os.path as osp
from datetime import datetime
import pandas as pd
from mmengine import ConfigDict
from .utils import get_outdir
# Flatten the nested structure and ensure consistent order of models across datasets
def flatten_data(data):
flat_data = {}
models_order = set()
for dataset in data:
for dataset_name, judgemodel_scores in dataset.items():
for judgemodel_name, model_scores in judgemodel_scores.items():
if judgemodel_name not in flat_data:
flat_data[judgemodel_name] = {}
if dataset_name not in flat_data[judgemodel_name]:
flat_data[judgemodel_name][dataset_name] = {}
for model_name, scores in model_scores.items():
models_order.add(model_name)
if scores is not None:
for score_name, score_value in scores.items():
flat_data[
judgemodel_name][dataset_name].setdefault(
score_name,
{}).setdefault(model_name, score_value)
else:
for score_name in flat_data[judgemodel_name][
dataset_name]:
flat_data[judgemodel_name][dataset_name][
score_name].setdefault(model_name, None)
# Ensure consistent order of models
consistent_models_order = sorted(list(models_order))
for judgemodel_name in flat_data:
for dataset_name in flat_data[judgemodel_name]:
for score_name in flat_data[judgemodel_name][dataset_name]:
for model_name in consistent_models_order:
flat_data[judgemodel_name][dataset_name][
score_name].setdefault(model_name, None)
return flat_data, consistent_models_order
class SubjectiveSummarizer:
"""Do the subjectivity analyze based on evaluation results.
Args:
config (ConfigDict): The configuration object of the evaluation task.
It's expected to be filled out at runtime.
"""
def __init__(self, config: ConfigDict, function: str) -> None:
self.cfg = config
self.function = function
def summarize(
self,
subjective_scores: list,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
):
"""Summarize the subjectivity analysis based on evaluation results.
Args:
subjective_scores (list of dicts): Container of saving score information for each datasets and models
time_str (str): Timestamp for file naming.
Returns:
None
"""
output_dir, results_folder = get_outdir(self.cfg, time_str)
flat_data, models_order = flatten_data(subjective_scores)
# Create a DataFrame for each judgemodel with models as rows and datasets as columns
judgemodel_dfs_final_corrected = {}
for judgemodel_name, datasets_scores in flat_data.items():
dfs = {} # Dictionary to hold DataFrames for each dataset
for dataset_name, scores in datasets_scores.items():
# Create a DataFrame with models as index and datasets as columns
df = pd.DataFrame.from_dict(scores,
orient='index',
columns=models_order)
# Insert a new row at the top for the dataset names
df.insert(0, 'Detailed Scores', list(scores.keys()))
df.insert(0, 'Dataset',
[dataset_name for _ in range(len(df.index))])
dfs[dataset_name] = df
# Concatenate all DataFrames for the current judgemodel
judgemodel_df = pd.concat(dfs.values(), ignore_index=True)
judgemodel_dfs_final_corrected[judgemodel_name] = judgemodel_df
# Save each DataFrame to a separate CSV file
for judgemodel_name, df in judgemodel_dfs_final_corrected.items():
fout = osp.join(
output_dir, 'Subjective_all_results-judged-by--' +
judgemodel_name + '.csv')
print('Your subjective evaluation results have been saved at ' +
str(fout))
df.to_csv(fout, index=False)

View File

@ -55,7 +55,6 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
print('There are no results for ' + filename + ' or ' +
partial_filename)
print('*' * 100)
assert len(result) > 0
judged_answers = []
references = []
@ -67,15 +66,10 @@ def get_judgeanswer_and_reference(dataset, subdir_path, post_process):
# else:
# print(v['prediction'])
# print('-' * 128)
if len(judged_answers) != len(result):
print(
f'Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
)
if len(judged_answers) == 0:
if len(judged_answers) <= 0.95 * len(result):
print('*' * 100)
print(
'There are no extracted judgements, please change your judge model or check your prompt!!!'
f'For your {filename} judge. Among {len(result)} judgements, successfully extracted {len(judged_answers)} judgements, please check!'
)
print('*' * 100)
assert len(judged_answers) > 0
return judged_answers, references

View File

@ -56,7 +56,7 @@ class SubjectiveEvalTask(BaseTask):
self.judge_cfg = copy.deepcopy(judge_cfg)
self.judge_models = judge_models
self.infer_order = cfg.get('infer_order')
self.given_pred = cfg.eval.get('given_pred', [])
self.given_pred = cfg['datasets'][0][0].get('given_pred', [])
def get_command(self, cfg_path, template):
"""Get the command template for the task.