diff --git a/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py b/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py index b5f89cd8..1c956f0c 100644 --- a/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py +++ b/configs/datasets/subjective/alpaca_eval/alpacav1_judgeby_gpt4.py @@ -65,7 +65,6 @@ for _name in subjective_all_sets: subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, - infer_order='random', prompt_template=dict( type=PromptTemplate, template=dict( diff --git a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py index b45b7622..d67de9a4 100644 --- a/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py +++ b/configs/datasets/subjective/alpaca_eval/alpacav2_judgeby_gpt4.py @@ -67,7 +67,6 @@ for _name in subjective_all_sets: subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, - infer_order='random', prompt_template=dict( type=PromptTemplate, template=dict( diff --git a/configs/datasets/subjective/compassarena/compassarena_compare.py b/configs/datasets/subjective/compassarena/compassarena_compare.py index b06659b3..59464dbc 100644 --- a/configs/datasets/subjective/compassarena/compassarena_compare.py +++ b/configs/datasets/subjective/compassarena/compassarena_compare.py @@ -119,7 +119,6 @@ for _name, _prompt in sub_map.items(): subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, - infer_order='double', prompt_template=dict( type=PromptTemplate, template=dict(round=[ diff --git a/configs/datasets/subjective/compassarena/compassarena_compare_moe.py b/configs/datasets/subjective/compassarena/compassarena_compare_moe.py new file mode 100644 index 00000000..bd195f5b --- /dev/null +++ b/configs/datasets/subjective/compassarena/compassarena_compare_moe.py @@ -0,0 +1,156 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import LMEvaluator +from opencompass.datasets import CompassArenaDataset + +subjective_reader_cfg = dict( + input_columns=['question', 'ref'], + output_column='judge', + ) + +data_path ="data/subjective/compass_arena" + +subjective_datasets = [] + +base_prompt = """ + +[回答1开始] +{prediction} +[回答1结束] + +[回答2开始] +{prediction2} +[回答2结束] + +根据评分要求,在以下 3 个选项中做出选择: +A. 回答1更好 +B. 回答2更好 +C. 回答1、2平局 +并提供你的解释原因。 + +如果你认为回答1更好,你的输出应形如: +选择:A +原因:blahblah blahblah\n + +如果你认为回答2更好,你的输出应形如: +选择:B +原因:blahblah blahblah\n + +如果你认为回答1、2打成平手,你的输出应形如: +选择:C +原因:blahblah blahblah\n +""" + +knowledge_prompt = """ +请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 更好的回答能与参考答案吻合或表明参考答案的意思。 +2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。 +3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + + +language_prompt = """ +请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。 +2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等 +3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + + +math_prompt = """ +请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 更好的回答的答案能和参考答案一致。 +2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。 +3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。 + +[用户问题] +{question} + +[参考答案] +{ref} +""" + base_prompt + +reason_prompt = math_prompt + +creation_prompt = """ +请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。 +评分要求(重要性依次递减): +1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 +2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答 +3. 好的回答必须具有创造性的词语和表达丰富度 + +[用户问题] +{question} +""" + base_prompt + +sub_map = {"knowledge": knowledge_prompt, "language": language_prompt, "math_v2": math_prompt, "reason_v2": reason_prompt, "creationv2_zh": creation_prompt} + +meta_prompt = """ +\n你是一个评判专家,请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。\n评分要求(重要性依次递减):\n1. 好的回答必须首先符合用户问题里的各种需求,不能跑题 \n2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答\n3. 好的回答必须具有创造性的词语和表达丰富度\n\n[用户问题]\n{question}\n[回答1开始]\n{prediction}\n[回答1结束]\n[回答2开始]\n{prediction2}\n[回答2结束]\n此外,还有两个其他评判专家的评判意见供你参考。\n[评判意见1]\n{judgement}\n[评判意见2]\n{judgement2}\n\n最终请你综合其他评判专家的评判意见与你自己的意见,在以下 3 个选项中做出选择:\nA. 回答1更好\nB. 回答2更好\nC. 回答1、2平局\n并提供你的解释原因。\n\n如果你认为回答1更好,你的输出应形如:\n选择:A\n原因:blahblah blahblah\n\n\n如果你认为回答2更好,你的输出应形如:\n选择:B\n原因:blahblah blahblah\n\n\n如果你认为回答1、2打成平手,你的输出应形如:\n选择:C\n原因:blahblah blahblah\n\n +""" +for _name, _prompt in sub_map.items(): + subjective_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt="{question}" + ), + ]), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048), + ) + + subjective_eval_cfg = dict( + evaluator=dict( + type=LMEvaluator, + prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = _prompt + ), + ]), + ), + meta_review_prompt_template=dict( + type=PromptTemplate, + template=dict(round=[ + dict( + role='HUMAN', + prompt = meta_prompt + ), + ]), + ), + ), + pred_role="BOT", + ) + + subjective_datasets.append( + dict( + abbr=f"{_name}", + type=CompassArenaDataset, + path=data_path, + name=_name, + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg + )) diff --git a/configs/datasets/subjective/information_retrival/ir_judgedby_autoj.py b/configs/datasets/subjective/information_retrival/ir_judgedby_autoj.py deleted file mode 100644 index 78885d72..00000000 --- a/configs/datasets/subjective/information_retrival/ir_judgedby_autoj.py +++ /dev/null @@ -1,71 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import LMEvaluator -from opencompass.datasets import IRDataset - -subjective_reader_cfg = dict( - input_columns=['question', 'capability', 'ref'], - output_column='judge', - ) - -subjective_all_sets = [ - "information_retrieval", -] -data_path ="data/subjective/" - -subjective_datasets = [] - -for _name in subjective_all_sets: - subjective_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt="{question}" - ), - ]), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512), - ) - - subjective_eval_cfg = dict( - evaluator=dict( - type=LMEvaluator, - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分: - -[BEGIN DATA] -*** -[用户问询]: {question} -*** -[回应]: {prediction} -*** -[参考答案]: {ref} -*** -[END DATA] - -请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]".""" - ), - ]), - ), - ), - pred_role="BOT", - ) - - subjective_datasets.append( - dict( - abbr=f"{_name}", - type=IRDataset, - path=data_path, - name=_name, - reader_cfg=subjective_reader_cfg, - infer_cfg=subjective_infer_cfg, - eval_cfg=subjective_eval_cfg - )) \ No newline at end of file diff --git a/configs/datasets/subjective/information_retrival/ir_judgedby_gpt4.py b/configs/datasets/subjective/information_retrival/ir_judgedby_gpt4.py deleted file mode 100644 index 45351799..00000000 --- a/configs/datasets/subjective/information_retrival/ir_judgedby_gpt4.py +++ /dev/null @@ -1,59 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import LMEvaluator -from opencompass.datasets import IRDataset - -subjective_reader_cfg = dict( - input_columns=['question', 'capability', 'gpt4_prefix', 'gpt4_suffix', 'ref'], - output_column='judge', - ) - -subjective_all_sets = [ - "information_retrieval", -] -data_path ="data/subjective/" - -subjective_datasets = [] - -for _name in subjective_all_sets: - subjective_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt="{question}" - ), - ]), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=512), - ) - - subjective_eval_cfg = dict( - evaluator=dict( - type=LMEvaluator, - prompt_template=dict( - type=PromptTemplate, - template=dict(round=[ - dict( - role='HUMAN', - prompt = "{gpt4_prefix}{prediction}{gpt4_suffix}" - ), - ]), - ), - ), - pred_role="BOT", - ) - - subjective_datasets.append( - dict( - abbr=f"{_name}", - type=IRDataset, - path=data_path, - name=_name, - reader_cfg=subjective_reader_cfg, - infer_cfg=subjective_infer_cfg, - eval_cfg=subjective_eval_cfg - )) diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py index 0563ff87..e4a10df1 100644 --- a/configs/eval_subjective_alignbench.py +++ b/configs/eval_subjective_alignbench.py @@ -44,7 +44,7 @@ models = [ meta_template=api_meta_template, max_out_len=2048, max_seq_len=4096, - batch_size=1, + batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), ) ] @@ -54,7 +54,7 @@ datasets = [*subjective_datasets] # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration -judge_model = dict( +judge_models = [dict( abbr='GPT4-Turbo', type=OpenAI, path='gpt-4-1106-preview', @@ -65,18 +65,14 @@ judge_model = dict( max_seq_len=2048, batch_size=8, temperature=0, -) +)] ## ------------- Evaluation Configuration eval = dict( partitioner=dict( - type=SubjectiveNaivePartitioner, mode='singlescore', models=models - ), - runner=dict( - type=LocalRunner, - max_num_workers=2, - task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + type=SubjectiveSizePartitioner, max_task_size=1000, mode='singlescore', models=models, judge_models=judge_models, ), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)), ) summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general') diff --git a/configs/eval_subjective_alpacaeval_oc.py b/configs/eval_subjective_alpacaeval_oc.py index 13d1971b..7220ed20 100644 --- a/configs/eval_subjective_alpacaeval_oc.py +++ b/configs/eval_subjective_alpacaeval_oc.py @@ -47,7 +47,7 @@ models = [ meta_template=api_meta_template, max_out_len=2048, max_seq_len=4096, - batch_size=1, + batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), ) ] @@ -73,7 +73,7 @@ gpt4 = dict( # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration -judge_model = dict( +judge_models = [dict( abbr='GPT4-Turbo', type=OpenAI, path='gpt-4-1106-preview', @@ -85,21 +85,20 @@ judge_model = dict( batch_size=2, retry=20, temperature=0, -) +)] ## ------------- Evaluation Configuration eval = dict( partitioner=dict( - type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models - ), - runner=dict( - type=SlurmSequentialRunner, - partition='llmeval', - quotatype='auto', - max_num_workers=256, - task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), + type=SubjectiveSizePartitioner, max_task_size=1000, mode='m2n', base_models=[gpt4], compare_models=models, + infer_order='random', + judge_models=judge_models ), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)), + given_pred = [{'abbr':'gpt4-turbo', 'path':''}] ) work_dir = 'outputs/alpaca/' + + summarizer = dict(type=AlpacaSummarizer, judge_type='v2') \ No newline at end of file diff --git a/configs/eval_subjective_compassarena.py b/configs/eval_subjective_compassarena.py index 5e1f93ee..e3d12218 100644 --- a/configs/eval_subjective_compassarena.py +++ b/configs/eval_subjective_compassarena.py @@ -72,7 +72,7 @@ gpt4 = dict( # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration -judge_model = dict( +judge_models = [dict( abbr='GPT4-Turbo', type=OpenAI, path='gpt-4-1106-preview', @@ -84,7 +84,7 @@ judge_model = dict( batch_size=2, retry=20, temperature=0, -) +)] ## ------------- Evaluation Configuration eval = dict( @@ -93,16 +93,13 @@ eval = dict( strategy='split', max_task_size=10000, mode='m2n', + infer_order='double', base_models=[gpt4], compare_models=models, + judge_models=judge_models, ), - runner=dict( - type=SlurmSequentialRunner, - partition='llm_dev2', - quotatype='auto', - max_num_workers=32, - task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model), - ), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)), + given_pred = [{'abbr':'gpt4-turbo', 'path':''}] ) work_dir = 'outputs/compass_arena_debug/' diff --git a/configs/eval_subjective_judge_pandalm.py b/configs/eval_subjective_judge_pandalm.py index 2fbe7e91..64e55ae1 100644 --- a/configs/eval_subjective_judge_pandalm.py +++ b/configs/eval_subjective_judge_pandalm.py @@ -63,7 +63,7 @@ infer = dict( # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration -judge_model = dict( +judge_models = [dict( type=HuggingFaceCausalLM, abbr='pandalm-7b-v1-hf', path='WeOpenML/PandaLM-7B-v1', @@ -79,12 +79,12 @@ judge_model = dict( batch_size=8, model_kwargs=dict(device_map='auto', trust_remote_code=True), run_cfg=dict(num_gpus=1, num_procs=1), -) +)] ## ------------- Evaluation Configuration eval = dict( - partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models), - runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), + partitioner=dict(type=SubjectiveNaivePartitioner, mode='singlescore', models=models, judge_models=judge_models), + runner=dict(type=LocalRunner, max_num_workers=2, task=dict(type=SubjectiveEvalTask)), ) summarizer = dict(type=AlignmentBenchSummarizer) diff --git a/configs/eval_subjective_mtbench.py b/configs/eval_subjective_mtbench.py index c8dbb23c..59335cfb 100644 --- a/configs/eval_subjective_mtbench.py +++ b/configs/eval_subjective_mtbench.py @@ -2,7 +2,6 @@ from mmengine.config import read_base with read_base(): from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets - # from .datasets.subjective.multiround.mtbench_pair_judge import subjective_datasets from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI from opencompass.models.openai_api import OpenAIAllesAPIN @@ -62,7 +61,7 @@ datasets = [*subjective_datasets] # -------------Evalation Stage ---------------------------------------- ## ------------- JudgeLLM Configuration -judge_model = dict( +judge_models = [dict( abbr='GPT4-Turbo', type=OpenAI, path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613 @@ -73,23 +72,12 @@ judge_model = dict( max_seq_len=2048, batch_size=8, temperature=0, -) -## ------------- Evaluation Configuration -# ## pair evaluation -# eval = dict( -# partitioner=dict( -# type=SubjectiveSizePartitioner, max_task_size=100, mode='m2n', base_models=[gpt4], compare_models=models -# ), -# runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), -# ) - -# summarizer = dict(type=MTBenchSummarizer, judge_type='pair') - +)] ## single evaluation eval = dict( - partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models), - runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask, judge_cfg=judge_model)), + partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models), + runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)), ) summarizer = dict(type=MTBenchSummarizer, judge_type='single') diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 699648d2..05477692 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -27,10 +27,12 @@ def extract_dicts(data): return predictions -def order_preds_and_record_references(predictions, - references, - infer_order, - seed=2680): +def order_preds_and_record_references( + predictions, + references, + infer_order, + seed=666, +): """Order predictions based on args and recording regrading references. Args: @@ -85,17 +87,19 @@ class LMEvaluator: prompt_template: ConfigDict, judge_cfg: ConfigDict, output_path: str, - infer_order: Optional[str] = 'random', + meta_review_prompt_template: Optional[ConfigDict] = None, dataset_cfg: Optional[ConfigDict] = None, postprocessor: ConfigDict = dict(type=first_number_postprocess) ) -> None: - assert infer_order in ['random', 'double'] self.output_path = output_path out_dir, out_name = osp.split(output_path) if not out_dir: out_dir = './' self.prompt_tmpl = ICL_PROMPT_TEMPLATES.build(prompt_template) + if meta_review_prompt_template is not None: + self.meta_review_prompt_tmpl = ICL_PROMPT_TEMPLATES.build( + meta_review_prompt_template) max_out_len = judge_cfg.get('max_out_len', None) batch_size = judge_cfg.get('batch_size', None) @@ -108,16 +112,20 @@ class LMEvaluator: self.postprocessor = get_type_from_cfg(postprocessor) self.logger = get_logger() self.dataset_cfg = dataset_cfg - self.infer_order = infer_order - def score(self, predictions, references: Optional[List] = None) -> Dict: + def score(self, + predictions, + judgements: Optional[List] = None, + references: Optional[List] = None, + meta: Optional[bool] = False, + infer_order: Optional[str] = 'random') -> Dict: dup_indices = [] if type(predictions) == list: """Apply to multi-model comparison.""" references = [{} for _ in range(len(predictions[0]['model_preds'])) ] if references is None else references predictions, references = order_preds_and_record_references( - predictions, references, self.infer_order) + predictions, references, infer_order) # calculate dupicated predictions numbers total_predictions_num = len(predictions[0]) @@ -135,6 +143,9 @@ class LMEvaluator: ] if references is None else references predictions = [predictions['model_preds']] + # Due to the rarity of identical predictions, we have temporarily disabled the plagiarism detection feature. + dup_indices = [] + if len(dup_indices) != 0: # remove dupicated predictions for index in sorted(dup_indices, reverse=True): @@ -149,6 +160,14 @@ class LMEvaluator: for i in range(len(predictions)): key = 'prediction' if i == 0 else f'prediction{i + 1}' pred_dict[key] = predictions[i] + if judgements: + for i in range(len(judgements)): + key = 'judgement' if i == 0 else f'judgement{i + 1}' + pred_dict[key] = judgements[i]['model_preds'] + for j in range(len(references)): + references[j]['judge_model' + + str(i + 1)] = judgements[i]['model_name'] + elif isinstance( predictions[0][0], list ): #multi round for format like [[[{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}], [{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}]]] @@ -158,11 +177,13 @@ class LMEvaluator: key = 'prediction' if i == 0 else f'prediction{i}' key += '_r' + str(j + 1) pred_dict[key] = multiround_predictions[j] - + if judgements: + raise NotImplementedError( + 'Not applied meta-reivew judge on multi-round dataset') if self.dataset_cfg: dataset = build_dataset_from_cfg(self.dataset_cfg) - if self.infer_order == 'double': + if infer_order == 'double': new_ds = { k: dataset.test[k] * 2 for k in dataset.test.column_names @@ -179,7 +200,6 @@ class LMEvaluator: print( f'Among total {total_predictions_num} predictions, there are {len(dup_indices)} predictions totally same, which are removed!' ) - for k, v in pred_dict.items(): dataset.reader.dataset['test'] = dataset.test.add_column(k, v) dataset.reader.input_columns.append(k) @@ -201,8 +221,13 @@ class LMEvaluator: **pred_dict) dataset.reader.output_column = 'reference' retriever = ZeroRetriever(dataset) - self.inferencer.inference(retriever=retriever, - prompt_template=self.prompt_tmpl) + if meta: + self.inferencer.inference( + retriever=retriever, + prompt_template=self.meta_review_prompt_tmpl) + else: + self.inferencer.inference(retriever=retriever, + prompt_template=self.prompt_tmpl) output = mmengine.load(self.output_path) return self.postprocess(output) diff --git a/opencompass/partitioners/base.py b/opencompass/partitioners/base.py index 1b24d259..9a94ee6c 100644 --- a/opencompass/partitioners/base.py +++ b/opencompass/partitioners/base.py @@ -1,3 +1,4 @@ +# flake8: noqa: E501 import inspect from abc import abstractmethod from copy import deepcopy @@ -81,11 +82,21 @@ class BasePartitioner: work_dir=work_dir, out_dir=self.out_dir, add_cfg=add_cfg) - - self.logger.info(f'Partitioned into {len(tasks)} tasks.') - for i, task in enumerate(tasks): - self.logger.debug(f'Task {i}: {task_abbr_from_cfg(task)}') - + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + self.logger.info( + f'Now we are in the subjective evluation! Partitioned into 2 stages and {len(tasks[0])} tasks in first stage, {len(tasks[1])} tasks in second stage.' + ) + cnt = 0 + for task_part in tasks: + for task in task_part: + self.logger.debug( + f'Task {cnt}: {task_abbr_from_cfg(task)}') + cnt += 1 + else: + self.logger.info(f'Partitioned into {len(tasks)} tasks.') + for i, task in enumerate(tasks): + self.logger.debug(f'Task {i}: {task_abbr_from_cfg(task)}') return tasks def parse_model_dataset_args(self, cfg: ConfigDict): diff --git a/opencompass/partitioners/sub_naive.py b/opencompass/partitioners/sub_naive.py index e21193b0..45cc0c47 100644 --- a/opencompass/partitioners/sub_naive.py +++ b/opencompass/partitioners/sub_naive.py @@ -1,14 +1,20 @@ +# flake8: noqa: E501 +import copy +import os.path as osp from itertools import combinations, product from typing import Dict, List, Optional, Tuple from mmengine.config import ConfigDict from opencompass.registry import PARTITIONERS +from opencompass.utils import (deal_with_judge_model_abbr, + get_infer_output_path, model_abbr_from_cfg) from .naive import NaivePartitioner def remove_duplicate_pairs(model_combinations): + # For compare mode, we need to remove redundant pairs first combo_dict = {} for i, combo in enumerate(model_combinations): sorted_names = tuple(sorted((combo[0]['abbr'], combo[1]['abbr']))) @@ -20,6 +26,82 @@ def remove_duplicate_pairs(model_combinations): return new_model_combinations +def replicate_tasks_with_judge_models(tasks, judge_models, meta_judge_model): + # When all tasks are already partitioned, we add judge_models and meta_judge_model as additional args. + if meta_judge_model: + replicated_tasks = [[], []] + else: + replicated_tasks = [] + for task in tasks: + replicated_task_dicts = [task.copy() for _ in range(len(judge_models))] + for idx, replicated_task in enumerate(replicated_task_dicts): + replicated_task['judge_model'] = judge_models[idx] + if meta_judge_model: + meta_task = task.copy() + meta_task['meta_judge_model'] = meta_judge_model + meta_task['judge_models'] = judge_models + replicated_tasks[1].append(meta_task) + replicated_tasks[0].extend(replicated_task_dicts) + else: + replicated_tasks.extend(replicated_task_dicts) + return replicated_tasks + + +def remove_already_tasks(tasks, work_dir, meta_judge_model): + # Check and remove the already finished subjective evaluation tasks + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + tasks_to_keep = [[], []] + for i in range(2): + for task in tasks[i]: + temp_task = copy.deepcopy(task) + to_delete_index = [ + ] # To deal with the situation that the partition strategy is not split, which means that there will be a task contains multi dataset, and when we need to re-start, we need to remove the already done tasks. + for idx, dataset in enumerate(task['datasets'][0]): + if i == 0: + filename = get_infer_output_path( + deal_with_judge_model_abbr(task['models'][0], + task['judge_model'], + False), dataset, + osp.join(work_dir, 'results')) + else: + filename = get_infer_output_path( + deal_with_judge_model_abbr( + task['models'][0], task['meta_judge_model'], + True), dataset, osp.join(work_dir, 'results')) + if osp.exists(filename): + to_delete_index.append(idx) + temp_task['datasets'][0] = [ + temp_task['datasets'][0][j] + for j in range(len(temp_task['datasets'][0])) + if j not in to_delete_index + ] + if len(temp_task['datasets'][0]) != 0: + tasks_to_keep[i].append(temp_task) + else: + tasks_to_keep = [] + for task in tasks: + temp_task = copy.deepcopy(task) + to_delete_index = [ + ] # To deal with the situation that the partition strategy is not split, which means that there will be a task contains multi dataset, and when we need to re-start, we need to remove the already done tasks. + for idx, dataset in enumerate(task['datasets'][0]): + filename = get_infer_output_path( + deal_with_judge_model_abbr(task['models'][0], + task['judge_model']), dataset, + osp.join(work_dir, 'results')) + if osp.exists(filename): + to_delete_index.append(idx) + # Remove the already done tasks + temp_task['datasets'][0] = [ + temp_task['datasets'][0][j] + for j in range(len(temp_task['datasets'][0])) + if j not in to_delete_index + ] + if len(temp_task['datasets'][0]) != 0: + tasks_to_keep.append(temp_task) + return tasks_to_keep + + @PARTITIONERS.register_module() class SubjectiveNaivePartitioner(NaivePartitioner): """Naive task partitioner for subjective evaluation. Compared to @@ -37,15 +119,22 @@ class SubjectiveNaivePartitioner(NaivePartitioner): models: Optional[List[ConfigDict]] = [], base_models: Optional[List[ConfigDict]] = [], compare_models: Optional[List[ConfigDict]] = [], + judge_models: Optional[List[ConfigDict]] = [], + meta_judge_model: Optional[ConfigDict] = None, model_pairs: Optional[List[Tuple]] = None, - keep_keys: Optional[List[str]] = None): + keep_keys: Optional[List[str]] = None, + infer_order: Optional[str] = 'random'): super().__init__(out_dir=out_dir, keep_keys=keep_keys) assert mode in ['singlescore', 'allpair', 'm2n', 'fixed'] + assert infer_order in ['random', 'double'] self.mode = mode self.models = models self.base_models = base_models self.compare_models = compare_models self.model_pairs = model_pairs + self.judge_models = judge_models + self.meta_judge_model = meta_judge_model + self.infer_order = infer_order def get_model_combinations( self, @@ -97,14 +186,35 @@ class SubjectiveNaivePartitioner(NaivePartitioner): """ models = self.models if self.models != [] else models base_models, compare_models = self.base_models, self.compare_models + judge_models, meta_judge_model = self.judge_models, self.meta_judge_model if self.mode == 'singlescore': models = models else: models = self.get_model_combinations(models, base_models, compare_models) model_dataset_combinations = [{'models': models, 'datasets': datasets}] - return super().partition( + tasks = super().partition( model_dataset_combinations=model_dataset_combinations, work_dir=work_dir, out_dir=out_dir, add_cfg=add_cfg) + + # We need to add judge models and meta-judge-model as new tasks + # When there is no meta-judge-model, we assign all judge models to each tasks + # When there is a meta-judge-model, we add an additional task stage + tasks = replicate_tasks_with_judge_models(tasks, judge_models, + meta_judge_model) + + # We also need to check and remove the already done tasks + tasks = remove_already_tasks(tasks, work_dir, meta_judge_model) + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + # Refer to meta review judge + for task_stage in tasks: + for task in task_stage: + task['infer_order'] = self.infer_order + else: + # Refer to just have review judge + for task in tasks: + task['infer_order'] = self.infer_order + return tasks diff --git a/opencompass/partitioners/sub_size.py b/opencompass/partitioners/sub_size.py index 624cef7b..fa51ee4f 100644 --- a/opencompass/partitioners/sub_size.py +++ b/opencompass/partitioners/sub_size.py @@ -1,3 +1,4 @@ +# flake8: noqa: E501 import copy import math import os.path as osp @@ -11,7 +12,8 @@ from opencompass.registry import PARTITIONERS from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, get_infer_output_path) -from .sub_naive import SubjectiveNaivePartitioner +from .sub_naive import (SubjectiveNaivePartitioner, remove_already_tasks, + replicate_tasks_with_judge_models) @PARTITIONERS.register_module() @@ -40,19 +42,25 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): models: Optional[List[ConfigDict]] = [], base_models: Optional[List[ConfigDict]] = [], compare_models: Optional[List[ConfigDict]] = [], + judge_models: Optional[List[ConfigDict]] = [], + meta_judge_model: Optional[ConfigDict] = None, model_pairs: Optional[List[Tuple]] = None, max_task_size: int = 40000, gen_task_coef: int = 20, strategy: str = 'heuristic', dataset_size_path: str = '.cache/dataset_size.json', - keep_keys: Optional[List[str]] = None): + keep_keys: Optional[List[str]] = None, + infer_order: Optional[str] = 'random'): super().__init__(out_dir=out_dir, keep_keys=keep_keys, mode=mode, models=models, base_models=base_models, compare_models=compare_models, - model_pairs=model_pairs) + judge_models=judge_models, + meta_judge_model=meta_judge_model, + model_pairs=model_pairs, + infer_order=infer_order) self.max_task_size = max_task_size self.gen_task_coef = gen_task_coef self.dataset_size_path = dataset_size_path @@ -96,13 +104,13 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): """ models = self.models if self.models != [] else models base_models, compare_models = self.base_models, self.compare_models + judge_models, meta_judge_model = self.judge_models, self.meta_judge_model if self.mode == 'singlescore': models = models else: models = super().get_model_combinations(models, base_models, compare_models) model_dataset_combinations = [{'models': models, 'datasets': datasets}] - tasks = [] for comb in model_dataset_combinations: comb['datasets'] = sorted(comb['datasets'], @@ -113,8 +121,8 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): for dataset in comb['datasets']: filename = get_infer_output_path(model, dataset, out_dir) # skip the task if the task output exists - if osp.exists(filename): - continue + # if osp.exists(filename): + # continue dataset_size = self.get_cost(dataset) if dataset_size > self.max_task_size: root, ext = osp.splitext(filename) @@ -151,6 +159,21 @@ class SubjectiveSizePartitioner(SubjectiveNaivePartitioner): 'work_dir': work_dir, **add_cfg })) + + tasks = replicate_tasks_with_judge_models(tasks, judge_models, + meta_judge_model) + tasks = remove_already_tasks(tasks, work_dir, meta_judge_model) + + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + # Refer to meta review judge + for task_stage in tasks: + for task in task_stage: + task['infer_order'] = self.infer_order + else: + # Refer to just have review judge + for task in tasks: + task['infer_order'] = self.infer_order return tasks @property diff --git a/opencompass/summarizers/subjective/alignmentbench.py b/opencompass/summarizers/subjective/alignmentbench.py index 49e57bb3..d114a157 100644 --- a/opencompass/summarizers/subjective/alignmentbench.py +++ b/opencompass/summarizers/subjective/alignmentbench.py @@ -309,7 +309,7 @@ class AlignmentBenchSummarizer: self.eval_model_abbrs = [ model_abbr_from_cfg(model) for model in self.eval_model_cfgs ] - self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) + self.judge_models = self.cfg.get('judge_models', None) self.judge_type = judge_type assert self.judge_type in [ 'general', 'autoj', 'judgelm', 'general_plus' @@ -333,33 +333,36 @@ class AlignmentBenchSummarizer: Returns: pd.DataFrame: The summary results. """ - dataset_cfgs = self.cfg['datasets'] - output_dir, results_folder = get_outdir(self.cfg, time_str) - fout_flag, fout_flag2 = 0, 0 - for eval_model_abbr in self.eval_model_abbrs: - subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr - subdir_path = os.path.join(results_folder, subdir) - if os.path.isdir(subdir_path): - model, judge_model = eval_model_abbr, self.judge_abbr - if self.judge_type == 'general': - fout = osp.join( - output_dir, - 'judged-by--' + judge_model + '-dimension.csv') - fout2 = osp.join( - output_dir, - 'judged-by--' + judge_model + '-capability.csv') - for dataset in dataset_cfgs: - judged_answers, references = get_judgeanswer_and_reference( - dataset, subdir_path, self.judge_function) + for judge_model in self.judge_models: + judge_abbr = model_abbr_from_cfg(judge_model) + dataset_cfgs = self.cfg['datasets'] + output_dir, results_folder = get_outdir(self.cfg, time_str) + fout_flag, fout_flag2 = 0, 0 + for eval_model_abbr in self.eval_model_abbrs: + subdir = eval_model_abbr + '_judged-by--' + judge_abbr + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + model = eval_model_abbr if self.judge_type == 'general': - get_dimension_results(judged_answers, references, fout, - fout_flag, model) - fout_flag += 1 - get_capability_results(judged_answers, references, fout2, - fout_flag2, model, self.category) - fout_flag2 += 1 - else: - print(subdir_path + ' is not exist! please check!') + fout = osp.join( + output_dir, + 'judged-by--' + judge_abbr + '-dimension.csv') + fout2 = osp.join( + output_dir, + 'judged-by--' + judge_abbr + '-capability.csv') + for dataset in dataset_cfgs: + judged_answers, references = get_judgeanswer_and_reference( + dataset, subdir_path, self.judge_function) + if self.judge_type == 'general': + get_dimension_results(judged_answers, references, + fout, fout_flag, model) + fout_flag += 1 + get_capability_results(judged_answers, references, + fout2, fout_flag2, model, + self.category) + fout_flag2 += 1 + else: + print(subdir_path + ' is not exist! please check!') if self.judge_type == 'general': with open(fout, 'r') as f: x = from_csv(f) diff --git a/opencompass/summarizers/subjective/alpacaeval.py b/opencompass/summarizers/subjective/alpacaeval.py index ac7e3f48..d9e858dc 100644 --- a/opencompass/summarizers/subjective/alpacaeval.py +++ b/opencompass/summarizers/subjective/alpacaeval.py @@ -82,7 +82,8 @@ class AlpacaSummarizer: self.cfg = config self.base_models = self.cfg['eval']['partitioner']['base_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models'] - self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) + self.judge_abbr = model_abbr_from_cfg( + self.cfg['judge_models'][0]) # We will reorganize the summarizers self.judge_type = judge_type assert self.judge_type in ['v1', 'v2'] self.judge_map = { diff --git a/opencompass/summarizers/subjective/compass_arena.py b/opencompass/summarizers/subjective/compass_arena.py index d23c9804..e62d260c 100644 --- a/opencompass/summarizers/subjective/compass_arena.py +++ b/opencompass/summarizers/subjective/compass_arena.py @@ -67,7 +67,9 @@ class CompassArenaSummarizer: self.cfg = config self.base_models = self.cfg['eval']['partitioner']['base_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models'] - self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) + self.judge_models = self.cfg.get('judge_models', None) + self.meta_judge_model = self.cfg.eval.partitioner.get( + 'meta_judge_model', None) self.judge_type = judge_type assert self.judge_type in ['general'] self.judge_map = { @@ -95,109 +97,135 @@ class CompassArenaSummarizer: product(self.base_models, self.compare_models)) unique_combinations = remove_duplicate_pairs( [combo for combo in model_combinations if combo[0] != combo[1]]) - judge_model = self.judge_abbr + fout_list = [] - for dataset in dataset_cfgs: - dataset_abbr = dataset_abbr_from_cfg(dataset) - fout = osp.join( - output_dir, 'judged-by--' + judge_model + '-' + dataset_abbr + - '-report.csv') - fout_list.append(fout) - for model_pair in unique_combinations: - model1, model2, = model_pair[0]['abbr'], model_pair[1]['abbr'], - subdir = model1 + '_' + model2 + '_judged-by--' + judge_model - subdir_path = os.path.join(results_folder, subdir) - if os.path.isdir(subdir_path): - judged_answers, references = get_judgeanswer_and_reference( - dataset, - subdir_path, - self.judge_function, - ) - if self.check_pos_bias: - bias_num = check_position_bias(judged_answers, - references) - else: - bias_num = 0 - win_model1, win_model2, categories = defaultdict( - float), defaultdict(float), defaultdict(float) - model1, model2 = references[0]['answer1'], references[0][ - 'answer2'] - for prediction, reference in zip(judged_answers, - references): - if self.summary_type == 'single': - if prediction == 'A': - categories['total'] += 1 - categories[reference['capability']] += 1 - if reference['answer1'] == model1: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 - else: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - elif prediction == 'B': - categories['total'] += 1 - categories[reference['capability']] += 1 - if reference['answer1'] == model1: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - else: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 - elif self.summary_type == 'half_add': - categories['total'] += 1 - categories[reference['capability']] += 1 - if prediction == 'A': - if reference['answer1'] == model1: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 - else: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - elif prediction == 'B': - if reference['answer1'] == model1: - win_model2[reference['capability']] += 1 - win_model2['total'] += 1 - else: - win_model1[reference['capability']] += 1 - win_model1['total'] += 1 - elif prediction == 'C': - win_model1[reference['capability']] += 0.5 - win_model1['total'] += 0.5 - win_model2[reference['capability']] += 0.5 - win_model2['total'] += 0.5 - for capability in categories: - if capability not in win_model1: - win_model1[capability] = 0.0 - else: - win_model1[capability] = round( - (win_model1[capability] / - categories[capability]) * 100, 2) - if capability not in win_model2: - win_model2[capability] = 0.0 - else: - win_model2[capability] = round( - (win_model2[capability] / - categories[capability]) * 100, 2) - win_model1['position_bias'] = bias_num - win_model2['position_bias'] = bias_num - scores = { - 'win_' + model1: win_model1, - 'win_' + model2: win_model2 - } - rows = list(scores.keys()) - columns = list(scores[rows[0]].keys()) - columns.insert(0, columns.pop(columns.index('total'))) - columns.insert(1, - columns.pop(columns.index('position_bias'))) - with open(fout, 'a+', newline='') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([model1 + '_vs_' + model2] + columns) - for row in rows: - writer.writerow( - [row] + - [scores[row][column] for column in columns]) + pre_len = len(self.judge_models) + if self.meta_judge_model is not None: + self.judge_models.append(self.meta_judge_model) + meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model) + else: + meta_judge_model_abbr = None + for idx, judge_model in enumerate(self.judge_models): + judge_model = model_abbr_from_cfg(judge_model) + for dataset in dataset_cfgs: + dataset_abbr = dataset_abbr_from_cfg(dataset) + if idx == pre_len: + fout = osp.join( + output_dir, 'summarized-by--' + judge_model + '-' + + dataset_abbr + '-report.csv') else: - print(subdir_path + ' is not exist! please check!') + fout = osp.join( + output_dir, 'judged-by--' + judge_model + '-' + + dataset_abbr + '-report.csv') + fout_list.append(fout) + for model_pair in unique_combinations: + model1, model2, = model_pair[0]['abbr'], model_pair[1][ + 'abbr'], + if idx == pre_len: + subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model + else: + subdir = model1 + '_' + model2 + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if os.path.isdir(subdir_path): + judged_answers, references = get_judgeanswer_and_reference( + dataset, + subdir_path, + self.judge_function, + ) + if self.check_pos_bias: + bias_num = check_position_bias( + judged_answers, references) + else: + bias_num = 0 + win_model1, win_model2, categories = defaultdict( + float), defaultdict(float), defaultdict(float) + model1, model2 = references[0]['answer1'], references[ + 0]['answer2'] + for prediction, reference in zip( + judged_answers, references): + if self.summary_type == 'single': + if prediction == 'A': + categories['total'] += 1 + categories[reference['capability']] += 1 + if reference['answer1'] == model1: + win_model1[ + reference['capability']] += 1 + win_model1['total'] += 1 + else: + win_model2[ + reference['capability']] += 1 + win_model2['total'] += 1 + elif prediction == 'B': + categories['total'] += 1 + categories[reference['capability']] += 1 + if reference['answer1'] == model1: + win_model2[ + reference['capability']] += 1 + win_model2['total'] += 1 + else: + win_model1[ + reference['capability']] += 1 + win_model1['total'] += 1 + elif self.summary_type == 'half_add': + categories['total'] += 1 + categories[reference['capability']] += 1 + if prediction == 'A': + if reference['answer1'] == model1: + win_model1[ + reference['capability']] += 1 + win_model1['total'] += 1 + else: + win_model2[ + reference['capability']] += 1 + win_model2['total'] += 1 + elif prediction == 'B': + if reference['answer1'] == model1: + win_model2[ + reference['capability']] += 1 + win_model2['total'] += 1 + else: + win_model1[ + reference['capability']] += 1 + win_model1['total'] += 1 + elif prediction == 'C': + win_model1[reference['capability']] += 0.5 + win_model1['total'] += 0.5 + win_model2[reference['capability']] += 0.5 + win_model2['total'] += 0.5 + for capability in categories: + if capability not in win_model1: + win_model1[capability] = 0.0 + else: + win_model1[capability] = round( + (win_model1[capability] / + categories[capability]) * 100, 2) + if capability not in win_model2: + win_model2[capability] = 0.0 + else: + win_model2[capability] = round( + (win_model2[capability] / + categories[capability]) * 100, 2) + win_model1['position_bias'] = bias_num + win_model2['position_bias'] = bias_num + scores = { + 'win_' + model1: win_model1, + 'win_' + model2: win_model2 + } + rows = list(scores.keys()) + columns = list(scores[rows[0]].keys()) + columns.insert(0, columns.pop(columns.index('total'))) + columns.insert( + 1, columns.pop(columns.index('position_bias'))) + with open(fout, 'a+', newline='') as csvfile: + writer = csv.writer(csvfile) + writer.writerow([model1 + '_vs_' + model2] + + columns) + for row in rows: + writer.writerow([row] + [ + scores[row][column] for column in columns + ]) + else: + print(subdir_path + ' is not exist! please check!') for fout in fout_list: with open(fout, 'r') as f: x = from_csv(f) diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py index 8d80544f..5da2e538 100644 --- a/opencompass/summarizers/subjective/mtbench.py +++ b/opencompass/summarizers/subjective/mtbench.py @@ -98,7 +98,7 @@ class MTBenchSummarizer(CompassArenaSummarizer): self.base_models = self.cfg['eval']['partitioner']['base_models'] self.compare_models = self.cfg['eval']['partitioner'][ 'compare_models'] - self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) + self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_map = { 'single': post_process_mtbench_single, 'pair': post_process_mtbench_pair diff --git a/opencompass/tasks/subjective_eval.py b/opencompass/tasks/subjective_eval.py index 30847f79..a455f38d 100644 --- a/opencompass/tasks/subjective_eval.py +++ b/opencompass/tasks/subjective_eval.py @@ -1,10 +1,11 @@ +# flake8: noqa: E501 import argparse import copy import fnmatch import os.path as osp import random import time -from typing import List, Union +from typing import List, Optional, Union import mmengine from mmengine.config import Config, ConfigDict @@ -14,6 +15,7 @@ from opencompass.registry import ICL_EVALUATORS, MODELS, TEXT_POSTPROCESSORS from opencompass.tasks.base import BaseTask from opencompass.tasks.openicl_eval import extract_role_pred from opencompass.utils import (build_dataset_from_cfg, dataset_abbr_from_cfg, + deal_with_judge_model_abbr, get_infer_output_path, get_logger, model_abbr_from_cfg, task_abbr_from_cfg) @@ -35,21 +37,25 @@ class SubjectiveEvalTask(BaseTask): def __init__(self, cfg: ConfigDict): super().__init__(cfg) self.logger = get_logger() - judge_cfg = cfg.eval.runner.task.get('judge_cfg', {}) - if type(judge_cfg) != ConfigDict: - print('*' * 100) - print('Due to different Judge model needs different summarizer and' - " prompts, we don't support multi judge model evaluation at " - 'one time, please do not use list to set your judge cfg, jus' - 't use a dict or list[0] should be fine. If you want to eval' - 'uation multi judge model in one script, we suggest you to u' - 'se a bash or bat script to start multi configs evaluation!') - print('*' * 100) - assert type(judge_cfg) == ConfigDict + judge_cfg = cfg.get('judge_model', None) + meta_judge_cfg = cfg.get('meta_judge_model', None) + judge_models = cfg.get('judge_models', None) + + if judge_cfg is None and meta_judge_cfg is None: + assert judge_cfg is not None, 'Both judge_cfg and meta_judge_cfg are None, but judge_models must be provided.' + + if meta_judge_cfg is not None: + assert judge_models is not None, 'meta_judge_cfg is provided, but judge_models are missing.' + judge_cfg = meta_judge_cfg # Relpace judge_cfg to meta_judge_cfg when it is not None + self.meta = True + else: + self.meta = False run_cfg = judge_cfg.get('run_cfg', {}) self.num_gpus = run_cfg.get('num_gpus', 0) self.num_procs = run_cfg.get('num_procs', 1) self.judge_cfg = copy.deepcopy(judge_cfg) + self.judge_models = judge_models + self.infer_order = cfg.get('infer_order') self.given_pred = cfg.eval.get('given_pred', []) def get_command(self, cfg_path, template): @@ -78,17 +84,15 @@ class SubjectiveEvalTask(BaseTask): # Load Dataset eval_cfg = dataset_cfg.get('eval_cfg') output_column = dataset_cfg['reader_cfg']['output_column'] - if type(model_cfg) == ConfigDict: - model_cfg = (model_cfg, ) - model_cfg += ({ - 'abbr': - 'judged-by--' + model_abbr_from_cfg(self.judge_cfg) - }, ) out_path = get_infer_output_path( - model_cfg, dataset_cfg, osp.join(self.work_dir, 'results')) + deal_with_judge_model_abbr(model_cfg, self.judge_cfg, + self.meta), dataset_cfg, + osp.join(self.work_dir, 'results')) if osp.exists(out_path): continue - self._score(model_cfg, dataset_cfg, eval_cfg, output_column) + + self._score(model_cfg, dataset_cfg, eval_cfg, output_column, + self.meta) def _load_model_pred( self, @@ -194,7 +198,139 @@ class SubjectiveEvalTask(BaseTask): 'model_preds': pred_strs } - def _score(self, model_cfg, dataset_cfg, eval_cfg, output_column): + def _load_model_judgements( + self, + model_cfg: Union[ConfigDict, List[ConfigDict]], + dataset_cfg: ConfigDict, + eval_cfg: ConfigDict, + judge_cfg: Union[ConfigDict, List[ConfigDict]], + ) -> Union[None, List[str]]: + + if isinstance(judge_cfg, (tuple, list)): + return [ + self._load_model_judgements(model_cfg, dataset_cfg, eval_cfg, + j) for j in judge_cfg + ] + + pred_strs = None + model_cfg = [model_cfg] if isinstance(model_cfg, + ConfigDict) else model_cfg + # There will be 5 situations, so we need to deal with them + # 1.There are no partitions in infer and judge stage + # 2.No partition in infer stage, but use partition in judge stage + # 3.Use partition in infer stage, but not use partition in judge stage + # 4.Use both partition, with same partition size + # 5.Use both partition, but different partition size + + # If take SubjectSizePartition, get new filename without _0 + if 'test_range' in dataset_cfg['reader_cfg']: + filename = get_infer_output_path( + deal_with_judge_model_abbr([m for m in model_cfg], judge_cfg), + dataset_cfg, osp.join(self.work_dir, 'results')) + root, ext = osp.splitext(filename) + last_underscore_index = root.rfind('_') + root = root[:last_underscore_index] + filename = root + ext + # If take SubjectNaivePartition, get filename + else: + filename = get_infer_output_path( + deal_with_judge_model_abbr([m for m in model_cfg], judge_cfg), + dataset_cfg, osp.join(self.work_dir, 'results')) + # Get partition name + root, ext = osp.splitext(filename) + partial_filename = root + '_0' + ext + + # If no predictions get in predictions dir + if not osp.exists(osp.realpath(filename)) and not osp.exists( + osp.realpath(partial_filename)): + return {'error': 'No judgements found.'} + else: + # If use Naive partition in infer stage + if osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + pred_strs = [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + # If use Size partition in infer stage + else: + filename = partial_filename + pred_strs = [] + i = 1 + while osp.exists(osp.realpath(filename)): + preds = mmengine.load(filename) + filename = root + f'_{i}' + ext + i += 1 + pred_strs += [ + preds[str(i)]['prediction'] for i in range(len(preds)) + ] + # Get all judgements in pred_strs + # If take SubjectSizePartition, get new pred_strs based on test_range + if 'test_range' in dataset_cfg['reader_cfg']: + test_range = dataset_cfg['reader_cfg']['test_range'] + if self.infer_order == 'double': + # When set infer_order as double, we need to select the judgements to meet the predctions which will be doubled later + start = 0 + end = None + pred_strs_length = len(pred_strs) + # Split the string by the ':', the test_range is a string shapes like '[0:15]' + parts = test_range.strip('[]').split(':') + # Check if the start index is provided + if parts[0]: + start = int(parts[0]) + # Check if the end index is provided + if len(parts) > 1 and parts[1]: + end = int(parts[1]) + else: + # If the end is not provided, determine the default end based on the length of 'pred_strs' + end = int(pred_strs_length / 2) + assert pred_strs_length % 2 == 0, "Since you have set the infer_order as 'double', the length of 'pred_strs' must be even." + assert end <= pred_strs_length / 2, "The 'end' value must not exceed half of the 'pred_strs' length." + # Reset the newly start and end + start *= 2 + end *= 2 + pred_strs = eval('pred_strs[' + str(start) + ':' + str(end) + + ']') + else: + pred_strs = eval('pred_strs' + test_range) + # If take SubjectNaivePartition, get all pred_strs + else: + pred_strs = pred_strs + if ('pred_role' in eval_cfg and 'meta_template' in judge_cfg + and not MODELS.get(judge_cfg['type']).is_api + and isinstance(pred_strs[0], str)): + # Create a prompt template for role config parsing + from opencompass.models.base import LMTemplateParser + parser = LMTemplateParser(judge_cfg['meta_template']) + role = parser.roles[eval_cfg['pred_role']] + pred_strs = [ + extract_role_pred(pred, role.get('begin', None), + role.get('end', None)) for pred in pred_strs + ] + + # Postprocess predictions if necessary + ds_abbr = dataset_abbr_from_cfg(dataset_cfg) + model_postprocessors = judge_cfg.get('pred_postprocessor', {}) + pred_postprocessor = None + for pattern in model_postprocessors.keys(): + if fnmatch.fnmatch(ds_abbr, pattern): + pred_postprocessor = model_postprocessors[pattern] + break + if 'pred_postprocessor' in eval_cfg or pred_postprocessor: + kwargs = pred_postprocessor or eval_cfg['pred_postprocessor'] + proc = TEXT_POSTPROCESSORS.get(kwargs.pop('type')) + pred_strs = [proc(s, **kwargs) for s in pred_strs] + + return { + 'model_name': model_abbr_from_cfg(judge_cfg), + 'model_preds': pred_strs + } + + def _score(self, + model_cfg, + dataset_cfg, + eval_cfg, + output_column, + meta=False): test_set = build_dataset_from_cfg(dataset_cfg).test # Postprocess dataset if necessary if 'dataset_postprocessor' in eval_cfg: @@ -208,27 +344,32 @@ class SubjectiveEvalTask(BaseTask): test_set = test_set.map(postprocess) # Get out_path - out_path = get_infer_output_path(model_cfg, dataset_cfg, - osp.join(self.work_dir, 'results')) - new_model_cfg = [] - for m_cfg in model_cfg: - if len(m_cfg) > 1: - new_model_cfg.append(m_cfg) - if len(new_model_cfg) == 1: - new_model_cfg = new_model_cfg[0] - model_preds = self._load_model_pred(new_model_cfg, dataset_cfg, - eval_cfg, self.given_pred) + out_path = get_infer_output_path( + deal_with_judge_model_abbr(model_cfg, self.judge_cfg, self.meta), + dataset_cfg, osp.join(self.work_dir, 'results')) + if meta: + model_preds = self._load_model_pred(model_cfg, dataset_cfg, + eval_cfg, self.given_pred) + model_judges = self._load_model_judgements(model_cfg, dataset_cfg, + eval_cfg, + self.judge_models) + else: + model_preds = self._load_model_pred(model_cfg, dataset_cfg, + eval_cfg, self.given_pred) + model_judges = None if not self.judge_cfg: - raise ValueError('missing "eval.runner.task.judge_cfg"') + raise ValueError('missing "eval.judge_cfg"') eval_cfg['evaluator']['judge_cfg'] = self.judge_cfg eval_cfg['evaluator']['dataset_cfg'] = dataset_cfg eval_cfg['evaluator']['output_path'] = out_path icl_evaluator = ICL_EVALUATORS.build(eval_cfg['evaluator']) references = (test_set[output_column] if output_column else None) - if 'error' not in model_preds: result = icl_evaluator.score(predictions=model_preds, - references=references) + judgements=model_judges, + references=references, + meta=meta, + infer_order=self.infer_order) else: result = model_preds @@ -259,17 +400,24 @@ class SubjectiveEvalTask(BaseTask): output_paths = [] for model, datasets in zip(self.model_cfgs, self.dataset_cfgs): for dataset in datasets: - if type(model) == ConfigDict: + if isinstance(model, ConfigDict): model = (model, ) - model += ({ - 'abbr': - 'judged-by--' + model_abbr_from_cfg(self.judge_cfg) - }, ) + if self.meta: + model += ({ + 'abbr': + 'summarized-by--' + model_abbr_from_cfg(self.judge_cfg) + }, ) + else: + model += ({ + 'abbr': + 'judged-by--' + model_abbr_from_cfg(self.judge_cfg) + }, ) output_paths.append( get_infer_output_path( model, dataset, osp.join(self.work_dir, self.output_subdir), file_extension)) + model = model[:-1] return output_paths diff --git a/opencompass/utils/abbr.py b/opencompass/utils/abbr.py index 4661d94f..44012324 100644 --- a/opencompass/utils/abbr.py +++ b/opencompass/utils/abbr.py @@ -46,3 +46,25 @@ def get_infer_output_path(model_cfg: ConfigDict, model_abbr = model_abbr_from_cfg(model_cfg) dataset_abbr = dataset_abbr_from_cfg(dataset_cfg) return osp.join(root_path, model_abbr, f'{dataset_abbr}.{file_extension}') + + +def deal_with_judge_model_abbr(model_cfg, judge_model_cfg, meta=False): + if isinstance(model_cfg, ConfigDict): + model_cfg = (model_cfg, ) + if meta: + for m_cfg in model_cfg: + if 'summarized-by--' in m_cfg['abbr']: + return model_cfg + model_cfg += ({ + 'abbr': + 'summarized-by--' + model_abbr_from_cfg(judge_model_cfg) + }, ) + else: + for m_cfg in model_cfg: + if 'judged-by--' in m_cfg['abbr']: + return model_cfg + model_cfg += ({ + 'abbr': + 'judged-by--' + model_abbr_from_cfg(judge_model_cfg) + }, ) + return model_cfg diff --git a/run.py b/run.py index a991fab4..09c90184 100644 --- a/run.py +++ b/run.py @@ -341,7 +341,14 @@ def main(): if args.dry_run: return runner = RUNNERS.build(cfg.eval.runner) - runner(tasks) + + # For meta-review-judge in subjective evaluation + if isinstance(tasks, list) and len(tasks) != 0 and isinstance( + tasks[0], list): + for task_part in tasks: + runner(task_part) + else: + runner(tasks) # visualize if args.mode in ['all', 'eval', 'viz']: