From 6f98c8d9abe95fd86d81265257482e737cace5f3 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com> Date: Mon, 22 Apr 2024 12:06:03 +0800 Subject: [PATCH] [Fix] Fix MultiRound Subjective Evaluation(#1043) * fix multiround * fix --- .../functionalmt_zh_judgeby_gpt4.py | 1 + .../eval_subjective_functional_multiround.py | 111 ++++++++++++++++++ .../openicl/icl_evaluator/lm_evaluator.py | 21 +++- .../icl_inferencer/icl_chat_inferencer.py | 16 +-- .../summarizers/subjective/multiround.py | 3 +- 5 files changed, 131 insertions(+), 21 deletions(-) create mode 100644 configs/eval_subjective_functional_multiround.py diff --git a/configs/datasets/subjective/multiround/functionalmt_zh_judgeby_gpt4.py b/configs/datasets/subjective/multiround/functionalmt_zh_judgeby_gpt4.py index 87aea66e..0da6d8e9 100644 --- a/configs/datasets/subjective/multiround/functionalmt_zh_judgeby_gpt4.py +++ b/configs/datasets/subjective/multiround/functionalmt_zh_judgeby_gpt4.py @@ -30,6 +30,7 @@ for _name in subjective_all_sets: subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, + wrap_all_predictions=True, prompt_template=dict( type=PromptTemplate, template=dict(round=[ diff --git a/configs/eval_subjective_functional_multiround.py b/configs/eval_subjective_functional_multiround.py new file mode 100644 index 00000000..dfbf29b7 --- /dev/null +++ b/configs/eval_subjective_functional_multiround.py @@ -0,0 +1,111 @@ +from opencompass.models import HuggingFaceCausalLM +from copy import deepcopy +from opencompass.models import TurboMindModel +from mmengine.config import read_base + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI +from opencompass.partitioners import NaivePartitioner, SizePartitioner +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.partitioners.sub_size import SubjectiveSizePartitioner +from opencompass.runners import LocalRunner +from opencompass.runners import SlurmSequentialRunner +from opencompass.tasks import OpenICLInferTask +from opencompass.tasks.subjective_eval import SubjectiveEvalTask +from opencompass.summarizers import MultiroundSummarizer + +with read_base(): + from .datasets.subjective.multiround.functionalmt_zh_judgeby_gpt4 import subjective_datasets + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), + ] +) + +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), + ], + eos_token_id=151645, +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='qwen1.5-7b-chat-hf', + path="Qwen/Qwen1.5-7B-Chat", + model_kwargs=dict( + device_map='auto', + trust_remote_code=True + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + generation_kwargs=dict( + do_sample=True, + ), + meta_template=_meta_template, + pad_token_id=151645, + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=1, num_procs=1), + end_str='<|im_end|>', + ) +] + +datasets = [*subjective_datasets] + +work_dir = 'outputs/multiround/' +# -------------Inferen Stage ---------------------------------------- + + +infer = dict( + partitioner=dict(type=SizePartitioner, max_task_size=1000), + runner=dict( + type=SlurmSequentialRunner, + partition='your part', + quotatype='auto', + max_num_workers=256, + task=dict(type=OpenICLInferTask)), +) + +judge_models = [dict( + abbr='GPT4-Turbo', + type=OpenAI, + path='gpt-4-1106-preview', + key='', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=1024, + max_seq_len=4096, + batch_size=10, + retry=10, + temperature = 0, +)] + +## ------------- Evaluation Configuration +eval = dict( + partitioner=dict( + type=SubjectiveSizePartitioner, + max_task_size=1000, + mode='singlescore', + models = models, + judge_models=judge_models + ), + runner=dict( + type=SlurmSequentialRunner, + partition='your part', + quotatype='auto', + max_num_workers=256, + task=dict(type=SubjectiveEvalTask)), +) + +summarizer = dict( + type=MultiroundSummarizer +) \ No newline at end of file diff --git a/opencompass/openicl/icl_evaluator/lm_evaluator.py b/opencompass/openicl/icl_evaluator/lm_evaluator.py index 79de767e..bb3d502e 100644 --- a/opencompass/openicl/icl_evaluator/lm_evaluator.py +++ b/opencompass/openicl/icl_evaluator/lm_evaluator.py @@ -75,9 +75,11 @@ class LMEvaluator: keywords, ``{prediction}`` and ``{reference}``, referring to the prediction and optionally the reference answer. judge_cfg (ConfigDict): The config of language model as a judge. + meta_review_prompt_template (ConfigDict, optional): Prompt template for meta judge model. output_path (str): The path to prediction output. dataset_cfg (ConfigDict, optional): The config of the dataset to be evaluated. + pack_all_predictions (bool, optional): For multiround evaluation, judge all round or judge every single round. postprocessor (ConfigDict): The model prediction's postprocessor config. """ @@ -88,6 +90,7 @@ class LMEvaluator: judge_cfg: ConfigDict, output_path: str, meta_review_prompt_template: Optional[ConfigDict] = None, + pack_all_predictions: Optional[bool] = False, dataset_cfg: Optional[ConfigDict] = None, postprocessor: ConfigDict = dict(type=first_number_postprocess) ) -> None: @@ -112,6 +115,7 @@ class LMEvaluator: self.postprocessor = get_type_from_cfg(postprocessor) self.logger = get_logger() self.dataset_cfg = dataset_cfg + self.pack_all_predictions = pack_all_predictions def score(self, predictions, @@ -171,12 +175,17 @@ class LMEvaluator: elif isinstance( predictions[0][0], list ): #multi round for format like [[[{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}], [{'round':1, 'user':'', 'assistant':''}, {'round':2, 'user':'', 'assistant':''}]]] - for i in range(len(predictions)): - multiround_predictions = extract_dicts(predictions[i]) - for j in range(len(multiround_predictions)): - key = 'prediction' if i == 0 else f'prediction{i}' - key += '_r' + str(j + 1) - pred_dict[key] = multiround_predictions[j] + if self.pack_all_predictions: + for i in range(len(predictions)): + key = 'prediction' if i == 0 else f'prediction{i + 1}' + pred_dict[key] = predictions[i] + else: + for i in range(len(predictions)): + multiround_predictions = extract_dicts(predictions[i]) + for j in range(len(multiround_predictions)): + key = 'prediction' if i == 0 else f'prediction{i}' + key += '_r' + str(j + 1) + pred_dict[key] = multiround_predictions[j] if judgements: raise NotImplementedError( 'Not applied meta-reivew judge on multi-round dataset') diff --git a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py index 544aaf85..681afc35 100644 --- a/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_chat_inferencer.py @@ -172,8 +172,6 @@ class ChatInferencer(BaseInferencer): output_json_filepath: Optional[str] = './icl_inference_output', output_json_filename: Optional[str] = 'predictions', save_every: Optional[int] = 1, - temperature: Optional[float] = 0.0, - do_sample: Optional[bool] = False, infer_mode: str = 'last', max_out_len: int = 512, **kwargs) -> None: @@ -185,8 +183,6 @@ class ChatInferencer(BaseInferencer): ) assert infer_mode in ['last', 'every', 'every_with_gt'] self.infer_mode = infer_mode - self.temperature = temperature - self.do_sample = do_sample self.model: BaseModel self._set_meta_template(self.model) @@ -353,16 +349,8 @@ class ChatInferencer(BaseInferencer): for i in assistant_indices: history = chat[:i] - if self.do_sample: - output = self.model.generate_from_template( - [history], - do_sample=self.do_sample, - temperature=self.temperature, - max_out_len=self.max_out_len)[0] - else: - output = self.model.generate_from_template( - [history], do_sample=False, - max_out_len=self.max_out_len)[0] + output = self.model.generate_from_template( + [history], max_out_len=self.max_out_len)[0] chat[i]['content'] = output if not self.dialogue_mode: output_handler.save_multiround_results( diff --git a/opencompass/summarizers/subjective/multiround.py b/opencompass/summarizers/subjective/multiround.py index 7a12bf53..f869b417 100644 --- a/opencompass/summarizers/subjective/multiround.py +++ b/opencompass/summarizers/subjective/multiround.py @@ -128,7 +128,8 @@ class MultiroundSummarizer: self.eval_model_abbrs = [ model_abbr_from_cfg(model) for model in self.eval_model_cfgs ] - self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_model']) + self.judge_abbr = model_abbr_from_cfg( + self.cfg['eval']['partitioner']['judge_models'][0]) def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):