From 65fad8e2ac7fae458de8c06fa19458bfd8b61efe Mon Sep 17 00:00:00 2001
From: klein <zhangchy2@shanghaitech.edu.cn>
Date: Fri, 26 Jul 2024 11:19:04 +0800
Subject: [PATCH] [Fix] minor update wildbench (#1335)

* update crb

* update crbbench

* update crbbench

* update crbbench

* minor update wildbench

* [Fix] Update doc of wildbench, and merge wildbench into subjective

* [Fix] Update doc of wildbench, and merge wildbench into subjective, fix crbbench

* Update crb.md

* Update crb_pair_judge.py

* Update crb_single_judge.py

* Update subjective_evaluation.md

* Update openai_api.py

* [Update] update wildbench readme

* [Update] update wildbench readme

* [Update] update wildbench readme, remove crb

* Delete configs/eval_subjective_wildbench_pair.py

* Delete configs/eval_subjective_wildbench_single.py

* Update __init__.py

---------

Co-authored-by: bittersweet1999 <148421775+bittersweet1999@users.noreply.github.com>
---
 .../subjective/wildbench/wildbench.md         |  10 +-
 .../wildbench/wildbench_pair_judge.py         |  27 ++-
 .../wildbench/wildbench_single_judge.py       |   6 +-
 configs/eval_subjective.py                    |   5 +-
 configs/eval_subjective_wildbench_pair.py     | 180 ------------------
 configs/eval_subjective_wildbench_single.py   | 135 -------------
 .../advanced_guides/subjective_evaluation.md  |   1 +
 .../advanced_guides/subjective_evaluation.md  |   1 +
 opencompass/datasets/subjective/wildbench.py  |   8 +-
 9 files changed, 43 insertions(+), 330 deletions(-)
 delete mode 100644 configs/eval_subjective_wildbench_pair.py
 delete mode 100644 configs/eval_subjective_wildbench_single.py

diff --git a/configs/datasets/subjective/wildbench/wildbench.md b/configs/datasets/subjective/wildbench/wildbench.md
index e4567ba1..c6101233 100644
--- a/configs/datasets/subjective/wildbench/wildbench.md
+++ b/configs/datasets/subjective/wildbench/wildbench.md
@@ -19,12 +19,16 @@ wildbench
 
 The wildbench.jsonl is the preprocessed dataset, and the other three are the reference, used for score.
 
-Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py`
+Once you download the dataset, you have to modify the path defined in `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` and `configs/datasets/subjective/wildbench/wildbench_single_judge.py`.
+
+Note that you have to modify the given_preds in line 57-61 of `configs/datasets/subjective/wildbench/wildbench_pair_judge.py` too,
 
 ## Run
 
 We have provide the script for wildbench in `configs/eval_subjective_wildbench_pair.py` and `configs/eval_subjective_wildbench_single.py`.
 
-Please modify the path for `give_pred` (line 171) in `configs/eval_subjective_wildbench_pair.py` to your path.
-
 Note that if you test the wildbench with other models, please set the max_out_lens to 4096.
+
+## Acknowledgement
+
+We greatly appreciate the authors of [wildbench dataset](https://github.com/allenai/WildBench). If you find it is useful in your research, please consider cite them.
diff --git a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
index e0a34c70..5037ae45 100644
--- a/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_pair_judge.py
@@ -13,7 +13,7 @@ subjective_reader_cfg = dict(
 
 data_path ='./data/WildBench/wildbench.jsonl'
 
-subjective_datasets = []
+wildbench_datasets = []
 subjective_infer_cfg = dict(
         prompt_template=dict(
             type=PromptTemplate,
@@ -33,14 +33,33 @@ subjective_eval_cfg = dict(
     ),
     pred_role='BOT',
 )
+gpt4 = dict(
+    abbr='gpt4-turbo',
+)
 
-subjective_datasets.append(
+claude = dict(
+    abbr='HaiKu',
+)
+
+llama_2_70b = dict(
+    abbr='llama-2-70b-chat-hf',
+)
+
+wildbench_datasets.append(
     dict(
         abbr='wildbench',
         type=WildBenchDataset,
         path=data_path,
-        mode='pair',
+        eval_mode='pair',
         reader_cfg=subjective_reader_cfg,
         infer_cfg=subjective_infer_cfg,
-        eval_cfg=subjective_eval_cfg
+        eval_cfg=subjective_eval_cfg,
+        given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
+                {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
+                {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
+                {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
+                {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}],
+        mode='m2n', # m个模型 与 n个模型进行对战
+        infer_order='random',
+        base_models = [llama_2_70b, gpt4, claude]
     ))
diff --git a/configs/datasets/subjective/wildbench/wildbench_single_judge.py b/configs/datasets/subjective/wildbench/wildbench_single_judge.py
index be11abcb..7c0df0de 100644
--- a/configs/datasets/subjective/wildbench/wildbench_single_judge.py
+++ b/configs/datasets/subjective/wildbench/wildbench_single_judge.py
@@ -12,7 +12,7 @@ subjective_reader_cfg = dict(
 
 data_path ='./data/WildBench/wildbench.jsonl'
 
-subjective_datasets = []
+wildbench_single_datasets = []
 
 # the question is a list, how to process it
 subjective_infer_cfg = dict(
@@ -35,12 +35,12 @@ subjective_eval_cfg = dict(
     pred_role='BOT',
 )
 
-subjective_datasets.append(
+wildbench_single_datasets.append(
     dict(
         abbr='wildbench',
         type=WildBenchDataset,
         path=data_path,
-        mode='single',
+        eval_mode='single',
         reader_cfg=subjective_reader_cfg,
         infer_cfg=subjective_infer_cfg,
         eval_cfg=subjective_eval_cfg
diff --git a/configs/eval_subjective.py b/configs/eval_subjective.py
index 523374d7..13d1228d 100644
--- a/configs/eval_subjective.py
+++ b/configs/eval_subjective.py
@@ -7,6 +7,7 @@ with read_base():
     from .datasets.subjective.arena_hard.arena_hard_compare import arenahard_datasets
     from .datasets.subjective.compassbench.compassbench_compare import compassbench_datasets
     from .datasets.subjective.fofo.fofo_judge import fofo_datasets
+    from .datasets.subjective.wildbench.wildbench_pair_judge import wildbench_datasets
     from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import mtbench_datasets
     from .datasets.subjective.multiround.mtbench101_judge import mtbench101_datasets
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
@@ -55,7 +56,9 @@ models = [
     )
 ]
 
-datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets] # add datasets you want
+
+datasets = [*alignbench_datasets, *alpacav2_datasets, *arenahard_datasets, *compassarena_datasets, *compassbench_datasets, *fofo_datasets, *mtbench_datasets, *mtbench101_datasets, *wildbench_datasets] # add datasets you want
+
 
 infer = dict(
     partitioner=dict(type=NaivePartitioner),
diff --git a/configs/eval_subjective_wildbench_pair.py b/configs/eval_subjective_wildbench_pair.py
deleted file mode 100644
index 652793cf..00000000
--- a/configs/eval_subjective_wildbench_pair.py
+++ /dev/null
@@ -1,180 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
-    from .datasets.subjective.wildbench.wildbench_pair_judge import subjective_datasets
-    from .models.openai.gpt_4 import models as gpt4_models
-    from .models.hf_llama.hf_llama2_70b_chat import models as llama2_models
-    # from .models.gemma.hf_gemma_2b_it import models
-    # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model
-    # # from .models.hf_internlm.hf_internlm2_chat_7b import models
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.partitioners import NaivePartitioner, SizePartitioner
-from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
-from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.runners import SlurmSequentialRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.tasks.subjective_eval import SubjectiveEvalTask
-from opencompass.summarizers import WildBenchPairSummarizer
-from opencompass.models.claude_api.claude_api import Claude
-from opencompass.models import HuggingFacewithChatTemplate
-
-
-models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
-
-api_meta_template = dict(
-    round=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
-# _meta_template = dict(
-#     round=[
-#         dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-#         dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-#     ],
-# )
-# -------------Inference Stage ----------------------------------------
-# For subjective evaluation, we often set do sample for models
-
-models = [
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='llama-3-8b-instruct-hf',
-        path='meta-llama/Meta-Llama-3-8B-Instruct',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-        stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='yi-1.5-6b-chat-hf',
-        path='01-ai/Yi-1.5-6B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='qwen1.5-7b-chat-hf',
-        path='Qwen/Qwen1.5-7B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='llama-3-70b-instruct-hf',
-    #     path='meta-llama/Meta-Llama-3-70B-Instruct',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    #     stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    # ),
-    #     dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='yi-1.5-34b-chat-hf',
-    #     path='01-ai/Yi-1.5-34B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=2),
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='qwen1.5-72b-chat-hf',
-    #     path='Qwen/Qwen1.5-72B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=8),
-    # )
-]
-
-datasets = [*subjective_datasets]
-
-# -------------Evalation Stage ----------------------------------------
-
-## ------------- JudgeLLM Configuration
-judge_models = [dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
-    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=16,
-    max_out_len=2048,
-    max_seq_len=2048,
-    batch_size=8,
-    temperature=0,
-)]
-
-gpt4 = dict(
-    abbr='gpt4-turbo',
-    type=OpenAI,
-    path='gpt-4-0409-preview',
-    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=1,
-    max_out_len=2048,
-    max_seq_len=4096,
-    batch_size=4,
-    retry=20,
-    temperature=1,
-)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
-
-claude = dict(abbr='HaiKu',
-        type=Claude,
-        path='claude-2',
-        key='YOUR_CLAUDE_KEY',
-        query_per_second=1,
-        max_out_len=2048, max_seq_len=2048, batch_size=2,
-    )
-## single evaluation
-# eval = dict(
-#     partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
-#     runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
-# )
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        max_num_workers=64,
-        quotatype='reserved',
-        partition='llmeval',
-        task=dict(type=OpenICLInferTask)),
-)
-
-eval = dict(
-    partitioner=dict(
-        type=SubjectiveNaivePartitioner,
-        mode='m2n', # m个模型 与 n个模型进行对战
-        infer_order='random',
-        #  在m2n模式下，需要指定base_models和compare_models，将会对base_models和compare_models生成对应的两两pair（去重且不会与自身进行比较）
-        base_models = [*llama2_models, gpt4, claude], # 用于对比的基线模型
-        compare_models = models, # 待评测模型
-        judge_models=judge_models
-    ),
-    runner=dict(
-        type=LocalRunner,
-        # partition='llmeval',
-        # quotatype='auto',
-        max_num_workers=3,
-        task=dict(
-            type=SubjectiveEvalTask
-        )),
-    given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/WildBench/gpt4'},
-                  {'abbr': 'llama-2-70b-chat-hf', 'path':'./data/WildBench/llama2-70b'},
-                  {'abbr': 'HaiKu', 'path':'./data/WildBench/claude'},
-                  {'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/WildBench/llama2-70b'},
-                  {'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/WildBench/llama2-70b'}]
-)
-
-summarizer = dict(type=WildBenchPairSummarizer)
-
-work_dir = 'outputs/wildbench/'
diff --git a/configs/eval_subjective_wildbench_single.py b/configs/eval_subjective_wildbench_single.py
deleted file mode 100644
index 5e053488..00000000
--- a/configs/eval_subjective_wildbench_single.py
+++ /dev/null
@@ -1,135 +0,0 @@
-from mmengine.config import read_base
-
-with read_base():
-    # from .datasets.subjective.multiround.mtbench_single_judge_diff_temp import subjective_datasets
-    from .datasets.subjective.wildbench.wildbench_single_judge import subjective_datasets
-    # from .models.gemma.hf_gemma_2b_it import models as gemma_2b_models
-    # from .models.hf_llama.hf_llama3_70b_instruct import models as llama3_model
-    # # from .models.hf_internlm.hf_internlm2_chat_7b import models
-    # from .models.yi.hf_yi_1_5_34b_chat import models as yi_model
-    # from .models.qwen.hf_qwen1_5_72b_chat import models as qwen_model
-
-from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
-from opencompass.partitioners import NaivePartitioner, SizePartitioner
-from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
-from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
-from opencompass.runners import LocalRunner
-from opencompass.runners import SlurmSequentialRunner
-from opencompass.tasks import OpenICLInferTask
-from opencompass.tasks.subjective_eval import SubjectiveEvalTask
-from opencompass.summarizers import WildBenchSingleSummarizer
-from opencompass.models import HuggingFacewithChatTemplate
-
-
-# models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
-
-api_meta_template = dict(
-    round=[
-        dict(role='SYSTEM', api_role='SYSTEM'),
-        dict(role='HUMAN', api_role='HUMAN'),
-        dict(role='BOT', api_role='BOT', generate=True),
-    ]
-)
-
-# _meta_template = dict(
-#     round=[
-#         dict(role='HUMAN', begin='\n<|im_start|>user\n', end='<|im_end|>'),
-#         dict(role='BOT', begin='\n<|im_start|>assistant\n', end='<|im_end|>', generate=True),
-#     ],
-# )
-# -------------Inference Stage ----------------------------------------
-# For subjective evaluation, we often set do sample for models
-# set max_out_len to 4096.
-models = [
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='llama-3-8b-instruct-hf',
-        path='meta-llama/Meta-Llama-3-8B-Instruct',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-        stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='yi-1.5-6b-chat-hf',
-        path='01-ai/Yi-1.5-6B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    dict(
-        type=HuggingFacewithChatTemplate,
-        abbr='qwen1.5-7b-chat-hf',
-        path='Qwen/Qwen1.5-7B-Chat',
-        max_out_len=4096,
-        batch_size=8,
-        run_cfg=dict(num_gpus=1),
-    ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='llama-3-70b-instruct-hf',
-    #     path='meta-llama/Meta-Llama-3-70B-Instruct',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    #     stop_words=['<|end_of_text|>', '<|eot_id|>'],
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='yi-1.5-34b-chat-hf',
-    #     path='01-ai/Yi-1.5-34B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=2),
-    # ),
-    # dict(
-    #     type=HuggingFacewithChatTemplate,
-    #     abbr='qwen1.5-72b-chat-hf',
-    #     path='Qwen/Qwen1.5-72B-Chat',
-    #     max_out_len=4096,
-    #     batch_size=8,
-    #     run_cfg=dict(num_gpus=4),
-    # )
-]
-
-datasets = [*subjective_datasets]
-
-# -------------Evalation Stage ----------------------------------------
-
-## ------------- JudgeLLM Configuration
-judge_models = [dict(
-    abbr='GPT4-Turbo',
-    type=OpenAI,
-    path='gpt-4-0613', # To compare with the official leaderboard, please use gpt4-0613
-    key='xxxx',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
-    meta_template=api_meta_template,
-    query_per_second=16,
-    max_out_len=2048,
-    max_seq_len=2048,
-    batch_size=8,
-    temperature=0,
-)]
-
-
-infer = dict(
-    partitioner=dict(type=SizePartitioner, max_task_size=1000, strategy='split'),
-    runner=dict(
-        type=SlurmSequentialRunner,
-        max_num_workers=64,
-        quotatype='reserved',
-        partition='llmeval',
-        task=dict(type=OpenICLInferTask)),
-)
-
-## single evaluation
-eval = dict(
-    partitioner=dict(type=SubjectiveSizePartitioner, strategy='split', max_task_size=10000, mode='singlescore', models=models, judge_models=judge_models),
-    runner=dict(type=LocalRunner,
-                max_num_workers=2,
-                task=dict(type=SubjectiveEvalTask)),
-)
-
-summarizer = dict(type=WildBenchSingleSummarizer)
-
-work_dir = 'outputs/wildbench/'
diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
index 50a41114..24a55921 100644
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@@ -21,6 +21,7 @@ We support the use of GPT-4 (or other JudgeLLM) for the subjective evaluation of
 4. AlpacaEvalv2 English Compare Dataset (https://github.com/tatsu-lab/alpaca_eval)
 5. ArenaHard English Compare Dataset, mainly focused on coding (https://github.com/lm-sys/arena-hard/tree/main)
 6. Fofo English Scoring Dataset (https://github.com/SalesforceAIResearch/FoFo/)
+7. Wildbench English Score and Compare Dataset（https://github.com/allenai/WildBench）
 
 ## Initiating Subjective Evaluation
 
diff --git a/docs/zh_cn/advanced_guides/subjective_evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md
index d5b40c06..2dfb509e 100644
--- a/docs/zh_cn/advanced_guides/subjective_evaluation.md
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -21,6 +21,7 @@
 4. AlpacaEvalv2 英文Compare数据集（https://github.com/tatsu-lab/alpaca_eval）
 5. ArenaHard 英文Compare数据集，主要面向coding(https://github.com/lm-sys/arena-hard/tree/main)
 6. Fofo  英文Socring数据集（https://github.com/SalesforceAIResearch/FoFo/）
+7. Wildbench 英文Score和Compare数据集（https://github.com/allenai/WildBench）
 
 ## 启动主观评测
 
diff --git a/opencompass/datasets/subjective/wildbench.py b/opencompass/datasets/subjective/wildbench.py
index 8f0995f5..65d8ec27 100644
--- a/opencompass/datasets/subjective/wildbench.py
+++ b/opencompass/datasets/subjective/wildbench.py
@@ -209,7 +209,7 @@ def parse_conversation(conversation):
 @LOAD_DATASET.register_module()
 class WildBenchDataset(BaseDataset):
 
-    def load(self, path: str, K=-1, mode='pair'):
+    def load(self, path: str, K=-1, eval_mode='pair', *args, **kwargs):
         dataset = DatasetDict()
         raw_data = []
         with open(path, 'r', encoding='utf-8') as file:
@@ -222,13 +222,13 @@ class WildBenchDataset(BaseDataset):
                 for checklist_item in item['checklist']:
                     checklist_mardkdown += f'- {checklist_item}\n'
 
-                if mode == 'single':
+                if eval_mode == 'single':
                     prompt = score_prompt
-                elif mode == 'pair':
+                elif eval_mode == 'pair':
                     prompt = pair_prompt
                 else:
                     assert NotImplementedError(
-                        f'Mode {mode} not in single or pair.')
+                        f'Eval mode {eval_mode} not in single or pair.')
 
                 prompt = prompt.replace('{history}', history)
                 prompt = prompt.replace('{user_query}', last_query)