Merge branch 'open-compass:main' into main

This commit is contained in:
bittersweet1999 2024-10-21 13:40:03 +08:00 committed by GitHub
commit f65ce1796f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
71 changed files with 3107 additions and 61 deletions

View File

@ -84,7 +84,7 @@ internlm2_5-20b-chat-turbomind:
race-high: 91
internlm2-chat-1.8b-turbomind:
gsm8k: 30
gsm8k: 40
race-middle: 82
race-high: 83
@ -121,7 +121,7 @@ llama-3-8b-instruct-hf:
llama-3_1-8b-instruct-turbomind:
gsm8k: 79
race-middle: 82
race-high: 82
race-high: 88
llama-3-8b-instruct-turbomind:
gsm8k: 77

View File

@ -73,7 +73,7 @@ jobs:
cuda_env: [dsw_cu11, dsw_cu12]
runs-on: ${{ matrix.cuda_env }}
environment: 'prod'
timeout-minutes: 420 #7hours
timeout-minutes: 600 #10hours
steps:
- name: Clone repository
uses: actions/checkout@v2

View File

@ -59,6 +59,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2024.10.14\]** We now support the OpenAI multilingual QA dataset [MMMLU](https://huggingface.co/datasets/openai/MMMLU). Feel free to give it a try! 🔥🔥🔥
- **\[2024.09.19\]** We now support [Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) with multiple backend(huggingface/vllm/lmdeploy). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.17\]** We now support OpenAI o1(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`). Feel free to give them a try! 🔥🔥🔥
- **\[2024.09.05\]** We now support answer extraction through model post-processing to provide a more accurate representation of the model's capabilities. As part of this update, we have integrated [XFinder](https://github.com/IAAR-Shanghai/xFinder) as our first post-processing model. For more detailed information, please refer to the [documentation](opencompass/utils/postprocessors/xfinder/README.md), and give it a try! 🔥🔥🔥

View File

@ -59,6 +59,7 @@
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
- **\[2024.10.14\]** 现已支持OpenAI多语言问答数据集[MMMLU](https://huggingface.co/datasets/openai/MMMLU),欢迎尝试! 🔥🔥🔥
- **\[2024.09.19\]** 现已支持[Qwen2.5](https://huggingface.co/Qwen)(0.5B to 72B) ,可以使用多种推理后端(huggingface/vllm/lmdeploy), 欢迎尝试! 🔥🔥🔥
- **\[2024.09.05\]** 现已支持OpenAI o1 模型(`o1-mini-2024-09-12` and `o1-preview-2024-09-12`), 欢迎尝试! 🔥🔥🔥
- **\[2024.09.05\]** OpenCompass 现在支持通过模型后处理来进行答案提取,以更准确地展示模型的能力。作为此次更新的一部分,我们集成了 [XFinder](https://github.com/IAAR-Shanghai/xFinder) 作为首个后处理模型。具体信息请参阅 [文档](opencompass/utils/postprocessors/xfinder/README.md),欢迎尝试! 🔥🔥🔥

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .compassbench_v1_3_objective_gen_068af0 import compassbench_aug_datasets # noqa: F401, F403

View File

@ -0,0 +1,74 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets.compassbench_obj import CompassBenchObjectiveV1_3, compassbench_objective_v1_3_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
prompt_cn = {
'single_choice_cn': '以下是一道单项选择题请你根据你了解的知识给出正确的答案选项。请你一步步推理并在最后用“答案选项为X”来回答其中X是ABCD中你认为正确的选项序号\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
'cloze_cn': '以下是一道填空题,请你根据你了解的知识一步步思考后把你的最终答案放到\\boxed{}中。\n下面是你要回答的题目:\n{question}\n让我们一步步解决这个问题:',
}
prompt_en = {
'single_choice_en': "Here is a single-choice question. Please give the correct answer based on your knowledge. Please reason step by step and answer with 'The answer is X' at the end, where X is the option number you think is correct.\nHere is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
'cloze_en': "Here is a fill-in-the-blank question. Please think step by step based on your knowledge and put your final answer in \\boxed{}. Here is the question you need to answer:\n{question}\nLet's solve this problem step by step:",
}
douknow_sets = {
'knowledge': ['single_choice_cn'],
'math': ['single_choice_cn'],
}
# Set up the prompts
CircularEval = True
compassbench_aug_datasets = []
for _split in list(douknow_sets.keys()):
for _name in douknow_sets[_split]:
if 'cn' in _name:
single_choice_prompts = prompt_cn
cloze_prompts = prompt_cn
else:
single_choice_prompts = prompt_en
cloze_prompts = prompt_en
douknow_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt= single_choice_prompts[_name],
),
dict(role='BOT', prompt='{answer}'),] if 'choice' in _name else cloze_prompts[_name],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
douknow_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if CircularEval else AccEvaluator) if 'single_choice' in _name else dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD' ) if 'single_choice' in _name else dict(type=compassbench_objective_v1_3_postprocess, name=_name))
compassbench_aug_datasets.append(
dict(
type=CompassBenchObjectiveV1_3,
path=f'./data/compassbench_v1_3/{_split}/{_name}.jsonl',
name='circular_' + _name if CircularEval else _name,
abbr='compassbench-' + _split + '-' + _name + 'circular'if CircularEval else '',
reader_cfg=dict(
input_columns=['question'],
output_column='answer'
),
infer_cfg=douknow_infer_cfg,
eval_cfg=douknow_eval_cfg,
))
del _split, _name

View File

@ -0,0 +1,49 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import GPQADataset, GPQAEvaluator
from opencompass.utils import first_option_postprocess
gpqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer')
hint = f'对下面的单项选择题,请直接给出正确答案的选项。'
question_and_options = 'Question: {question}\n(A){A}\n(B){B}\n(C){C}\n(D){D}\n'
gpqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template={
opt: f'{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']},
),
prompt_template=dict(
type=PromptTemplate,
template={
opt: f'{hint}\n</E>{question_and_options}\nAnswer: {opt}' for opt in ['A', 'B', 'C', 'D']
},
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=PPLInferencer))
gpqa_eval_cfg = dict(evaluator=dict(type=AccwithDetailsEvaluator))
gpqa_datasets = []
gpqa_subsets = {
# 'extended': 'gpqa_extended.csv',
# 'main': 'gpqa_main.csv',
'diamond': 'gpqa_diamond.csv'
}
for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,
infer_cfg=gpqa_infer_cfg,
eval_cfg=gpqa_eval_cfg)
)

View File

@ -0,0 +1,47 @@
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MMLUProDataset, MMLUProBaseEvaluator
with read_base():
from .mmlu_pro_categories import categories
mmlu_pro_datasets = []
for category in categories:
hint = f'Answer the following multiple choice question about {category}, and give your answer option directly.'
question_and_options = 'Question:\n{question}\nOptions:\n{options_str}'
mmlu_pro_reader_cfg = dict(
input_columns=['question', 'cot_content', 'options_str'],
output_column='answer_string',
train_split='validation',
test_split='test',
)
mmlu_pro_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=f'{question_and_options}\nAnswer: {{answer}}'),
prompt_template=dict(
type=PromptTemplate,
template=f'{hint}\n</E>{question_and_options}\nAnswer: ',
ice_token='</E>'
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4]),
inferencer=dict(type=GenInferencer, max_out_len=100)
)
mmlu_pro_eval_cfg = dict(
evaluator=dict(type=MMLUProBaseEvaluator)
)
mmlu_pro_datasets.append(
dict(
abbr=f'mmlu_pro_{category.replace(" ", "_")}',
type=MMLUProDataset,
path='opencompass/mmlu_pro',
category=category,
reader_cfg=mmlu_pro_reader_cfg,
infer_cfg=mmlu_pro_infer_cfg,
eval_cfg=mmlu_pro_eval_cfg,
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench',
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
type=SubjectiveCmpDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
))

View File

@ -0,0 +1,115 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from opencompass.datasets import ArenaHardDataset
from opencompass.summarizers import ArenaHardSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
summarizer = dict(type=ArenaHardSummarizer),
))

View File

@ -0,0 +1,80 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'arenahard',
]
arenahard_datasets = []
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
gpt4 = [dict(
abbr='gpt4-0314',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = judge_prompt
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
arenahard_datasets.append(
dict(
abbr='arenahard',
type=ArenaHardDataset,
path='./data/subjective/arena_hard',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.datasets import CompassArenaDataset
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,130 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
base_prompt_cn = """
我希望你创建一个排行榜用于评估来自各种大型语言模型的回答格式的正确性为了完成这个任务你将需要分析给模型的文本提示以及它们对应的回答具体来说请确保你的评估输出正确地格式化为JSON字符串我将为此提供提示和回答
以下是提示内容
{
"instruction": "{question}",
}
以下是模型的输出结果
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式进行彻底的格式检查并提供格式正确或错误的详细解释你的反馈应包括模型的名称接着是格式正确性的状态'1'表示正确'0'表示错误将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现换句话说你应该生成以下输出
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意你的回答应是一个正确格式化的JSON字符串不应包含任何额外的内容我们将在Python中直接将其作为JSON字符串加载
"""
fofo_datasets = []
for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts'
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)
subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
))

View File

@ -0,0 +1,51 @@
# Guideline for evaluating HelloBench on Diverse LLMs
HelloBench is a comprehenvise, in-the-wild, and open-ended benchmark to evaluate LLMs' performance in generating long text. More details could be found in [🌐Github Repo](https://github.com/Quehry/HelloBench) and [📖Paper](https://arxiv.org/abs/2409.16191).
## Detailed instructions to evalute HelloBench in Opencompass
1. Git clone Opencompass
```shell
cd ~
git clone git@github.com:open-compass/opencompass.git
cd opencompass
```
2. Download HelloBench data in [Google Drive Url](https://drive.google.com/file/d/1EJTmMFgCs2pDy9l0wB5idvp3XzjYEsi9/view?usp=sharing), unzip it and put it in the following path(OPENCOMPASS_PATH/data/HelloBench), make sure you get path like this:
```
~/opencompass/data/
└── HelloBench
├── chat.jsonl
├── heuristic_text_generation.jsonl
├── length_constrained_data
│ ├── heuristic_text_generation_16k.jsonl
│ ├── heuristic_text_generation_2k.jsonl
│ ├── heuristic_text_generation_4k.jsonl
│ └── heuristic_text_generation_8k.jsonl
├── open_ended_qa.jsonl
├── summarization.jsonl
└── text_completion.jsonl
```
3. Setup your opencompass
```
cd ~/opencompass
pip install -e .
```
4. configuration your launch in configs/eval_hellobench.py
- set your models to be evaluated
- set your judge model (we recommend to use gpt4o-mini)
5. launch it!
```
python run.py configs/eval_hellobench.py
```
6. After that, you could find the results in outputs/hellobench/xxx/summary

View File

@ -0,0 +1,111 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import HelloBenchDataset, hellobench_postprocess
system_prompt = """You are a helpful evaluator. Your task is to evaluate the checklists of the responses given by the Large Language Models (LLMs) based on user instructions. These checklists consist of yes or no questions."""
user_prompt = """Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Here is the instruction:
{{\"instruction\": {instruction}}}
Here is the response given by LLM:
{{\"response\": {prediction}}}
Since the response may be rather long, I am specifically reminding you here that the response has ended.
Here are checklists of this instruction:
{{\"checklists\": {formatted_checklists}}}
To further remind you, I will repeat my requirements:
Your core task is to evaluate the checklists based on the users instruction and LLMs response, with each checklist item being a yes or no question indicating a specific aspect that the LLMs response should meet. You need to judge the checklist item based on the instruction and response. The evaluation results are scored from 0 to 1, with 5 scores in total, which are:
0: The response fails to meet the checklist requirements, demonstrating substantial need for improvement across multiple areas.
0.25: The response partially meets some checklist requirements, but significant elements remain unaddressed.
0.5: The response meets several checklist requirements, yet the overall evaluation appears ambiguous or unclear.
0.75: The response aligns with most checklist requirements, though there are still minor areas that could be refined or enhanced.
1: The response fully satisfies all checklist requirements, with no identifiable issues or areas for improvement. It means this response is already perfect; you can't find any significant flaws in it.
Always provide the reason for your evaluation results. You should be strict but fair in your evaluation. A score of 1 means that the response perfectly meets all the checklist requirements and you think there are really no room for improvements. When giving a score of 1, you need to carefully consider whether this checklist has been perfectly satisfied.
Evaluate all the checklists and return the evaluation results of the checklists. Output a Python List consisting of the Python Dictionary formatted as follows:
[{{\"checklist_id\": \"the id of the checklist\", \"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}},{{\"checklist_id\": \"the id of the checklist\",
\"reason\": \"The reason for your evaluation results\", \"evaluation_score\": \"Your evaluation score for this checklist\"}}]
There are total {num_checklist} checklists that you need to evaluate. The length of the output list is equal to the number of checklists and you should give an evaluation score for each checklist. You shoule be very very very strict to the evalution to further compare the responses from different models. Your response must be a valid Python List and should contain nothing else, as it will be directly executed in Python."""
subjective_reader_cfg = dict(
input_columns=['instruction', 'formatted_checklists', 'num_checklist'],
output_column='judgement',
)
hellobench_categories = [
'open_ended_qa',
'summarization',
'chat',
'text_completion',
'heuristic_text_generation',
]
data_path ='data/HelloBench'
hellobench_datasets = []
for category_name in hellobench_categories:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=16384),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = user_prompt
),
]),
),
dict_postprocessor=dict(type=hellobench_postprocess,),
),
pred_role='BOT',
)
hellobench_datasets.append(
dict(
abbr=f'HelloBench-{category_name}',
type=HelloBenchDataset,
path=data_path,
category_name=category_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
from opencompass.datasets import MTBench101Dataset
from opencompass.summarizers import MTBench101Summarizer
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
))

View File

@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench101',
]
data_path ='data/subjective/'
mtbench101_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
mtbench101_datasets.append(
dict(
abbr=f'{_name}',
type=MTBench101Dataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
from opencompass.datasets import MTBenchDataset
from opencompass.summarizers import MTBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
))

View File

@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench_0.0','mtbench_0.1','mtbench_0.7'
]
data_path ='data/subjective/mtbench'
mtbench_datasets = []
for _name in subjective_all_sets:
temperature = float(_name.split('_')[1])
do_sample = False if temperature == 0.0 else True
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
mtbench_datasets.append(
dict(
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
@ -63,4 +62,5 @@ wildbench_datasets.append(
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
gpt4 = dict(
abbr='gpt4-turbo',
)
claude = dict(
abbr='HaiKu',
)
llama_2_70b = dict(
abbr='llama-2-70b-chat-hf',
)
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
))

View File

@ -0,0 +1,35 @@
# MMMLU
## Dataset Description
Multilingual Massive Multitask Language Understanding (MMMLU)
The MMLU is a widely recognized benchmark of general knowledge attained by AI models. It covers a broad range of topics from 57 different categories, covering elementary-level knowledge up to advanced professional subjects like law, physics, history, and computer science.
We translated the MMLUs test set into 14 languages using professional human translators. Relying on human translators for this evaluation increases confidence in the accuracy of the translations, especially for low-resource languages like Yoruba. We are publishing the professional human translations and the code we use to run the evaluations.
This effort reflects our commitment to improving the multilingual capabilities of AI models, ensuring they perform accurately across languages, particularly for underrepresented communities. By prioritizing high-quality translations, we aim to make AI technology more inclusive and effective for users worldwide.
MMMLU contains the MMLU test set translated into the following locales:
- AR_XY (Arabic)
- BN_BD (Bengali)
- DE_DE (German)
- ES_LA (Spanish)
- FR_FR (French)
- HI_IN (Hindi)
- ID_ID (Indonesian)
- IT_IT (Italian)
- JA_JP (Japanese)
- KO_KR (Korean)
- PT_BR (Brazilian Portuguese)
- SW_KE (Swahili)
- YO_NG (Yoruba)
- ZH_CH (Simplied Chinese)
## How to Use
Download file from [link](https://hf-mirror.com/datasets/openai/MMMLU)
```python
from datasets import load_dataset
ds = load_dataset("openai/MMMLU", "default")
from datasets import load_dataset
ds = load_dataset("openai/MMMLU", "by_language")
```

View File

@ -0,0 +1,138 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMMLUDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
from mmengine.config import read_base
with read_base():
from .mmmlu_prompt import (get_few_shot_prompts_ar,
get_few_shot_prompts_bn,
get_few_shot_prompts_de,
get_few_shot_prompts_es,
get_few_shot_prompts_fr,
get_few_shot_prompts_hi,
get_few_shot_prompts_id,
get_few_shot_prompts_it,
get_few_shot_prompts_ja,
get_few_shot_prompts_ko,
get_few_shot_prompts_pt,
get_few_shot_prompts_zh,
get_few_shot_prompts_sw,
get_few_shot_prompts_yo)
mmmlu_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D','subject'],
output_column='target',
train_split='test')
mmmlu_all_sets = [
'mmlu_AR-XY',
'mmlu_BN-BD',
'mmlu_DE-DE',
'mmlu_ES-LA',
'mmlu_FR-FR',
'mmlu_HI-IN',
'mmlu_ID-ID',
'mmlu_IT-IT',
'mmlu_JA-JP',
'mmlu_KO-KR',
'mmlu_PT-BR',
'mmlu_SW-KE',
'mmlu_YO-NG',
'mmlu_ZH-CN',
]
mmmlu_datasets = []
for _name in mmmlu_all_sets:
if 'AR' in _name:
_hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك'
_prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالإجابة:'
_round = get_few_shot_prompts_ar(_hint, _prompt)
elif 'BN' in _name:
_hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন'
_prompt = f'এটি {{subject}} সম্পর্কে \nপ্রশ্ন: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nউত্তর:'
_round = get_few_shot_prompts_bn(_hint, _prompt)
elif 'DE' in _name:
_hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.'
_prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:'
_round = get_few_shot_prompts_de(_hint, _prompt)
elif 'ES' in _name:
_hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.'
_prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:'
_round = get_few_shot_prompts_es(_hint, _prompt)
elif 'FR' in _name:
_hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.'
_prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :'''
_round = get_few_shot_prompts_fr(_hint, _prompt)
elif 'HI' in _name:
_hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।'
_prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:'
_round = get_few_shot_prompts_hi(_hint, _prompt)
elif 'ID' in _name:
_hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.'
_prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:'
_round = get_few_shot_prompts_id(_hint, _prompt)
elif 'IT' in _name:
_hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.'
_prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:'
_round = get_few_shot_prompts_it(_hint, _prompt)
elif 'JA' in _name:
_hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。'
_prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:'
_round = get_few_shot_prompts_ja(_hint, _prompt)
elif 'KO' in _name:
_hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.'
_prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:'
_round = get_few_shot_prompts_ko(_hint, _prompt)
elif 'PT' in _name:
_hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.'
_prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:'
_round = get_few_shot_prompts_pt(_hint, _prompt)
elif 'ZH' in _name:
_hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。'
_prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:'
_round = get_few_shot_prompts_zh(_hint, _prompt)
elif 'SW' in _name:
_hint = f'Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.'
_prompt = f'Hii ni kuhusu {{subject}}.\nSwali: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJibu:'
_round = get_few_shot_prompts_sw(_hint, _prompt)
elif 'YO' in _name:
_hint = f'Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.'
_prompt = f'Eyi jẹ nipa {{subject}}.\nIbeere: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nFesi:'
_round = get_few_shot_prompts_yo(_hint, _prompt)
else:
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
_prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:'
_round = f'{_hint}\n{_prompt}\n'+"Please answer only with option A, B, C or D. \nAnswer:"
mmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=_round
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmmlu_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmmlu_datasets.append(
dict(
abbr=f'openai_m{_name}',
type=MMMLUDataset,
path='openai/MMMLU',
name=_name,
reader_cfg=mmmlu_reader_cfg,
infer_cfg=mmmlu_infer_cfg,
eval_cfg=mmmlu_eval_cfg,
))
del _name, _hint, _prompt, _round

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mmmlu_gen_c51a84 import mmmlu_datasets # noqa: F401, F403

View File

@ -0,0 +1,105 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMMLUDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
mmmlu_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D','subject'],
output_column='target',
train_split='test')
mmmlu_all_sets = [
'mmlu_AR-XY',
'mmlu_BN-BD',
'mmlu_DE-DE',
'mmlu_ES-LA',
'mmlu_FR-FR',
'mmlu_HI-IN',
'mmlu_ID-ID',
'mmlu_IT-IT',
'mmlu_JA-JP',
'mmlu_KO-KR',
'mmlu_PT-BR',
'mmlu_SW-KE',
'mmlu_YO-NG',
'mmlu_ZH-CN',
]
mmmlu_datasets = []
for _name in mmmlu_all_sets:
if 'AR' in _name:
_hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك'
_prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nأ. {{A}}\nب. {{B}}\nج. {{C}}\nد. {{D}}\nالإجابة:'
elif 'BN' in _name:
_hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন'
_prompt = f'এটি {{subject}} এর সম্পর্কে \nপ্রশ্ন: {{input}}\nএ. {{A}}\nবি. {{B}}\nসি. {{C}}\nডি. {{D}}\nউত্তর:'
elif 'DE' in _name:
_hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.'
_prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:'
elif 'ES' in _name:
_hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.'
_prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:'
elif 'FR' in _name:
_hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.'
_prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :'''
elif 'HI' in _name:
_hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।'
_prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:'
elif 'ID' in _name:
_hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.'
_prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:'
elif 'IT' in _name:
_hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.'
_prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:'
elif 'JA' in _name:
_hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。'
_prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:'
elif 'KO' in _name:
_hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.'
_prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:'
elif 'PT' in _name:
_hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.'
_prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:'
elif 'ZH' in _name:
_hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。'
_prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:'
else:
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
_prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:'
mmmlu_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\n {_prompt}'
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmmlu_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmmlu_datasets.append(
dict(
abbr=f'openai_m{_name}',
type=MMMLUDataset,
path='openai/MMMLU',
name=_name,
reader_cfg=mmmlu_reader_cfg,
infer_cfg=mmmlu_infer_cfg,
eval_cfg=mmmlu_eval_cfg,
))
del _name, _hint, _prompt

View File

@ -0,0 +1,213 @@
_hint = "This is a hint that helps you solve the question."
_prompt = "Please enter your question here."
def get_few_shot_prompts_ar(_hint, _prompt):
return [
dict(role='HUMAN', prompt="هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ الجبر المجرد\nالسؤال: ابحث عن أقصى حد ممكن لترتيب بعض العناصر في Z_4 x Z_6.\n A.4\nB.6\nC.12\nD.24\nلنفكر خطوة بخطوة\nالإجابة:"),
dict(role='BOT', prompt='C'),
dict(role='HUMAN', prompt="هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو المدرسة الثانوية\nالسؤال: أي من الأديان أدناه هو دين عالمي؟ A. الطاوية\n B. الإسلام\n C. الشنتو\n D. الكونفوشيوسية\nلنفكر خطوة بخطوة\nالإجابة:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ تعلم الآلة\nالسؤال: في كعكة يان لوكون، الكرز في الأعلى هو: \nA. التعلم المعزز\nB. التعلم الذاتي المراقب\nC. التعلم غير المراقب\nD. التعلم المراقب\nلنفكر خطوة بخطوة\nالإجابة:"),
dict(role='BOT', prompt="A"),
dict(role='HUMAN', prompt="هناك سؤال اختيار من متعدد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق بـ الفلسفة\nالسؤال: يقترح سقراط أن المقدس هو جزء واحد من:\nA. ما هو حكيم.\nB. ما هو عادل.\nC. ما هو جميل.\nD. ما هو قانوني.\nلنفكر خطوة بخطوة\nالإجابة:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="هذه سؤال اختيار واحد. أجب عن السؤال بالرد A أو B أو C أو D.\nيتعلق الأمر بتاريخ الولايات المتحدة في المدارس الثانوية.\nسؤال: هذه السؤال يشير إلى المعلومات التالية. ربما، مع ذلك، أنا أكثر وعيًا بأهمية الحريات المدنية في هذه اللحظة المحددة من تاريخنا من أي شخص آخر، لأنني أسافر عبر البلاد وألتقي بالناس وأرى أشياء حدثت لأناس عاديين، أدرك ما يعنيه للديمقراطية الحفاظ على حرياتنا المدنية. طوال السنوات كان علينا أن نقاتل من أجل الحرية المدنية، ونعلم أن هناك أوقاتًا تصبح فيها الأضواء خافتة، وكلما حدث ذلك تكون الديمقراطية في خطر. الآن، إلى حد كبير بسبب الحالة المضطربة للعالم ككل، اختفت الحريات المدنية في العديد من البلدان الأخرى. من المستحيل، بالطبع، أن تكون في حالة حرب وأن تحافظ على حرية الصحافة وحرية التعبير وحرية التجمع. إنها تختفي تلقائيًا. وهكذا في العديد من البلدان التي كانت آمنة عادة، أصبحت اليوم قد اختفت. في بلدان أخرى، حتى قبل أن تأتي الحرب، لم تختف فقط حرية الصحافة وحرية التجمع وحرية التعبير، بل اختفت أيضًا حرية الدين. ولذلك، نحن هنا في هذا البلد، لدينا مسؤولية كبيرة. نحن في السلام. ليس لدينا سبب من المخاوف التي تتحكم في العديد من الشعوب الأخرى في جميع أنحاء العالم؛ لذلك يجب علينا أن نحافظ على حريات الديمقراطية. —إلينور روزفلت، خطاب إلى الاتحاد الأمريكي للحريات المدنية، شيكاغو، إلينوي، 14 مارس 1940.\nفي خطابها، أشارت إلينور روزفلت إلى التهديد السابق للحريات المدنية الذي أنشأته أي مما يلي؟\nA. الحرب العالمية الأولى\nB. الصفقة الجديدة\nC. الحرب الباردة\nD. الكساد العظيم\nدعونا نفكر خطوة بخطوة.\nالجواب:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"\nفقط يجب الرد على الخيار A أو B أو C أو D. \nالإجابة هي:"),
]
def get_few_shot_prompts_bn(_hint, _prompt):
return [
dict(role='HUMAN', prompt="এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি মেশিন লার্নিং সম্পর্কে\nপ্রশ্ন: ইয়ান লেকুনের কেকের উপর চেরি হল: \nA. শক্তিশালীকরণ শেখা\nB. স্ব-নিরীক্ষিত শেখা\nC. অ-নিরীক্ষিত শেখা\nD. নিরীক্ষিত শেখা\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি বিমূর্ত বীজগণিত সম্পর্কে\nপ্রশ্ন: Z_4 x Z_6 এর কোন একটি উপাদানের জন্য সর্বাধিক সম্ভাব্য র‍্যাঙ্ক খুঁজুন।\nA.4\nB.6\nC.12\nD.24\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি উচ্চ বিদ্যালয়ের ভূগোল সম্পর্কে\nপ্রশ্ন: নিচের কোন ধর্ম একটি বিশ্বজনীন ধর্ম? \nA. তাওবাদ\nB. ইসলাম\nC. শিন্টোবাদ\nD. কনফুসিয়াসবাদ\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="এটি একটি একটি বিকল্প প্রশ্ন। A, B, C অথবা D এর মাধ্যমে উত্তর দিন।\nএটি দর্শনশাস্ত্র সম্পর্কে\nপ্রশ্ন: সক্রেটিস নির্দেশ করেন যে পবিত্র হচ্ছে:\nA. যা বিজ্ঞ\nB. যা ন্যায়িক\nC. যা সুন্দর\nD. যা আইনগত\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="এটি একটি একক পছন্দের প্রশ্ন। প্রশ্নের উত্তর A, B, C অথবা D দিয়ে দিন।\nএটি উচ্চ বিদ্যালয়ের মার্কিন ইতিহাস সম্পর্কে।\nপ্রশ্ন: এই প্রশ্নটি নিম্নলিখিত তথ্যের সাথে সম্পর্কিত। তবে, शायद আমি আমাদের ইতিহাসের এই নির্ভরযোগ্য মুহূর্তে নাগরিক স্বাধীনতার গুরুত্ব সম্পর্কে অন্য যে কারো চেয়ে বেশি সচেতন, কারণ আমি দেশজুড়ে ভ্রমণ করি এবং মানুষদের সঙ্গে দেখা করি এবং ছোট মানুষদের সাথে ঘটে যাওয়া ঘটনার কথা দেখি। আমি বুঝতে পারি যে আমাদের নাগরিক স্বাধীনতাগুলো রক্ষা করা কীভাবে গণতন্ত্রের জন্য গুরুত্বপূর্ণ। আমরা সাল জুড়ে নাগরিক স্বাধীনতার জন্য লড়াই করতে হয়েছে, এবং আমরা জানি যে এমন সময় আসে যখন আলো ধীরে ধীরে ম্লান হয়, এবং যখনই তা ঘটে, গণতন্ত্র বিপদে পড়ে। এখন, বিশালাংশে বিশ্বজুড়ে অস্থির পরিস্থিতির কারণে, অনেক অন্যান্য দেশে নাগরিক স্বাধীনতা হারিয়ে গেছে। যুদ্ধ চলাকালীন সংবাদপত্রের স্বাধীনতা, বক্তৃতার স্বাধীনতা এবং সমাবেশের স্বাধীনতা বজায় রাখা অবশ্যই অসম্ভব। সেগুলি স্বয়ংক্রিয়ভাবে消失 হয়ে যায়। এবং তাই বহু দেশে যেগুলি সাধারণত নিরাপদ ছিল, আজ তারা gone হয়ে গেছে। অন্যান্য দেশে, এমনকি যুদ্ধ আসার আগেই, শুধুমাত্র সংবাদপত্রের স্বাধীনতা, সমাবেশের স্বাধীনতা, এবং বক্তৃতার স্বাধীনতা হারিয়ে যায়নি, তবে ধর্মের স্বাধীনতাও消失 হয়ে গেছে। এবং তাই আমরা জানি যে এই দেশে আমাদের একটি গুরুতর দায়িত্ব আছে। আমরা শান্তিতে আছি। আমাদের কাছে বিশ্বের অনেক অন্যান্য লোকজনের নিয়ন্ত্রণ করা ভয়ের জন্য কোন কারণ নেই; অতএব, আমাদের গণতন্ত্রের স্বাধীনতাগুলোকে রক্ষা করতে হবে। —এলিনর রুজভেল্ট, আমেরিকান সিভিল লিবারটিজ ইউনিয়নের সম্বোধন, শিকাগো, ইলিনয়, ১৪ই মার্চ, ১৯৪০।\nতার বক্তৃতায়, এলিনর রুজভেল্ট পূর্ববর্তী নাগরিক স্বাধীনতার প্রতি হুমকি সম্পর্কে কোনটি উল্লেখ করেছেন?\nA. বিশ্বযুদ্ধ I\nB. নয়া চুক্তি\nC. ঠাণ্ডা যুদ্ধ\nD. গ্রেট ডিপ্রেশন\nশুধু বিকল্প A, B, C বা D এর উত্তর দিন\nউত্তর:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"শুধু বিকল্প A, B, C বা D এর উত্তর দিন, \nউত্তর হলো:"),
]
def get_few_shot_prompts_de(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um maschinelles Lernen.\nFrage: In Yann LeCuns Kuchen ist die Kirsche oben:\nA. Verstärkendes Lernen\nB. Selbstüberwachtes Lernen\nC. Unüberwachtes Lernen\nD. Überwachtes Lernen\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um abstrakte Algebra.\nFrage: Finde die maximal mögliche Ordnung für ein Element von Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um Geografie in der High School.\nFrage: Welche der folgenden Religionen ist eine universalisierende Religion? \nA. Taoismus\nB. Islam\nC. Shintoismus\nD. Konfuzianismus\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Das ist eine einzelne Auswahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.\nEs geht um Philosophie.\nFrage: Sokrates schlägt vor, dass das Heilige ein Teil von:\nA. was weise ist.\nB. was gerecht ist.\nC. was schön ist.\nD. was legal ist.\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="Dies ist eine Einzelwahlfrage. Beantworten Sie die Frage, indem Sie A, B, C oder D antworten.\nEs geht um die amerikanische Geschichte in der High School.\nFrage: Diese Frage bezieht sich auf die folgenden Informationen. Vielleicht bin ich mir jedoch in diesem bestimmten Moment unserer Geschichte mehr bewusst, wie wichtig die Bürgerrechte sind als jeder andere, weil ich durch das Land reise, Menschen treffe und Dinge sehe, die kleinen Menschen passiert sind. Ich erkenne, was es bedeutet, die Bürgerrechte zu bewahren, um die Demokratie zu erhalten. Im Verlauf der Jahre mussten wir für die Bürgerrechte kämpfen, und wir wissen, dass es Zeiten gibt, in denen das Licht eher schwach wird, und jedes Mal, wenn das passiert, ist die Demokratie in Gefahr. Jetzt, größtenteils aufgrund des angespannten Zustands der Welt als Ganzes, sind die Bürgerrechte in vielen anderen Ländern verschwunden. Es ist unmöglich, im Krieg zu sein und gleichzeitig die Pressefreiheit, die Meinungsfreiheit und die Versammlungsfreiheit aufrechtzuerhalten. Sie verschwinden automatisch. Und so sind in vielen Ländern, in denen sie normalerweise sicher waren, heute verschwunden. In anderen Ländern verschwanden nicht nur die Pressefreiheit und die Versammlungsfreiheit und die Meinungsfreiheit, sondern auch die Religionsfreiheit, selbst bevor der Krieg kam. Und so wissen wir hier in diesem Land, dass wir eine ernsthafte Verantwortung haben. Wir sind in Frieden. Wir haben keinen Grund für die Ängste, die so viele andere Menschen auf der ganzen Welt regieren; daher müssen wir die Freiheiten der Demokratie schützen. —Eleanor Roosevelt, Ansprache an die Amerikanische Vereinigung für Bürgerrechte, Chicago, Illinois, 14. März 1940.\nIn ihrer Rede spielte Eleanor Roosevelt auf die frühere Bedrohung der Bürgerrechte an, die durch welches der folgenden Dinge geschaffen wurde?\nA. Erster Weltkrieg\nB. New Deal\nC. Kalter Krieg\nD. Große Depression\nBitte beantworten Sie nur mit Option A, B, C oder D.\nAntwort:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Bitte beantworten Sie nur mit Option A, B, C oder D. \nAntwort:"),
]
def get_few_shot_prompts_es(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de aprendizaje automático.\nPregunta: En el pastel de Yann LeCun, la cereza en la cima es: \nA. aprendizaje por refuerzo\nB. aprendizaje auto-supervisado\nC. aprendizaje no supervisado\nD. aprendizaje supervisado\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de álgebra abstracta.\nPregunta: Encuentra el orden máximo posible para algún elemento de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de geografía de secundaria.\nPregunta: ¿Cuál de la siguiente religión es una religión universalizante? \nA. Taoísmo\nB. Islam\nC. Shintoísmo\nD. Confucianismo\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de filosofía.\nPregunta: Sócrates sugiere que lo sagrado es una parte de:\nA. lo que es prudente.\nB. lo que es justo.\nC. lo que es bello.\nD. lo que es legal.\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="Esta es una pregunta de opción única. Responde a la pregunta contestando A, B, C o D.\nSe trata de la historia de EE.UU. en la escuela secundaria.\nPregunta: Esta pregunta se refiere a la siguiente información. Sin embargo, quizás soy más consciente de la importancia de las libertades civiles en este momento particular de nuestra historia que cualquier otra persona, porque viajo por el país, encuentro personas y veo cosas que han sucedido a las personas pequeñas, me doy cuenta de lo que significa para la democracia preservar nuestras libertades civiles. A lo largo de los años hemos tenido que luchar por la libertad civil, y sabemos que hay momentos en que la luz se vuelve bastante tenue, y cada vez que eso sucede, la democracia está en peligro. Ahora, en gran parte debido al estado problemático del mundo en su conjunto, las libertades civiles han desaparecido en muchos otros países. Es imposible, por supuesto, estar en guerra y mantener la libertad de prensa, la libertad de expresión y la libertad de reunión. Desaparecen automáticamente. Y así, en muchos países donde normalmente eran seguras, hoy han desaparecido. En otros países, incluso antes de que llegara la guerra, no solo la libertad de prensa y la libertad de reunión, y la libertad de expresión desaparecieron, sino que también desapareció la libertad de religión. Y así sabemos que aquí en este país, tenemos una grave responsabilidad. Estamos en paz. No tenemos razón para los temores que gobiernan a tantas otras personas en todo el mundo; por lo tanto, tenemos que proteger las libertades de la democracia. —Eleanor Roosevelt, Discurso ante la Unión Americana de Libertades Civiles, Chicago, Illinois, 14 de marzo de 1940.\nEn su discurso, Eleanor Roosevelt aludió a la amenaza anterior a las libertades civiles creada por cuál de las siguientes?\nA. Primera Guerra Mundial\nB. El New Deal\nC. La Guerra Fría\nD. La Gran Depresión\nSolo necesitas responder con la opción A, B, C o D.\nRespuesta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Solo necesitas responder con la opción A, B, C o D. \nRespuesta:"),
]
def get_few_shot_prompts_fr(_hint, _prompt):
return [
dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit d'apprentissage automatique.\nQuestion : Dans le gâteau de Yann LeCun, la cerise sur le dessus est :\nA. apprentissage par renforcement\nB. apprentissage auto-supervisé\nC. apprentissage non supervisé\nD. apprentissage supervisé\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit d'algèbre abstraite.\nQuestion : Trouvez l'ordre maximum possible pour un élément de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit de géographie de lycée.\nQuestion : Laquelle des religions suivantes est une religion universalisante ?\nA. Taoïsme\nB. Islam\nC. Shintoïsme\nD. Confucianisme\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nIl s'agit de philosophie.\nQuestion : Socrate suggère que le sacré est une partie de :\nA. ce qui est prudent.\nB. ce qui est juste.\nC. ce qui est beau.\nD. ce qui est légal.\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="C'est une question à choix unique. Répondez à la question en répondant A, B, C ou D.\nC'est sur l'histoire des États-Unis au lycée.\nQuestion : Cette question se réfère aux informations suivantes. Peut-être, cependant, je suis plus conscient de l'importance des libertés civiles à ce moment particulier de notre histoire que quiconque, car je voyage à travers le pays, rencontre des gens et vois des choses qui sont arrivées à des petites personnes, je réalise ce que signifie pour la démocratie de préserver nos libertés civiles. Au fil des ans, nous avons dû nous battre pour la liberté civile, et nous savons qu'il y a des moments où la lumière devient plutôt faible, et chaque fois que cela se produit, la démocratie est en danger. Maintenant, en grande partie à cause de l'état troublé du monde dans son ensemble, les libertés civiles ont disparu dans de nombreux autres pays. Il est impossible, bien sûr, d'être en guerre et de maintenir la liberté de la presse, la liberté d'expression et la liberté de réunion. Elles disparaissent automatiquement. Et donc dans de nombreux pays où elles étaient normalement en sécurité, aujourd'hui, elles sont parties. Dans d'autres pays, même avant l'arrivée de la guerre, non seulement la liberté de la presse et la liberté de réunion, et la liberté d'expression ont disparu, mais la liberté de religion a aussi disparu. Et donc nous savons ici dans ce pays, nous avons une grave responsabilité. Nous sommes en paix. Nous n'avons aucune raison pour les peurs qui gouvernent tant d'autres peuples à travers le monde ; par conséquent, nous devons garder les libertés de la démocratie. —Eleanor Roosevelt, Adresse à l'Union Américaine pour les Libertés Civiles, Chicago, Illinois, 14 mars 1940\nDans son discours, Eleanor Roosevelt a fait allusion à la menace antérieure pour les libertés civiles créée par laquelle des suivantes ?\nA. Première Guerre Mondiale\nB. Le New Deal\nC. La Guerre froide\nD. La Grande Dépression\nVous n'avez qu'à répondre avec l'option A, B, C ou D.\nRéponse :"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Vous n'avez qu'à répondre avec l'option A, B, C ou D. \nRéponse :"),
]
def get_few_shot_prompts_hi(_hint, _prompt):
return [
dict(role='HUMAN', prompt="यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह मशीन लर्निंग के बारे में है\nप्रश्न: Yann LeCun की केक में, ऊपर का चेरी है: \nA.रिवॉर्ड लर्निंग\nB.सेल्फ-सुपरवाइज्ड लर्निंग\nC.अनसुपरवाइज्ड लर्निंग\nD.सुपरवाइज्ड लर्निंग\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह अमूर्त बीजगणित के बारे में है\nप्रश्न: Z_4 x Z_6 के कुछ तत्व के लिए अधिकतम संभावित क्रम ढूंढें।\n A.4\nB.6\nC.12\nD.24\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह उच्च विद्यालय भूगोल के बारे में है\nप्रश्न: नीचे कौन सा धर्म सार्वभौमिक धर्म है? A.ताओवाद\n B.इस्लाम\n C.शिंतो धर्म\n D.कन्फ्यूशियसवाद\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह दर्शनशास्त्र के बारे में है\nप्रश्न: सुकरात सुझाते हैं कि पवित्र एक भाग है:\nA.जो विवेकी है।\nB.जो न्यायपूर्ण है।\nC.जो सुंदर है।\nD.जो कानूनी है।\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D के जवाब देकर दें।\nयह दर्शनशास्त्र के बारे में है\nप्रश्न: यह प्रश्न नीचे दी गई जानकारी के संदर्भ में है। शायद, फिर भी, मैं हमारे इतिहास के इस विशेष क्षण में नागरिक स्वतंत्रताओं के महत्व के प्रति अन्य किसी से अधिक जागरूक हूँ, क्योंकि मैं देश भर में यात्रा करता हूँ और लोगों से मिलता हूँ और उन चीज़ों को देखता हूँ जो छोटे लोगों के साथ हुई हैं, मैं समझता हूँ कि हमारी नागरिक स्वतंत्रताओं को बनाए रखना लोकतंत्र का क्या अर्थ है। वर्षों से, हमें नागरिक स्वतंत्रता के लिए लड़ना पड़ा है, और हम जानते हैं कि ऐसे क्षण होते हैं जब रोशनी कम हो जाती है, और हर बार ऐसा होने पर लोकतंत्र खतरे में होता है। अब, मुख्यतः पूरी दुनिया की चिंताजनक स्थिति के कारण, कई अन्य देशों में नागरिक स्वतंत्रताएँ गायब हो गई हैं। यह निश्चित रूप से असंभव है, युद्ध में रहकर प्रेस की स्वतंत्रता और भाषण की स्वतंत्रता और सभा की स्वतंत्रता को बनाए रखना। वे स्वचालित रूप से गायब हो जाती हैं। और इसलिए कई देशों में जहाँ सामान्यतः वे सुरक्षित थीं, आज वे गायब हो गई हैं। अन्य देशों में, यहाँ तक कि युद्ध आने से पहले, न केवल प्रेस की स्वतंत्रता और सभा की स्वतंत्रता और भाषण की स्वतंत्रता गायब हो गई, बल्कि धर्म की स्वतंत्रता भी गायब हो गई। और इसलिए हम यहाँ इस देश में जानते हैं, हमें एक गंभीर जिम्मेदारी है। हम शांति में हैं। हमारे पास उन डर के लिए कोई कारण नहीं है जो दुनिया भर में tantos अन्य लोगों को प्रभावित करते हैं; इसलिए, हमें लोकतंत्र की स्वतंत्रताओं की रक्षा करनी चाहिए। —Eleanor Roosevelt, American Civil Liberties Union को दिए गए संबोधन में, शिकागो, इलिनोइस, 14 मार्च, 1940 उसकी स्पीच में, Eleanor Roosevelt ने नागरिक स्वतंत्रताओं के लिए पूर्व के खतरे का उल्लेख किसके द्वारा किया?\nA.विश्व युद्ध I\nB.न्यू डील\nC.शीत युद्ध\nD.महान मंदी\nकृपया केवल विकल्प A, B, C या D का उत्तर दें।\nउत्तर:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"आपको केवल विकल्प A, B, C या D के साथ जवाब देना है। \nउत्तर:"),
]
def get_few_shot_prompts_id(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang pembelajaran mesin.\nPertanyaan: Dalam kue Yann LeCun, ceri di atas adalah:\nA. pembelajaran penguatan\nB. pembelajaran mandiri\nC. pembelajaran tak terawasi\nD. pembelajaran terawasi\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang aljabar abstrak.\nPertanyaan: Temukan urutan maksimum yang mungkin untuk beberapa elemen dari Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang geografi sekolah menengah.\nPertanyaan: Agama mana di bawah ini yang merupakan agama universal?\nA. Taoisme\nB. Islam\nC. Shintoisme\nD. Konfusianisme\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Ini adalah pertanyaan pilihan ganda. Jawablah pertanyaan ini dengan menjawab A, B, C, atau D.\nIni tentang filsafat.\nPertanyaan: Socrates menyarankan bahwa yang suci adalah salah satu bagian dari:\nA. apa yang bijak.\nB. apa yang adil.\nC. apa yang indah.\nD. apa yang legal.\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="Ini adalah pertanyaan pilihan ganda. Jawab pertanyaan dengan menjawab A, B, C, atau D.\nIni tentang sejarah AS di sekolah menengah.\nPertanyaan: Pertanyaan ini merujuk pada informasi berikut. Namun, mungkin saya lebih sadar akan pentingnya kebebasan sipil pada momen tertentu dalam sejarah kita daripada siapa pun, karena saya berkeliling negara dan bertemu orang-orang serta melihat hal-hal yang terjadi pada orang-orang kecil, saya menyadari apa artinya bagi demokrasi untuk memelihara kebebasan sipil kita. Selama bertahun-tahun kita harus berjuang untuk kebebasan sipil, dan kita tahu bahwa ada kalanya cahaya menjadi redup, dan setiap kali itu terjadi, demokrasi berada dalam bahaya. Sekarang, sebagian besar karena keadaan dunia yang bermasalah secara keseluruhan, kebebasan sipil telah menghilang di banyak negara lain. Tentu saja, adalah mustahil untuk berperang dan tetap mempertahankan kebebasan pers, kebebasan berbicara, dan kebebasan berkumpul. Mereka menghilang secara otomatis. Dan jadi di banyak negara di mana biasanya mereka aman, hari ini mereka sudah hilang. Di negara-negara lain, bahkan sebelum perang datang, tidak hanya kebebasan pers dan kebebasan berkumpul, serta kebebasan berbicara yang hilang, tetapi kebebasan beragama juga hilang. Dan jadi kami tahu di negara ini, kami memiliki tanggung jawab yang berat. Kami berada dalam keadaan damai. Kami tidak punya alasan untuk ketakutan yang mengatur begitu banyak orang di seluruh dunia; oleh karena itu, kami harus menjaga kebebasan demokrasi. —Eleanor Roosevelt, Pidato kepada Asosiasi Kebebasan Sipil Amerika, Chicago, Illinois, 14 Maret 1940\nDalam pidatonya, Eleanor Roosevelt merujuk pada ancaman sebelumnya terhadap kebebasan sipil yang diciptakan oleh mana di antara berikut ini?\nA. Perang Dunia I\nB. New Deal\nC. Perang Dingin\nD. Depresi Besar\nAnda hanya perlu menjawab dengan opsi A, B, C, atau D.\nJawaban:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Anda hanya perlu menjawab dengan opsi A, B, C, atau D. \nJawaban:"),
]
def get_few_shot_prompts_it(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo al machine learning.\nDomanda: Nella torta di Yann LeCun, la ciliegina sulla torta è:\nA. apprendimento per rinforzo\nB. apprendimento auto-supervisionato\nC. apprendimento non supervisionato\nD. apprendimento supervisionato\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo all'algebra astratta.\nDomanda: Trova l'ordine massimo possibile per alcuni elementi di Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nPensiamo passo dopo passo.\nRisposta:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo alla geografia delle scuole superiori.\nDomanda: Quale religione qui sotto è una religione universalista?\nA. Taoismo\nB. Islam\nC. Shintoismo\nD. Confucianesimo\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Si tratta di una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nÈ riguardo alla filosofia.\nDomanda: Socrate suggerisce che il sacro è una parte di:\nA. ciò che è prudente.\nB. ciò che è giusto.\nC. ciò che è bello.\nD. ciò che è legale.\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="Questa è una domanda a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.\nRiguarda la storia degli Stati Uniti delle scuole superiori.\nDomanda: Questa domanda si riferisce alle seguenti informazioni. Tuttavia, forse sono più consapevole dell'importanza delle libertà civili in questo particolare momento della nostra storia rispetto a chiunque altro, perché viaggio per il paese, incontro persone e vedo cose che sono accadute a persone comuni, mi rendo conto di cosa significhi per la democrazia preservare le nostre libertà civili. Negli anni abbiamo dovuto combattere per la libertà civile e sappiamo che ci sono momenti in cui la luce si fa piuttosto fioca, e ogni volta che ciò accade, la democrazia è in pericolo. Ora, principalmente a causa dello stato travagliato del mondo nel suo insieme, le libertà civili sono scomparse in molti altri paesi. È impossibile, naturalmente, essere in guerra e mantenere la libertà di stampa, la libertà di parola e la libertà di riunione. Esse scompaiono automaticamente. E così, in molti paesi dove normalmente erano sicure, oggi sono svanite. In altri paesi, anche prima che arrivasse la guerra, non solo la libertà di stampa e la libertà di riunione, e la libertà di parola sono scomparse, ma anche la libertà di religione è scomparsa. E così sappiamo qui in questo paese, abbiamo una grave responsabilità. Siamo in pace. Non abbiamo ragione per le paure che governano così tante altre persone nel mondo; pertanto, dobbiamo difendere le libertà della democrazia. —Eleanor Roosevelt, Discorso all'Unione Americana per le Libertà Civili, Chicago, Illinois, 14 marzo 1940.\nNel suo discorso, Eleanor Roosevelt alluse alla minaccia precedente alle libertà civili creata da quale delle seguenti?\nA. Prima Guerra Mondiale\nB. Il New Deal\nC. Guerra Fredda\nD. Grande Depressione\nÈ sufficiente rispondere con l'opzione A, B, C o D.\nRisposta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"È sufficiente rispondere con l'opzione A, B, C o D. \nRisposta:"),
]
def get_few_shot_prompts_ja(_hint, _prompt):
return [
dict(role='HUMAN', prompt="これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは機械学習に関するものです。\n質問:ヤン・ルカンのケーキにおいて、一番上のチェリーは:\nA. 強化学習\nB. 自己監督学習\nC. 教師なし学習\nD. 教師あり学習\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは抽象代数学に関するものです。\n質問Z_4 x Z_6 のいくつかの要素の最大可能な順序を求めなさい。\nA. 4\nB. 6\nC. 12\nD. 24\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは高校の地理に関するものです。\n質問:以下のどの宗教が普遍化宗教ですか?\nA. 道教\nB. イスラム教\nC. 神道\nD. 儒教\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれは哲学に関するものです。\n質問:ソクラテスは、聖なるものが以下のどれの一部であると示唆していますか?\nA. 賢明なもの\nB. 正義のあるもの\nC. 美しいもの\nD. 合法的なもの\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="これは単一選択の質問です。A、B、C、またはDで回答してください。\nこれはアメリカの歴史についてです。\n質問:この質問は次の情報を参照しています。しかし、私はこの特定の歴史の瞬間における市民の自由の重要性に他の誰よりも敏感であるかもしれません。なぜなら、私は国を旅し、人々に会い、小さな人々に起こったことを見てきたからです。民主主義が市民の自由を守ることが何を意味するのかを理解しています。私たちは市民の自由のために戦わなければならなかった年月があり、光がかなり薄暗くなる時期があることを知っています。そのたびに民主主義は危険にさらされます。今、主に世界全体の不安定な状態によって、多くの他の国で市民の自由が消失しています。もちろん、戦争をしていては報道の自由、言論の自由、集会の自由を保つことは不可能です。それらは自動的に消えてしまいます。そして、通常は安全であった多くの国では、今日、これらはなくなりました。他の国々では、戦争が来る前から、報道の自由や集会の自由、言論の自由だけでなく、宗教の自由も消えていました。したがって、私たちはこの国で重大な責任を負っていることを知っています。私たちは平和です。他の国々の多くの人々が抱える恐れの理由はないので、私たちは民主主義の自由を守らなければなりません。 —エレア・ルーズベルト、1940年3月14日イリイ州シカゴでのアメリカ市民自由連合への演説\n彼女の演説で、エレノア・ルーズベルトは市民の自由に対する以前の脅威をどのように言及しましたか?\nA. 第一次世界大戦\nB. ニューディール\nC. 冷戦\nD. 大恐慌\n選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"選択肢A、B、C、またはDで答えるだけで大丈夫です。\n回答:"),
]
def get_few_shot_prompts_ko(_hint, _prompt):
return [
dict(role='HUMAN', prompt="단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 기계 학습에 관한 것입니다.\n질문: 얀 르쿤의 케이크에서 가장 위에 있는 체리는:\nA. 강화 학습\nB. 자기 지도 학습\nC. 비지도 학습\nD. 지도 학습\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 추상 대수학에 관한 것입니다.\n질문: Z_4 x Z_6의 어떤 요소의 최대 가능한 순서를 찾으세요.\nA. 4\nB. 6\nC. 12\nD. 24\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 고등학교 지리에 관한 것입니다.\n질문: 아래의 어떤 종교가 보편화 종교입니까?\nA. 도교\nB. 이슬람교\nC. 신도\nD. 유교\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이 질문은 철학에 관한 것입니다.\n질문: 소크라테스는 신성한 것이 다음 중 어떤 것의 일부라고 제안합니까?\nA. 신중한 것\nB. 정의로운 것\nC. 아름다운 것\nD. 합법적인 것\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="이것은 단일 선택 질문입니다. A, B, C 또는 D로 답하십시오.\n이는 미국 역사에 관한 것입니다.\n질문: 이 질문은 다음 정보를 참조합니다. 그러나 어쩌면 나는 이 특정 역사적 순간에 시민 권리의 중요성에 대해 다른 누구보다 더 잘 인식하고 있습니다. 왜냐하면 나는 나라를 여행하며 사람들을 만나고 작은 사람들에게 일어난 일들을 보았기 때문입니다. 나는 민주주의가 시민 권리를 보존한다는 것이 무엇을 의미하는지 깨닫습니다. 우리는 시민 권리를 위해 싸워야 했던 여러 해를 거쳐 왔으며, 빛이 흐려지는 순간이 있음을 알고 있습니다. 그럴 때마다 민주주의가 위험에 처한 것처럼 느껴집니다. 지금은 전 세계의 불안정한 상태로 인해 많은 다른 국가에서 시민 권리가 사라졌습니다. 전쟁 중에 언론의 자유와 표현의 자유, 집회의 자유를 유지하는 것은 불가능합니다. 그것들은 자동으로 사라집니다. 그리고 따라서 일반적으로 안전했던 많은 국가에서는 오늘날 그것들이 사라졌습니다. 다른 국가에서는 전쟁이 오기 전에도 언론의 자유와 집회의 자유, 표현의 자유가 사라졌을 뿐만 아니라 종교의 자유도 사라졌습니다. 그래서 우리는 이 나라에서 중대한 책임을 지고 있다는 것을 압니다. 우리는 평화로운 상태에 있습니다. 전 세계의 많은 사람들이 느끼는 두려움에 대한 이유가 없으므로 우리는 민주주의의 자유를 지켜야 합니다. —엘리노르 루즈벨트, 1940년 3월 14일 일리노이주 시카고에서 미국 시민 자유 연합에 대한 연설\n그녀의 연설에서 엘리노르 루즈벨트는 시민 권리에 대한 이전의 위협이 어떤 것에 의해 발생했는지 언급했습니다.\nA. 제1차 세계 대전\nB. 뉴딜\nC. 냉전\nD. 대공황\n옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"옵션 A, B, C 또는 D로만 대답하면 됩니다. \n답변:"),
]
def get_few_shot_prompts_pt(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre aprendizado de máquina.\nPergunta: No bolo de Yann LeCun, a cereja no topo é:\nA. aprendizado por reforço\nB. aprendizado auto-supervisionado\nC. aprendizado não supervisionado\nD. aprendizado supervisionado\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre álgebra abstrata.\nPergunta: Encontre a ordem máxima possível para algum elemento de Z_4 x Z_6.\nA. 4\nB. 6\nC. 12\nD. 24\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:"),
dict(role='BOT', prompt="C"),
dict(role='HUMAN', prompt="Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre geografia do ensino médio.\nPergunta: Qual religião abaixo é uma religião universalizante?\nA. Taoísmo\nB. Islamismo\nC. Xintoísmo\nD. Confucionismo\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre filosofia.\nPergunta: Sócrates sugere que o sagrado é uma parte de:\nA. o que é prudente.\nB. o que é justo.\nC. o que é belo.\nD. o que é legal.\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:"),
dict(role='BOT', prompt='B'),
dict(role='HUMAN', prompt="Esta é uma pergunta de escolha única. Responda à pergunta respondendo A, B, C ou D.\nÉ sobre história dos Estados Unidos do ensino médio.\nPergunta: Esta pergunta se refere à seguinte informação. Talvez, no entanto, eu esteja mais consciente da importância das liberdades civis neste momento particular da nossa história do que qualquer outra pessoa, porque eu viajo pelo país e conheço pessoas e vejo coisas que aconteceram com pessoas pequenas, percebo o que significa para a democracia preservar nossas liberdades civis. Ao longo dos anos, tivemos que lutar pela liberdade civil, e sabemos que há momentos em que a luz fica bastante fraca, e toda vez que isso acontece, a democracia está em perigo. Agora, em grande parte por causa do estado problemático do mundo como um todo, as liberdades civis desapareceram em muitos outros países. É impossível, é claro, estar em guerra e manter a liberdade de imprensa, a liberdade de expressão e a liberdade de reunião. Elas desaparecem automaticamente. E assim, em muitos países onde normalmente estavam seguras, hoje desapareceram. Em outros países, mesmo antes da guerra chegar, não apenas a liberdade de imprensa e a liberdade de reunião, e a liberdade de expressão desapareceram, mas a liberdade de religião também desapareceu. E assim, sabemos aqui neste país, temos uma grave responsabilidade. Estamos em paz. Não temos razão para os medos que governam tantas outras pessoas ao redor do mundo; portanto, temos que proteger as liberdades da democracia. —Eleanor Roosevelt, Discurso à União Americana pelas Liberdades Civis, Chicago, Illinois, 14 de março de 1940\nEm seu discurso, Eleanor Roosevelt aludiu à ameaça anterior às liberdades civis criada por qual das seguintes?\nA. Primeira Guerra Mundial\nB. O Novo Pacto\nC. A Guerra Fria\nD. A Grande Depressão\nVocê só precisa responder com a opção A, B, C ou D.\nResposta:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Você só precisa responder com a opção A, B, C ou D. \nResposta:"),
]
def get_few_shot_prompts_zh(_hint, _prompt):
return [
dict(role='HUMAN', prompt="这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于美国历史的。\n问题这个问题参考以下信息。或许我对我们历史这一特定时刻公民自由重要性的认识比其他任何人都要深刻因为我在全国各地旅行见到人们看到那些发生在小人物身上的事情我意识到民主意味着要保护我们的公民自由。在这些年里我们不得不为公民自由而奋斗我们知道有时光芒会变得微弱每当这种情况发生时民主就处于危险之中。现在主要由于整个世界的动荡状态许多其他国家的公民自由已经消失。当然在战争中保持新闻自由、言论自由和集会自由是不可能的。它们会自动消失。因此在许多通常是安全的国家里今天它们已经消失。在其他国家即使在战争到来之前不仅新闻自由、集会自由和言论自由消失了宗教自由也消失了。因此我们知道在这个国家我们有着重大的责任。我们处于和平状态。我们没有理由去害怕其他世界上许多人所感受到的恐惧因此我们必须保护民主的自由。——埃莉诺·罗斯福1940年3月14日在伊利诺伊州芝加哥的美国公民自由联盟演讲\n在她的演讲中,埃莉诺·罗斯福提到了哪一事件对公民自由造成了早期威胁?\nA. 第一次世界大战\nB. 新政\nC. 冷战\nD. 大萧条\n只需选择 A、B、C 或 D 来回答该问题。\n答案:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于高中地理的。\n问题:以下哪个宗教是普世宗教?\nA. 道教\nB. 伊斯兰教\nC. 神道教\nD. 儒教\n只需选择 A、B、C 或 D 来回答该问题。\n答案:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于哲学的。\n问题:苏格拉底建议神圣是以下哪个部分:\nA. 什么是谨慎的。\nB. 什么是正义的。\nC. 什么是美的。\nD. 什么是合法的。\n只需选择 A、B、C 或 D 来回答该问题。\n答案:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于抽象代数的。\n问题:找到 Z_4 x Z_6 中某个元素的最大可能阶数。\nA. 4\nB. 6\nC. 12\nD. 24\n只需选择 A、B、C 或 D 来回答该问题。\n答案:"),
dict(role='BOT', prompt='C'),
dict(role='HUMAN', prompt="这是一个单项选择题。请回复 A、B、C 或 D。\n这是关于机器学习的。\n问题:在 Yann LeCun 的蛋糕中,最上面的樱桃是:\nA. 强化学习\nB. 自监督学习\nC. 无监督学习\nD. 监督学习\n只需选择 A、B、C 或 D 来回答该问题。\n答案:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"只需选择 A、B、C 或 D 来回答该问题。 \n回答:"),
]
def get_few_shot_prompts_sw(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu filosofia\nSwali: Swali hili linahusiana na taarifa ifuatayo. Huenda, hata hivyo, mimi nina ufahamu zaidi wa umuhimu wa uhuru wa kiraia katika wakati huu maalum wa historia yetu kuliko mtu mwingine yeyote, kwa sababu nasafiri nchini na kukutana na watu na kuona mambo yanayotokea kwa watu wadogo, ninatambua inamaanisha nini kwa demokrasia kuhifadhi uhuru wetu wa kiraia. kwa kupitia miaka yote tumepaswa kupigania uhuru wa kiraia, na tunajua kuwa kuna nyakati ambapo mwanga unakuwa dhaifu, na kila wakati hii inatokea, demokrasia iko katika hatari. Sasa, hasa kwa sababu ya hali ya machafuko ya ulimwengu kwa jumla, uhuru wa kiraia umepotea katika nchi nyingi nyingine. Haiwezekani, kwa kweli, kuwa katika vita na kudumisha uhuru wa vyombo vya habari na uhuru wa kusema na uhuru wa kukusanyika. Vinapotea kiotomatiki. Na hivyo katika nchi nyingi ambapo kwa kawaida walikuwa salama, leo wameondoka. Katika nchi zingine, hata kabla ya vita kuja, si tu uhuru wa vyombo vya habari na uhuru wa kukusanyika, na uhuru wa kusema umepotea, bali pia uhuru wa dini umepotea. Na hivyo tunajua hapa katika nchi hii, tuna wajibu mzito. Tuko katika amani. Hatuna sababu ya hofu ambazo zinatawala watu wengi wengine duniani; kwa hiyo, ni lazima tulinde uhuru wa demokrasia. —Eleanor Roosevelt, Hotuba kwa Muungano wa Uhuru wa Kiraia wa Marekani, Chicago, Illinois, Machi 14, 1940 Katika hotuba yake, Eleanor Roosevelt alizungumzia tishio la awali kwa uhuru wa kiraia lililotolewa na ipi kati ya yafuatayo? \nA.Vita vya Kwanza vya Dunia\nB.Mkataba Mpya\nC.Vita vya Baridi\nD.Mapinduzi Makuu\nChagua tu A, B, C au D kujibu swali hili.\nJibu:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu jiografia ya shule ya sekondari\nSwali: Dini ipi hapa chini ni dini ya kueneza? \nA.Taoism\nB.Islam\nC.Shintoism\nD.Confucianism\nChagua tu A, B, C au D kujibu swali hili.\nJibu:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu filosofia\nSwali: Socrates anapendekeza kwamba kitakatifu ni sehemu moja ya:\nA. kile kilicho busara.\nB. kile kilicho haki.\nC. kile kilicho kizuri.\nD. kile kilicho halali.\nChagua tu A, B, C au D kujibu swali hili.\nJibu:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu algebra ya kiabstract\nSwali: Pata kipindi chenye uwezo mkubwa zaidi kwa kipengele baadhi ya Z_4 x Z_6.\nA.4\nB.6\nC.12\nD.24\nHebu tufikirie hatua kwa hatua\nJibu:"),
dict(role='BOT', prompt='C'),
dict(role='HUMAN', prompt="Kuna swali moja la chaguo. Jibu swali kwa kujibu A, B, C au D.\nit ni kuhusu kujifunza kwa mashine\nSwali: Katika keki ya Yann LeCun, cherii juu ni: \nA.kujifunza kwa nguvu\nB.kujifunza kwa kujisimamia\nC.kujifunza bila usimamizi\nD.kujifunza kwa usimamizi\nChagua tu A, B, C au D kujibu swali hili.\nJibu:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Chagua tu A, B, C au D kujibu swali hili. \nJibu:"),
]
def get_few_shot_prompts_yo(_hint, _prompt):
return [
dict(role='HUMAN', prompt="Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa filosofía\nIbeere: Ibeere yii tọka si alaye ti n bọ. Boya, sibẹsibẹ, Mo ni imọ diẹ sii nipa pataki awọn ominira ilu ni akoko pataki yii ti itan wa ju ẹnikẹni miiran lọ, nitori Mo n rin irin-ajo kọja ilẹ naa ati pe Mo pade awọn eniyan ati pe Mo ri awọn nkan ti o ti ṣẹlẹ si awọn eniyan kekere, Mo mọ ohun ti o tumọ si fun ijọba mimọ lati pa awọn ominira ilu wa. Ni gbogbo ọdun, a ti ni lati ja fun ominira ilu, ati pe a mọ pe awọn akoko wa nigba ti ina di dimu, ati nigbami ti eyi ṣẹlẹ, ijọba wa ni ewu. Bayi, ni pataki nitori ipo iṣoro agbaye ni apapọ, awọn ominira ilu ti parẹ ni ọpọlọpọ awọn orilẹ-ede miiran. O jẹ alailẹgbẹ, dajudaju, lati wa ni ogun ati ki o ṣetọju ominira iwe irohin ati ominira ẹtọ ati ominira apejọ. Wọn parẹ laifọwọyi. Ati pe nitorina ni ọpọlọpọ awọn orilẹ-ede nibiti wọn ti jẹ ailewu ni deede, loni wọn ti parẹ. Ni awọn orilẹ-ede miiran, paapaa ṣaaju ki ogun wa, kii ṣe nikan ominira iwe irohin ati ominira apejọ , ati ominira ẹtọ ti parẹ, ṣugbọn ominira ẹsin ti parẹ. Ati pe nitorina a mọ nibi ninu orilẹ-ede yii, a ni ojuse pataki. A wa ni alaafia. A ko ni idi fun awọn bẹru ti o ṣakoso ọpọlọpọ awọn eniyan miiran ni gbogbo agbaye; nitorina, a ni lati daabobo awọn ominira ti ijọba mimọ. —Eleanor Roosevelt, Ikede si American Civil Liberties Union, Chicago, Illinois, Oṣu Kẹta 14, 1940 Ninu ọrọ rẹ, Eleanor Roosevelt daba pe ewu ti tẹlẹ si awọn ominira ilu ṣẹda nipasẹ eyi ti o tẹle? \nA.Ija Agbaye I\nB.Iwọn Ilana Tuntun\nC.Ijakadi Tutu\nD.Ipe wọn nla\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt="Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa геography ile-iwe gíga\nIbeere: Igbagbọ wo ni isalẹ jẹ igbagbọ agbaye? A.Taoism\n B.Islam\n C.Shintoism\n D.Confucianism\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa filosofía\nIbeere: Socrates daba pe mímọ̀ jẹ apakan kan ti:\nA. ohun ti o jẹ ọlọgbọn.\nB. ohun ti o jẹ ododo.\nC. ohun ti o jẹ ẹwa.\nD. ohun ti o jẹ ofin.\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:"),
dict(role='BOT', prompt="B"),
dict(role='HUMAN', prompt="Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa aljebra igba\nIbeere: Wa aṣẹ to pọju fun diẹ ninu awọn eroja ti Z_4 x Z_6.\n A.4\nB.6\nC.12\nD.24\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:"),
dict(role='BOT', prompt='C'),
dict(role='HUMAN', prompt="Ibeere kan wa ti o ni yiyan kan. Fesi si ibeere naa nipa fesi A, B, C tabi D.\nit jẹ nipa ikẹkọ ẹrọ\nIbeere: Ninu akara Yann LeCun, eso cherry lori oke ni: \nA.ikẹkọ imudara\nB.ikẹkọ ara-ṣaaju\nC.ikẹkọ aibojumu\nD.ikẹkọ ti a fojusi\nKan yan A, B, C tabi D lati fesi si ibeere naa.\nFesi:"),
dict(role='BOT', prompt='A'),
dict(role='HUMAN', prompt=f'{_hint}\n{_prompt}\n'+"Kan yan A, B, C tabi D lati fesi si ibeere naa. \nFesi:"),
]

View File

@ -0,0 +1,41 @@
# MMMLU-Lite
## Introduction
A lite version of the MMMLU dataset, which is an community version of the MMMLU dataset by [OpenCompass](https://github.com/open-compass/opencompass). Due to the large size of the original dataset (about 200k questions), we have created a lite version of the dataset to make it easier to use. We sample 25 examples from each language subject in the original dataset with fixed seed to ensure reproducibility, finally we have 19950 examples in the lite version of the dataset, which is about 10% of the original dataset.
## Dataset Description
Multilingual Massive Multitask Language Understanding (MMMLU)
The MMLU is a widely recognized benchmark of general knowledge attained by AI models. It covers a broad range of topics from 57 different categories, covering elementary-level knowledge up to advanced professional subjects like law, physics, history, and computer science.
We translated the MMLUs test set into 14 languages using professional human translators. Relying on human translators for this evaluation increases confidence in the accuracy of the translations, especially for low-resource languages like Yoruba. We are publishing the professional human translations and the code we use to run the evaluations.
This effort reflects our commitment to improving the multilingual capabilities of AI models, ensuring they perform accurately across languages, particularly for underrepresented communities. By prioritizing high-quality translations, we aim to make AI technology more inclusive and effective for users worldwide.
MMMLU contains the MMLU test set translated into the following locales:
- AR_XY (Arabic)
- BN_BD (Bengali)
- DE_DE (German)
- ES_LA (Spanish)
- FR_FR (French)
- HI_IN (Hindi)
- ID_ID (Indonesian)
- IT_IT (Italian)
- JA_JP (Japanese)
- KO_KR (Korean)
- PT_BR (Brazilian Portuguese)
- SW_KE (Swahili)
- YO_NG (Yoruba)
- ZH_CH (Simplied Chinese)
## How to Use
Download file from [link](https://hf-mirror.com/datasets/openai/MMMLU)
```python
from datasets import load_dataset
ds = load_dataset("openai/MMMLU", "default")
from datasets import load_dataset
ds = load_dataset("openai/MMMLU", "by_language")
```

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mmmlu_lite_gen_c51a84 import mmmlu_lite_datasets # noqa: F401, F403

View File

@ -0,0 +1,106 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
from opencompass.datasets import MMMLULiteDataset
from opencompass.utils.text_postprocessors import first_option_postprocess
mmmlu_lite_reader_cfg = dict(
input_columns=['input', 'A', 'B', 'C', 'D','subject'],
output_column='target',
train_split='test')
mmmlu_lite_all_sets = [
'mmlu_lite_AR-XY',
'mmlu_lite_BN-BD',
'mmlu_lite_DE-DE',
'mmlu_lite_ES-LA',
'mmlu_lite_FR-FR',
'mmlu_lite_HI-IN',
'mmlu_lite_ID-ID',
'mmlu_lite_IT-IT',
'mmlu_lite_JA-JP',
'mmlu_lite_KO-KR',
'mmlu_lite_PT-BR',
'mmlu_lite_SW-KE',
'mmlu_lite_YO-NG',
'mmlu_lite_ZH-CN',
]
mmmlu_lite_datasets = []
for _name in mmmlu_lite_all_sets:
if 'AR' in _name:
_hint = f'هناك سؤال اختيار واحد. أجب عن السؤال بالرد على A أو B أو C أو D, يرجى استخدام واحدة من الرموز A، B، C، أو D لتمثيل خيارات الإجابة في ردك'
_prompt = f'يتعلق بـ {{subject}} \nالسؤال: {{input}}\nأ. {{A}}\nب. {{B}}\nج. {{C}}\nد. {{D}}\nالإجابة:'
elif 'BN' in _name:
_hint = f'এটি একটি একক পছন্দের প্রশ্ন। এ, বি, সি বা ডি উত্তর দিয়ে প্রশ্নের উত্তর দিন।, আপনার উত্তরে ইংরেজি বর্ণ A, B, C এবং D এর মধ্যে একটি ব্যবহার করুন'
_prompt = f'এটি {{subject}} এর সম্পর্কে \nপ্রশ্ন: {{input}}\nএ. {{A}}\nবি. {{B}}\nসি. {{C}}\nডি. {{D}}\nউত্তর:'
elif 'DE' in _name:
_hint = f'Es gibt eine Einzelwahlfrage. Beantworte die Frage, indem du A, B, C oder D antwortest.'
_prompt = f'Es geht um {{subject}} \nFrage: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAntwort:'
elif 'ES' in _name:
_hint = f'Hay una pregunta de elección única. Responde a la pregunta respondiendo A, B, C o D.'
_prompt = f'Se trata de {{subject}} \nPregunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRespuesta:'
elif 'FR' in _name:
_hint = f'Il y a une question à choix unique. Répondez à la question en répondant A, B, C ou D.'
_prompt = f'''C'est à propos de {{subject}} \nQuestion : {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRéponse :'''
elif 'HI' in _name:
_hint = f'यह एक एकल विकल्प प्रश्न है। प्रश्न का उत्तर A, B, C या D में से कोई भी उत्तर देकर दें।'
_prompt = f'यह {{subject}} के बारे में है \nप्रश्न: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nउत्तर:'
elif 'ID' in _name:
_hint = f'Ada pertanyaan pilihan tunggal. Jawablah pertanyaan dengan menjawab A, B, C, atau D.'
_prompt = f'Ini tentang {{subject}} \nPertanyaan: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nJawaban:'
elif 'IT' in _name:
_hint = f'Ci sono domande a scelta singola. Rispondi alla domanda rispondendo A, B, C o D.'
_prompt = f'Si tratta di {{subject}} \nDomanda: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nRisposta:'
elif 'JA' in _name:
_hint = f'単一選択肢の質問があります。この質問にはA、B、C、またはDで答えてください。'
_prompt = f'これは {{subject}} に関することです \n質問: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n回答:'
elif 'KO' in _name:
_hint = f'단일 선택 질문이 있습니다. A, B, C 또는 D로 답변해 주세요.'
_prompt = f'이것은 {{subject}}에 관한 것입니다 \n질문: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n답변:'
elif 'PT' in _name:
_hint = f'Há uma pergunta de escolha única. Responda à pergunta escolhendo A, B, C ou D.'
_prompt = f'É sobre {{subject}} \nPergunta: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nResposta:'
elif 'ZH' in _name:
_hint = f'这里有一个单项选择题。请通过选择 A、B、C 或 D 来回答该问题。'
_prompt = f'这是关于 {{subject}} 的内容\n问题:{{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:'
else:
_hint = f'There is a single choice question. Answer the question by replying A, B, C or D.'
_prompt = f'it is about {{subject}} \nQuestion: {{input}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nAnswer:'
mmmlu_lite_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin='</E>',
round=[
dict(
role='HUMAN',
prompt=f'{_hint}\n {_prompt}'
),
],
),
ice_token='</E>',
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
mmmlu_lite_eval_cfg = dict(
evaluator=dict(type=AccwithDetailsEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
mmmlu_lite_datasets.append(
dict(
abbr=f'openai_m{_name}',
type=MMMLULiteDataset,
# path='opencompass/mmmlu_lite',
path = './data/mmmlu_lite',
name=f'openai_m{_name}',
reader_cfg=mmmlu_lite_reader_cfg,
infer_cfg=mmmlu_lite_infer_cfg,
eval_cfg=mmmlu_lite_eval_cfg,
))
del _name, _hint, _prompt

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench',
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
from opencompass.summarizers import AlignmentBenchSummarizer
from opencompass.datasets import AlignmentBenchDataset
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
@ -46,7 +47,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
@ -63,4 +63,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=AlignmentBenchSummarizer, judge_type='general')
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlignmentBenchDataset, alignbench_postprocess
subjective_reader_cfg = dict(
input_columns=['question', 'capability', 'critiquellm_prefix'],
output_column='judge',
)
subjective_all_sets = [
'alignment_bench_v1_1', # Changed to Alignbench_v1_1 since 06/15/2024, refer to https://github.com/THUDM/AlignBench
]
data_path ='data/subjective/alignment_bench'
alignment_bench_config_path = 'data/subjective/alignment_bench/config'
alignment_bench_config_name = 'multi-dimension'
alignbench_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n'
),
]),
),
dict_postprocessor=dict(type=alignbench_postprocess, judge_type='general'),
),
pred_role='BOT',
)
alignbench_datasets.append(
dict(
abbr=f'{_name}',
type=AlignmentBenchDataset,
path=data_path,
name=_name,
alignment_bench_config_path=alignment_bench_config_path,
alignment_bench_config_name=alignment_bench_config_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from opencompass.datasets import SubjectiveCmpDataset
from opencompass.summarizers import AlpacaSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -94,7 +95,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
@ -102,7 +102,7 @@ for _name in subjective_all_sets:
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
type=SubjectiveCmpDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
@ -111,5 +111,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}],
summarizer=dict(type=AlpacaSummarizer, judge_type='v2'),
))

View File

@ -0,0 +1,115 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import AlpacaEvalDataset, alpacaeval_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'alpaca_eval',
]
alpacav2_datasets = []
gpt4_prompt = """
I require a leaderboard for various large language models. I'll provide you with prompts given to these models and their corresponding outputs. Your task is to assess these responses, and select the model that produces the best output from a human perspective.
## Instruction
{
"instruction": "{question}",
}
## Model Outputs
Here are the unordered outputs from the models. Each output is associated with a specific model, identified by a unique model identifier.
{
{
"model_identifier": "m",
"output": "{prediction}"
},
{
"model_identifier": "M",
"output": "{prediction2}"
}
}
## Task
Evaluate the models based on the quality and relevance of their outputs, and select the model that generated the best output. Answer by providing the model identifier of the best model. We will use your output as the name of the best model, so make sure your output only contains one of the following model identifiers and nothing else (no quotes, no spaces, no new lines, ...): m or M.
## Best Model Identifier
"""
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
],
reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
)
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='You are a highly efficient assistant, who evaluates and selects the best large language model (LLMs) based on the quality of their responses to a given instruction. This process will be used to create a leaderboard reflecting the most accurate and human-preferred answers.')
],
round=[
dict(
role='HUMAN',
prompt = gpt4_prompt
),
]),
),
dict_postprocessor=dict(type=alpacaeval_postprocess),
),
pred_role='BOT',
)
alpacav2_datasets.append(
dict(
abbr=f'{_name}',
type=AlpacaEvalDataset,
path='./data/subjective/alpaca_eval',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='random',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/alpaca_eval/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from opencompass.datasets import ArenaHardDataset
from opencompass.summarizers import ArenaHardSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -59,7 +60,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
@ -76,5 +76,6 @@ for _name in subjective_all_sets:
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}],
summarizer = dict(type=ArenaHardSummarizer),
))

View File

@ -0,0 +1,80 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import ArenaHardDataset, arenahard_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'arenahard',
]
arenahard_datasets = []
system_prompt = "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
judge_prompt = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's Answer|>\n{prediction}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{prediction2}\n<|The End of Assistant B's Answer|>"
gpt4 = [dict(
abbr='gpt4-0314',
)]
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt=system_prompt)
],
round=[
dict(
role='HUMAN',
prompt = judge_prompt
),
]),
),
dict_postprocessor=dict(type=arenahard_postprocess),
),
pred_role='BOT',
)
arenahard_datasets.append(
dict(
abbr='arenahard',
type=ArenaHardDataset,
path='./data/subjective/arena_hard',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
given_pred = [{'abbr':'gpt4-0314', 'path':'./data/subjective/arena_hard'}]
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.datasets import CompassArenaDataset
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
@ -15,23 +15,29 @@ data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
@ -43,8 +49,10 @@ knowledge_prompt = """
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -56,8 +64,10 @@ language_prompt = """
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -69,8 +79,10 @@ math_prompt = """
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
@ -83,6 +95,7 @@ creation_prompt = """
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
@ -120,7 +133,6 @@ for _name, _prompt in sub_map.items():
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
@ -137,6 +149,6 @@ for _name, _prompt in sub_map.items():
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -0,0 +1,142 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import CompassArenaDataset, compassarena_postprocess
from opencompass.summarizers import CompassArenaSummarizer
subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)
data_path ='data/subjective/compass_arena'
compassarena_datasets = []
base_prompt = """
[回答1开始]
{prediction}
[回答1结束]
[回答2开始]
{prediction2}
[回答2结束]
根据评分要求在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答12平局
并提供你的解释原因
如果你认为回答1更好你的输出应形如
选择A
原因blahblah blahblah\n
如果你认为回答2更好你的输出应形如
选择B
原因blahblah blahblah\n
如果你认为回答12打成平手你的输出应形如
选择C
原因blahblah blahblah\n
"""
knowledge_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答能与参考答案吻合或表明参考答案的意思
2. 在都准确答对问题的前提下更好的回答能对知识点进行额外补充且补充的知识准确无误
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
language_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 在有明确的参考答案的情况下越贴近参考答案或表明了参考答案的意思的回答越好
2. 更好的回答在语言表达上更流畅更加符合与人类对话的习惯包括语气情调等
3. 在都准确答对问题的前提下更好的回答能进行额外补充且补充的内容准确无误
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
math_prompt = """
请根据提供的 评分要求用户问题参考答案 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 更好的回答的答案能和参考答案一致
2. 若两个回答的答案都与参考答案不一致则更好的回答的推理过程应更加合理
3. 更好的回答更加符合与人类对话的习惯包括语气情调等
[用户问题]
{question}
[参考答案]
{ref}
""" + base_prompt
reason_prompt = math_prompt
creation_prompt = """
请根据提供的 评分要求用户问题 以及 相应的两个回答回答1回答2判断两个回答中哪一个更好
评分要求重要性依次递减:
1. 好的回答必须首先符合用户问题里的各种需求不能跑题
2. 好的回答必须具有逻辑连贯性围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度
[用户问题]
{question}
""" + base_prompt
sub_map = {'language': language_prompt, 'knowledge': knowledge_prompt, 'reason_v2': reason_prompt, 'math_v2': math_prompt, 'creationv2_zh': creation_prompt}
gpt4 = [dict(
abbr='gpt4-turbo',
)]
for _name, _prompt in sub_map.items():
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
dict_postprocessor=dict(type=compassarena_postprocess, summary_type='half_add', check_pos_bias=True),
),
pred_role='BOT',
)
compassarena_datasets.append(
dict(
abbr=f'compassarena_{_name}',
type=CompassArenaDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='m2n',
infer_order='double',
base_models=gpt4,
# summarizer = dict(type=CompassArenaSummarizer, summary_type='half_add'),
given_pred = [{'abbr':'gpt4-turbo', 'path':'./data/subjective/compass_arena/gpt4-turbo'}]
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -112,7 +113,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -127,4 +127,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,130 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts', 'fofo_test_prompts_cn',
]
base_prompt_en = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
base_prompt_cn = """
我希望你创建一个排行榜用于评估来自各种大型语言模型的回答格式的正确性为了完成这个任务你将需要分析给模型的文本提示以及它们对应的回答具体来说请确保你的评估输出正确地格式化为JSON字符串我将为此提供提示和回答
以下是提示内容
{
"instruction": "{question}",
}
以下是模型的输出结果
[
{
"model": "model",
"answer": "{prediction}"
},
]
请通过检查模型回答是否符合提示中声明的格式规范来评估模型回答的格式进行彻底的格式检查并提供格式正确或错误的详细解释你的反馈应包括模型的名称接着是格式正确性的状态'1'表示正确'0'表示错误将你的推理以每个评估模型的单个字符串中的 bullet 点形式呈现换句话说你应该生成以下输出
```json
[
{
'model': <模型名称>,
'format_correctness': <正确性>,
'reasons': <格式正确性的原因>
}
]
```
请注意你的回答应是一个正确格式化的JSON字符串不应包含任何额外的内容我们将在Python中直接将其作为JSON字符串加载
"""
fofo_datasets = []
for _name in subjective_all_sets:
if '_cn' in _name:
base_prompt = base_prompt_cn
else:
base_prompt = base_prompt_en
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from opencompass.datasets import FofoDataset
from opencompass.summarizers import FofoSummarizer
from mmengine.config import read_base
subjective_reader_cfg = dict(
@ -80,7 +81,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
@ -95,4 +95,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=FofoSummarizer, judge_type='general')
))

View File

@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FofoDataset, fofo_postprocess
from mmengine.config import read_base
subjective_reader_cfg = dict(
input_columns=['question'],
output_column='judge',
)
subjective_all_sets = [
'fofo_test_prompts'
]
base_prompt = """
I would like you to create a leaderboard that evaluates the correctness of the format of answers from various large language models. To accomplish this, you will need to analyze the text prompts given to the models and their corresponding answers. Specifically, please ensure that your evaluation outputs are properly formatted as a json string. I will provide both the prompts and the responses for this purpose.
Here is the prompt:
{
"instruction": "{question}",
}
Here are the outputs of the models:
[
{
"model": "model",
"answer": "{prediction}"
},
]
Please evaluate the formatting of the model's responses by checking if they comply with the format specifications stated in the prompt. Perform a thorough format check and provide a detailed explanation for why the format is correct or incorrect. Your feedback should include the name of the model, followed by the format correctness status represented as '1' for correct and '0' for incorrect. Present your reasoning as bullet points within a single string for each model assessed. In other words, you should produce the following output:
```json
[
{
'model': <model-name>,
'format_correctness': <correctness>,
'reasons': <reasons-of-format-correctness>
}
]
```
Please note that your response should be a properly formatted JSON string and should not contain any additional content. We will load it directly as a JSON string in Python.
"""
fofo_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{question}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=4096),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = base_prompt
),
]),
),
dict_postprocessor=dict(type=fofo_postprocess),
),
pred_role='BOT',
)
fofo_datasets.append(
dict(
abbr=f'{_name}',
type=FofoDataset,
path='./data/subjective/fofo',
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
from opencompass.datasets import FollowBenchDataset
from opencompass.summarizers import FollowBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
@ -43,7 +44,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
@ -59,4 +59,5 @@ for _name in subjective_all_sets:
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
summarizer = dict(type=FollowBenchSummarizer,)
))

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import FollowBenchDataset, followbench_postprocess
subjective_reader_cfg = dict(
input_columns=['instruction', 'judge_prompt',],
output_column='judge',
)
subjective_all_sets = [
'followbench_llmeval_cn', 'followbench_llmeval_en',
]
data_path ='data/subjective/followbench/converted_data'
followbench_llmeval_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt='{instruction}'
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=2048),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = '{judge_prompt}'
),
]),
),
dict_postprocessor=dict(type=followbench_postprocess),
),
pred_role='BOT',
)
followbench_llmeval_datasets.append(
dict(
abbr=f'{_name}',
type=FollowBenchDataset,
path=data_path,
name=_name,
mode='singlescore',
cate='llm',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
))

View File

@ -48,4 +48,4 @@ pip install -e .
python run.py configs/eval_hellobench.py
```
6. After that, you could find the results in outputs/hellobench/xxx/summary
6. After that, you could find the results in outputs/hellobench/xxx/summary

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
from opencompass.datasets import MTBench101Dataset
from opencompass.summarizers import MTBench101Summarizer
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
@ -45,7 +46,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
@ -60,4 +60,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBench101Summarizer, judge_type='single')
))

View File

@ -0,0 +1,63 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBench101Dataset, mtbench101_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue','task','multi_id','turn_id','system_prompt','prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench101',
]
data_path ='data/subjective/'
mtbench101_datasets = []
for _name in subjective_all_sets:
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=4096, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench101_postprocess),
),
pred_role='BOT',
)
mtbench101_datasets.append(
dict(
abbr=f'{_name}',
type=MTBench101Dataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,8 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
from opencompass.datasets import MTBenchDataset
from opencompass.summarizers import MTBenchSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
@ -47,7 +48,6 @@ for _name in subjective_all_sets:
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
@ -62,4 +62,5 @@ for _name in subjective_all_sets:
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
summarizer = dict(type=MTBenchSummarizer, judge_type='single')
))

View File

@ -0,0 +1,65 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import MTBenchDataset, mtbench_postprocess
subjective_reader_cfg = dict(
input_columns=['dialogue', 'capability', 'system_prompt', 'prompt_template'],
output_column='judge',
)
subjective_all_sets = [
'mtbench_0.0','mtbench_0.1','mtbench_0.7'
]
data_path ='data/subjective/mtbench'
mtbench_datasets = []
for _name in subjective_all_sets:
temperature = float(_name.split('_')[1])
do_sample = False if temperature == 0.0 else True
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=1024, temperature=temperature, do_sample=do_sample,infer_mode='every'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt='{system_prompt}')
],
round=[
dict(
role='HUMAN',
prompt = '{prompt_template}'
),
]),
),
dict_postprocessor=dict(type=mtbench_postprocess),
),
pred_role='BOT',
)
mtbench_datasets.append(
dict(
abbr=f'{_name}',
type=MTBenchDataset,
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
mode='singlescore',
))

View File

@ -2,7 +2,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.datasets import WildBenchDataset
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
@ -30,7 +30,6 @@ subjective_eval_cfg = dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
@ -63,4 +62,5 @@ wildbench_datasets.append(
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
summarizer = dict(type=WildBenchPairSummarizer),
))

View File

@ -0,0 +1,66 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import ChatInferencer, GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import WildBenchDataset, wildbench_postprocess
from opencompass.summarizers import WildBenchPairSummarizer
subjective_reader_cfg = dict(
input_columns=['dialogue', 'prompt'],
output_column='judge',
)
data_path ='./data/subjective/WildBench/wildbench.jsonl'
wildbench_datasets = []
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{dialogue}"""
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=ChatInferencer, max_seq_len=4096, max_out_len=512, infer_mode='last'),
)
subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template="""{prompt}"""
),
dict_postprocessor=dict(type=wildbench_postprocess),
),
pred_role='BOT',
)
gpt4 = dict(
abbr='gpt4-turbo',
)
claude = dict(
abbr='HaiKu',
)
llama_2_70b = dict(
abbr='llama-2-70b-chat-hf',
)
wildbench_datasets.append(
dict(
abbr='wildbench',
type=WildBenchDataset,
path=data_path,
eval_mode='pair',
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg,
given_pred = [{'abbr': 'gpt4-turbo', 'path':'./data/subjective/WildBench/gpt4'},
{'abbr': 'llama-2-70b-chat-hf', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'HaiKu', 'path':'./data/subjective/WildBench/claude'},
{'abbr': 'llama-2-70b-chat-turbomind', 'path':'./data/subjective/WildBench/llama2-70b'},
{'abbr': 'llama-2-70b-chat-vllm', 'path':'./data/subjective/WildBench/llama2-70b'}],
mode='m2n', # m个模型 与 n个模型进行对战
infer_order='random',
base_models = [llama_2_70b, gpt4, claude],
))

View File

@ -0,0 +1,5 @@
categories = ['mmlu_AR-XY','mmlu_BN-BD','mmlu_DE-DE','mmlu_ES-LA','mmlu_FR-FR','mmlu_HI-IN','mmlu_ID-ID','mmlu_IT-IT','mmlu_JA-JP','mmlu_KO-KR','mmlu_PT-BR','mmlu_SW-KE','mmlu_YO-NG','mmlu_ZH-CN']
mmmlu_summary_groups = [
{'name': 'mmmlu', 'subsets': [f'openai_m{c}' for c in categories]},
]

View File

@ -0,0 +1,25 @@
from mmengine.config import read_base
with read_base():
from .groups.mmmlu import mmmlu_summary_groups
summarizer = dict(
dataset_abbrs=[
'openai_mmmlu_AR-XY',
'openai_mmmlu_BN-BD',
'openai_mmmlu_DE-DE',
'openai_mmmlu_ES-LA',
'openai_mmmlu_FR-FR',
'openai_mmmlu_HI-IN',
'openai_mmmlu_ID-ID',
'openai_mmmlu_IT-IT',
'openai_mmmlu_JA-JP',
'openai_mmmlu_KO-KR',
'openai_mmmlu_PT-BR',
'openai_mmmlu_SW-KE',
'openai_mmmlu_YO-NG',
'openai_mmmlu_ZH-CN',
'mmmlu',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -0,0 +1,26 @@
categories = ['mmlu_lite_AR-XY','mmlu_lite_BN-BD','mmlu_lite_DE-DE','mmlu_lite_ES-LA','mmlu_lite_FR-FR','mmlu_lite_HI-IN','mmlu_lite_ID-ID','mmlu_lite_IT-IT','mmlu_lite_JA-JP','mmlu_lite_KO-KR','mmlu_lite_PT-BR','mmlu_lite_SW-KE','mmlu_lite_YO-NG','mmlu_lite_ZH-CN']
mmmlu_summary_groups = [
{'name': 'mmmlu_lite', 'subsets': [f'openai_m{c}' for c in categories]},
]
summarizer = dict(
dataset_abbrs=[
'openai_mmmlu_lite_AR-XY',
'openai_mmmlu_lite_BN-BD',
'openai_mmmlu_lite_DE-DE',
'openai_mmmlu_lite_ES-LA',
'openai_mmmlu_lite_FR-FR',
'openai_mmmlu_lite_HI-IN',
'openai_mmmlu_lite_ID-ID',
'openai_mmmlu_lite_IT-IT',
'openai_mmmlu_lite_JA-JP',
'openai_mmmlu_lite_KO-KR',
'openai_mmmlu_lite_PT-BR',
'openai_mmmlu_lite_SW-KE',
'openai_mmmlu_lite_YO-NG',
'openai_mmmlu_lite_ZH-CN',
'mmmlu_lite'
],
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -81,6 +81,7 @@ from .mgsm import * # noqa: F401, F403
from .mmlu import * # noqa: F401, F403
from .mmlu_pro import * # noqa: F401, F403
from .MMLUArabic import * # noqa: F401, F403
from .mmmlu import * # noqa: F401, F403
from .multirc import * # noqa: F401, F403
from .narrativeqa import * # noqa: F401, F403
from .natural_question import * # noqa: F401, F403

View File

@ -0,0 +1,52 @@
# flake8: noqa
# yapf: disable
import json
import os
from datasets import Dataset, DatasetDict, load_dataset
from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MMMLUDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
subset = name.split('_')[1].replace('-', '_')
for split in ['test']:
data = load_dataset(path=path,
name=subset,
split=split,
trust_remote_code=True)
dataset_list = []
for item in data:
dataset_list.append({
'input': item['Question'],
'A': item['A'],
'B': item['B'],
'C': item['C'],
'D': item['D'],
'target': item['Answer'],
'subject': item['Subject'].replace('_', ' ')
})
dataset[split] = Dataset.from_list(dataset_list)
return dataset
@LOAD_DATASET.register_module()
class MMMLULiteDataset(BaseDataset):
@staticmethod
def load(path: str, name: str):
dataset = DatasetDict()
path = os.path.join(path, name + '.jsonl')
dataset_list = []
with open(path, 'r') as f:
dataset_list = [json.loads(line) for line in f.readlines()]
dataset['test'] = Dataset.from_list(dataset_list)
return dataset

View File

@ -102,7 +102,13 @@ class TurboMindModelwithChatTemplate(BaseModel):
messages = _format_with_fast_chat_template(messages, self.fastchat_template)
else:
messages = [self.tokenizer.apply_chat_template(m, add_generation_prompt=True, tokenize=False) for m in messages]
# LMDeploy tokenize prompts by AutoTokenizer with its default parameter "add_special_token=True"
# OC add bos_token in the prompt, which requires tokenizing prompts using "add_speicial_token=False"
# But LMDeploy doesn't have "add_speicial_token" in the pipeline API. So, we remove bos_token
# from messages as a workaround
if self.tokenizer.bos_token:
bos_token = self.tokenizer.bos_token
messages = [message.removeprefix(bos_token) if message.startswith(bos_token) else message for message in messages]
stop_words = list(set(self.stop_words + stopping_criteria))
DEFAULT_GEN_CONFIG = {
@ -129,8 +135,7 @@ class TurboMindModelwithChatTemplate(BaseModel):
results = []
outputs = self.pipe(messages, gen_config=gen_config, do_preprocess=False)
for output in outputs:
text = self.tokenizer.decode(output.token_ids)
results.append(text)
results.append(output.text)
for s in stop_words:
results = [r.split(s)[0] for r in results]
@ -162,4 +167,4 @@ class TurboMindModelwithChatTemplate(BaseModel):
else:
filtered = {k: v for k, v in engine_config.items() if hasattr(PytorchEngineConfig, k)}
backend_config = PytorchEngineConfig(**filtered)
return pipeline(model_path, backend_config=backend_config, log_level='INFO', max_log_len=10)
return pipeline(model_path, backend_config=backend_config, log_level='WARNING')

View File

@ -245,7 +245,6 @@ class ChatInferencer(BaseInferencer):
logger.info('Starting inference process...')
for datum in tqdm(dataloader, disable=not self.is_main_process):
chat = datum[0]
if self.infer_mode == 'last':
self.infer_last(chat, index, output_handler)
elif self.infer_mode == 'every':

View File

@ -7,6 +7,7 @@ evaluate>=0.3.0
func_timeout
fuzzywuzzy
h5py
huggingface_hub<=0.24.7
immutabledict
jieba
json5