mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
fix
This commit is contained in:
parent
51f5792f7c
commit
53c6725d19
@ -30,10 +30,10 @@ aime2024_eval_cfg = dict(
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
)
|
||||
]
|
@ -1,14 +1,26 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import Aime2024Dataset, MATHEvaluator, math_postprocess_v2
|
||||
from opencompass.datasets import CustomDataset
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
aime2024_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
aime2024_reader_cfg = dict(
|
||||
input_columns=['question'],
|
||||
output_column='answer'
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{question}\nRemember to put your final answer within \\boxed{}.',
|
||||
),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
@ -19,35 +31,19 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Original Question Begin>: \n{question}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
|
||||
|
||||
aime2024_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{question}\nPlease reason step by step, and put your final answer within \\boxed{}.'),
|
||||
],
|
||||
)
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=2048)
|
||||
)
|
||||
|
||||
aime2024_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
@ -58,33 +54,31 @@ aime2024_eval_cfg = dict(
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
)
|
||||
|
||||
aime2024_datasets = [
|
||||
dict(
|
||||
abbr='aime2024',
|
||||
type=Aime2024Dataset,
|
||||
path='opencompass/aime2024',
|
||||
type=CustomDataset,
|
||||
path='opencompass/aime2025',
|
||||
reader_cfg=aime2024_reader_cfg,
|
||||
infer_cfg=aime2024_infer_cfg,
|
||||
eval_cfg=aime2024_eval_cfg
|
||||
eval_cfg=aime2024_eval_cfg,
|
||||
)
|
||||
]
|
@ -4,7 +4,6 @@ from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import CMMLUDataset
|
||||
from opencompass.utils.text_postprocessors import match_answer_pattern
|
||||
from opencompass.openicl.icl_evaluator import LMEvaluator
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
@ -75,14 +74,12 @@ cmmlu_subject_mapping = {
|
||||
'traditional_chinese_medicine': '中医中药',
|
||||
'virology': '病毒学',
|
||||
'world_history': '世界历史',
|
||||
'world_religions': '世界宗教'
|
||||
'world_religions': '世界宗教',
|
||||
}
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
你回答的最后一行**必须**是以下格式 '答案: $选项' (不带引号), 其中选项是ABCD之一.
|
||||
|
||||
{question}
|
||||
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
@ -97,14 +94,11 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: \n {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
@ -140,14 +134,13 @@ for _name in cmmlu_all_sets:
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=CMMLUDataset,
|
||||
@ -157,7 +150,7 @@ for _name in cmmlu_all_sets:
|
||||
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer',
|
||||
train_split='dev',
|
||||
test_split='test'
|
||||
test_split='test',
|
||||
),
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
@ -175,10 +168,12 @@ for _name in cmmlu_all_sets:
|
||||
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||
output_column='answer',
|
||||
train_split='dev',
|
||||
test_split='test'),
|
||||
test_split='test',
|
||||
),
|
||||
infer_cfg=cmmlu_infer_cfg,
|
||||
eval_cfg=cmmlu_eval_cfg,
|
||||
mode='singlescore',
|
||||
))
|
||||
)
|
||||
)
|
||||
|
||||
del _name, _ch_name
|
||||
|
@ -2,8 +2,9 @@ from mmengine.config import read_base
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import DropOpenAIDataset, DropOpenAIEvaluator, generic_llmjudge_postprocess
|
||||
from opencompass.datasets import DropOpenAIDataset
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
with read_base():
|
||||
from .drop_examples import drop_examples # noqa: F401, F403
|
||||
@ -17,11 +18,6 @@ drop_reader_cfg = dict(
|
||||
|
||||
template = f'You will be asked to read a passage and answer a question. Some examples of passages and Q&A are provided below.\n\n{drop_examples}\n\n# Your Task\n\n---\n{{prompt}}\n\nThink step by step, then write a line of the form "Answer: $ANSWER" at the end of your response.'
|
||||
|
||||
drop_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template=dict(round=[dict(role='HUMAN', prompt=template)])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer))
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
@ -30,22 +26,25 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
|
||||
<Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Original Question Begin>: {prompt}\n \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answers}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
drop_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[dict(role='HUMAN', prompt=template)]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
drop_eval_cfg = dict(
|
||||
evaluator=dict(
|
||||
@ -57,14 +56,13 @@ drop_eval_cfg = dict(
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt = GRADER_TEMPLATE
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=DropOpenAIDataset,
|
||||
@ -76,8 +74,6 @@ drop_eval_cfg = dict(
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
|
||||
drop_datasets = [
|
||||
dict(
|
||||
abbr='drop',
|
||||
@ -85,5 +81,6 @@ drop_datasets = [
|
||||
path='data/drop_simple_eval/dev.jsonl',
|
||||
reader_cfg=drop_reader_cfg,
|
||||
infer_cfg=drop_infer_cfg,
|
||||
eval_cfg=drop_eval_cfg)
|
||||
eval_cfg=drop_eval_cfg,
|
||||
)
|
||||
]
|
@ -1,10 +1,11 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import FixKRetriever
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccwithDetailsEvaluator
|
||||
from opencompass.datasets import HellaswagDatasetwithICE, generic_llmjudge_postprocess
|
||||
from opencompass.datasets import HellaswagDatasetwithICE
|
||||
from opencompass.utils.text_postprocessors import first_option_postprocess
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
|
||||
hellaswag_reader_cfg = dict(
|
||||
input_columns=['ctx', 'A', 'B', 'C', 'D'],
|
||||
@ -13,6 +14,14 @@ hellaswag_reader_cfg = dict(
|
||||
test_split='val',
|
||||
)
|
||||
|
||||
align_prompt = """Continue the following text without adding any additional information or formatting:
|
||||
{ctx}
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
D) {D}
|
||||
What is the right option?'"""
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
|
||||
@ -21,45 +30,27 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Original Question Begin>: {ctx}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{label}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
hellaswag_infer_cfg = dict(
|
||||
ice_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=f'{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?'),
|
||||
dict(role='BOT', prompt='{label}\n'),
|
||||
]
|
||||
),
|
||||
),
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(role='HUMAN', prompt='Continue the following text without adding any additional information or formatting:\n'),
|
||||
'</E>',
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=f'{{ctx}}\nA) {{A}}\nB) {{B}}\nC) {{C}}\nD) {{D}}\nWhat is the right option?'),
|
||||
dict(role='BOT', prompt='{label}\n'),
|
||||
dict(role='HUMAN', prompt=align_prompt),
|
||||
],
|
||||
),
|
||||
ice_token='</E>',
|
||||
),
|
||||
retriever=dict(type=FixKRetriever, fix_id_list=list(range(10))),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
@ -73,14 +64,13 @@ hellaswag_eval_cfg = dict(
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=HellaswagDatasetwithICE,
|
||||
@ -90,8 +80,6 @@ hellaswag_eval_cfg = dict(
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
hellaswag_datasets = [
|
||||
|
@ -15,10 +15,8 @@ with read_base():
|
||||
# Please download the dataset from https://people.eecs.berkeley.edu/~hendrycks/data.tar
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
|
||||
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of ABCD.
|
||||
{input}
|
||||
|
||||
A) {A}
|
||||
B) {B}
|
||||
C) {C}
|
||||
@ -33,16 +31,13 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Original Question Begin>: {input}\n A) {A}\n B) {B}\n C) {C}\n D) {D}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{target}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
@ -50,7 +45,8 @@ GRADER_TEMPLATE = """
|
||||
mmlu_reader_cfg = dict(
|
||||
input_columns=['input', 'A', 'B', 'C', 'D'],
|
||||
output_column='target',
|
||||
train_split='dev')
|
||||
train_split='dev',
|
||||
)
|
||||
|
||||
mmlu_datasets = []
|
||||
for name in mmlu_all_sets:
|
||||
@ -77,14 +73,13 @@ for name in mmlu_all_sets:
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MMLUDataset,
|
||||
@ -97,7 +92,6 @@ for name in mmlu_all_sets:
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
|
||||
mmlu_datasets.append(
|
||||
dict(
|
||||
abbr=f'lukaemon_mmlu_{name}',
|
||||
@ -107,4 +101,6 @@ for name in mmlu_all_sets:
|
||||
reader_cfg=mmlu_reader_cfg,
|
||||
infer_cfg=mmlu_infer_cfg,
|
||||
eval_cfg=mmlu_eval_cfg,
|
||||
))
|
||||
mode='singlescore',
|
||||
)
|
||||
)
|
@ -10,13 +10,10 @@ with read_base():
|
||||
|
||||
QUERY_TEMPLATE = """
|
||||
Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of Options(e.g. one of ABCDEFGHIJKLMNOP). Think step by step before answering.
|
||||
|
||||
Question:\n
|
||||
{question}
|
||||
|
||||
Options:\n
|
||||
{options_str}
|
||||
|
||||
""".strip()
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
@ -27,14 +24,11 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
@ -1,8 +1,6 @@
|
||||
from opencompass.datasets import MusrDataset, MusrEvaluator
|
||||
from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
|
||||
from opencompass.datasets import generic_llmjudge_postprocess
|
||||
from opencompass.datasets import MusrDataset, generic_llmjudge_postprocess
|
||||
from opencompass.evaluator import GenericLLMEvaluator
|
||||
|
||||
from opencompass.openicl import PromptTemplate, ZeroRetriever, GenInferencer
|
||||
|
||||
GRADER_TEMPLATE = """
|
||||
Please as a grading expert, judge whether the final answers given by the candidates below are consistent with the standard answers, that is, whether the candidates answered correctly.
|
||||
@ -12,217 +10,80 @@ GRADER_TEMPLATE = """
|
||||
2. Because the candidate's answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate's answer is correct, but be careful not to try to answer the original question.
|
||||
3. Some answers may contain multiple items, such as multiple-choice questions, multiple-select questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is enough. For multiple-select questions and multiple-blank fill-in-the-blank questions, the candidate needs to answer all the corresponding options or blanks correctly to be considered correct.
|
||||
4. Some answers may be expressed in different ways, such as some answers may be a mathematical expression, some answers may be a textual description, as long as the meaning expressed is the same. And some formulas are expressed in different ways, but they are equivalent and correct.
|
||||
|
||||
5. If the prediction is given with \\boxed{}, please ignore the \\boxed{} and only judge whether the candidate's answer is consistent with the standard answer.
|
||||
Please judge whether the following answers are consistent with the standard answer based on the above criteria. Grade the predicted answer of this new question as one of:
|
||||
A: CORRECT
|
||||
B: INCORRECT
|
||||
Just return the letters "A" or "B", with no text around it.
|
||||
|
||||
Here is your task. Simply reply with either CORRECT, INCORRECT. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
|
||||
|
||||
<Original Question Begin>: {question}\n {options_str} \n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{answer}\n<Gold Target End>\n\n
|
||||
<Original Question Begin>: {system_prompt}\n{prompt}\n<Original Question End>\n\n
|
||||
<Gold Target Begin>: \n{gold_answer}\n<Gold Target End>\n\n
|
||||
<Predicted Answer Begin>: \n{prediction}\n<Predicted End>\n\n
|
||||
|
||||
Judging the correctness of candidates' answers:
|
||||
""".strip()
|
||||
|
||||
# Common configuration components
|
||||
reader_cfg = dict(
|
||||
input_columns=[
|
||||
'context',
|
||||
'question_text',
|
||||
'question',
|
||||
'answer',
|
||||
'choices',
|
||||
'choices_str',
|
||||
'intermediate_trees',
|
||||
'intermediate_data',
|
||||
'prompt',
|
||||
'system_prompt',
|
||||
'gold_answer',
|
||||
'scidx',
|
||||
'self_consistency_n',
|
||||
'ablation_name',
|
||||
],
|
||||
output_column='gold_answer',
|
||||
)
|
||||
|
||||
infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}',
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt='{prompt}'),
|
||||
],
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
# Dataset configurations
|
||||
DATASET_CONFIGS = {
|
||||
'murder_mysteries': {
|
||||
'abbr': 'musr_murder_mysteries',
|
||||
'name': 'murder_mysteries',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MusrDataset,
|
||||
path='opencompass/musr',
|
||||
name='murder_mysteries',
|
||||
reader_cfg=dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
},
|
||||
'object_placements': {
|
||||
'abbr': 'musr_object_placements',
|
||||
'name': 'object_placements',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MusrDataset,
|
||||
path='opencompass/musr',
|
||||
name='object_placements',
|
||||
reader_cfg=dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
},
|
||||
'team_allocation': {
|
||||
'abbr': 'musr_team_allocation',
|
||||
'name': 'team_allocation',
|
||||
'path': 'opencompass/musr',
|
||||
'reader_cfg': dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
'infer_cfg': dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt='{system_prompt}'
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt='{prompt}'
|
||||
),
|
||||
]
|
||||
),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
),
|
||||
'eval_cfg': dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
|
||||
],
|
||||
round=[
|
||||
dict(
|
||||
role='HUMAN',
|
||||
prompt=GRADER_TEMPLATE
|
||||
),
|
||||
]),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MusrDataset,
|
||||
path='opencompass/musr',
|
||||
name='team_allocation',
|
||||
reader_cfg=dict(
|
||||
input_columns=['context', 'question_text', 'question', 'answer', 'choices', 'choices_str', 'intermediate_trees', 'intermediate_data', 'prompt', 'system_prompt', 'gold_answer', 'scidx', 'self_consistency_n', 'ablation_name'],
|
||||
output_column='gold_answer',
|
||||
),
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
pred_role='BOT',
|
||||
)
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# Create dataset configurations
|
||||
musr_datasets = []
|
||||
|
||||
for config in DATASET_CONFIGS.values():
|
||||
@ -231,8 +92,35 @@ for config in DATASET_CONFIGS.values():
|
||||
type=MusrDataset,
|
||||
path=config['path'],
|
||||
name=config['name'],
|
||||
reader_cfg=config['reader_cfg'],
|
||||
infer_cfg=config['infer_cfg'],
|
||||
eval_cfg=config['eval_cfg'],
|
||||
reader_cfg=reader_cfg,
|
||||
infer_cfg=infer_cfg,
|
||||
eval_cfg=dict(
|
||||
evaluator=dict(
|
||||
type=GenericLLMEvaluator,
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
begin=[
|
||||
dict(
|
||||
role='SYSTEM',
|
||||
fallback_role='HUMAN',
|
||||
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.",
|
||||
)
|
||||
],
|
||||
round=[
|
||||
dict(role='HUMAN', prompt=GRADER_TEMPLATE),
|
||||
],
|
||||
),
|
||||
),
|
||||
dataset_cfg=dict(
|
||||
type=MusrDataset,
|
||||
path=config['path'],
|
||||
name=config['name'],
|
||||
reader_cfg=reader_cfg,
|
||||
),
|
||||
judge_cfg=dict(),
|
||||
dict_postprocessor=dict(type=generic_llmjudge_postprocess),
|
||||
),
|
||||
),
|
||||
)
|
||||
musr_datasets.append(dataset)
|
@ -186,17 +186,29 @@ class DefaultSummarizer:
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
group_metrics = list(functools.reduce(lambda a, b: a & b, [set(dataset_metrics[dataset_abbr]) for dataset_abbr in sg['subsets']]))
|
||||
if need_smart_metric and len(group_metrics) > 1:
|
||||
group_metrics.append(default_metric)
|
||||
for metric in group_metrics:
|
||||
for dataset_abbr in sg['subsets']:
|
||||
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
else:
|
||||
group_metrics = [default_metric]
|
||||
for dataset_abbr in sg['subsets']:
|
||||
metric = dataset_metrics[dataset_abbr][0]
|
||||
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
if metric == default_metric:
|
||||
metric_default = dataset_metrics[dataset_abbr][0]
|
||||
scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric_default] = \
|
||||
parsed_results[model_abbr][dataset_abbr][metric_default]
|
||||
eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
else:
|
||||
scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = \
|
||||
parsed_results[model_abbr][dataset_abbr][metric]
|
||||
eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
# if need_smart_metric and len(group_metrics) > 1:
|
||||
# for metric in group_metrics:
|
||||
# for dataset_abbr in sg['subsets']:
|
||||
# scores.setdefault(metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
# eval_modes.append(dataset_eval_mode.get(sg['subsets'][0], 'unknown'))
|
||||
# else:
|
||||
# group_metrics = [default_metric]
|
||||
# for dataset_abbr in sg['subsets']:
|
||||
# metric = dataset_metrics[dataset_abbr][0]
|
||||
# scores.setdefault(default_metric, {})[dataset_abbr + '@' + metric] = parsed_results[model_abbr][dataset_abbr][metric]
|
||||
# eval_modes.append(dataset_eval_mode.get(dataset_abbr, 'unknown'))
|
||||
|
||||
result = {}
|
||||
for metric in scores:
|
||||
|
Loading…
Reference in New Issue
Block a user