2025-03-12 21:58:58 +08:00
import os
from opencompass . openicl . icl_prompt_template import PromptTemplate
from opencompass . openicl . icl_retriever import ZeroRetriever
from opencompass . openicl . icl_inferencer import GenInferencer
from opencompass . openicl . icl_evaluator import AccEvaluator
from opencompass . datasets import BBHDataset , BBHEvaluator , bbh_mcq_postprocess , BBHEvaluator_mcq , generic_llmjudge_postprocess
from opencompass . evaluator import GenericLLMEvaluator
bbh_reader_cfg = dict ( input_columns = [ ' input ' ] , output_column = ' target ' )
bbh_multiple_choice_sets = [
' temporal_sequences ' ,
' disambiguation_qa ' ,
' date_understanding ' ,
' tracking_shuffled_objects_three_objects ' ,
' penguins_in_a_table ' ,
' geometric_shapes ' ,
' snarks ' ,
' ruin_names ' ,
' tracking_shuffled_objects_seven_objects ' ,
' tracking_shuffled_objects_five_objects ' ,
' logical_deduction_three_objects ' ,
' hyperbaton ' ,
' logical_deduction_five_objects ' ,
' logical_deduction_seven_objects ' ,
' movie_recommendation ' ,
' salient_translation_error_detection ' ,
' reasoning_about_colored_objects ' ,
]
bbh_free_form_sets = [
' multistep_arithmetic_two ' ,
' navigate ' ,
' dyck_languages ' ,
' word_sorting ' ,
' sports_understanding ' ,
' boolean_expressions ' ,
' object_counting ' ,
' formal_fallacies ' ,
' causal_judgement ' ,
' web_of_lies ' ,
]
GRADER_TEMPLATE = """
Please as a grading expert , judge whether the final answers given by the candidates below are consistent with the standard answers , that is , whether the candidates answered correctly .
Here are some evaluation criteria :
1. Please refer to the given standard answer . You don ' t need to re-generate the answer to the question because the standard answer has been given. You only need to judge whether the candidate ' s answer is consistent with the standard answer according to the form of the question . Don ' t try to answer the original question. You can assume that the standard answer is definitely correct.
2. Because the candidate ' s answer may be different from the standard answer in the form of expression, before making a judgment, please understand the question and the standard answer first, and then judge whether the candidate ' s answer is correct , but be careful not to try to answer the original question .
3. Some answers may contain multiple items , such as multiple - choice questions , multiple - select questions , fill - in - the - blank questions , etc . As long as the answer is the same as the standard answer , it is enough . For multiple - select questions and multiple - blank fill - in - the - blank questions , the candidate needs to answer all the corresponding options or blanks correctly to be considered correct .
4. Some answers may be expressed in different ways , such as some answers may be a mathematical expression , some answers may be a textual description , as long as the meaning expressed is the same . And some formulas are expressed in different ways , but they are equivalent and correct .
5. If the prediction is given with \\boxed { } , please ignore the \\boxed { } and only judge whether the candidate ' s answer is consistent with the standard answer.
Please judge whether the following answers are consistent with the standard answer based on the above criteria . Grade the predicted answer of this new question as one of :
A : CORRECT
B : INCORRECT
Just return the letters " A " or " B " , with no text around it .
Here is your task . Simply reply with either CORRECT , INCORRECT . Don ' t apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
< Original Question Begin > : { question } \n { options_str } \n < Original Question End > \n \n
< Gold Target Begin > : \n { answer } \n < Gold Target End > \n \n
< Predicted Answer Begin > : \n { prediction } \n < Predicted End > \n \n
Judging the correctness of candidates ' answers:
""" .strip()
bbh_datasets = [ ]
for _name in bbh_multiple_choice_sets :
with open ( os . path . join ( os . path . dirname ( __file__ ) , ' lib_prompt ' , f ' { _name } .txt ' ) , ' r ' ) as f :
_hint = f . read ( )
bbh_infer_cfg = dict (
prompt_template = dict (
type = PromptTemplate ,
template = dict ( round = [
dict (
role = ' HUMAN ' ,
prompt =
f " Follow the given examples and answer the question. \n { _hint } \n \n Q: {{ input }} \n A: Let ' s think step by step. "
)
] ) ) ,
retriever = dict ( type = ZeroRetriever ) ,
2025-03-24 16:27:45 +08:00
inferencer = dict ( type = GenInferencer ) )
2025-03-12 21:58:58 +08:00
bbh_eval_cfg = dict (
evaluator = dict (
type = GenericLLMEvaluator ,
prompt_template = dict (
type = PromptTemplate ,
template = dict (
begin = [
dict (
role = ' SYSTEM ' ,
fallback_role = ' HUMAN ' ,
prompt = " You are a helpful assistant who evaluates the correctness and quality of models ' outputs. " )
] ,
round = [
dict (
role = ' HUMAN ' ,
prompt = GRADER_TEMPLATE
) ,
] ) ,
) ,
dataset_cfg = dict (
type = BBHDataset ,
path = ' opencompass/bbh ' ,
reader_cfg = bbh_reader_cfg ,
name = _name ,
) ,
judge_cfg = dict ( ) ,
dict_postprocessor = dict ( type = generic_llmjudge_postprocess ) ,
) ,
pred_role = ' BOT ' ,
)
bbh_datasets . append (
dict (
type = BBHDataset ,
path = ' opencompass/bbh ' ,
name = _name ,
abbr = ' bbh- ' + _name ,
reader_cfg = bbh_reader_cfg ,
infer_cfg = bbh_infer_cfg . copy ( ) ,
eval_cfg = bbh_eval_cfg . copy ( ) ) )
for _name in bbh_free_form_sets :
with open ( os . path . join ( os . path . dirname ( __file__ ) , ' lib_prompt ' , f ' { _name } .txt ' ) , ' r ' ) as f :
_hint = f . read ( )
bbh_infer_cfg = dict (
prompt_template = dict (
type = PromptTemplate ,
template = dict ( round = [
dict (
role = ' HUMAN ' ,
prompt =
f " Follow the given examples and answer the question. \n { _hint } \n \n Q: {{ input }} \n A: Let ' s think step by step. "
)
] ) ) ,
retriever = dict ( type = ZeroRetriever ) ,
2025-03-24 16:27:45 +08:00
inferencer = dict ( type = GenInferencer ) )
2025-03-12 21:58:58 +08:00
bbh_eval_cfg = dict (
evaluator = dict (
type = GenericLLMEvaluator ,
prompt_template = dict (
type = PromptTemplate ,
template = dict (
begin = [
dict (
role = ' SYSTEM ' ,
fallback_role = ' HUMAN ' ,
prompt = " You are a helpful assistant who evaluates the correctness and quality of models ' outputs. " )
] ,
round = [
dict (
role = ' HUMAN ' ,
prompt = GRADER_TEMPLATE
) ,
] ) ,
) ,
dataset_cfg = dict (
type = BBHDataset ,
path = ' opencompass/bbh ' ,
reader_cfg = bbh_reader_cfg ,
name = _name ,
) ,
judge_cfg = dict ( ) ,
2025-03-19 10:03:45 +08:00
dict_postprocessor = dict ( type = generic_llmjudge_postprocess ) ,
2025-03-12 21:58:58 +08:00
) ,
pred_role = ' BOT ' ,
)
bbh_datasets . append (
dict (
type = BBHDataset ,
path = ' opencompass/bbh ' ,
name = _name ,
abbr = ' bbh- ' + _name ,
reader_cfg = bbh_reader_cfg ,
infer_cfg = bbh_infer_cfg . copy ( ) ,
eval_cfg = bbh_eval_cfg . copy ( ) ) )