OpenCompass/opencompass/datasets/calm/evaluation/labeling/CLADDER.py

from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
def get_gt_label(item):
        if item["gt_answer"] == "yes":
                gt_label = 1
        elif item["gt_answer"] == "no":
                gt_label = 0
        return gt_label

def get_pred_label(model_response, item, prompt_style, type):
        model_response = model_response.strip().lower()
        low_index = len(model_response)
        start_str1_dict = common_start_true_dict
        start_str2_dict = common_start_false_dict

        start_option1_list,start_option2_list = [],[]
        # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
        for key in start_str1_dict.keys():
            for str1 in start_str1_dict[key]:
                for i in range(key, len(str1)+1):
                    start_option1_list.append(str1[-i:])
        for key in start_str2_dict.keys():
            for str2 in start_str2_dict[key]:
                for i in range(key, len(str2)+1):
                    start_option2_list.append(str2[-i:])

        inner_option1_list = ["method 1 is more correct","使用方法1更准确"]+common_true_list
        inner_option2_list = ["method 2 is more correct","method 2 is correct","correct to use method 2", "方法2比方法1更准确","方法2"]+common_false_list
        if model_response.startswith(tuple(start_option1_list)):
            label = 1
        elif model_response.startswith(tuple(start_option2_list)):
            label = 0
        elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
            label = 1
            if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
                label = 0
        elif any(response in model_response for response in inner_option2_list):
            label = 0
        else:
            return -1
        return label
Calm dataset (#1287) * add calm dataset * modify config max_out_len * update README * Modify README * update README * update README * update README * update README * update README * add summarizer and modify readme * delete summarizer config comment * update summarizer * modify same response to all questions * update README 2024-07-26 11:48:16 +08:00			`from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict`
			`def get_gt_label(item):`
			`if item["gt_answer"] == "yes":`
			`gt_label = 1`
			`elif item["gt_answer"] == "no":`
			`gt_label = 0`
			`return gt_label`

			`def get_pred_label(model_response, item, prompt_style, type):`
			`model_response = model_response.strip().lower()`
			`low_index = len(model_response)`
			`start_str1_dict = common_start_true_dict`
			`start_str2_dict = common_start_false_dict`

			`start_option1_list,start_option2_list = [],[]`
			`# some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.`
			`for key in start_str1_dict.keys():`
			`for str1 in start_str1_dict[key]:`
			`for i in range(key, len(str1)+1):`
			`start_option1_list.append(str1[-i:])`
			`for key in start_str2_dict.keys():`
			`for str2 in start_str2_dict[key]:`
			`for i in range(key, len(str2)+1):`
			`start_option2_list.append(str2[-i:])`

			`inner_option1_list = ["method 1 is more correct","使用方法1更准确"]+common_true_list`
			`inner_option2_list = ["method 2 is more correct","method 2 is correct","correct to use method 2", "方法2比方法1更准确","方法2"]+common_false_list`
			`if model_response.startswith(tuple(start_option1_list)):`
			`label = 1`
			`elif model_response.startswith(tuple(start_option2_list)):`
			`label = 0`
			`elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):`
			`label = 1`
			`if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):`
			`label = 0`
			`elif any(response in model_response for response in inner_option2_list):`
			`label = 0`
			`else:`
			`return -1`
			`return label`