diff --git a/examples/eval_OpenHuEval_HuProverbRea_OE.py b/examples/eval_OpenHuEval_HuProverbRea_OE.py index c937e289..bf6ed8ed 100644 --- a/examples/eval_OpenHuEval_HuProverbRea_OE.py +++ b/examples/eval_OpenHuEval_HuProverbRea_OE.py @@ -3,9 +3,9 @@ from mmengine.config import read_base with read_base(): from opencompass.configs.datasets.OpenHuEval.HuProverbRea.HuProverbRea_OE import HuProverbRea_datasets - # from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model - # from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model - # from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model + from opencompass.configs.models.openai.gpt_4o_mini_20240718 import models as gpt_4o_mini_20240718_model + from opencompass.configs.models.openai.gpt_4o_2024_11_20 import models as gpt_4o_2024_11_20_model + from opencompass.configs.models.deepseek.deepseek_v3_api_aliyun import models as deepseek_v3_api_aliyun_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import models as lmdeploy_qwen2_5_7b_instruct_model from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_72b_instruct import models as lmdeploy_qwen2_5_72b_instruct_model @@ -15,8 +15,8 @@ with read_base(): from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import models as lmdeploy_internlm3_8b_instruct_model from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model - # from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model - # from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model + from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model + from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model # from opencompass.configs.models.openai.o3_mini_2025_01_31 import models as o3_mini_2025_01_31_model datasets = HuProverbRea_datasets @@ -30,6 +30,12 @@ for model in models: 'type': 'rm__before_eval' } } + if model['abbr'].startswith('QwQ'): + model['pred_postprocessor'] = { + 'OpenHuEval_*': { + 'type': 'extract_qwq_answer_before_eval' + } + } del model work_dir = './outputs/' + __file__.split('/')[-1].split('.')[0] + '/' # do NOT modify this line, yapf: disable, pylint: disable diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index be45ba52..056b5776 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -241,3 +241,26 @@ def remove_reasoning_part_before_evaluation(text: str): return text[reasoning_end + 8:] else: return text + + +@TEXT_POSTPROCESSORS.register_module('extract_qwq_answer_before_eval') +def extract_answer_before_evaluation(text: str): + """Overall, there are three situations in responses of QWQ: + + 1. There is a **Final Answer** title in the whole context. + 2. There is only one sentence in the context. + 3. There are more than one sentences in the context, \ + and the last one is the answer. + """ + if '**Final Answer**' in text: + answer = text.split('\n\n**Final Answer**\n\n')[-1] + else: + if '\n\nHuman' in text: + text = text.split('\n\nHuman')[0] + + text_split = text.split('.') + if text_split[-1] == '': + answer = text_split[-2] + '.' + else: + answer = text_split[-1] + '.' + return answer