diff --git a/examples/eval_OpenHuEval_HuLifeQA.py b/examples/eval_OpenHuEval_HuLifeQA.py index f4ed6d62..4b50c6a7 100644 --- a/examples/eval_OpenHuEval_HuLifeQA.py +++ b/examples/eval_OpenHuEval_HuLifeQA.py @@ -19,6 +19,7 @@ with read_base(): from opencompass.configs.models.openai.o1_mini_2024_09_12 import models as o1_mini_2024_09_12_model from opencompass.configs.models.qwq.lmdeploy_qwq_32b_preview import models as lmdeploy_qwq_32b_preview_model + from opencompass.configs.models.qwq.qwq_32b import models as qwq_32b_model from opencompass.configs.models.deepseek.deepseek_r1_api_aliyun import models as deepseek_r1_api_aliyun_model from opencompass.configs.models.deepseek.deepseek_r1_distill_llama_8b_api_aliyun import models as deepseek_r1_distill_llama_8b_api_aliyun_model from opencompass.configs.models.deepseek.deepseek_r1_distill_qwen_7b_api_aliyun import models as deepseek_r1_distill_qwen_7b_api_aliyun_model @@ -58,6 +59,7 @@ models = [ *gpt_4o_mini_20240718_model, *gpt_4o_20241120_model, *o1_mini_2024_09_12_model, + *qwq_32b_model, *deepseek_v3_api_aliyun_model, *deepseek_r1_api_aliyun_model, *deepseek_r1_distill_llama_8b_api_aliyun_model, diff --git a/opencompass/configs/models/qwq/qwq_32b.py b/opencompass/configs/models/qwq/qwq_32b.py new file mode 100644 index 00000000..f997bd24 --- /dev/null +++ b/opencompass/configs/models/qwq/qwq_32b.py @@ -0,0 +1,19 @@ +from opencompass.models import Qwen + +api_meta_template = dict(round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True), +], ) + +models = [ + dict( + abbr='QwQ-32B', + type=Qwen, + path='qwq-32b', + key='ENV', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=4096, + batch_size=8), +] diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py index d22c0785..1d70e58e 100644 --- a/opencompass/models/qwen_api.py +++ b/opencompass/models/qwen_api.py @@ -1,3 +1,5 @@ +import os +import random import time from concurrent.futures import ThreadPoolExecutor from typing import Dict, List, Optional, Union @@ -43,7 +45,16 @@ class Qwen(BaseAPIModel): retry=retry, generation_kwargs=generation_kwargs) import dashscope - dashscope.api_key = key + if isinstance(key, str): + if key == 'ENV': + if 'DASHSCOPE_API_KEY' not in os.environ: + raise ValueError('DASHSCOPE API key is not set.') + self.keys = os.getenv('DASHSCOPE_API_KEY').split(',') + else: + self.keys = [key] + else: + self.keys = key + self.path = path self.dashscope = dashscope def generate( @@ -131,7 +142,9 @@ class Qwen(BaseAPIModel): self.acquire() try: response = self.dashscope.Generation.call( + api_key=random.choice(self.keys), model=self.path, + stream=True, **data, ) except Exception as err: @@ -148,34 +161,51 @@ class Qwen(BaseAPIModel): # to slow down the request self.wait() continue + # + reasoning_content = "" # 定义完整思考过程 + answer_content = "" # 定义完整回复 + is_answering = False # 判断是否结束思考过程并开始回复 + for chunk in response: + if (chunk.output.choices[0].message.content == "" and + chunk.output.choices[0].message.reasoning_content == ""): + pass + else: + if (chunk.output.choices[0].message.reasoning_content != "" and + chunk.output.choices[0].message.content == ""): + reasoning_content += chunk.output.choices[0].message.reasoning_content + elif chunk.output.choices[0].message.content != "": + if not is_answering: + is_answering = True + answer_content += chunk.output.choices[0].message.content + reasoning_content = '' + reasoning_content + '' + return reasoning_content + answer_content + # if response.status_code == 200: + # try: + # msg = response.output.text + # self.logger.debug(msg) + # return msg + # except KeyError: + # print(response) + # self.logger.error(str(response.status_code)) + # time.sleep(1) + # continue + # if response.status_code == 429: + # print(response) + # time.sleep(2) + # continue + # if response.status_code == 400: + # print('=' * 128) + # print(response) + # msg = 'Output data may contain inappropriate content.' + # return msg - if response.status_code == 200: - try: - msg = response.output.text - self.logger.debug(msg) - return msg - except KeyError: - print(response) - self.logger.error(str(response.status_code)) - time.sleep(1) - continue - if response.status_code == 429: - print(response) - time.sleep(2) - continue - if response.status_code == 400: - print('=' * 128) - print(response) - msg = 'Output data may contain inappropriate content.' - return msg - - if ('Range of input length should be ' in response.message - or # input too long - 'Input data may contain inappropriate content.' - in response.message): # bad input - print(response.message) - return '' - print(response) - max_num_retries += 1 + # if ('Range of input length should be ' in response.message + # or # input too long + # 'Input data may contain inappropriate content.' + # in response.message): # bad input + # print(response.message) + # return '' + # print(response) + # max_num_retries += 1 raise RuntimeError(response.message)