diff --git a/configs/datasets/collections/base_core.py b/configs/datasets/collections/base_core.py index ab81496d..927f8d04 100644 --- a/configs/datasets/collections/base_core.py +++ b/configs/datasets/collections/base_core.py @@ -15,6 +15,6 @@ with read_base(): from ..math.math_evaluatorv2_gen_9d2049 import math_datasets from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets from ..humaneval.humaneval_gen_d2537e import humaneval_datasets - from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets + from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), []) diff --git a/configs/datasets/collections/base_medium.py b/configs/datasets/collections/base_medium.py index 86002a99..3b61cae0 100644 --- a/configs/datasets/collections/base_medium.py +++ b/configs/datasets/collections/base_medium.py @@ -7,7 +7,7 @@ with read_base(): from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets - from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets diff --git a/configs/datasets/collections/base_medium_llama.py b/configs/datasets/collections/base_medium_llama.py index 1de485c9..968e3bfd 100644 --- a/configs/datasets/collections/base_medium_llama.py +++ b/configs/datasets/collections/base_medium_llama.py @@ -7,7 +7,7 @@ with read_base(): from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..humaneval.humaneval_gen_a82cae import humaneval_datasets - from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets diff --git a/configs/datasets/collections/base_small.py b/configs/datasets/collections/base_small.py index 3778162e..5fd914e4 100644 --- a/configs/datasets/collections/base_small.py +++ b/configs/datasets/collections/base_small.py @@ -11,7 +11,7 @@ with read_base(): from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets - from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ..lambada.lambada_gen_217e11 import lambada_datasets from ..storycloze.storycloze_ppl_496661 import storycloze_datasets from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets diff --git a/configs/datasets/collections/chat_core.py b/configs/datasets/collections/chat_core.py index f87d9bfd..62d3976a 100644 --- a/configs/datasets/collections/chat_core.py +++ b/configs/datasets/collections/chat_core.py @@ -15,6 +15,6 @@ with read_base(): from ..math.math_evaluatorv2_gen_cecb31 import math_datasets from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets - from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets + from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), []) diff --git a/configs/datasets/collections/chat_medium.py b/configs/datasets/collections/chat_medium.py index 557d1455..bf2fef58 100644 --- a/configs/datasets/collections/chat_medium.py +++ b/configs/datasets/collections/chat_medium.py @@ -7,7 +7,7 @@ with read_base(): from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets - from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets diff --git a/configs/datasets/collections/chat_small.py b/configs/datasets/collections/chat_small.py index 6314e46c..dce15420 100644 --- a/configs/datasets/collections/chat_small.py +++ b/configs/datasets/collections/chat_small.py @@ -12,7 +12,7 @@ with read_base(): from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets - from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ..lambada.lambada_gen_217e11 import lambada_datasets from ..storycloze.storycloze_gen_7f656a import storycloze_datasets from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets diff --git a/configs/datasets/collections/leaderboard/qwen.py b/configs/datasets/collections/leaderboard/qwen.py index d14a8376..6e0c8cab 100644 --- a/configs/datasets/collections/leaderboard/qwen.py +++ b/configs/datasets/collections/leaderboard/qwen.py @@ -44,7 +44,7 @@ with read_base(): from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ...drop.drop_gen_8a9ed9 import drop_datasets from ...humaneval.humaneval_gen_a82cae import humaneval_datasets - from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ...bbh.bbh_gen_5bf00b import bbh_datasets diff --git a/configs/datasets/collections/leaderboard/qwen_chat.py b/configs/datasets/collections/leaderboard/qwen_chat.py index 892db561..d1c4d851 100644 --- a/configs/datasets/collections/leaderboard/qwen_chat.py +++ b/configs/datasets/collections/leaderboard/qwen_chat.py @@ -44,7 +44,7 @@ with read_base(): from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ...drop.drop_gen_8a9ed9 import drop_datasets from ...humaneval.humaneval_gen_a82cae import humaneval_datasets - from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets + from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets from ...bbh.bbh_gen_5b92b0 import bbh_datasets diff --git a/configs/datasets/gpqa/gpqa_gen_015262.py b/configs/datasets/gpqa/gpqa_gen_015262.py new file mode 100644 index 00000000..227c0d49 --- /dev/null +++ b/configs/datasets/gpqa/gpqa_gen_015262.py @@ -0,0 +1,46 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import GPQADataset, GPQAEvaluator +from opencompass.utils import first_option_postprocess + +gpqa_reader_cfg = dict( + input_columns=['question', 'A', 'B', 'C', 'D'], + output_column='answer') + +gpqa_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n' + '(A){A}\n' + '(B){B}\n' + '(C){C}\n' + '(D){D}\n' + 'Format your response as follows: "The correct answer is (insert answer here)"'), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator), + pred_postprocessor=dict(type=first_option_postprocess, options='ABCD')) + +gpqa_datasets = [] +gpqa_subsets = { + 'extended': 'gpqa_extended.csv', + 'main': 'gpqa_main.csv', + 'diamond': 'gpqa_diamond.csv' +} + +for split in list(gpqa_subsets.keys()): + gpqa_datasets.append( + dict( + abbr='GPQA_' + split, + type=GPQADataset, + path='./data/gpqa/', + name=gpqa_subsets[split], + reader_cfg=gpqa_reader_cfg, + infer_cfg=gpqa_infer_cfg, + eval_cfg=gpqa_eval_cfg) + ) diff --git a/configs/datasets/mbpp/mbpp_gen_1e1056.py b/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/mbpp_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py diff --git a/configs/datasets/mbpp/mbpp_gen_6590b0.py b/configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py similarity index 100% rename from configs/datasets/mbpp/mbpp_gen_6590b0.py rename to configs/datasets/mbpp/deprecated_mbpp_gen_6590b0.py diff --git a/configs/datasets/mbpp/mbpp_gen_caa7ab.py b/configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py similarity index 100% rename from configs/datasets/mbpp/mbpp_gen_caa7ab.py rename to configs/datasets/mbpp/deprecated_mbpp_gen_caa7ab.py diff --git a/configs/datasets/mbpp/mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/mbpp_passk_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_mbpp_passk_gen_1e1056.py diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/mbpp_repeat10_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_mbpp_repeat10_gen_1e1056.py diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py b/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/sanitized_mbpp_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_1e1056.py diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_cb43ef.py b/configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py similarity index 100% rename from configs/datasets/mbpp/sanitized_mbpp_gen_cb43ef.py rename to configs/datasets/mbpp/deprecated_sanitized_mbpp_gen_cb43ef.py diff --git a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py b/configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/sanitized_mbpp_passk_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_sanitized_mbpp_passk_gen_1e1056.py diff --git a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py b/configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py similarity index 100% rename from configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_1e1056.py rename to configs/datasets/mbpp/deprecated_sanitized_mbpp_repeat10_gen_1e1056.py diff --git a/configs/datasets/mbpp/mbpp_gen.py b/configs/datasets/mbpp/mbpp_gen.py index 930cb73d..e374ac06 100644 --- a/configs/datasets/mbpp/mbpp_gen.py +++ b/configs/datasets/mbpp/mbpp_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mbpp_gen_1e1056 import mbpp_datasets # noqa: F401, F403 + from .mbpp_gen_830460 import mbpp_datasets # noqa: F401, F403 diff --git a/configs/datasets/mbpp/mbpp_gen_5d6316.py b/configs/datasets/mbpp/mbpp_gen_5d6316.py deleted file mode 100644 index 2224d2fb..00000000 --- a/configs/datasets/mbpp/mbpp_gen_5d6316.py +++ /dev/null @@ -1,46 +0,0 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.datasets import MBPPDataset, MBPPEvaluator2 - -mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2") - -# This prompt is used for WizardLMCode series -# You can use other config file for basic 3-shot generation -mbpp_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict( - role="HUMAN", - prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request. - -### Instruction: -Create a Python script for this problem: - -{text} -Test examples: -{test_list} - -### Response:""", - ), - ] - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer, max_out_len=512), -) - -mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT") - -mbpp_datasets = [ - dict( - type=MBPPDataset, - abbr="mbpp", - path="./data/mbpp/mbpp.jsonl", - reader_cfg=mbpp_reader_cfg, - infer_cfg=mbpp_infer_cfg, - eval_cfg=mbpp_eval_cfg, - ) -] diff --git a/configs/datasets/mbpp/mbpp_gen_78c1bc.py b/configs/datasets/mbpp/mbpp_gen_830460.py similarity index 82% rename from configs/datasets/mbpp/mbpp_gen_78c1bc.py rename to configs/datasets/mbpp/mbpp_gen_830460.py index d228ad60..769b1e03 100644 --- a/configs/datasets/mbpp/mbpp_gen_78c1bc.py +++ b/configs/datasets/mbpp/mbpp_gen_830460.py @@ -10,13 +10,13 @@ mbpp_infer_cfg = dict( type=PromptTemplate, template=dict( round=[ - dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), - dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"), + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"), dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), - dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), @@ -25,7 +25,7 @@ mbpp_infer_cfg = dict( ), ), retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), + inferencer=dict(type=GenInferencer, max_out_len=512), ) mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") diff --git a/configs/datasets/mbpp/mbpp_passk_gen_830460.py b/configs/datasets/mbpp/mbpp_passk_gen_830460.py new file mode 100644 index 00000000..c1ce05f3 --- /dev/null +++ b/configs/datasets/mbpp/mbpp_passk_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column") + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), + dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), + dict(role="BOT", prompt="[BEGIN]\n"), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") + +mbpp_datasets = [ + dict( + type=MBPPDataset_V2, + abbr="mbpp_passk", + path="./data/mbpp/mbpp.jsonl", + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py b/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py new file mode 100644 index 00000000..e3b5c36b --- /dev/null +++ b/configs/datasets/mbpp/mbpp_repeat10_gen_830460.py @@ -0,0 +1,45 @@ +# This config is used for pass@k evaluation with dataset repetition +# That model cannot generate multiple response for single input +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator + +mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column") + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), + dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), + dict(role="BOT", prompt="[BEGIN]\n"), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") + +mbpp_datasets = [ + dict( + type=MBPPDataset_V2, + abbr="mbpp_repeat10", + path="./data/mbpp/mbpp.jsonl", + num_repeats=10, + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py new file mode 100644 index 00000000..4f1eba57 --- /dev/null +++ b/configs/datasets/mbpp/sanitized_mbpp_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2") + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n",), + dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n",), + dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n",), + dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n",), + dict(role="BOT", prompt="[BEGIN]\n"), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr="sanitized_mbpp", + path="./data/mbpp/sanitized-mbpp.jsonl", + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py new file mode 100644 index 00000000..a8d79cb6 --- /dev/null +++ b/configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py @@ -0,0 +1,42 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column") + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), + dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), + dict(role="BOT", prompt="[BEGIN]\n"), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr="sanitized_mbpp_passk", + path="./data/mbpp/sanitized-mbpp.jsonl", + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py new file mode 100644 index 00000000..6035c8c2 --- /dev/null +++ b/configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py @@ -0,0 +1,43 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator + +sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column") + +sanitized_mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), + dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), + dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), + + dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), + dict(role="BOT", prompt="[BEGIN]\n"), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512), +) + +sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT") + +sanitized_mbpp_datasets = [ + dict( + type=SanitizedMBPPDataset, + abbr="sanitized_mbpp_repeat10", + path="./data/mbpp/sanitized-mbpp.jsonl", + num_repeats=10, + reader_cfg=sanitized_mbpp_reader_cfg, + infer_cfg=sanitized_mbpp_infer_cfg, + eval_cfg=sanitized_mbpp_eval_cfg, + ) +] diff --git a/configs/datasets/mbpp_cn/mbpp_cn_gen_1d1481.py b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_gen_1d1481.py similarity index 100% rename from configs/datasets/mbpp_cn/mbpp_cn_gen_1d1481.py rename to configs/datasets/mbpp_cn/deprecated_mbpp_cn_gen_1d1481.py diff --git a/configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py similarity index 100% rename from configs/datasets/mbpp_cn/mbpp_cn_passk_gen_1d1481.py rename to configs/datasets/mbpp_cn/deprecated_mbpp_cn_passk_gen_1d1481.py diff --git a/configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py b/configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py similarity index 100% rename from configs/datasets/mbpp_cn/mbpp_cn_repeat10_gen_1d1481.py rename to configs/datasets/mbpp_cn/deprecated_mbpp_cn_repeat10_gen_1d1481.py diff --git a/configs/datasets/mbpp_cn/mbpp_cn_gen.py b/configs/datasets/mbpp_cn/mbpp_cn_gen.py index 48a4a3de..dcf24e43 100644 --- a/configs/datasets/mbpp_cn/mbpp_cn_gen.py +++ b/configs/datasets/mbpp_cn/mbpp_cn_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from .mbpp_cn_gen_1d1481 import mbpp_cn_datasets # noqa: F401, F403 \ No newline at end of file + from .mbpp_cn_gen_9114d5 import mbpp_cn_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py b/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py new file mode 100644 index 00000000..d17380f5 --- /dev/null +++ b/configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPDataset, MBPPEvaluator + +mbpp_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='test_list_2') + +mbpp_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= + "你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n" + ), + dict(role="BOT", prompt="[BEGIN]\n"), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") + +mbpp_cn_datasets = [ + dict( + type=MBPPDataset, + abbr='mbpp_cn', + path='./data/mbpp_cn/mbpp_cn.jsonl', + reader_cfg=mbpp_reader_cfg, + infer_cfg=mbpp_infer_cfg, + eval_cfg=mbpp_eval_cfg) +] diff --git a/configs/datasets/mbpp_plus/mbpp_plus_gen_94815c.py b/configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py similarity index 100% rename from configs/datasets/mbpp_plus/mbpp_plus_gen_94815c.py rename to configs/datasets/mbpp_plus/deprecated_mbpp_plus_gen_94815c.py diff --git a/configs/datasets/mbpp_plus/mbpp_plus_gen.py b/configs/datasets/mbpp_plus/mbpp_plus_gen.py index 5a1ce3da..240b145d 100644 --- a/configs/datasets/mbpp_plus/mbpp_plus_gen.py +++ b/configs/datasets/mbpp_plus/mbpp_plus_gen.py @@ -1,4 +1,4 @@ from mmengine.config import read_base with read_base(): - from.mbpp_plus_gen_94815c import mbpp_plus_datasets # noqa: F401, F403 \ No newline at end of file + from.mbpp_plus_gen_0b836a import mbpp_plus_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py b/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py new file mode 100644 index 00000000..b9c1c8ff --- /dev/null +++ b/configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py @@ -0,0 +1,64 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset + +mbpp_plus_reader_cfg = dict( + input_columns=['text', 'test_list'], output_column='task_id') + +mbpp_plus_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n" + ), + dict( + role="BOT", + prompt= + "[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n " + ), + dict( + role="HUMAN", + prompt= + "You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n" + ), + dict(role="BOT", prompt="[BEGIN]\n"), + ], )), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer, max_out_len=512)) + +mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role="BOT") + +mbpp_plus_datasets = [ + dict( + type=MBPPPlusDataset, + abbr='mbpp_plus', + path='./data/mbpp_plus/mbpp_plus.jsonl', + reader_cfg=mbpp_plus_reader_cfg, + infer_cfg=mbpp_plus_infer_cfg, + eval_cfg=mbpp_plus_eval_cfg) +] diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_256k.py new file mode 100644 index 00000000..a92f6292 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_256k.py @@ -0,0 +1,18 @@ +from mmengine.config import read_base + +with read_base(): + from .needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets + from .needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets + from .needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets + from .needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets + from .needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets + from .needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets + from .needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets + from .needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets + + from .needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets + from .needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets + from .needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets + from .needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets + +needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), []) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py new file mode 100644 index 00000000..13d8d8f2 --- /dev/null +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_reasoning_256k.py @@ -0,0 +1,287 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset +from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchMultiEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +context_lengths = [32000, 128000, 256000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +# ----------English Version---------- +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] + +needle_file_name = 'multi_needle_reasoning_en.json' +diff = 10 +num_needles = 2 +needlebench_2needle_en_datasets = [] +language = 'English' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_2needle_en_datasets.append(dataset_dict) + +num_needles = 3 +needlebench_3needle_en_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_3needle_en_datasets.append(dataset_dict) + +num_needles = 4 +needlebench_4needle_en_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_4needle_en_datasets.append(dataset_dict) + +num_needles = 5 +needlebench_5needle_en_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_en_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_5needle_en_datasets.append(dataset_dict) + +# ----------Chinese Version---------- +base_path = './data/needlebench' +file_list = ['zh_finance.jsonl'] + +needle_file_name = 'multi_needle_reasoning_zh.json' +diff = 10 +num_needles = 2 +needlebench_2needle_zh_datasets = [] +language = 'Chinese' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_2needle_zh_datasets.append(dataset_dict) + +num_needles = 3 +needlebench_3needle_zh_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_3needle_zh_datasets.append(dataset_dict) + +num_needles = 4 +needlebench_4needle_zh_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_4needle_zh_datasets.append(dataset_dict) + +num_needles = 5 +needlebench_5needle_zh_datasets = [] + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k', + 'type': NeedleBenchMultiDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': language, + 'needle_file_name': needle_file_name, + 'num_needles': num_needles, + 'diff': diff, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_5needle_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py new file mode 100644 index 00000000..5078ac0c --- /dev/null +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_multi_retrieval_256k.py @@ -0,0 +1,109 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset +from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchParallelEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +context_lengths = [32000, 128000, 256000] +document_depth_percent_intervals = 20 +document_depth_percent_interval_type = "linear" + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_en_datasets = [] +needle_file_name = 'needles.jsonl' +depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_en_256k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 3000, + 'guide': True, + 'language': 'English', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_en_datasets.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_zh_datasets = [] + +for original_context_length in context_lengths: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'_parallel_zh_256k', + 'type': NeedleBenchParallelDataset, + 'path': base_path, + 'needle_file_name': needle_file_name, + 'length': original_context_length, + 'depths': depths, + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 25, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py b/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py new file mode 100644 index 00000000..d2ae6cec --- /dev/null +++ b/configs/datasets/needlebench/needlebench_256k/needlebench_single_256k.py @@ -0,0 +1,110 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset +from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator +from opencompass.datasets.needlebench.origin import needlebench_postprocess +from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess +import math + + +def logistic(x, L=100, x0=50, k=0.1): + return round(L / (1 + math.exp(-k * (x - x0))), 3) + + +def generate_linear_space(start, end, num): + if num == 1: + return [start] + elif num < 1: + raise ValueError("num must be at least 1.") + step = (end - start) / (num - 1) + return [start + step * i for i in range(num)] + + +def generate_depth_percents(intervals, interval_type): + if interval_type == 'linear': + return generate_linear_space(0, 100, intervals) + elif interval_type == 'sigmoid': + linear_space = generate_linear_space(0, 100, intervals) + return [logistic(x) for x in linear_space] + else: + raise ValueError('Unsupported interval type') + + +needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer') + +needlebench_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict(role='HUMAN', prompt='{prompt}'), + dict(role='BOT', prompt='{answer}\n'), + ] + ) + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer)) + +needlebench_eval_cfg = dict( + evaluator=dict(type=NeedleBenchOriginEvaluator), + pred_postprocessor=dict(type=needlebench_postprocess), + dataset_postprocessor=dict(type=needlebench_dataset_postprocess), + pred_role='BOT') + +# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000]) +context_lengths = [32000, 128000, 256000] +depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100] + +base_path = './data/needlebench' +file_list = ['PaulGrahamEssays.jsonl'] +needlebench_en_datasets = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_en_256k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 600, + 'guide': True, + 'language': 'English', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_en_datasets.append(dataset_dict) + +file_list = ['zh_finance.jsonl'] +needlebench_zh_datasets = [] +needle_file_name = 'needles.jsonl' + +for original_context_length in context_lengths: + for depth_percent in depths_list: + dataset_dict = { + 'abbr': f'Length{original_context_length}' + f'Depth{int(depth_percent)}_origin_zh_256k', + 'type': NeedleBenchOriginDataset, + 'path': base_path, + 'length': original_context_length, + 'depth': int(depth_percent), + 'tokenizer_model': 'gpt-4', + 'file_list': file_list, + 'num_repeats_per_file': 10, + 'length_buffer': 200, + 'guide': True, + 'language': 'Chinese', + 'needle_file_name': needle_file_name, + 'reader_cfg': needlebench_reader_cfg, + 'infer_cfg': needlebench_infer_cfg, + 'eval_cfg': needlebench_eval_cfg + } + needlebench_zh_datasets.append(dataset_dict) diff --git a/configs/eval_code_passk.py b/configs/eval_code_passk.py index f0b23a3c..b348da61 100644 --- a/configs/eval_code_passk.py +++ b/configs/eval_code_passk.py @@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask with read_base(): from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_passk_gen_1e1056 import mbpp_datasets - from .datasets.mbpp.sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets datasets = [] datasets += humaneval_datasets diff --git a/configs/eval_code_passk_repeat_dataset.py b/configs/eval_code_passk_repeat_dataset.py index 9b99bedc..80708b7d 100644 --- a/configs/eval_code_passk_repeat_dataset.py +++ b/configs/eval_code_passk_repeat_dataset.py @@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask with read_base(): from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_repeat10_gen_1e1056 import mbpp_datasets - from .datasets.mbpp.sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets datasets = [] datasets += humaneval_datasets diff --git a/configs/eval_internlm2_chat_keyset.py b/configs/eval_internlm2_chat_keyset.py index 27e95a4c..6c3db879 100644 --- a/configs/eval_internlm2_chat_keyset.py +++ b/configs/eval_internlm2_chat_keyset.py @@ -8,7 +8,7 @@ with read_base(): from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets + from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model diff --git a/configs/eval_internlm2_keyset.py b/configs/eval_internlm2_keyset.py index fcb6e7e2..497b11d1 100644 --- a/configs/eval_internlm2_keyset.py +++ b/configs/eval_internlm2_keyset.py @@ -7,7 +7,7 @@ with read_base(): from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from .datasets.math.math_gen_265cce import math_datasets from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets - from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets + from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model diff --git a/configs/models/deepseek/hf_deepseek_67b_chat.py b/configs/models/deepseek/hf_deepseek_67b_chat.py index d84a6133..8eaadfc7 100644 --- a/configs/models/deepseek/hf_deepseek_67b_chat.py +++ b/configs/models/deepseek/hf_deepseek_67b_chat.py @@ -1,6 +1,7 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( + begin='<|begin▁of▁sentence|>', round=[ dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True), @@ -12,7 +13,6 @@ models = [ type=HuggingFaceCausalLM, abbr='deepseek-67b-chat-hf', path="deepseek-ai/deepseek-llm-67b-chat", - tokenizer_path='deepseek-ai/deepseek-llm-67b-chat', model_kwargs=dict( device_map='auto', trust_remote_code=True, @@ -28,6 +28,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1), - end_str='<|end▁of▁sentence|>', + batch_padding=True, ) ] diff --git a/configs/models/deepseek/hf_deepseek_7b_chat.py b/configs/models/deepseek/hf_deepseek_7b_chat.py index 1791e09d..2531961b 100644 --- a/configs/models/deepseek/hf_deepseek_7b_chat.py +++ b/configs/models/deepseek/hf_deepseek_7b_chat.py @@ -1,6 +1,7 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( + begin='<|begin▁of▁sentence|>', round=[ dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True), @@ -12,7 +13,6 @@ models = [ type=HuggingFaceCausalLM, abbr='deepseek-7b-chat-hf', path="deepseek-ai/deepseek-llm-7b-chat", - tokenizer_path='deepseek-ai/deepseek-llm-7b-chat', model_kwargs=dict( device_map='auto', trust_remote_code=True, @@ -28,5 +28,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), + batch_padding=True, ) ] diff --git a/configs/models/deepseek/hf_deepseek_moe_16b_chat.py b/configs/models/deepseek/hf_deepseek_moe_16b_chat.py index 6c45f088..a009ff0c 100644 --- a/configs/models/deepseek/hf_deepseek_moe_16b_chat.py +++ b/configs/models/deepseek/hf_deepseek_moe_16b_chat.py @@ -1,6 +1,7 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( + begin='<|begin▁of▁sentence|>', round=[ dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True), @@ -12,7 +13,6 @@ models = [ type=HuggingFaceCausalLM, abbr='deepseek-moe-16b-chat-hf', path="deepseek-ai/deepseek-moe-16b-chat", - tokenizer_path='deepseek-ai/deepseek-moe-16b-chat', model_kwargs=dict( device_map='auto', trust_remote_code=True, @@ -26,7 +26,7 @@ models = [ max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=2, num_procs=1), - end_str='<|end▁of▁sentence|>', + run_cfg=dict(num_gpus=1, num_procs=1), + batch_padding=True, ) ] diff --git a/configs/models/gemma/hf_gemma_2b_it.py b/configs/models/gemma/hf_gemma_2b_it.py index b87243d7..0075484b 100644 --- a/configs/models/gemma/hf_gemma_2b_it.py +++ b/configs/models/gemma/hf_gemma_2b_it.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='user\n', end='\n'), dict(role="BOT", begin="model\n", end='\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,9 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, + min_out_len=1, max_out_len=100, max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), + batch_padding=True, ) ] diff --git a/configs/models/gemma/hf_gemma_7b_it.py b/configs/models/gemma/hf_gemma_7b_it.py index cc679b2f..b913db6e 100644 --- a/configs/models/gemma/hf_gemma_7b_it.py +++ b/configs/models/gemma/hf_gemma_7b_it.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='user\n', end='\n'), dict(role="BOT", begin="model\n", end='\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -29,5 +28,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_1_8b.py b/configs/models/hf_internlm/hf_internlm2_chat_1_8b.py index 48daa8e3..81c7d35d 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_1_8b.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_1_8b.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_1_8b_sft.py b/configs/models/hf_internlm/hf_internlm2_chat_1_8b_sft.py index 24823a07..6228ea57 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_1_8b_sft.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_1_8b_sft.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_20b.py b/configs/models/hf_internlm/hf_internlm2_chat_20b.py index 7d6515c3..c35e1701 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_20b.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_20b.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=2, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_20b_sft.py b/configs/models/hf_internlm/hf_internlm2_chat_20b_sft.py index bde3c35c..53844f5c 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_20b_sft.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_20b_sft.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=2, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_20b_with_system.py b/configs/models/hf_internlm/hf_internlm2_chat_20b_with_system.py index 29373777..3c490bf5 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_20b_with_system.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_20b_with_system.py @@ -7,7 +7,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -33,5 +32,6 @@ models = [ run_cfg=dict(num_gpus=2, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_7b.py b/configs/models/hf_internlm/hf_internlm2_chat_7b.py index 14cee114..3e0b349d 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_7b.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_7b.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_7b_sft.py b/configs/models/hf_internlm/hf_internlm2_chat_7b_sft.py index fe3d5e7e..07164a67 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_7b_sft.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_7b_sft.py @@ -6,7 +6,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -32,5 +31,6 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/hf_internlm2_chat_7b_with_system.py b/configs/models/hf_internlm/hf_internlm2_chat_7b_with_system.py index cd000115..e9628f6f 100644 --- a/configs/models/hf_internlm/hf_internlm2_chat_7b_with_system.py +++ b/configs/models/hf_internlm/hf_internlm2_chat_7b_with_system.py @@ -7,7 +7,6 @@ _meta_template = dict( dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), ], - eos_token_id=92542 ) models = [ @@ -33,5 +32,6 @@ models = [ run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', generation_kwargs = {"eos_token_id": [2, 92542]}, + batch_padding=True, ) ] diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py index 77e9c12f..fcad86d9 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_20b.py @@ -3,27 +3,31 @@ from opencompass.models.turbomind import TurboMindModel _meta_template = dict( round=[ - dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', - generate=True), + dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"), + dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True), ], - eos_token_id=92542 ) models = [ dict( type=TurboMindModel, - abbr='internlm2-chat-20b-turbomind', + abbr="internlm2-chat-20b-turbomind", path="internlm/internlm2-chat-20b", meta_template=_meta_template, - engine_config=dict(session_len=210000, - max_batch_size=8, - rope_scaling_factor=3.0, - model_name="internlm2-chat-20b", - tp=2), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=2000,), + engine_config=dict( + session_len=210000, + max_batch_size=8, + rope_scaling_factor=3.0, + model_name="internlm2-chat-20b", + tp=2, + stop_words=[2, 92542], + ), + gen_config=dict( + top_k=1, + top_p=0.8, + temperature=1.0, + max_new_tokens=2000, + ), max_out_len=2000, max_seq_len=210000, batch_size=1, diff --git a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py index 6f5e7f91..424fc1c9 100644 --- a/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py +++ b/configs/models/hf_internlm/lmdeploy_internlm2_chat_7b.py @@ -3,29 +3,34 @@ from opencompass.models.turbomind import TurboMindModel _meta_template = dict( round=[ - dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), - dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', - generate=True), + dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"), + dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True), ], - eos_token_id=92542 ) models = [ dict( type=TurboMindModel, - abbr='internlm2-chat-7b-turbomind', + abbr="internlm2-chat-7b-turbomind", path="internlm/internlm2-chat-7b", meta_template=_meta_template, - engine_config=dict(session_len=210000, - max_batch_size=8, - rope_scaling_factor=2.0, - model_name="internlm2-chat-7b"), - gen_config=dict(top_k=1, top_p=0.8, - temperature=1.0, - max_new_tokens=2000), + engine_config=dict( + session_len=210000, + max_batch_size=8, + rope_scaling_factor=2.0, + model_name="internlm2-chat-7b", + tp=1, + stop_words=[2, 92542], + ), + gen_config=dict( + top_k=1, + top_p=0.8, + temperature=1.0, + max_new_tokens=2000, + ), max_out_len=2000, max_seq_len=210000, - batch_size=8, + batch_size=1, concurrency=8, run_cfg=dict(num_gpus=1, num_procs=1), ) diff --git a/configs/models/hf_llama/hf_llama2_13b_chat.py b/configs/models/hf_llama/hf_llama2_13b_chat.py index 1c5d2038..ef85562e 100644 --- a/configs/models/hf_llama/hf_llama2_13b_chat.py +++ b/configs/models/hf_llama/hf_llama2_13b_chat.py @@ -2,8 +2,8 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( round=[ - dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '), - dict(role="BOT", begin='', end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin=' ', end=' ', generate=True), ], ) @@ -27,5 +27,6 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=2, num_procs=1), end_str='[INST]', + batch_padding=True, ) ] diff --git a/configs/models/hf_llama/hf_llama2_70b_chat.py b/configs/models/hf_llama/hf_llama2_70b_chat.py index 51a433af..ff25d27d 100644 --- a/configs/models/hf_llama/hf_llama2_70b_chat.py +++ b/configs/models/hf_llama/hf_llama2_70b_chat.py @@ -2,8 +2,8 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( round=[ - dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '), - dict(role="BOT", begin='', end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin=' ', end=' ', generate=True), ], ) @@ -27,5 +27,6 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1), end_str='[INST]', + batch_padding=True, ) ] diff --git a/configs/models/hf_llama/hf_llama2_7b_chat.py b/configs/models/hf_llama/hf_llama2_7b_chat.py index 327f98bf..4c880729 100644 --- a/configs/models/hf_llama/hf_llama2_7b_chat.py +++ b/configs/models/hf_llama/hf_llama2_7b_chat.py @@ -2,8 +2,8 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( round=[ - dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '), - dict(role="BOT", begin='', end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin=' ', end=' ', generate=True), ], ) @@ -27,5 +27,6 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), end_str='[INST]', + batch_padding=True, ) ] diff --git a/configs/models/mistral/hf_mistral_7b_instruct_v0_1.py b/configs/models/mistral/hf_mistral_7b_instruct_v0_1.py index 3f8256f4..b8149a51 100644 --- a/configs/models/mistral/hf_mistral_7b_instruct_v0_1.py +++ b/configs/models/mistral/hf_mistral_7b_instruct_v0_1.py @@ -4,10 +4,9 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( begin="", round=[ - dict(role="HUMAN", begin='[INST]', end='[/INST]'), - dict(role="BOT", begin="", end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin="", end=' ', generate=True), ], - eos_token_id=2 ) models = [ @@ -30,5 +29,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), + batch_padding=True, ) ] diff --git a/configs/models/mistral/hf_mistral_7b_instruct_v0_2.py b/configs/models/mistral/hf_mistral_7b_instruct_v0_2.py index f65a49b3..e109ca58 100644 --- a/configs/models/mistral/hf_mistral_7b_instruct_v0_2.py +++ b/configs/models/mistral/hf_mistral_7b_instruct_v0_2.py @@ -4,10 +4,9 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( begin="", round=[ - dict(role="HUMAN", begin='[INST]', end='[/INST]'), - dict(role="BOT", begin="", end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin="", end=' ', generate=True), ], - eos_token_id=2 ) models = [ @@ -30,6 +29,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), - end_str='', + batch_padding=True, ) ] diff --git a/configs/models/mixtral/hf_mixtral_8x7b_instruct_v0_1.py b/configs/models/mixtral/hf_mixtral_8x7b_instruct_v0_1.py index c67a732d..0c31f3c8 100644 --- a/configs/models/mixtral/hf_mixtral_8x7b_instruct_v0_1.py +++ b/configs/models/mixtral/hf_mixtral_8x7b_instruct_v0_1.py @@ -4,10 +4,9 @@ from opencompass.models import HuggingFaceCausalLM _meta_template = dict( begin="", round=[ - dict(role="HUMAN", begin='[INST]', end='[/INST]'), - dict(role="BOT", begin="", end='', generate=True), + dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), + dict(role="BOT", begin="", end=' ', generate=True), ], - eos_token_id=2 ) models = [ @@ -30,6 +29,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=2, num_procs=1), - end_str='', + batch_padding=True, ) ] diff --git a/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py b/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py index e4a93462..1b40ef2a 100644 --- a/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py +++ b/configs/models/openbmb/hf_minicpm_2b_dpo_fp32.py @@ -12,7 +12,6 @@ models = [ type=HuggingFace, abbr='minicpm-2b-dpo-hf', path='openbmb/MiniCPM-2B-dpo-fp32', - tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32', model_kwargs=dict( trust_remote_code=True, device_map='auto', @@ -27,6 +26,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<用户>', + batch_padding=True, ) ] diff --git a/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py b/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py index 6a300747..b8ea8c32 100644 --- a/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py +++ b/configs/models/openbmb/hf_minicpm_2b_sft_fp32.py @@ -12,7 +12,6 @@ models = [ type=HuggingFace, abbr='minicpm-2b-sft-hf', path='openbmb/MiniCPM-2B-sft-fp32', - tokenizer_path='openbmb/MiniCPM-2B-sft-fp32', model_kwargs=dict( trust_remote_code=True, device_map='auto', @@ -27,6 +26,6 @@ models = [ max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), - end_str='<用户>', + batch_padding=True, ) ] diff --git a/configs/models/others/hf_command_r_plus.py b/configs/models/others/hf_command_r_plus.py new file mode 100644 index 00000000..ce41ab3d --- /dev/null +++ b/configs/models/others/hf_command_r_plus.py @@ -0,0 +1,25 @@ +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|START_OF_TURN_TOKEN|><|USER_TOKEN|>', end='<|END_OF_TURN_TOKEN|>'), + dict(role="BOT", begin="<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", end='<|END_OF_TURN_TOKEN|>', generate=True), + ], +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='command-r-plus-hf', + path="CohereForAI/c4ai-command-r-plus", + model_kwargs=dict(device_map='auto', trust_remote_code=True), + tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True), + meta_template=_meta_template, + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=8, num_procs=1), + end_str='<|END_OF_TURN_TOKEN|>', + batch_padding=True, + ) +] diff --git a/configs/models/others/hf_dbrx_instruct.py b/configs/models/others/hf_dbrx_instruct.py index 263129e2..af0a54b7 100644 --- a/configs/models/others/hf_dbrx_instruct.py +++ b/configs/models/others/hf_dbrx_instruct.py @@ -29,7 +29,6 @@ models = [ batch_size=8, meta_template=_meta_template, run_cfg=dict(num_gpus=8, num_procs=1), - end_str='<|im_end|>', batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_0_5b_chat.py b/configs/models/qwen/hf_qwen1_5_0_5b_chat.py index 04a6afed..c7413332 100644 --- a/configs/models/qwen/hf_qwen1_5_0_5b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_0_5b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_14b_chat.py b/configs/models/qwen/hf_qwen1_5_14b_chat.py index 56c8bc47..f6bff1f9 100644 --- a/configs/models/qwen/hf_qwen1_5_14b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_14b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_1_8b_chat.py b/configs/models/qwen/hf_qwen1_5_1_8b_chat.py index 22f425ad..4e090de0 100644 --- a/configs/models/qwen/hf_qwen1_5_1_8b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_1_8b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_32b.py b/configs/models/qwen/hf_qwen1_5_32b.py new file mode 100644 index 00000000..9ad947af --- /dev/null +++ b/configs/models/qwen/hf_qwen1_5_32b.py @@ -0,0 +1,25 @@ +from opencompass.models import HuggingFaceCausalLM + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='qwen1.5-32b-hf', + path="Qwen/Qwen1.5-32B", + tokenizer_path='Qwen/Qwen1.5-32B', + model_kwargs=dict( + device_map='auto', + trust_remote_code=True + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + pad_token_id=151645, + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=2, num_procs=1), + ) +] diff --git a/configs/models/qwen/hf_qwen1_5_32b_chat.py b/configs/models/qwen/hf_qwen1_5_32b_chat.py new file mode 100644 index 00000000..1e215ff6 --- /dev/null +++ b/configs/models/qwen/hf_qwen1_5_32b_chat.py @@ -0,0 +1,33 @@ +from opencompass.models import HuggingFaceCausalLM + +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), + dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), + ], +) + +models = [ + dict( + type=HuggingFaceCausalLM, + abbr='qwen1.5-32b-chat-hf', + path="Qwen/Qwen1.5-32B-Chat", + model_kwargs=dict( + device_map='auto', + trust_remote_code=True + ), + tokenizer_kwargs=dict( + padding_side='left', + truncation_side='left', + trust_remote_code=True, + use_fast=False, + ), + meta_template=_meta_template, + max_out_len=100, + max_seq_len=2048, + batch_size=8, + run_cfg=dict(num_gpus=2, num_procs=1), + end_str='<|im_end|>', + batch_padding=True, + ) +] diff --git a/configs/models/qwen/hf_qwen1_5_4b_chat.py b/configs/models/qwen/hf_qwen1_5_4b_chat.py index b5ed4f89..427c7849 100644 --- a/configs/models/qwen/hf_qwen1_5_4b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_4b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_72b_chat.py b/configs/models/qwen/hf_qwen1_5_72b_chat.py index b8b27b4c..f0279442 100644 --- a/configs/models/qwen/hf_qwen1_5_72b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_72b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/qwen/hf_qwen1_5_7b_chat.py b/configs/models/qwen/hf_qwen1_5_7b_chat.py index 8eb05bee..43825c22 100644 --- a/configs/models/qwen/hf_qwen1_5_7b_chat.py +++ b/configs/models/qwen/hf_qwen1_5_7b_chat.py @@ -5,7 +5,6 @@ _meta_template = dict( dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), ], - eos_token_id=151645, ) models = [ @@ -24,11 +23,11 @@ models = [ use_fast=False, ), meta_template=_meta_template, - pad_token_id=151645, max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=1, num_procs=1), + run_cfg=dict(num_gpus=4, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/yi/hf_yi_34b_chat.py b/configs/models/yi/hf_yi_34b_chat.py index 7ba9b10a..352c58bf 100644 --- a/configs/models/yi/hf_yi_34b_chat.py +++ b/configs/models/yi/hf_yi_34b_chat.py @@ -12,7 +12,6 @@ models = [ type=HuggingFace, abbr='yi-34b-chat-hf', path='01-ai/Yi-34B-Chat', - tokenizer_path='01-ai/Yi-34B-Chat', model_kwargs=dict( trust_remote_code=True, device_map='auto', @@ -26,7 +25,8 @@ models = [ max_out_len=100, max_seq_len=2048, batch_size=8, - run_cfg=dict(num_gpus=4, num_procs=1), + run_cfg=dict(num_gpus=2, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/models/yi/hf_yi_6b_chat.py b/configs/models/yi/hf_yi_6b_chat.py index 273a0e0d..92a46e69 100644 --- a/configs/models/yi/hf_yi_6b_chat.py +++ b/configs/models/yi/hf_yi_6b_chat.py @@ -28,5 +28,6 @@ models = [ batch_size=8, run_cfg=dict(num_gpus=1, num_procs=1), end_str='<|im_end|>', + batch_padding=True, ) ] diff --git a/configs/summarizers/needlebench.py b/configs/summarizers/needlebench.py index 3ecb9f32..ef6ab813 100644 --- a/configs/summarizers/needlebench.py +++ b/configs/summarizers/needlebench.py @@ -17,7 +17,7 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts, for depth in depths ] names_dict[key] = names_list - + multi_needle_list.extend(names_list) if language == 'en': multi_needle_en_list.extend(names_list) @@ -29,7 +29,7 @@ def create_m_rs_names_list(context_lengths, depths, needle_counts, return names_dict -def create_summarizer(context_lengths, depths, dataset_size, +def create_summarizer(context_lengths, depths, dataset_size, sparse_depths=None): needle_counts = ["2", "3", "4", "5"] languages = ["en", "zh"] @@ -40,7 +40,7 @@ def create_summarizer(context_lengths, depths, dataset_size, context_lengths, depths, needle_counts, languages, dataset_size) names_dict.update(multi_reasoning_names) - + single_needle_list = [] single_needle_en_list = [] single_needle_zh_list = [] @@ -133,6 +133,8 @@ context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 1 needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k") context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k") +context_lengths_256k = list([32000, 128000, 256000]) +needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_list_sparse, "256k") context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k") diff --git a/docs/en/advanced_guides/code_eval.md b/docs/en/advanced_guides/code_eval.md index d1b391c5..14748c6c 100644 --- a/docs/en/advanced_guides/code_eval.md +++ b/docs/en/advanced_guides/code_eval.md @@ -4,7 +4,7 @@ This tutorial primarily focuses on evaluating a model's coding proficiency, usin ## pass@1 -If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md). +If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md). For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md). @@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets mbpp_datasets[0]['type'] = MBPPDataset_V2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator @@ -63,7 +63,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['num_repeats'] = 10 diff --git a/docs/zh_cn/advanced_guides/code_eval.md b/docs/zh_cn/advanced_guides/code_eval.md index f6554d6d..2eb3c67f 100644 --- a/docs/zh_cn/advanced_guides/code_eval.md +++ b/docs/zh_cn/advanced_guides/code_eval.md @@ -4,7 +4,7 @@ ## pass@1 -如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。 +如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。 如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。 @@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets mbpp_datasets[0]['type'] = MBPPDataset_V2 mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator @@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator with read_base(): from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets - from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets + from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['num_repeats'] = 10 diff --git a/opencompass/cli/main.py b/opencompass/cli/main.py index 0c07906f..0e8f76d8 100644 --- a/opencompass/cli/main.py +++ b/opencompass/cli/main.py @@ -56,6 +56,12 @@ def parse_args(): 'to run', action='store_true', default=False) + parser.add_argument( + '--accelerator', + help='Infer accelerator, support vllm and lmdeploy now.', + choices=['vllm', 'lmdeploy', 'hg'], + default='hg', + type=str) parser.add_argument('-m', '--mode', help='Running mode. You can choose "infer" if you ' diff --git a/opencompass/datasets/apps.py b/opencompass/datasets/apps.py index d2ce4e34..f8ea0ec2 100644 --- a/opencompass/datasets/apps.py +++ b/opencompass/datasets/apps.py @@ -27,11 +27,9 @@ except ImportError: from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils.logging import get_logger from .base import BaseDataset -logger = get_logger() TIMEOUT = 10 @@ -321,7 +319,7 @@ def timeout_handler(signum, frame): try: signal.signal(signal.SIGALRM, timeout_handler) except AttributeError: - logger.warning('signal.SIGALRM is not available on this platform') + print('signal.SIGALRM is not available on this platform') timeout = 4 # seconds diff --git a/opencompass/datasets/mbpp.py b/opencompass/datasets/mbpp.py index e7c0f1ca..51f87c5b 100644 --- a/opencompass/datasets/mbpp.py +++ b/opencompass/datasets/mbpp.py @@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset): multiple responses in special cases. """ + def processing_test(example): + example['test_case'] = example['test_list'] + example['test_list'] = '\n'.join(example['test_list']) + example['test_list_2'] = example['test_list'] + example['test_column'] = dict(test_list_2=example['test_list'], + task_id=example['task_id']) + return example + dataset = [] with open(path, 'r', encoding='utf-8') as f: for line in f: - dataset.extend( - [json.loads(line.strip()) for _ in range(num_repeats)]) + example = json.loads(line.strip()) + example = processing_test(example) + dataset.extend([example for _ in range(num_repeats)]) return Dataset.from_list(dataset) @@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator): predictions)): pred = self._process_answer(pred) programs = self._process_test(refer, pred) - future = executor.submit(execution, programs, i, 3) + future = executor.submit(execution, programs, i, 10) futures.append(future) details[str(i)] = {} details[str(i)]['origin'] = predictions[i] @@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator): return {f'mbpp_plus_{k}': score[k] * 100 for k in score} def _process_answer(self, text): - try: - # for chatGLM related text - eval_text = eval(text) - except Exception: - pass - else: - if isinstance(eval_text, str): - text = eval_text - # deal with code block - if '```' in text: - blocks = re.findall(r'```(.*?)```', text, re.DOTALL) - if len(blocks) == 0: - text = text.split('```')[1] # fall back to default strategy - else: - text = blocks[0] # fetch the first code block - if not text.startswith('\n'): # in case starting with ```xxx - text = text[max(text.find('\n') + 1, 0):] + patterns = [ + r"\[BEGIN\]\s*'(.*)'\s*\[DONE\]", + r"BEGIN\s*'(.*)'\s*\[DONE\]", + r"\[BEGIN\]\s*'(.*)'\s*DONE", + r"BEGIN\s*'(.*)'\s*DONE", + r"\[BEGIN\]\s*'(.*)\s*\[DONE\]", + r"BEGIN\s*'(.*)\s*\[DONE\]", + r"\[BEGIN\]\s*'(.*)\s*DONE", + r"BEGIN\s*'(.*)\s*DONE", + r'\[BEGIN\]\s*(.*)\s*\[DONE\]', + r'BEGIN\s*(.*)\s*\[DONE\]', + r'\[BEGIN\]\s*(.*)\s*DONE', + r'BEGIN\s*(.*)\s*DONE', + r'```python\s*(.*)\s*```', + r'```\s*(.*)\s*```', + r'(.*)\s*```.*', + r"\[BEGIN\]\s*'(.*)", + r'\[BEGIN\](.*)', + ] + for p in patterns: + match = re.search(p, text, re.DOTALL) + if match: + text = match.group(1) + break + text = text.split('```')[0] + text = re.split(r"'?\s*\[?DONE\]?", text)[0] + text = text.replace('\\_', '_') text = text.strip() - match = re.search(r"('\s*|)(\[DONE\]|DONE)", text) - if match: - text = text[:match.start()] - match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text) - if match: - text = text[match.end():] - text = text.strip() - if text.startswith("'"): - text = text[1:] - if text.endswith("'"): - text = text[:-1] - text = text.replace('\\', '') - match = re.search(r'```python(.*)```', text, re.DOTALL) - if match: - text = match.group(1).strip().split('```')[0].strip() return text def _process_test(self, test_case, pred): @@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator): for pred in preds: pred = self._process_answer(pred) programs = self._process_test(test_case, pred) - future = executor.submit(execution, programs, task_id, 3) + future = executor.submit(execution, programs, task_id, 10) futures.append(future) from tqdm import tqdm diff --git a/opencompass/datasets/taco.py b/opencompass/datasets/taco.py index ea339364..79b41297 100644 --- a/opencompass/datasets/taco.py +++ b/opencompass/datasets/taco.py @@ -27,11 +27,9 @@ except ImportError: from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET -from opencompass.utils.logging import get_logger from .base import BaseDataset -logger = get_logger() TIMEOUT = 10 @@ -267,7 +265,7 @@ def timeout_handler(signum, frame): try: signal.signal(signal.SIGALRM, timeout_handler) except AttributeError: - logger.warning('signal.SIGALRM is not available on this platform') + print('signal.SIGALRM is not available on this platform') timeout = 4 # seconds diff --git a/opencompass/models/openai_api.py b/opencompass/models/openai_api.py index 9d86d72a..47a942df 100644 --- a/opencompass/models/openai_api.py +++ b/opencompass/models/openai_api.py @@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel): self.top_logprobs = top_logprobs if isinstance(key, str): - self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key] + if key == 'ENV': + if 'OPENAI_API_KEY' not in os.environ: + raise ValueError('OpenAI API key is not set.') + self.keys = os.getenv('OPENAI_API_KEY').split(',') + else: + self.keys = [key] else: self.keys = key @@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel): self.url = openai_api_base self.path = path - def generate( - self, - inputs: List[PromptType], - max_out_len: int = 512, - temperature: float = 0.7, - ) -> List[str]: + def generate(self, + inputs: List[PromptType], + max_out_len: int = 512, + temperature: float = 0.7, + **kwargs) -> List[str]: """Generate results given a list of inputs. Args: @@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI): } for _ in range(self.retry): self.wait() - raw_response = requests.post(self.url, - headers=self.headers, - data=json.dumps(data)) + try: + raw_response = requests.post(self.url, + headers=self.headers, + data=json.dumps(data)) + except requests.ConnectionError: + self.logger.error('Request error, got', + str(raw_response.content)) + time.sleep(1) + continue try: response = raw_response.json() except requests.JSONDecodeError: diff --git a/opencompass/models/qwen_api.py b/opencompass/models/qwen_api.py index 68f8dae8..1f34cd2c 100644 --- a/opencompass/models/qwen_api.py +++ b/opencompass/models/qwen_api.py @@ -161,7 +161,7 @@ class Qwen(BaseAPIModel): time.sleep(1) continue if response.status_code == 429: - print('Rate limited') + print(response) time.sleep(2) continue if response.status_code == 400: diff --git a/opencompass/runners/dlc.py b/opencompass/runners/dlc.py index 79924841..55fcb62a 100644 --- a/opencompass/runners/dlc.py +++ b/opencompass/runners/dlc.py @@ -214,6 +214,16 @@ class DLCRunner(BaseRunner): pod_create_time = None pri_time = None initial_time = datetime.datetime.now() + + url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501 + logger = get_logger() + logger.debug('') + logger.debug('*' * 168) + logger.debug( + f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501 + ) + logger.debug('*' * 168) + while True: # 1. Avoid to request dlc too frequently. # 2. DLC job may not be ready immediately after creation. diff --git a/opencompass/runners/slurm_sequential.py b/opencompass/runners/slurm_sequential.py index 3b4dcad5..cf61f23b 100644 --- a/opencompass/runners/slurm_sequential.py +++ b/opencompass/runners/slurm_sequential.py @@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner): tmpl += f' --gres=gpu:{num_gpus}' for extra_cmd in self.extra_command: tmpl += f' {extra_cmd}' + tmpl += ' -x HOST-10-140-60-7' tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}' get_cmd = partial(task.get_command, cfg_path=param_file, diff --git a/opencompass/summarizers/needlebench.py b/opencompass/summarizers/needlebench.py index f811e3d9..9e03f960 100644 --- a/opencompass/summarizers/needlebench.py +++ b/opencompass/summarizers/needlebench.py @@ -72,7 +72,7 @@ dataset_mapping_dict = {} needle_counts = ['2', '3', '4', '5'] languages = ['en', 'zh'] -sizes = ['4k', '8k', '32k', '200k', '1000k'] +sizes = ['4k', '8k', '32k', '200k', '256k', '1000k'] types = ['origin', 'parallel'] for needle_count in needle_counts: @@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path): numbers = [2, 3, 4, 5] languages = ['en', 'zh'] size_exists = [] - sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k'] + sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_256k', '_1000k'] for size in sizes_origin: if size in content: @@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): markersize=8, label='Average Depth Score' ) + for x_value, y_value in zip(x_data, y_data): + ax2.text(x_value, y_value, f'{y_value:.2f}', ha='center', va='top') + ax2.set_ylim(0, 100) ax2.set_yticklabels([]) @@ -353,7 +356,7 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str): new_save_path = os.path.join(directory_path, new_filename) plt.savefig(new_save_path, format='png', bbox_inches='tight', pad_inches=0) - print(f'Saved :{new_save_path}') + print(f'Saved: {new_save_path}') plt.close() diff --git a/opencompass/summarizers/subjective/compass_arena.py b/opencompass/summarizers/subjective/compass_arena.py index f6e6c246..764190b2 100644 --- a/opencompass/summarizers/subjective/compass_arena.py +++ b/opencompass/summarizers/subjective/compass_arena.py @@ -1,6 +1,5 @@ -# flake8: noqa: E501 -import ast -import csv +# flake8: noqa +# yapf: disable import os import os.path as osp import re @@ -10,7 +9,7 @@ from itertools import product import mmengine from mmengine import ConfigDict -from prettytable import from_csv +from tabulate import tabulate from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg @@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg from .utils import get_judgeanswer_and_reference, get_outdir +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + def post_process_compass_arena(s): if result := re.findall('(?:选择:|Choice: )([ABC])', s): return result[0] @@ -68,17 +73,90 @@ class CompassArenaSummarizer: self.base_models = self.cfg['eval']['partitioner']['base_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.judge_models = self.cfg.get('judge_models', None) - self.meta_judge_model = self.cfg.eval.partitioner.get( - 'meta_judge_model', None) + self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None) self.judge_type = judge_type assert self.judge_type in ['general'] - self.judge_map = { - 'general': post_process_compass_arena, - } + self.judge_map = {'general': post_process_compass_arena} self.judge_function = self.judge_map[self.judge_type] self.check_pos_bias = check_pos_bias self.summary_type = summary_type + def get_score(self, time_str): + output_dir, results_folder = get_outdir(self.cfg, time_str) + model_combinations = list(product(self.base_models, self.compare_models)) + unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]]) + + if self.meta_judge_model is not None: + self.judge_models.append(self.meta_judge_model) + + scores = {} + + for idx, judge_model_cfg in enumerate(self.judge_models): + judge_model = model_abbr_from_cfg(judge_model_cfg) + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + for model_pair in unique_combinations: + model1 = model_pair[0]['abbr'] + model2 = model_pair[1]['abbr'] + if idx == len(self.judge_models): + subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model + else: + subdir = model1 + '_' + model2 + '_judged-by--' + judge_model + subdir_path = os.path.join(results_folder, subdir) + if not os.path.isdir(subdir_path): + print(subdir_path + ' is not exist! please check!') + continue + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + if self.check_pos_bias: + bias_num = check_position_bias(judged_answers, references) + else: + bias_num = 0 + win_model1 = defaultdict(float) + win_model2 = defaultdict(float) + categories = defaultdict(float) + model1 = references[0]['answer1'] + model2 = references[0]['answer2'] + for prediction, reference in zip(judged_answers, references): + categories[dataset_abbr] += 1 + categories[reference['capability']] += 1 + + if prediction == 'A': + if reference['answer1'] == model1: + score_1, score_2 = 1, 0 + else: + score_1, score_2 = 0, 1 + elif prediction == 'B': + if reference['answer1'] == model1: + score_1, score_2 = 0, 1 + else: + score_1, score_2 = 1, 0 + elif prediction == 'C': + if self.summary_type == 'half_add': + score_1, score_2 = 0.5, 0.5 + else: + score_1, score_2 = 0, 0 + + win_model1[reference['capability']] += score_1 + win_model1[dataset_abbr] += score_1 + win_model2[reference['capability']] += score_2 + win_model2[dataset_abbr] += score_2 + for capability in categories: + win_model1[capability] = win_model1[capability] / categories[capability] * 100 + win_model1[capability] = round(win_model1[capability], 2) + win_model2[capability] = win_model2[capability] / categories[capability] * 100 + win_model2[capability] = round(win_model2[capability], 2) + + win_model1['position_bias'] = bias_num + win_model2['position_bias'] = bias_num + + if judge_model not in scores: + scores[judge_model] = {} + if dataset_abbr not in scores[judge_model]: + scores[judge_model][dataset_abbr] = {} + scores[judge_model][dataset_abbr][model2] = win_model2 + + return scores + def summarize( self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), @@ -91,143 +169,72 @@ class CompassArenaSummarizer: Returns: pd.DataFrame: The summary results. """ - dataset_cfgs = self.cfg['datasets'] - output_dir, results_folder = get_outdir(self.cfg, time_str) - model_combinations = list( - product(self.base_models, self.compare_models)) - unique_combinations = remove_duplicate_pairs( - [combo for combo in model_combinations if combo[0] != combo[1]]) - fout_list = [] - pre_len = len(self.judge_models) - if self.meta_judge_model is not None: - self.judge_models.append(self.meta_judge_model) - meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model) - else: - meta_judge_model_abbr = None + + scores = self.get_score(time_str) + # scores['win_' + model1] = win_model1 + output_dir, results_folder = get_outdir(self.cfg, time_str) + + for idx, judge_model in enumerate(self.judge_models): - judge_model = model_abbr_from_cfg(judge_model) - for dataset in dataset_cfgs: + judge_abbr = model_abbr_from_cfg(judge_model) + for dataset in self.cfg['datasets']: dataset_abbr = dataset_abbr_from_cfg(dataset) - if idx == pre_len: - fout = osp.join( - output_dir, 'summarized-by--' + judge_model + '-' + - dataset_abbr + '-report.csv') + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + one_column = list(scores[judge_abbr][dataset_abbr].values())[0] + row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']] + row_headers = [dataset_abbr, 'position_bias'] + row_headers + headers = [''] + summarizer_model_abbrs + table = [] + for row_header in row_headers: + row = [row_header] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') else: - fout = osp.join( - output_dir, 'judged-by--' + judge_model + '-' + - dataset_abbr + '-report.csv') - fout_list.append(fout) - for model_pair in unique_combinations: - model1, model2, = model_pair[0]['abbr'], model_pair[1][ - 'abbr'], - if idx == pre_len: - subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model - else: - subdir = model1 + '_' + model2 + '_judged-by--' + judge_model - subdir_path = os.path.join(results_folder, subdir) - if os.path.isdir(subdir_path): - judged_answers, references = get_judgeanswer_and_reference( - dataset, - subdir_path, - self.judge_function, - ) - if self.check_pos_bias: - bias_num = check_position_bias( - judged_answers, references) - else: - bias_num = 0 - win_model1, win_model2, categories = defaultdict( - float), defaultdict(float), defaultdict(float) - model1, model2 = references[0]['answer1'], references[ - 0]['answer2'] - for prediction, reference in zip( - judged_answers, references): - if self.summary_type == 'single': - if prediction == 'A': - categories['total'] += 1 - categories[reference['capability']] += 1 - if reference['answer1'] == model1: - win_model1[ - reference['capability']] += 1 - win_model1['total'] += 1 - else: - win_model2[ - reference['capability']] += 1 - win_model2['total'] += 1 - elif prediction == 'B': - categories['total'] += 1 - categories[reference['capability']] += 1 - if reference['answer1'] == model1: - win_model2[ - reference['capability']] += 1 - win_model2['total'] += 1 - else: - win_model1[ - reference['capability']] += 1 - win_model1['total'] += 1 - elif self.summary_type == 'half_add': - categories['total'] += 1 - categories[reference['capability']] += 1 - if prediction == 'A': - if reference['answer1'] == model1: - win_model1[ - reference['capability']] += 1 - win_model1['total'] += 1 - else: - win_model2[ - reference['capability']] += 1 - win_model2['total'] += 1 - elif prediction == 'B': - if reference['answer1'] == model1: - win_model2[ - reference['capability']] += 1 - win_model2['total'] += 1 - else: - win_model1[ - reference['capability']] += 1 - win_model1['total'] += 1 - elif prediction == 'C': - win_model1[reference['capability']] += 0.5 - win_model1['total'] += 0.5 - win_model2[reference['capability']] += 0.5 - win_model2['total'] += 0.5 - for capability in categories: - if capability not in win_model1: - win_model1[capability] = 0.0 - else: - win_model1[capability] = round( - (win_model1[capability] / - categories[capability]) * 100, 2) - if capability not in win_model2: - win_model2[capability] = 0.0 - else: - win_model2[capability] = round( - (win_model2[capability] / - categories[capability]) * 100, 2) - win_model1['position_bias'] = bias_num - win_model2['position_bias'] = bias_num - scores = { - 'win_' + model1: win_model1, - 'win_' + model2: win_model2 - } - rows = list(scores.keys()) - columns = list(scores[rows[0]].keys()) - columns.insert(0, columns.pop(columns.index('total'))) - columns.insert( - 1, columns.pop(columns.index('position_bias'))) - with open(fout, 'a+', newline='') as csvfile: - writer = csv.writer(csvfile) - writer.writerow([model1 + '_vs_' + model2] + - columns) - for row in rows: - writer.writerow([row] + [ - scores[row][column] for column in columns - ]) - else: - print(subdir_path + ' is not exist! please check!') - for fout in fout_list: - with open(fout, 'r') as f: - x = from_csv(f) - print(fout) - print(x) + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv') + + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) + + table = [] + summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models] + headers = [''] + summarizer_model_abbrs + for dataset in self.cfg['datasets']: + dataset_abbr = dataset_abbr_from_cfg(dataset) + row = [dataset_abbr] + for model_cfg in self.compare_models: + model_abbr = model_abbr_from_cfg(model_cfg) + s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '') + if isinstance(s, float): + s = f'{s:.2f}' + if isinstance(s, int): + s = str(s) + row.append(s) + table.append(row) + txt = tabulate(table, headers=headers) + print(txt) + + if idx == len(self.judge_models): + output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv') + else: + output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv') + with open(output_filename, 'w') as f: + f.write(','.join(headers) + '\n') + for line in table: + f.write(','.join(line) + '\n') + print(output_filename) diff --git a/opencompass/summarizers/subjective/mtbench.py b/opencompass/summarizers/subjective/mtbench.py index 5867769b..c2c8f2cf 100644 --- a/opencompass/summarizers/subjective/mtbench.py +++ b/opencompass/summarizers/subjective/mtbench.py @@ -1,4 +1,5 @@ -# flake8: noqa: E501 +# flake8: noqa +# yapf: disable import csv import os import os.path as osp @@ -8,11 +9,7 @@ from datetime import datetime import numpy as np from mmengine import ConfigDict - -try: - from prettytable import from_csv -except ImportError: - from_csv = None +from tabulate import tabulate from opencompass.utils import model_abbr_from_cfg @@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer from .utils import get_judgeanswer_and_reference, get_outdir +def model_abbr_from_cfg_used_in_summarizer(model): + if model.get('summarizer_abbr', None): + return model['summarizer_abbr'] + else: + return model_abbr_from_cfg(model) + def post_process_mtbench_pair(judgement: str): """Input a string like below: @@ -52,7 +55,7 @@ def get_capability_results( references, fout, fout_flag, - model, + model_abbr, ): capability_ratings = defaultdict(int) capability_counts = defaultdict(int) @@ -70,12 +73,12 @@ def get_capability_results( capability_avg_ratings[capability] = s columns = list(capability_avg_ratings.keys()) columns.insert(0, columns.pop(columns.index('total'))) + with open(fout, 'a+', newline='') as csvfile: writer = csv.writer(csvfile) if fout_flag == 0: writer.writerow(['model'] + columns) - writer.writerow([model] + - [capability_avg_ratings[column] for column in columns]) + writer.writerow([model_abbr] + [capability_avg_ratings[column] for column in columns]) class MTBenchSummarizer(CompassArenaSummarizer): @@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer): self.cfg = config if self.judge_type == 'single': self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] - self.eval_model_abbrs = [ - model_abbr_from_cfg(model) for model in self.eval_model_cfgs - ] elif self.judge_type == 'pair': self.base_models = self.cfg['eval']['partitioner']['base_models'] - self.compare_models = self.cfg['eval']['partitioner'][ - 'compare_models'] + self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_map = { 'single': post_process_mtbench_single, @@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer): } self.judge_function = self.judge_map[self.judge_type] - def summarize(self, - time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): + def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')): """Summarize the subjectivity analysis based on evaluation results. Args: @@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer): Returns: pd.DataFrame: The summary results. """ - if self.judge_type == 'single': - dataset_cfgs = self.cfg['datasets'] - output_dir, results_folder = get_outdir(self.cfg, time_str) - fout_flag = 0 - for eval_model_abbr in self.eval_model_abbrs: - subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr - subdir_path = os.path.join(results_folder, subdir) - if os.path.isdir(subdir_path): - model, judge_model = eval_model_abbr, self.judge_abbr - fout = osp.join( - output_dir, - 'judged-by--' + judge_model + '-capability.csv') - overall_judged_answers, overall_references = [], [] - for dataset in dataset_cfgs: - judged_answers, references = get_judgeanswer_and_reference( - dataset, subdir_path, self.judge_function) - overall_judged_answers += judged_answers - overall_references += references - get_capability_results(overall_judged_answers, - overall_references, fout, fout_flag, - model) - fout_flag += 1 - else: - print(subdir_path + ' is not exist! please check!') - with open(fout, 'r') as f: - x = from_csv(f) - print(x) - print(fout) - elif self.judge_type == 'pair': - super().summarize() + if self.judge_type == 'pair': + return super().summarize() + + # self.judge_type == 'single' + dataset_cfgs = self.cfg['datasets'] + output_dir, results_folder = get_outdir(self.cfg, time_str) + fout_flag = 0 + for eval_model_cfg in self.eval_model_cfgs: + eval_model_abbr = model_abbr_from_cfg(eval_model_cfg) + show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg) + subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr) + if os.path.isdir(subdir_path): + fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv') + overall_judged_answers, overall_references = [], [] + for dataset in dataset_cfgs: + judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function) + overall_judged_answers += judged_answers + overall_references += references + get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr) + fout_flag += 1 + else: + print(subdir_path + ' is not exist! please check!') + with open(fout, 'r') as f: + csv_reader = csv.reader(f) + header = next(csv_reader) + table = [line for line in csv_reader] + + new_header = [''] + [line[0] for line in table] + new_table = [[h] + line[1:] for h, line in zip(header[1:], table)] + new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)] + t = tabulate(new_table, headers=new_header) + with open(fout, 'w') as f: + f.write(','.join(new_header) + '\n') + for line in new_table: + f.write(','.join(map(str, line)) + '\n') + print(t) + print(fout) diff --git a/opencompass/tasks/openicl_eval.py b/opencompass/tasks/openicl_eval.py index fa43c410..8351d81c 100644 --- a/opencompass/tasks/openicl_eval.py +++ b/opencompass/tasks/openicl_eval.py @@ -3,6 +3,7 @@ import copy import fnmatch import math import os.path as osp +import re import statistics import time from collections import Counter @@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str], start = 0 end = len(s) - if begin_str: + if begin_str and re.match(r'\s*', begin_str) is None: begin_idx = s.find(begin_str) if begin_idx != -1: start = begin_idx + len(begin_str) - if end_str: + if end_str and re.match(r'\s*', end_str) is None: # TODO: Support calling tokenizer for the accurate eos token # and avoid such hardcode end_idx = s.find(end_str, start) diff --git a/opencompass/tasks/outer_eval/alpacaeval.py b/opencompass/tasks/outer_eval/alpacaeval.py index 006551e2..2701e870 100644 --- a/opencompass/tasks/outer_eval/alpacaeval.py +++ b/opencompass/tasks/outer_eval/alpacaeval.py @@ -1,6 +1,7 @@ # flake8: noqa: E501 import copy import json +import os import os.path as osp import mmengine @@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask): command = '' if api_key is not None: command += f'export OPENAI_API_KEY={api_key}; ' + else: + api_key = os.environ.get('OPENAI_API_KEY', '').split(',')[0] + if api_key: + command += f'export OPENAI_API_KEY={api_key}; ' command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}' return template.format(task_cmd=command) diff --git a/opencompass/utils/run.py b/opencompass/utils/run.py index 3f3baa77..5a53da0f 100644 --- a/opencompass/utils/run.py +++ b/opencompass/utils/run.py @@ -5,6 +5,7 @@ import tabulate from mmengine.config import Config from opencompass.datasets.custom import make_custom_dataset_config +from opencompass.models import VLLM, HuggingFaceCausalLM, TurboMindModel from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask @@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config: if args.config: config = Config.fromfile(args.config, format_python_code=False) config = try_fill_in_custom_cfgs(config) + # set infer accelerator if needed + if args.accelerator in ['vllm', 'lmdeploy']: + config['models'] = change_accelerator(config['models'], + args.accelerator) return config # parse dataset args if not args.datasets and not args.custom_dataset_path: @@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config: pad_token_id=args.pad_token_id, run_cfg=dict(num_gpus=args.num_gpus)) models.append(model) + # set infer accelerator if needed + if args.accelerator in ['vllm', 'lmdeploy']: + models = change_accelerator(models, args.accelerator) # parse summarizer args summarizer_arg = args.summarizer if args.summarizer is not None \ else 'example' @@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config: format_python_code=False) +def change_accelerator(models, accelerator): + models = models.copy() + model_accels = [] + for model in models: + get_logger().info(f'Transforming {model["abbr"]} to {accelerator}') + # change HuggingFace model to VLLM or TurboMindModel + if model['type'] is HuggingFaceCausalLM: + gen_args = dict() + if model.get('generation_kwargs') is not None: + generation_kwargs = model['generation_kwargs'].copy() + gen_args['temperature'] = 0.001 if generation_kwargs.get( + 'temperature' + ) is None else generation_kwargs['temperature'] + gen_args['top_k'] = 1 if generation_kwargs.get( + 'top_k') is None else generation_kwargs['top_k'] + gen_args['top_p'] = 0.9 if generation_kwargs.get( + 'top_p') is None else generation_kwargs['top_p'] + gen_args['stop_token_ids'] = None if generation_kwargs.get( + 'eos_token_id' + ) is None else generation_kwargs['eos_token_id'] + generation_kwargs[ + 'stop_token_ids'] = None if generation_kwargs.get( + 'eos_token_id' + ) is None else generation_kwargs['eos_token_id'] + generation_kwargs.pop('eos_token_id') + else: + # if generation_kwargs is not provided, set default values + generation_kwargs = dict() + gen_args['temperature'] = 0.0 + gen_args['top_k'] = 1 + gen_args['top_p'] = 0.9 + gen_args['stop_token_ids'] = None + + if accelerator == 'lmdeploy': + get_logger().info( + f'Transforming {model["abbr"]} to {accelerator}') + model = dict( + type= # noqa E251 + f'{TurboMindModel.__module__}.{TurboMindModel.__name__}', + abbr=model['abbr'].replace('hf', 'lmdeploy') + if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy', + path=model['path'], + engine_config=dict(session_len=model['max_seq_len'], + max_batch_size=model['batch_size'], + tp=model['run_cfg']['num_gpus']), + gen_config=dict(top_k=gen_args['top_k'], + temperature=gen_args['temperature'], + top_p=gen_args['top_p'], + max_new_tokens=model['max_out_len'], + stop_words=gen_args['stop_token_ids']), + max_out_len=model['max_out_len'], + max_seq_len=model['max_seq_len'], + batch_size=model['batch_size'], + concurrency=model['batch_size'], + run_cfg=model['run_cfg'], + ) + for item in ['meta_template']: + if model.get(item) is not None: + model.update(item, model[item]) + elif accelerator == 'vllm': + get_logger().info( + f'Transforming {model["abbr"]} to {accelerator}') + + model = dict( + type=f'{VLLM.__module__}.{VLLM.__name__}', + abbr=model['abbr'].replace('hf', 'vllm') + if '-hf' in model['abbr'] else model['abbr'] + '-vllm', + path=model['path'], + model_kwargs=dict( + tensor_parallel_size=model['run_cfg']['num_gpus']), + max_out_len=model['max_out_len'], + max_seq_len=model['max_seq_len'], + batch_size=model['batch_size'], + generation_kwargs=generation_kwargs, + run_cfg=model['run_cfg'], + ) + for item in ['meta_template', 'end_str']: + if model.get(item) is not None: + model.update(item, model[item]) + generation_kwargs.update( + dict(temperature=gen_args['temperature'])) + else: + raise ValueError(f'Unsupported accelerator {accelerator}') + model_accels.append(model) + return model_accels + + def exec_mm_infer_runner(tasks, args, cfg): """execute multimodal infer runner according to args.""" if args.slurm: