mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] deprecate old mbpps (#1064)
This commit is contained in:
parent
c172401323
commit
8c85edd1cd
@ -15,6 +15,6 @@ with read_base():
|
|||||||
from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
|
from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
|
||||||
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
|
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
|
||||||
from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
|
from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
|
||||||
from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
|
from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
|
||||||
|
|
||||||
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
|
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
|
||||||
|
@ -7,7 +7,7 @@ with read_base():
|
|||||||
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
||||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
||||||
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
||||||
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
||||||
|
@ -7,7 +7,7 @@ with read_base():
|
|||||||
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
|
||||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||||
from ..humaneval.humaneval_gen_a82cae import humaneval_datasets
|
from ..humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
|
||||||
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
||||||
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
||||||
|
@ -11,7 +11,7 @@ with read_base():
|
|||||||
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
|
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
|
||||||
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
|
||||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
||||||
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
|
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
|
||||||
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets
|
||||||
|
@ -15,6 +15,6 @@ with read_base():
|
|||||||
from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
|
from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
|
||||||
from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
|
from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
|
||||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
||||||
|
|
||||||
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
|
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
|
||||||
|
@ -7,7 +7,7 @@ with read_base():
|
|||||||
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
|
||||||
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
from ..bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
|
||||||
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
|
||||||
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets
|
||||||
|
@ -12,7 +12,7 @@ with read_base():
|
|||||||
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
|
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
|
||||||
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
|
||||||
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
from ..lambada.lambada_gen_217e11 import lambada_datasets
|
||||||
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
|
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
|
||||||
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets
|
||||||
|
@ -44,7 +44,7 @@ with read_base():
|
|||||||
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
from ...drop.drop_gen_8a9ed9 import drop_datasets
|
from ...drop.drop_gen_8a9ed9 import drop_datasets
|
||||||
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ...bbh.bbh_gen_5bf00b import bbh_datasets
|
from ...bbh.bbh_gen_5bf00b import bbh_datasets
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ with read_base():
|
|||||||
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
from ...drop.drop_gen_8a9ed9 import drop_datasets
|
from ...drop.drop_gen_8a9ed9 import drop_datasets
|
||||||
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
from ...bbh.bbh_gen_5b92b0 import bbh_datasets
|
from ...bbh.bbh_gen_5b92b0 import bbh_datasets
|
||||||
|
|
||||||
|
|
||||||
|
46
configs/datasets/gpqa/gpqa_gen_015262.py
Normal file
46
configs/datasets/gpqa/gpqa_gen_015262.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import GPQADataset, GPQAEvaluator
|
||||||
|
from opencompass.utils import first_option_postprocess
|
||||||
|
|
||||||
|
gpqa_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'A', 'B', 'C', 'D'],
|
||||||
|
output_column='answer')
|
||||||
|
|
||||||
|
gpqa_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
|
||||||
|
'(A){A}\n'
|
||||||
|
'(B){B}\n'
|
||||||
|
'(C){C}\n'
|
||||||
|
'(D){D}\n'
|
||||||
|
'Format your response as follows: "The correct answer is (insert answer here)"'),
|
||||||
|
], )),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
|
||||||
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
|
||||||
|
|
||||||
|
gpqa_datasets = []
|
||||||
|
gpqa_subsets = {
|
||||||
|
'extended': 'gpqa_extended.csv',
|
||||||
|
'main': 'gpqa_main.csv',
|
||||||
|
'diamond': 'gpqa_diamond.csv'
|
||||||
|
}
|
||||||
|
|
||||||
|
for split in list(gpqa_subsets.keys()):
|
||||||
|
gpqa_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr='GPQA_' + split,
|
||||||
|
type=GPQADataset,
|
||||||
|
path='./data/gpqa/',
|
||||||
|
name=gpqa_subsets[split],
|
||||||
|
reader_cfg=gpqa_reader_cfg,
|
||||||
|
infer_cfg=gpqa_infer_cfg,
|
||||||
|
eval_cfg=gpqa_eval_cfg)
|
||||||
|
)
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .mbpp_gen_1e1056 import mbpp_datasets # noqa: F401, F403
|
from .mbpp_gen_830460 import mbpp_datasets # noqa: F401, F403
|
||||||
|
@ -1,46 +0,0 @@
|
|||||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
|
||||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
|
||||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
|
||||||
from opencompass.datasets import MBPPDataset, MBPPEvaluator2
|
|
||||||
|
|
||||||
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
|
|
||||||
|
|
||||||
# This prompt is used for WizardLMCode series
|
|
||||||
# You can use other config file for basic 3-shot generation
|
|
||||||
mbpp_infer_cfg = dict(
|
|
||||||
prompt_template=dict(
|
|
||||||
type=PromptTemplate,
|
|
||||||
template=dict(
|
|
||||||
round=[
|
|
||||||
dict(
|
|
||||||
role="HUMAN",
|
|
||||||
prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
|
||||||
|
|
||||||
### Instruction:
|
|
||||||
Create a Python script for this problem:
|
|
||||||
|
|
||||||
{text}
|
|
||||||
Test examples:
|
|
||||||
{test_list}
|
|
||||||
|
|
||||||
### Response:""",
|
|
||||||
),
|
|
||||||
]
|
|
||||||
),
|
|
||||||
),
|
|
||||||
retriever=dict(type=ZeroRetriever),
|
|
||||||
inferencer=dict(type=GenInferencer, max_out_len=512),
|
|
||||||
)
|
|
||||||
|
|
||||||
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
|
|
||||||
|
|
||||||
mbpp_datasets = [
|
|
||||||
dict(
|
|
||||||
type=MBPPDataset,
|
|
||||||
abbr="mbpp",
|
|
||||||
path="./data/mbpp/mbpp.jsonl",
|
|
||||||
reader_cfg=mbpp_reader_cfg,
|
|
||||||
infer_cfg=mbpp_infer_cfg,
|
|
||||||
eval_cfg=mbpp_eval_cfg,
|
|
||||||
)
|
|
||||||
]
|
|
@ -25,7 +25,7 @@ mbpp_infer_cfg = dict(
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=GenInferencer),
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
)
|
)
|
||||||
|
|
||||||
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
|
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
|
42
configs/datasets/mbpp/mbpp_passk_gen_830460.py
Normal file
42
configs/datasets/mbpp/mbpp_passk_gen_830460.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
||||||
|
|
||||||
|
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
|
||||||
|
|
||||||
|
mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
mbpp_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MBPPDataset_V2,
|
||||||
|
abbr="mbpp_passk",
|
||||||
|
path="./data/mbpp/mbpp.jsonl",
|
||||||
|
reader_cfg=mbpp_reader_cfg,
|
||||||
|
infer_cfg=mbpp_infer_cfg,
|
||||||
|
eval_cfg=mbpp_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
45
configs/datasets/mbpp/mbpp_repeat10_gen_830460.py
Normal file
45
configs/datasets/mbpp/mbpp_repeat10_gen_830460.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# This config is used for pass@k evaluation with dataset repetition
|
||||||
|
# That model cannot generate multiple response for single input
|
||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
||||||
|
|
||||||
|
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
|
||||||
|
|
||||||
|
mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
mbpp_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MBPPDataset_V2,
|
||||||
|
abbr="mbpp_repeat10",
|
||||||
|
path="./data/mbpp/mbpp.jsonl",
|
||||||
|
num_repeats=10,
|
||||||
|
reader_cfg=mbpp_reader_cfg,
|
||||||
|
infer_cfg=mbpp_infer_cfg,
|
||||||
|
eval_cfg=mbpp_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
42
configs/datasets/mbpp/sanitized_mbpp_gen_830460.py
Normal file
42
configs/datasets/mbpp/sanitized_mbpp_gen_830460.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
|
||||||
|
|
||||||
|
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
|
||||||
|
|
||||||
|
sanitized_mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n",),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n",),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n",),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n",),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
sanitized_mbpp_datasets = [
|
||||||
|
dict(
|
||||||
|
type=SanitizedMBPPDataset,
|
||||||
|
abbr="sanitized_mbpp",
|
||||||
|
path="./data/mbpp/sanitized-mbpp.jsonl",
|
||||||
|
reader_cfg=sanitized_mbpp_reader_cfg,
|
||||||
|
infer_cfg=sanitized_mbpp_infer_cfg,
|
||||||
|
eval_cfg=sanitized_mbpp_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
42
configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py
Normal file
42
configs/datasets/mbpp/sanitized_mbpp_passk_gen_830460.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
|
||||||
|
|
||||||
|
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
|
||||||
|
|
||||||
|
sanitized_mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
sanitized_mbpp_datasets = [
|
||||||
|
dict(
|
||||||
|
type=SanitizedMBPPDataset,
|
||||||
|
abbr="sanitized_mbpp_passk",
|
||||||
|
path="./data/mbpp/sanitized-mbpp.jsonl",
|
||||||
|
reader_cfg=sanitized_mbpp_reader_cfg,
|
||||||
|
infer_cfg=sanitized_mbpp_infer_cfg,
|
||||||
|
eval_cfg=sanitized_mbpp_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
43
configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py
Normal file
43
configs/datasets/mbpp/sanitized_mbpp_repeat10_gen_830460.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
|
||||||
|
|
||||||
|
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
|
||||||
|
|
||||||
|
sanitized_mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
|
||||||
|
|
||||||
|
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512),
|
||||||
|
)
|
||||||
|
|
||||||
|
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
sanitized_mbpp_datasets = [
|
||||||
|
dict(
|
||||||
|
type=SanitizedMBPPDataset,
|
||||||
|
abbr="sanitized_mbpp_repeat10",
|
||||||
|
path="./data/mbpp/sanitized-mbpp.jsonl",
|
||||||
|
num_repeats=10,
|
||||||
|
reader_cfg=sanitized_mbpp_reader_cfg,
|
||||||
|
infer_cfg=sanitized_mbpp_infer_cfg,
|
||||||
|
eval_cfg=sanitized_mbpp_eval_cfg,
|
||||||
|
)
|
||||||
|
]
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .mbpp_cn_gen_1d1481 import mbpp_cn_datasets # noqa: F401, F403
|
from .mbpp_cn_gen_9114d5 import mbpp_cn_datasets # noqa: F401, F403
|
64
configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py
Normal file
64
configs/datasets/mbpp_cn/mbpp_cn_gen_9114d5.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPDataset, MBPPEvaluator
|
||||||
|
|
||||||
|
mbpp_reader_cfg = dict(
|
||||||
|
input_columns=['text', 'test_list'], output_column='test_list_2')
|
||||||
|
|
||||||
|
mbpp_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n"
|
||||||
|
),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
], )),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||||
|
|
||||||
|
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
|
||||||
|
|
||||||
|
mbpp_cn_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MBPPDataset,
|
||||||
|
abbr='mbpp_cn',
|
||||||
|
path='./data/mbpp_cn/mbpp_cn.jsonl',
|
||||||
|
reader_cfg=mbpp_reader_cfg,
|
||||||
|
infer_cfg=mbpp_infer_cfg,
|
||||||
|
eval_cfg=mbpp_eval_cfg)
|
||||||
|
]
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from.mbpp_plus_gen_94815c import mbpp_plus_datasets # noqa: F401, F403
|
from.mbpp_plus_gen_0b836a import mbpp_plus_datasets # noqa: F401, F403
|
64
configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py
Normal file
64
configs/datasets/mbpp_plus/mbpp_plus_gen_0b836a.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset
|
||||||
|
|
||||||
|
mbpp_plus_reader_cfg = dict(
|
||||||
|
input_columns=['text', 'test_list'], output_column='task_id')
|
||||||
|
|
||||||
|
mbpp_plus_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="BOT",
|
||||||
|
prompt=
|
||||||
|
"[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n "
|
||||||
|
),
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt=
|
||||||
|
"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"
|
||||||
|
),
|
||||||
|
dict(role="BOT", prompt="[BEGIN]\n"),
|
||||||
|
], )),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512))
|
||||||
|
|
||||||
|
mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role="BOT")
|
||||||
|
|
||||||
|
mbpp_plus_datasets = [
|
||||||
|
dict(
|
||||||
|
type=MBPPPlusDataset,
|
||||||
|
abbr='mbpp_plus',
|
||||||
|
path='./data/mbpp_plus/mbpp_plus.jsonl',
|
||||||
|
reader_cfg=mbpp_plus_reader_cfg,
|
||||||
|
infer_cfg=mbpp_plus_infer_cfg,
|
||||||
|
eval_cfg=mbpp_plus_eval_cfg)
|
||||||
|
]
|
@ -0,0 +1,18 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
|
||||||
|
from .needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
|
||||||
|
|
||||||
|
from .needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets
|
||||||
|
from .needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets
|
||||||
|
from .needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets
|
||||||
|
from .needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
|
||||||
|
|
||||||
|
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
|
@ -0,0 +1,287 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
|
||||||
|
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def logistic(x, L=100, x0=50, k=0.1):
|
||||||
|
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_linear_space(start, end, num):
|
||||||
|
if num == 1:
|
||||||
|
return [start]
|
||||||
|
elif num < 1:
|
||||||
|
raise ValueError("num must be at least 1.")
|
||||||
|
step = (end - start) / (num - 1)
|
||||||
|
return [start + step * i for i in range(num)]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_depth_percents(intervals, interval_type):
|
||||||
|
if interval_type == 'linear':
|
||||||
|
return generate_linear_space(0, 100, intervals)
|
||||||
|
elif interval_type == 'sigmoid':
|
||||||
|
linear_space = generate_linear_space(0, 100, intervals)
|
||||||
|
return [logistic(x) for x in linear_space]
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported interval type')
|
||||||
|
|
||||||
|
|
||||||
|
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
||||||
|
|
||||||
|
needlebench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
needlebench_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=NeedleBenchMultiEvaluator),
|
||||||
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
|
pred_role='BOT')
|
||||||
|
|
||||||
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
|
context_lengths = [32000, 128000, 256000]
|
||||||
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
|
# ----------English Version----------
|
||||||
|
base_path = './data/needlebench'
|
||||||
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
|
||||||
|
needle_file_name = 'multi_needle_reasoning_en.json'
|
||||||
|
diff = 10
|
||||||
|
num_needles = 2
|
||||||
|
needlebench_2needle_en_datasets = []
|
||||||
|
language = 'English'
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 600,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_2needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 3
|
||||||
|
needlebench_3needle_en_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 600,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_3needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 4
|
||||||
|
needlebench_4needle_en_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 600,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_4needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 5
|
||||||
|
needlebench_5needle_en_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 600,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_5needle_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
# ----------Chinese Version----------
|
||||||
|
base_path = './data/needlebench'
|
||||||
|
file_list = ['zh_finance.jsonl']
|
||||||
|
|
||||||
|
needle_file_name = 'multi_needle_reasoning_zh.json'
|
||||||
|
diff = 10
|
||||||
|
num_needles = 2
|
||||||
|
needlebench_2needle_zh_datasets = []
|
||||||
|
language = 'Chinese'
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_2needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 3
|
||||||
|
needlebench_3needle_zh_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_3needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 4
|
||||||
|
needlebench_4needle_zh_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_4needle_zh_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
num_needles = 5
|
||||||
|
needlebench_5needle_zh_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
|
||||||
|
'type': NeedleBenchMultiDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': language,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'num_needles': num_needles,
|
||||||
|
'diff': diff,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_5needle_zh_datasets.append(dataset_dict)
|
@ -0,0 +1,109 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
|
||||||
|
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def logistic(x, L=100, x0=50, k=0.1):
|
||||||
|
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_linear_space(start, end, num):
|
||||||
|
if num == 1:
|
||||||
|
return [start]
|
||||||
|
elif num < 1:
|
||||||
|
raise ValueError("num must be at least 1.")
|
||||||
|
step = (end - start) / (num - 1)
|
||||||
|
return [start + step * i for i in range(num)]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_depth_percents(intervals, interval_type):
|
||||||
|
if interval_type == 'linear':
|
||||||
|
return generate_linear_space(0, 100, intervals)
|
||||||
|
elif interval_type == 'sigmoid':
|
||||||
|
linear_space = generate_linear_space(0, 100, intervals)
|
||||||
|
return [logistic(x) for x in linear_space]
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported interval type')
|
||||||
|
|
||||||
|
|
||||||
|
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
||||||
|
|
||||||
|
needlebench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
needlebench_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=NeedleBenchParallelEvaluator),
|
||||||
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
|
pred_role='BOT')
|
||||||
|
|
||||||
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
|
context_lengths = [32000, 128000, 256000]
|
||||||
|
document_depth_percent_intervals = 20
|
||||||
|
document_depth_percent_interval_type = "linear"
|
||||||
|
|
||||||
|
base_path = './data/needlebench'
|
||||||
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
needlebench_en_datasets = []
|
||||||
|
needle_file_name = 'needles.jsonl'
|
||||||
|
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'_parallel_en_256k',
|
||||||
|
'type': NeedleBenchParallelDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depths': depths,
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 25,
|
||||||
|
'length_buffer': 3000,
|
||||||
|
'guide': True,
|
||||||
|
'language': 'English',
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
file_list = ['zh_finance.jsonl']
|
||||||
|
needlebench_zh_datasets = []
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'_parallel_zh_256k',
|
||||||
|
'type': NeedleBenchParallelDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depths': depths,
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 25,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': 'Chinese',
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_zh_datasets.append(dataset_dict)
|
@ -0,0 +1,110 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
|
||||||
|
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_postprocess
|
||||||
|
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
|
||||||
|
import math
|
||||||
|
|
||||||
|
|
||||||
|
def logistic(x, L=100, x0=50, k=0.1):
|
||||||
|
return round(L / (1 + math.exp(-k * (x - x0))), 3)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_linear_space(start, end, num):
|
||||||
|
if num == 1:
|
||||||
|
return [start]
|
||||||
|
elif num < 1:
|
||||||
|
raise ValueError("num must be at least 1.")
|
||||||
|
step = (end - start) / (num - 1)
|
||||||
|
return [start + step * i for i in range(num)]
|
||||||
|
|
||||||
|
|
||||||
|
def generate_depth_percents(intervals, interval_type):
|
||||||
|
if interval_type == 'linear':
|
||||||
|
return generate_linear_space(0, 100, intervals)
|
||||||
|
elif interval_type == 'sigmoid':
|
||||||
|
linear_space = generate_linear_space(0, 100, intervals)
|
||||||
|
return [logistic(x) for x in linear_space]
|
||||||
|
else:
|
||||||
|
raise ValueError('Unsupported interval type')
|
||||||
|
|
||||||
|
|
||||||
|
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
|
||||||
|
|
||||||
|
needlebench_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt='{prompt}'),
|
||||||
|
dict(role='BOT', prompt='{answer}\n'),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer))
|
||||||
|
|
||||||
|
needlebench_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=NeedleBenchOriginEvaluator),
|
||||||
|
pred_postprocessor=dict(type=needlebench_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
|
||||||
|
pred_role='BOT')
|
||||||
|
|
||||||
|
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
|
||||||
|
context_lengths = [32000, 128000, 256000]
|
||||||
|
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
|
||||||
|
|
||||||
|
base_path = './data/needlebench'
|
||||||
|
file_list = ['PaulGrahamEssays.jsonl']
|
||||||
|
needlebench_en_datasets = []
|
||||||
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_origin_en_256k',
|
||||||
|
'type': NeedleBenchOriginDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 600,
|
||||||
|
'guide': True,
|
||||||
|
'language': 'English',
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_en_datasets.append(dataset_dict)
|
||||||
|
|
||||||
|
file_list = ['zh_finance.jsonl']
|
||||||
|
needlebench_zh_datasets = []
|
||||||
|
needle_file_name = 'needles.jsonl'
|
||||||
|
|
||||||
|
for original_context_length in context_lengths:
|
||||||
|
for depth_percent in depths_list:
|
||||||
|
dataset_dict = {
|
||||||
|
'abbr': f'Length{original_context_length}'
|
||||||
|
f'Depth{int(depth_percent)}_origin_zh_256k',
|
||||||
|
'type': NeedleBenchOriginDataset,
|
||||||
|
'path': base_path,
|
||||||
|
'length': original_context_length,
|
||||||
|
'depth': int(depth_percent),
|
||||||
|
'tokenizer_model': 'gpt-4',
|
||||||
|
'file_list': file_list,
|
||||||
|
'num_repeats_per_file': 10,
|
||||||
|
'length_buffer': 200,
|
||||||
|
'guide': True,
|
||||||
|
'language': 'Chinese',
|
||||||
|
'needle_file_name': needle_file_name,
|
||||||
|
'reader_cfg': needlebench_reader_cfg,
|
||||||
|
'infer_cfg': needlebench_infer_cfg,
|
||||||
|
'eval_cfg': needlebench_eval_cfg
|
||||||
|
}
|
||||||
|
needlebench_zh_datasets.append(dataset_dict)
|
@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_passk_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import mbpp_datasets
|
||||||
from .datasets.mbpp.sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets
|
from .datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets
|
||||||
|
|
||||||
datasets = []
|
datasets = []
|
||||||
datasets += humaneval_datasets
|
datasets += humaneval_datasets
|
||||||
|
@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_repeat10_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import mbpp_datasets
|
||||||
from .datasets.mbpp.sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets
|
from .datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets
|
||||||
|
|
||||||
datasets = []
|
datasets = []
|
||||||
datasets += humaneval_datasets
|
datasets += humaneval_datasets
|
||||||
|
@ -8,7 +8,7 @@ with read_base():
|
|||||||
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets
|
from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets
|
||||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
||||||
|
|
||||||
from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
|
from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
|
||||||
from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
|
from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
|
||||||
|
@ -7,7 +7,7 @@ with read_base():
|
|||||||
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
|
||||||
from .datasets.math.math_gen_265cce import math_datasets
|
from .datasets.math.math_gen_265cce import math_datasets
|
||||||
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
|
||||||
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
|
||||||
|
|
||||||
from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model
|
from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model
|
||||||
from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
|
from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
|
begin='<|begin▁of▁sentence|>',
|
||||||
round=[
|
round=[
|
||||||
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
||||||
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
||||||
@ -12,7 +13,6 @@ models = [
|
|||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='deepseek-67b-chat-hf',
|
abbr='deepseek-67b-chat-hf',
|
||||||
path="deepseek-ai/deepseek-llm-67b-chat",
|
path="deepseek-ai/deepseek-llm-67b-chat",
|
||||||
tokenizer_path='deepseek-ai/deepseek-llm-67b-chat',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -28,6 +28,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=4, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|end▁of▁sentence|>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
|
begin='<|begin▁of▁sentence|>',
|
||||||
round=[
|
round=[
|
||||||
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
||||||
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
||||||
@ -12,7 +13,6 @@ models = [
|
|||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='deepseek-7b-chat-hf',
|
abbr='deepseek-7b-chat-hf',
|
||||||
path="deepseek-ai/deepseek-llm-7b-chat",
|
path="deepseek-ai/deepseek-llm-7b-chat",
|
||||||
tokenizer_path='deepseek-ai/deepseek-llm-7b-chat',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -28,5 +28,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from opencompass.models import HuggingFaceCausalLM
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
|
begin='<|begin▁of▁sentence|>',
|
||||||
round=[
|
round=[
|
||||||
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
dict(role="HUMAN", begin='User: ', end='\n\n'),
|
||||||
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
dict(role="BOT", begin="Assistant: ", end='<|end▁of▁sentence|>', generate=True),
|
||||||
@ -12,7 +13,6 @@ models = [
|
|||||||
type=HuggingFaceCausalLM,
|
type=HuggingFaceCausalLM,
|
||||||
abbr='deepseek-moe-16b-chat-hf',
|
abbr='deepseek-moe-16b-chat-hf',
|
||||||
path="deepseek-ai/deepseek-moe-16b-chat",
|
path="deepseek-ai/deepseek-moe-16b-chat",
|
||||||
tokenizer_path='deepseek-ai/deepseek-moe-16b-chat',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
@ -26,7 +26,7 @@ models = [
|
|||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|end▁of▁sentence|>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
|
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
|
||||||
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
|
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,9 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
|
min_out_len=1,
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
|
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
|
||||||
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
|
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -29,5 +28,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -33,5 +32,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -6,7 +6,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -32,5 +31,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,6 @@ _meta_template = dict(
|
|||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -33,5 +32,6 @@ models = [
|
|||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
generation_kwargs = {"eos_token_id": [2, 92542]},
|
generation_kwargs = {"eos_token_id": [2, 92542]},
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -3,27 +3,31 @@ from opencompass.models.turbomind import TurboMindModel
|
|||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n',
|
dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True),
|
||||||
generate=True),
|
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
dict(
|
dict(
|
||||||
type=TurboMindModel,
|
type=TurboMindModel,
|
||||||
abbr='internlm2-chat-20b-turbomind',
|
abbr="internlm2-chat-20b-turbomind",
|
||||||
path="internlm/internlm2-chat-20b",
|
path="internlm/internlm2-chat-20b",
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
engine_config=dict(session_len=210000,
|
engine_config=dict(
|
||||||
|
session_len=210000,
|
||||||
max_batch_size=8,
|
max_batch_size=8,
|
||||||
rope_scaling_factor=3.0,
|
rope_scaling_factor=3.0,
|
||||||
model_name="internlm2-chat-20b",
|
model_name="internlm2-chat-20b",
|
||||||
tp=2),
|
tp=2,
|
||||||
gen_config=dict(top_k=1, top_p=0.8,
|
stop_words=[2, 92542],
|
||||||
|
),
|
||||||
|
gen_config=dict(
|
||||||
|
top_k=1,
|
||||||
|
top_p=0.8,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
max_new_tokens=2000,),
|
max_new_tokens=2000,
|
||||||
|
),
|
||||||
max_out_len=2000,
|
max_out_len=2000,
|
||||||
max_seq_len=210000,
|
max_seq_len=210000,
|
||||||
batch_size=1,
|
batch_size=1,
|
||||||
|
@ -3,29 +3,34 @@ from opencompass.models.turbomind import TurboMindModel
|
|||||||
|
|
||||||
_meta_template = dict(
|
_meta_template = dict(
|
||||||
round=[
|
round=[
|
||||||
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"),
|
||||||
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n',
|
dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True),
|
||||||
generate=True),
|
|
||||||
],
|
],
|
||||||
eos_token_id=92542
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
dict(
|
dict(
|
||||||
type=TurboMindModel,
|
type=TurboMindModel,
|
||||||
abbr='internlm2-chat-7b-turbomind',
|
abbr="internlm2-chat-7b-turbomind",
|
||||||
path="internlm/internlm2-chat-7b",
|
path="internlm/internlm2-chat-7b",
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
engine_config=dict(session_len=210000,
|
engine_config=dict(
|
||||||
|
session_len=210000,
|
||||||
max_batch_size=8,
|
max_batch_size=8,
|
||||||
rope_scaling_factor=2.0,
|
rope_scaling_factor=2.0,
|
||||||
model_name="internlm2-chat-7b"),
|
model_name="internlm2-chat-7b",
|
||||||
gen_config=dict(top_k=1, top_p=0.8,
|
tp=1,
|
||||||
|
stop_words=[2, 92542],
|
||||||
|
),
|
||||||
|
gen_config=dict(
|
||||||
|
top_k=1,
|
||||||
|
top_p=0.8,
|
||||||
temperature=1.0,
|
temperature=1.0,
|
||||||
max_new_tokens=2000),
|
max_new_tokens=2000,
|
||||||
|
),
|
||||||
max_out_len=2000,
|
max_out_len=2000,
|
||||||
max_seq_len=210000,
|
max_seq_len=210000,
|
||||||
batch_size=8,
|
batch_size=1,
|
||||||
concurrency=8,
|
concurrency=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
)
|
)
|
||||||
|
@ -27,5 +27,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='[INST]',
|
end_str='[INST]',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -27,5 +27,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=4, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='[INST]',
|
end_str='[INST]',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -27,5 +27,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='[INST]',
|
end_str='[INST]',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
||||||
dict(role="BOT", begin="", end='</s> ', generate=True),
|
dict(role="BOT", begin="", end='</s> ', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=2
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -30,5 +29,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
||||||
dict(role="BOT", begin="", end='</s> ', generate=True),
|
dict(role="BOT", begin="", end='</s> ', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=2
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -30,6 +29,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='</s>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
|
||||||
dict(role="BOT", begin="", end='</s> ', generate=True),
|
dict(role="BOT", begin="", end='</s> ', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=2
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -30,6 +29,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='</s>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -12,7 +12,6 @@ models = [
|
|||||||
type=HuggingFace,
|
type=HuggingFace,
|
||||||
abbr='minicpm-2b-dpo-hf',
|
abbr='minicpm-2b-dpo-hf',
|
||||||
path='openbmb/MiniCPM-2B-dpo-fp32',
|
path='openbmb/MiniCPM-2B-dpo-fp32',
|
||||||
tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
@ -27,6 +26,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<用户>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -12,7 +12,6 @@ models = [
|
|||||||
type=HuggingFace,
|
type=HuggingFace,
|
||||||
abbr='minicpm-2b-sft-hf',
|
abbr='minicpm-2b-sft-hf',
|
||||||
path='openbmb/MiniCPM-2B-sft-fp32',
|
path='openbmb/MiniCPM-2B-sft-fp32',
|
||||||
tokenizer_path='openbmb/MiniCPM-2B-sft-fp32',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
@ -27,6 +26,6 @@ models = [
|
|||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<用户>',
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
25
configs/models/others/hf_command_r_plus.py
Normal file
25
configs/models/others/hf_command_r_plus.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", begin='<|START_OF_TURN_TOKEN|><|USER_TOKEN|>', end='<|END_OF_TURN_TOKEN|>'),
|
||||||
|
dict(role="BOT", begin="<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", end='<|END_OF_TURN_TOKEN|>', generate=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='command-r-plus-hf',
|
||||||
|
path="CohereForAI/c4ai-command-r-plus",
|
||||||
|
model_kwargs=dict(device_map='auto', trust_remote_code=True),
|
||||||
|
tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True),
|
||||||
|
meta_template=_meta_template,
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=8, num_procs=1),
|
||||||
|
end_str='<|END_OF_TURN_TOKEN|>',
|
||||||
|
batch_padding=True,
|
||||||
|
)
|
||||||
|
]
|
@ -29,7 +29,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=8, num_procs=1),
|
run_cfg=dict(num_gpus=8, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
|
||||||
batch_padding=True,
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
25
configs/models/qwen/hf_qwen1_5_32b.py
Normal file
25
configs/models/qwen/hf_qwen1_5_32b.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='qwen1.5-32b-hf',
|
||||||
|
path="Qwen/Qwen1.5-32B",
|
||||||
|
tokenizer_path='Qwen/Qwen1.5-32B',
|
||||||
|
model_kwargs=dict(
|
||||||
|
device_map='auto',
|
||||||
|
trust_remote_code=True
|
||||||
|
),
|
||||||
|
tokenizer_kwargs=dict(
|
||||||
|
padding_side='left',
|
||||||
|
truncation_side='left',
|
||||||
|
trust_remote_code=True,
|
||||||
|
use_fast=False,
|
||||||
|
),
|
||||||
|
pad_token_id=151645,
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
|
)
|
||||||
|
]
|
33
configs/models/qwen/hf_qwen1_5_32b_chat.py
Normal file
33
configs/models/qwen/hf_qwen1_5_32b_chat.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from opencompass.models import HuggingFaceCausalLM
|
||||||
|
|
||||||
|
_meta_template = dict(
|
||||||
|
round=[
|
||||||
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
type=HuggingFaceCausalLM,
|
||||||
|
abbr='qwen1.5-32b-chat-hf',
|
||||||
|
path="Qwen/Qwen1.5-32B-Chat",
|
||||||
|
model_kwargs=dict(
|
||||||
|
device_map='auto',
|
||||||
|
trust_remote_code=True
|
||||||
|
),
|
||||||
|
tokenizer_kwargs=dict(
|
||||||
|
padding_side='left',
|
||||||
|
truncation_side='left',
|
||||||
|
trust_remote_code=True,
|
||||||
|
use_fast=False,
|
||||||
|
),
|
||||||
|
meta_template=_meta_template,
|
||||||
|
max_out_len=100,
|
||||||
|
max_seq_len=2048,
|
||||||
|
batch_size=8,
|
||||||
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
|
)
|
||||||
|
]
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=4, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -5,7 +5,6 @@ _meta_template = dict(
|
|||||||
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
|
||||||
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
|
||||||
],
|
],
|
||||||
eos_token_id=151645,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
models = [
|
models = [
|
||||||
@ -24,11 +23,11 @@ models = [
|
|||||||
use_fast=False,
|
use_fast=False,
|
||||||
),
|
),
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
pad_token_id=151645,
|
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=4, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -12,7 +12,6 @@ models = [
|
|||||||
type=HuggingFace,
|
type=HuggingFace,
|
||||||
abbr='yi-34b-chat-hf',
|
abbr='yi-34b-chat-hf',
|
||||||
path='01-ai/Yi-34B-Chat',
|
path='01-ai/Yi-34B-Chat',
|
||||||
tokenizer_path='01-ai/Yi-34B-Chat',
|
|
||||||
model_kwargs=dict(
|
model_kwargs=dict(
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
device_map='auto',
|
device_map='auto',
|
||||||
@ -26,7 +25,8 @@ models = [
|
|||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=4, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -28,5 +28,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
end_str='<|im_end|>',
|
end_str='<|im_end|>',
|
||||||
|
batch_padding=True,
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -133,6 +133,8 @@ context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 1
|
|||||||
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k")
|
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k")
|
||||||
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
|
||||||
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k")
|
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k")
|
||||||
|
context_lengths_256k = list([32000, 128000, 256000])
|
||||||
|
needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_list_sparse, "256k")
|
||||||
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
|
||||||
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k")
|
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k")
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ This tutorial primarily focuses on evaluating a model's coding proficiency, usin
|
|||||||
|
|
||||||
## pass@1
|
## pass@1
|
||||||
|
|
||||||
If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md).
|
If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md).
|
||||||
|
|
||||||
For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md).
|
For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md).
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
|
||||||
mbpp_datasets[0]['type'] = MBPPDataset_V2
|
mbpp_datasets[0]['type'] = MBPPDataset_V2
|
||||||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
||||||
@ -63,7 +63,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
|
||||||
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
||||||
humaneval_datasets[0]['num_repeats'] = 10
|
humaneval_datasets[0]['num_repeats'] = 10
|
||||||
|
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
## pass@1
|
## pass@1
|
||||||
|
|
||||||
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
|
如果只需要生成单条回复来评测pass@1的性能,可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
|
||||||
|
|
||||||
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。
|
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。
|
||||||
|
|
||||||
@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
|
||||||
mbpp_datasets[0]['type'] = MBPPDataset_V2
|
mbpp_datasets[0]['type'] = MBPPDataset_V2
|
||||||
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
|
||||||
@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
|
|||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
|
||||||
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets
|
from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
|
||||||
|
|
||||||
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
|
||||||
humaneval_datasets[0]['num_repeats'] = 10
|
humaneval_datasets[0]['num_repeats'] = 10
|
||||||
|
@ -56,6 +56,12 @@ def parse_args():
|
|||||||
'to run',
|
'to run',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False)
|
default=False)
|
||||||
|
parser.add_argument(
|
||||||
|
'--accelerator',
|
||||||
|
help='Infer accelerator, support vllm and lmdeploy now.',
|
||||||
|
choices=['vllm', 'lmdeploy', 'hg'],
|
||||||
|
default='hg',
|
||||||
|
type=str)
|
||||||
parser.add_argument('-m',
|
parser.add_argument('-m',
|
||||||
'--mode',
|
'--mode',
|
||||||
help='Running mode. You can choose "infer" if you '
|
help='Running mode. You can choose "infer" if you '
|
||||||
|
@ -27,11 +27,9 @@ except ImportError:
|
|||||||
|
|
||||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||||
from opencompass.utils.logging import get_logger
|
|
||||||
|
|
||||||
from .base import BaseDataset
|
from .base import BaseDataset
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
TIMEOUT = 10
|
TIMEOUT = 10
|
||||||
|
|
||||||
|
|
||||||
@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
|
|||||||
try:
|
try:
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
logger.warning('signal.SIGALRM is not available on this platform')
|
print('signal.SIGALRM is not available on this platform')
|
||||||
timeout = 4 # seconds
|
timeout = 4 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
|
|||||||
multiple responses in special cases.
|
multiple responses in special cases.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def processing_test(example):
|
||||||
|
example['test_case'] = example['test_list']
|
||||||
|
example['test_list'] = '\n'.join(example['test_list'])
|
||||||
|
example['test_list_2'] = example['test_list']
|
||||||
|
example['test_column'] = dict(test_list_2=example['test_list'],
|
||||||
|
task_id=example['task_id'])
|
||||||
|
return example
|
||||||
|
|
||||||
dataset = []
|
dataset = []
|
||||||
with open(path, 'r', encoding='utf-8') as f:
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
dataset.extend(
|
example = json.loads(line.strip())
|
||||||
[json.loads(line.strip()) for _ in range(num_repeats)])
|
example = processing_test(example)
|
||||||
|
dataset.extend([example for _ in range(num_repeats)])
|
||||||
return Dataset.from_list(dataset)
|
return Dataset.from_list(dataset)
|
||||||
|
|
||||||
|
|
||||||
@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
|
|||||||
predictions)):
|
predictions)):
|
||||||
pred = self._process_answer(pred)
|
pred = self._process_answer(pred)
|
||||||
programs = self._process_test(refer, pred)
|
programs = self._process_test(refer, pred)
|
||||||
future = executor.submit(execution, programs, i, 3)
|
future = executor.submit(execution, programs, i, 10)
|
||||||
futures.append(future)
|
futures.append(future)
|
||||||
details[str(i)] = {}
|
details[str(i)] = {}
|
||||||
details[str(i)]['origin'] = predictions[i]
|
details[str(i)]['origin'] = predictions[i]
|
||||||
@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
|
|||||||
return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
|
return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
|
||||||
|
|
||||||
def _process_answer(self, text):
|
def _process_answer(self, text):
|
||||||
try:
|
patterns = [
|
||||||
# for chatGLM related text
|
r"\[BEGIN\]\s*'(.*)'\s*\[DONE\]",
|
||||||
eval_text = eval(text)
|
r"BEGIN\s*'(.*)'\s*\[DONE\]",
|
||||||
except Exception:
|
r"\[BEGIN\]\s*'(.*)'\s*DONE",
|
||||||
pass
|
r"BEGIN\s*'(.*)'\s*DONE",
|
||||||
else:
|
r"\[BEGIN\]\s*'(.*)\s*\[DONE\]",
|
||||||
if isinstance(eval_text, str):
|
r"BEGIN\s*'(.*)\s*\[DONE\]",
|
||||||
text = eval_text
|
r"\[BEGIN\]\s*'(.*)\s*DONE",
|
||||||
# deal with code block
|
r"BEGIN\s*'(.*)\s*DONE",
|
||||||
if '```' in text:
|
r'\[BEGIN\]\s*(.*)\s*\[DONE\]',
|
||||||
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
r'BEGIN\s*(.*)\s*\[DONE\]',
|
||||||
if len(blocks) == 0:
|
r'\[BEGIN\]\s*(.*)\s*DONE',
|
||||||
text = text.split('```')[1] # fall back to default strategy
|
r'BEGIN\s*(.*)\s*DONE',
|
||||||
else:
|
r'```python\s*(.*)\s*```',
|
||||||
text = blocks[0] # fetch the first code block
|
r'```\s*(.*)\s*```',
|
||||||
if not text.startswith('\n'): # in case starting with ```xxx
|
r'(.*)\s*```.*',
|
||||||
text = text[max(text.find('\n') + 1, 0):]
|
r"\[BEGIN\]\s*'(.*)",
|
||||||
|
r'\[BEGIN\](.*)',
|
||||||
|
]
|
||||||
|
for p in patterns:
|
||||||
|
match = re.search(p, text, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
break
|
||||||
|
text = text.split('```')[0]
|
||||||
|
text = re.split(r"'?\s*\[?DONE\]?", text)[0]
|
||||||
|
text = text.replace('\\_', '_')
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
|
|
||||||
if match:
|
|
||||||
text = text[:match.start()]
|
|
||||||
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
|
|
||||||
if match:
|
|
||||||
text = text[match.end():]
|
|
||||||
text = text.strip()
|
|
||||||
if text.startswith("'"):
|
|
||||||
text = text[1:]
|
|
||||||
if text.endswith("'"):
|
|
||||||
text = text[:-1]
|
|
||||||
text = text.replace('\\', '')
|
|
||||||
match = re.search(r'```python(.*)```', text, re.DOTALL)
|
|
||||||
if match:
|
|
||||||
text = match.group(1).strip().split('```')[0].strip()
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _process_test(self, test_case, pred):
|
def _process_test(self, test_case, pred):
|
||||||
@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
|
|||||||
for pred in preds:
|
for pred in preds:
|
||||||
pred = self._process_answer(pred)
|
pred = self._process_answer(pred)
|
||||||
programs = self._process_test(test_case, pred)
|
programs = self._process_test(test_case, pred)
|
||||||
future = executor.submit(execution, programs, task_id, 3)
|
future = executor.submit(execution, programs, task_id, 10)
|
||||||
futures.append(future)
|
futures.append(future)
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
@ -27,11 +27,9 @@ except ImportError:
|
|||||||
|
|
||||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||||
from opencompass.utils.logging import get_logger
|
|
||||||
|
|
||||||
from .base import BaseDataset
|
from .base import BaseDataset
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
TIMEOUT = 10
|
TIMEOUT = 10
|
||||||
|
|
||||||
|
|
||||||
@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
|
|||||||
try:
|
try:
|
||||||
signal.signal(signal.SIGALRM, timeout_handler)
|
signal.signal(signal.SIGALRM, timeout_handler)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
logger.warning('signal.SIGALRM is not available on this platform')
|
print('signal.SIGALRM is not available on this platform')
|
||||||
timeout = 4 # seconds
|
timeout = 4 # seconds
|
||||||
|
|
||||||
|
|
||||||
|
@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
|
|||||||
self.top_logprobs = top_logprobs
|
self.top_logprobs = top_logprobs
|
||||||
|
|
||||||
if isinstance(key, str):
|
if isinstance(key, str):
|
||||||
self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key]
|
if key == 'ENV':
|
||||||
|
if 'OPENAI_API_KEY' not in os.environ:
|
||||||
|
raise ValueError('OpenAI API key is not set.')
|
||||||
|
self.keys = os.getenv('OPENAI_API_KEY').split(',')
|
||||||
|
else:
|
||||||
|
self.keys = [key]
|
||||||
else:
|
else:
|
||||||
self.keys = key
|
self.keys = key
|
||||||
|
|
||||||
@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
|
|||||||
self.url = openai_api_base
|
self.url = openai_api_base
|
||||||
self.path = path
|
self.path = path
|
||||||
|
|
||||||
def generate(
|
def generate(self,
|
||||||
self,
|
|
||||||
inputs: List[PromptType],
|
inputs: List[PromptType],
|
||||||
max_out_len: int = 512,
|
max_out_len: int = 512,
|
||||||
temperature: float = 0.7,
|
temperature: float = 0.7,
|
||||||
) -> List[str]:
|
**kwargs) -> List[str]:
|
||||||
"""Generate results given a list of inputs.
|
"""Generate results given a list of inputs.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
|
|||||||
}
|
}
|
||||||
for _ in range(self.retry):
|
for _ in range(self.retry):
|
||||||
self.wait()
|
self.wait()
|
||||||
|
try:
|
||||||
raw_response = requests.post(self.url,
|
raw_response = requests.post(self.url,
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
data=json.dumps(data))
|
data=json.dumps(data))
|
||||||
|
except requests.ConnectionError:
|
||||||
|
self.logger.error('Request error, got',
|
||||||
|
str(raw_response.content))
|
||||||
|
time.sleep(1)
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
response = raw_response.json()
|
response = raw_response.json()
|
||||||
except requests.JSONDecodeError:
|
except requests.JSONDecodeError:
|
||||||
|
@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
|
|||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
continue
|
continue
|
||||||
if response.status_code == 429:
|
if response.status_code == 429:
|
||||||
print('Rate limited')
|
print(response)
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
continue
|
continue
|
||||||
if response.status_code == 400:
|
if response.status_code == 400:
|
||||||
|
@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
|
|||||||
pod_create_time = None
|
pod_create_time = None
|
||||||
pri_time = None
|
pri_time = None
|
||||||
initial_time = datetime.datetime.now()
|
initial_time = datetime.datetime.now()
|
||||||
|
|
||||||
|
url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501
|
||||||
|
logger = get_logger()
|
||||||
|
logger.debug('')
|
||||||
|
logger.debug('*' * 168)
|
||||||
|
logger.debug(
|
||||||
|
f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501
|
||||||
|
)
|
||||||
|
logger.debug('*' * 168)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
# 1. Avoid to request dlc too frequently.
|
# 1. Avoid to request dlc too frequently.
|
||||||
# 2. DLC job may not be ready immediately after creation.
|
# 2. DLC job may not be ready immediately after creation.
|
||||||
|
@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
|
|||||||
tmpl += f' --gres=gpu:{num_gpus}'
|
tmpl += f' --gres=gpu:{num_gpus}'
|
||||||
for extra_cmd in self.extra_command:
|
for extra_cmd in self.extra_command:
|
||||||
tmpl += f' {extra_cmd}'
|
tmpl += f' {extra_cmd}'
|
||||||
|
tmpl += ' -x HOST-10-140-60-7'
|
||||||
tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}'
|
tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}'
|
||||||
get_cmd = partial(task.get_command,
|
get_cmd = partial(task.get_command,
|
||||||
cfg_path=param_file,
|
cfg_path=param_file,
|
||||||
|
@ -72,7 +72,7 @@ dataset_mapping_dict = {}
|
|||||||
|
|
||||||
needle_counts = ['2', '3', '4', '5']
|
needle_counts = ['2', '3', '4', '5']
|
||||||
languages = ['en', 'zh']
|
languages = ['en', 'zh']
|
||||||
sizes = ['4k', '8k', '32k', '200k', '1000k']
|
sizes = ['4k', '8k', '32k', '200k', '256k', '1000k']
|
||||||
types = ['origin', 'parallel']
|
types = ['origin', 'parallel']
|
||||||
|
|
||||||
for needle_count in needle_counts:
|
for needle_count in needle_counts:
|
||||||
@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
|
|||||||
numbers = [2, 3, 4, 5]
|
numbers = [2, 3, 4, 5]
|
||||||
languages = ['en', 'zh']
|
languages = ['en', 'zh']
|
||||||
size_exists = []
|
size_exists = []
|
||||||
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k']
|
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_256k', '_1000k']
|
||||||
|
|
||||||
for size in sizes_origin:
|
for size in sizes_origin:
|
||||||
if size in content:
|
if size in content:
|
||||||
@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
|
|||||||
markersize=8,
|
markersize=8,
|
||||||
label='Average Depth Score'
|
label='Average Depth Score'
|
||||||
)
|
)
|
||||||
|
for x_value, y_value in zip(x_data, y_data):
|
||||||
|
ax2.text(x_value, y_value, f'{y_value:.2f}', ha='center', va='top')
|
||||||
|
|
||||||
ax2.set_ylim(0, 100)
|
ax2.set_ylim(0, 100)
|
||||||
|
|
||||||
ax2.set_yticklabels([])
|
ax2.set_yticklabels([])
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
# flake8: noqa: E501
|
# flake8: noqa
|
||||||
import ast
|
# yapf: disable
|
||||||
import csv
|
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import re
|
import re
|
||||||
@ -10,7 +9,7 @@ from itertools import product
|
|||||||
|
|
||||||
import mmengine
|
import mmengine
|
||||||
from mmengine import ConfigDict
|
from mmengine import ConfigDict
|
||||||
from prettytable import from_csv
|
from tabulate import tabulate
|
||||||
|
|
||||||
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
from opencompass.partitioners.sub_naive import remove_duplicate_pairs
|
||||||
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
||||||
@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
|
|||||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
|
|
||||||
|
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||||
|
if model.get('summarizer_abbr', None):
|
||||||
|
return model['summarizer_abbr']
|
||||||
|
else:
|
||||||
|
return model_abbr_from_cfg(model)
|
||||||
|
|
||||||
def post_process_compass_arena(s):
|
def post_process_compass_arena(s):
|
||||||
if result := re.findall('(?:选择:|Choice: )([ABC])', s):
|
if result := re.findall('(?:选择:|Choice: )([ABC])', s):
|
||||||
return result[0]
|
return result[0]
|
||||||
@ -68,17 +73,90 @@ class CompassArenaSummarizer:
|
|||||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||||
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||||
self.judge_models = self.cfg.get('judge_models', None)
|
self.judge_models = self.cfg.get('judge_models', None)
|
||||||
self.meta_judge_model = self.cfg.eval.partitioner.get(
|
self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
|
||||||
'meta_judge_model', None)
|
|
||||||
self.judge_type = judge_type
|
self.judge_type = judge_type
|
||||||
assert self.judge_type in ['general']
|
assert self.judge_type in ['general']
|
||||||
self.judge_map = {
|
self.judge_map = {'general': post_process_compass_arena}
|
||||||
'general': post_process_compass_arena,
|
|
||||||
}
|
|
||||||
self.judge_function = self.judge_map[self.judge_type]
|
self.judge_function = self.judge_map[self.judge_type]
|
||||||
self.check_pos_bias = check_pos_bias
|
self.check_pos_bias = check_pos_bias
|
||||||
self.summary_type = summary_type
|
self.summary_type = summary_type
|
||||||
|
|
||||||
|
def get_score(self, time_str):
|
||||||
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
|
model_combinations = list(product(self.base_models, self.compare_models))
|
||||||
|
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
|
||||||
|
|
||||||
|
if self.meta_judge_model is not None:
|
||||||
|
self.judge_models.append(self.meta_judge_model)
|
||||||
|
|
||||||
|
scores = {}
|
||||||
|
|
||||||
|
for idx, judge_model_cfg in enumerate(self.judge_models):
|
||||||
|
judge_model = model_abbr_from_cfg(judge_model_cfg)
|
||||||
|
for dataset in self.cfg['datasets']:
|
||||||
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
|
for model_pair in unique_combinations:
|
||||||
|
model1 = model_pair[0]['abbr']
|
||||||
|
model2 = model_pair[1]['abbr']
|
||||||
|
if idx == len(self.judge_models):
|
||||||
|
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
|
||||||
|
else:
|
||||||
|
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
|
||||||
|
subdir_path = os.path.join(results_folder, subdir)
|
||||||
|
if not os.path.isdir(subdir_path):
|
||||||
|
print(subdir_path + ' is not exist! please check!')
|
||||||
|
continue
|
||||||
|
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||||
|
if self.check_pos_bias:
|
||||||
|
bias_num = check_position_bias(judged_answers, references)
|
||||||
|
else:
|
||||||
|
bias_num = 0
|
||||||
|
win_model1 = defaultdict(float)
|
||||||
|
win_model2 = defaultdict(float)
|
||||||
|
categories = defaultdict(float)
|
||||||
|
model1 = references[0]['answer1']
|
||||||
|
model2 = references[0]['answer2']
|
||||||
|
for prediction, reference in zip(judged_answers, references):
|
||||||
|
categories[dataset_abbr] += 1
|
||||||
|
categories[reference['capability']] += 1
|
||||||
|
|
||||||
|
if prediction == 'A':
|
||||||
|
if reference['answer1'] == model1:
|
||||||
|
score_1, score_2 = 1, 0
|
||||||
|
else:
|
||||||
|
score_1, score_2 = 0, 1
|
||||||
|
elif prediction == 'B':
|
||||||
|
if reference['answer1'] == model1:
|
||||||
|
score_1, score_2 = 0, 1
|
||||||
|
else:
|
||||||
|
score_1, score_2 = 1, 0
|
||||||
|
elif prediction == 'C':
|
||||||
|
if self.summary_type == 'half_add':
|
||||||
|
score_1, score_2 = 0.5, 0.5
|
||||||
|
else:
|
||||||
|
score_1, score_2 = 0, 0
|
||||||
|
|
||||||
|
win_model1[reference['capability']] += score_1
|
||||||
|
win_model1[dataset_abbr] += score_1
|
||||||
|
win_model2[reference['capability']] += score_2
|
||||||
|
win_model2[dataset_abbr] += score_2
|
||||||
|
for capability in categories:
|
||||||
|
win_model1[capability] = win_model1[capability] / categories[capability] * 100
|
||||||
|
win_model1[capability] = round(win_model1[capability], 2)
|
||||||
|
win_model2[capability] = win_model2[capability] / categories[capability] * 100
|
||||||
|
win_model2[capability] = round(win_model2[capability], 2)
|
||||||
|
|
||||||
|
win_model1['position_bias'] = bias_num
|
||||||
|
win_model2['position_bias'] = bias_num
|
||||||
|
|
||||||
|
if judge_model not in scores:
|
||||||
|
scores[judge_model] = {}
|
||||||
|
if dataset_abbr not in scores[judge_model]:
|
||||||
|
scores[judge_model][dataset_abbr] = {}
|
||||||
|
scores[judge_model][dataset_abbr][model2] = win_model2
|
||||||
|
|
||||||
|
return scores
|
||||||
|
|
||||||
def summarize(
|
def summarize(
|
||||||
self,
|
self,
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
|
||||||
@ -91,143 +169,72 @@ class CompassArenaSummarizer:
|
|||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: The summary results.
|
pd.DataFrame: The summary results.
|
||||||
"""
|
"""
|
||||||
dataset_cfgs = self.cfg['datasets']
|
|
||||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
|
||||||
model_combinations = list(
|
|
||||||
product(self.base_models, self.compare_models))
|
|
||||||
unique_combinations = remove_duplicate_pairs(
|
|
||||||
[combo for combo in model_combinations if combo[0] != combo[1]])
|
|
||||||
|
|
||||||
fout_list = []
|
|
||||||
pre_len = len(self.judge_models)
|
scores = self.get_score(time_str)
|
||||||
if self.meta_judge_model is not None:
|
# scores['win_' + model1] = win_model1
|
||||||
self.judge_models.append(self.meta_judge_model)
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model)
|
|
||||||
else:
|
|
||||||
meta_judge_model_abbr = None
|
|
||||||
for idx, judge_model in enumerate(self.judge_models):
|
for idx, judge_model in enumerate(self.judge_models):
|
||||||
judge_model = model_abbr_from_cfg(judge_model)
|
judge_abbr = model_abbr_from_cfg(judge_model)
|
||||||
for dataset in dataset_cfgs:
|
for dataset in self.cfg['datasets']:
|
||||||
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
if idx == pre_len:
|
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||||
fout = osp.join(
|
one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
|
||||||
output_dir, 'summarized-by--' + judge_model + '-' +
|
row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
|
||||||
dataset_abbr + '-report.csv')
|
row_headers = [dataset_abbr, 'position_bias'] + row_headers
|
||||||
|
headers = [''] + summarizer_model_abbrs
|
||||||
|
table = []
|
||||||
|
for row_header in row_headers:
|
||||||
|
row = [row_header]
|
||||||
|
for model_cfg in self.compare_models:
|
||||||
|
model_abbr = model_abbr_from_cfg(model_cfg)
|
||||||
|
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
|
||||||
|
if isinstance(s, float):
|
||||||
|
s = f'{s:.2f}'
|
||||||
|
if isinstance(s, int):
|
||||||
|
s = str(s)
|
||||||
|
row.append(s)
|
||||||
|
table.append(row)
|
||||||
|
txt = tabulate(table, headers=headers)
|
||||||
|
print(txt)
|
||||||
|
|
||||||
|
if idx == len(self.judge_models):
|
||||||
|
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||||
else:
|
else:
|
||||||
fout = osp.join(
|
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
|
||||||
output_dir, 'judged-by--' + judge_model + '-' +
|
|
||||||
dataset_abbr + '-report.csv')
|
with open(output_filename, 'w') as f:
|
||||||
fout_list.append(fout)
|
f.write(','.join(headers) + '\n')
|
||||||
for model_pair in unique_combinations:
|
for line in table:
|
||||||
model1, model2, = model_pair[0]['abbr'], model_pair[1][
|
f.write(','.join(line) + '\n')
|
||||||
'abbr'],
|
print(output_filename)
|
||||||
if idx == pre_len:
|
|
||||||
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
|
table = []
|
||||||
|
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
|
||||||
|
headers = [''] + summarizer_model_abbrs
|
||||||
|
for dataset in self.cfg['datasets']:
|
||||||
|
dataset_abbr = dataset_abbr_from_cfg(dataset)
|
||||||
|
row = [dataset_abbr]
|
||||||
|
for model_cfg in self.compare_models:
|
||||||
|
model_abbr = model_abbr_from_cfg(model_cfg)
|
||||||
|
s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
|
||||||
|
if isinstance(s, float):
|
||||||
|
s = f'{s:.2f}'
|
||||||
|
if isinstance(s, int):
|
||||||
|
s = str(s)
|
||||||
|
row.append(s)
|
||||||
|
table.append(row)
|
||||||
|
txt = tabulate(table, headers=headers)
|
||||||
|
print(txt)
|
||||||
|
|
||||||
|
if idx == len(self.judge_models):
|
||||||
|
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
|
||||||
else:
|
else:
|
||||||
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
|
output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
with open(output_filename, 'w') as f:
|
||||||
if os.path.isdir(subdir_path):
|
f.write(','.join(headers) + '\n')
|
||||||
judged_answers, references = get_judgeanswer_and_reference(
|
for line in table:
|
||||||
dataset,
|
f.write(','.join(line) + '\n')
|
||||||
subdir_path,
|
print(output_filename)
|
||||||
self.judge_function,
|
|
||||||
)
|
|
||||||
if self.check_pos_bias:
|
|
||||||
bias_num = check_position_bias(
|
|
||||||
judged_answers, references)
|
|
||||||
else:
|
|
||||||
bias_num = 0
|
|
||||||
win_model1, win_model2, categories = defaultdict(
|
|
||||||
float), defaultdict(float), defaultdict(float)
|
|
||||||
model1, model2 = references[0]['answer1'], references[
|
|
||||||
0]['answer2']
|
|
||||||
for prediction, reference in zip(
|
|
||||||
judged_answers, references):
|
|
||||||
if self.summary_type == 'single':
|
|
||||||
if prediction == 'A':
|
|
||||||
categories['total'] += 1
|
|
||||||
categories[reference['capability']] += 1
|
|
||||||
if reference['answer1'] == model1:
|
|
||||||
win_model1[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model1['total'] += 1
|
|
||||||
else:
|
|
||||||
win_model2[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model2['total'] += 1
|
|
||||||
elif prediction == 'B':
|
|
||||||
categories['total'] += 1
|
|
||||||
categories[reference['capability']] += 1
|
|
||||||
if reference['answer1'] == model1:
|
|
||||||
win_model2[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model2['total'] += 1
|
|
||||||
else:
|
|
||||||
win_model1[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model1['total'] += 1
|
|
||||||
elif self.summary_type == 'half_add':
|
|
||||||
categories['total'] += 1
|
|
||||||
categories[reference['capability']] += 1
|
|
||||||
if prediction == 'A':
|
|
||||||
if reference['answer1'] == model1:
|
|
||||||
win_model1[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model1['total'] += 1
|
|
||||||
else:
|
|
||||||
win_model2[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model2['total'] += 1
|
|
||||||
elif prediction == 'B':
|
|
||||||
if reference['answer1'] == model1:
|
|
||||||
win_model2[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model2['total'] += 1
|
|
||||||
else:
|
|
||||||
win_model1[
|
|
||||||
reference['capability']] += 1
|
|
||||||
win_model1['total'] += 1
|
|
||||||
elif prediction == 'C':
|
|
||||||
win_model1[reference['capability']] += 0.5
|
|
||||||
win_model1['total'] += 0.5
|
|
||||||
win_model2[reference['capability']] += 0.5
|
|
||||||
win_model2['total'] += 0.5
|
|
||||||
for capability in categories:
|
|
||||||
if capability not in win_model1:
|
|
||||||
win_model1[capability] = 0.0
|
|
||||||
else:
|
|
||||||
win_model1[capability] = round(
|
|
||||||
(win_model1[capability] /
|
|
||||||
categories[capability]) * 100, 2)
|
|
||||||
if capability not in win_model2:
|
|
||||||
win_model2[capability] = 0.0
|
|
||||||
else:
|
|
||||||
win_model2[capability] = round(
|
|
||||||
(win_model2[capability] /
|
|
||||||
categories[capability]) * 100, 2)
|
|
||||||
win_model1['position_bias'] = bias_num
|
|
||||||
win_model2['position_bias'] = bias_num
|
|
||||||
scores = {
|
|
||||||
'win_' + model1: win_model1,
|
|
||||||
'win_' + model2: win_model2
|
|
||||||
}
|
|
||||||
rows = list(scores.keys())
|
|
||||||
columns = list(scores[rows[0]].keys())
|
|
||||||
columns.insert(0, columns.pop(columns.index('total')))
|
|
||||||
columns.insert(
|
|
||||||
1, columns.pop(columns.index('position_bias')))
|
|
||||||
with open(fout, 'a+', newline='') as csvfile:
|
|
||||||
writer = csv.writer(csvfile)
|
|
||||||
writer.writerow([model1 + '_vs_' + model2] +
|
|
||||||
columns)
|
|
||||||
for row in rows:
|
|
||||||
writer.writerow([row] + [
|
|
||||||
scores[row][column] for column in columns
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
print(subdir_path + ' is not exist! please check!')
|
|
||||||
for fout in fout_list:
|
|
||||||
with open(fout, 'r') as f:
|
|
||||||
x = from_csv(f)
|
|
||||||
print(fout)
|
|
||||||
print(x)
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
# flake8: noqa: E501
|
# flake8: noqa
|
||||||
|
# yapf: disable
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
@ -8,11 +9,7 @@ from datetime import datetime
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from mmengine import ConfigDict
|
from mmengine import ConfigDict
|
||||||
|
from tabulate import tabulate
|
||||||
try:
|
|
||||||
from prettytable import from_csv
|
|
||||||
except ImportError:
|
|
||||||
from_csv = None
|
|
||||||
|
|
||||||
from opencompass.utils import model_abbr_from_cfg
|
from opencompass.utils import model_abbr_from_cfg
|
||||||
|
|
||||||
@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
|
|||||||
from .utils import get_judgeanswer_and_reference, get_outdir
|
from .utils import get_judgeanswer_and_reference, get_outdir
|
||||||
|
|
||||||
|
|
||||||
|
def model_abbr_from_cfg_used_in_summarizer(model):
|
||||||
|
if model.get('summarizer_abbr', None):
|
||||||
|
return model['summarizer_abbr']
|
||||||
|
else:
|
||||||
|
return model_abbr_from_cfg(model)
|
||||||
|
|
||||||
def post_process_mtbench_pair(judgement: str):
|
def post_process_mtbench_pair(judgement: str):
|
||||||
"""Input a string like below:
|
"""Input a string like below:
|
||||||
|
|
||||||
@ -52,7 +55,7 @@ def get_capability_results(
|
|||||||
references,
|
references,
|
||||||
fout,
|
fout,
|
||||||
fout_flag,
|
fout_flag,
|
||||||
model,
|
model_abbr,
|
||||||
):
|
):
|
||||||
capability_ratings = defaultdict(int)
|
capability_ratings = defaultdict(int)
|
||||||
capability_counts = defaultdict(int)
|
capability_counts = defaultdict(int)
|
||||||
@ -70,12 +73,12 @@ def get_capability_results(
|
|||||||
capability_avg_ratings[capability] = s
|
capability_avg_ratings[capability] = s
|
||||||
columns = list(capability_avg_ratings.keys())
|
columns = list(capability_avg_ratings.keys())
|
||||||
columns.insert(0, columns.pop(columns.index('total')))
|
columns.insert(0, columns.pop(columns.index('total')))
|
||||||
|
|
||||||
with open(fout, 'a+', newline='') as csvfile:
|
with open(fout, 'a+', newline='') as csvfile:
|
||||||
writer = csv.writer(csvfile)
|
writer = csv.writer(csvfile)
|
||||||
if fout_flag == 0:
|
if fout_flag == 0:
|
||||||
writer.writerow(['model'] + columns)
|
writer.writerow(['model'] + columns)
|
||||||
writer.writerow([model] +
|
writer.writerow([model_abbr] + [capability_avg_ratings[column] for column in columns])
|
||||||
[capability_avg_ratings[column] for column in columns])
|
|
||||||
|
|
||||||
|
|
||||||
class MTBenchSummarizer(CompassArenaSummarizer):
|
class MTBenchSummarizer(CompassArenaSummarizer):
|
||||||
@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
|||||||
self.cfg = config
|
self.cfg = config
|
||||||
if self.judge_type == 'single':
|
if self.judge_type == 'single':
|
||||||
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
|
||||||
self.eval_model_abbrs = [
|
|
||||||
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
|
|
||||||
]
|
|
||||||
elif self.judge_type == 'pair':
|
elif self.judge_type == 'pair':
|
||||||
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
self.base_models = self.cfg['eval']['partitioner']['base_models']
|
||||||
self.compare_models = self.cfg['eval']['partitioner'][
|
self.compare_models = self.cfg['eval']['partitioner']['compare_models']
|
||||||
'compare_models']
|
|
||||||
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
|
||||||
self.judge_map = {
|
self.judge_map = {
|
||||||
'single': post_process_mtbench_single,
|
'single': post_process_mtbench_single,
|
||||||
@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
|||||||
}
|
}
|
||||||
self.judge_function = self.judge_map[self.judge_type]
|
self.judge_function = self.judge_map[self.judge_type]
|
||||||
|
|
||||||
def summarize(self,
|
def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
||||||
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
|
|
||||||
"""Summarize the subjectivity analysis based on evaluation results.
|
"""Summarize the subjectivity analysis based on evaluation results.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
|
|||||||
Returns:
|
Returns:
|
||||||
pd.DataFrame: The summary results.
|
pd.DataFrame: The summary results.
|
||||||
"""
|
"""
|
||||||
if self.judge_type == 'single':
|
if self.judge_type == 'pair':
|
||||||
|
return super().summarize()
|
||||||
|
|
||||||
|
# self.judge_type == 'single'
|
||||||
dataset_cfgs = self.cfg['datasets']
|
dataset_cfgs = self.cfg['datasets']
|
||||||
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
output_dir, results_folder = get_outdir(self.cfg, time_str)
|
||||||
fout_flag = 0
|
fout_flag = 0
|
||||||
for eval_model_abbr in self.eval_model_abbrs:
|
for eval_model_cfg in self.eval_model_cfgs:
|
||||||
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr
|
eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
|
||||||
subdir_path = os.path.join(results_folder, subdir)
|
show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
|
||||||
|
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
|
||||||
if os.path.isdir(subdir_path):
|
if os.path.isdir(subdir_path):
|
||||||
model, judge_model = eval_model_abbr, self.judge_abbr
|
fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
|
||||||
fout = osp.join(
|
|
||||||
output_dir,
|
|
||||||
'judged-by--' + judge_model + '-capability.csv')
|
|
||||||
overall_judged_answers, overall_references = [], []
|
overall_judged_answers, overall_references = [], []
|
||||||
for dataset in dataset_cfgs:
|
for dataset in dataset_cfgs:
|
||||||
judged_answers, references = get_judgeanswer_and_reference(
|
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
|
||||||
dataset, subdir_path, self.judge_function)
|
|
||||||
overall_judged_answers += judged_answers
|
overall_judged_answers += judged_answers
|
||||||
overall_references += references
|
overall_references += references
|
||||||
get_capability_results(overall_judged_answers,
|
get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
|
||||||
overall_references, fout, fout_flag,
|
|
||||||
model)
|
|
||||||
fout_flag += 1
|
fout_flag += 1
|
||||||
else:
|
else:
|
||||||
print(subdir_path + ' is not exist! please check!')
|
print(subdir_path + ' is not exist! please check!')
|
||||||
with open(fout, 'r') as f:
|
with open(fout, 'r') as f:
|
||||||
x = from_csv(f)
|
csv_reader = csv.reader(f)
|
||||||
print(x)
|
header = next(csv_reader)
|
||||||
|
table = [line for line in csv_reader]
|
||||||
|
|
||||||
|
new_header = [''] + [line[0] for line in table]
|
||||||
|
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
|
||||||
|
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
|
||||||
|
t = tabulate(new_table, headers=new_header)
|
||||||
|
with open(fout, 'w') as f:
|
||||||
|
f.write(','.join(new_header) + '\n')
|
||||||
|
for line in new_table:
|
||||||
|
f.write(','.join(map(str, line)) + '\n')
|
||||||
|
print(t)
|
||||||
print(fout)
|
print(fout)
|
||||||
elif self.judge_type == 'pair':
|
|
||||||
super().summarize()
|
|
||||||
|
@ -3,6 +3,7 @@ import copy
|
|||||||
import fnmatch
|
import fnmatch
|
||||||
import math
|
import math
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
import re
|
||||||
import statistics
|
import statistics
|
||||||
import time
|
import time
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
|
|||||||
start = 0
|
start = 0
|
||||||
end = len(s)
|
end = len(s)
|
||||||
|
|
||||||
if begin_str:
|
if begin_str and re.match(r'\s*', begin_str) is None:
|
||||||
begin_idx = s.find(begin_str)
|
begin_idx = s.find(begin_str)
|
||||||
if begin_idx != -1:
|
if begin_idx != -1:
|
||||||
start = begin_idx + len(begin_str)
|
start = begin_idx + len(begin_str)
|
||||||
|
|
||||||
if end_str:
|
if end_str and re.match(r'\s*', end_str) is None:
|
||||||
# TODO: Support calling tokenizer for the accurate eos token
|
# TODO: Support calling tokenizer for the accurate eos token
|
||||||
# and avoid such hardcode
|
# and avoid such hardcode
|
||||||
end_idx = s.find(end_str, start)
|
end_idx = s.find(end_str, start)
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
# flake8: noqa: E501
|
# flake8: noqa: E501
|
||||||
import copy
|
import copy
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
|
|
||||||
import mmengine
|
import mmengine
|
||||||
@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
|
|||||||
command = ''
|
command = ''
|
||||||
if api_key is not None:
|
if api_key is not None:
|
||||||
command += f'export OPENAI_API_KEY={api_key}; '
|
command += f'export OPENAI_API_KEY={api_key}; '
|
||||||
|
else:
|
||||||
|
api_key = os.environ.get('OPENAI_API_KEY', '').split(',')[0]
|
||||||
|
if api_key:
|
||||||
|
command += f'export OPENAI_API_KEY={api_key}; '
|
||||||
command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}'
|
command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}'
|
||||||
return template.format(task_cmd=command)
|
return template.format(task_cmd=command)
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import tabulate
|
|||||||
from mmengine.config import Config
|
from mmengine.config import Config
|
||||||
|
|
||||||
from opencompass.datasets.custom import make_custom_dataset_config
|
from opencompass.datasets.custom import make_custom_dataset_config
|
||||||
|
from opencompass.models import VLLM, HuggingFaceCausalLM, TurboMindModel
|
||||||
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
from opencompass.partitioners import NaivePartitioner, SizePartitioner
|
||||||
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
|
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
|
||||||
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
|
||||||
@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
|
|||||||
if args.config:
|
if args.config:
|
||||||
config = Config.fromfile(args.config, format_python_code=False)
|
config = Config.fromfile(args.config, format_python_code=False)
|
||||||
config = try_fill_in_custom_cfgs(config)
|
config = try_fill_in_custom_cfgs(config)
|
||||||
|
# set infer accelerator if needed
|
||||||
|
if args.accelerator in ['vllm', 'lmdeploy']:
|
||||||
|
config['models'] = change_accelerator(config['models'],
|
||||||
|
args.accelerator)
|
||||||
return config
|
return config
|
||||||
# parse dataset args
|
# parse dataset args
|
||||||
if not args.datasets and not args.custom_dataset_path:
|
if not args.datasets and not args.custom_dataset_path:
|
||||||
@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
|
|||||||
pad_token_id=args.pad_token_id,
|
pad_token_id=args.pad_token_id,
|
||||||
run_cfg=dict(num_gpus=args.num_gpus))
|
run_cfg=dict(num_gpus=args.num_gpus))
|
||||||
models.append(model)
|
models.append(model)
|
||||||
|
# set infer accelerator if needed
|
||||||
|
if args.accelerator in ['vllm', 'lmdeploy']:
|
||||||
|
models = change_accelerator(models, args.accelerator)
|
||||||
# parse summarizer args
|
# parse summarizer args
|
||||||
summarizer_arg = args.summarizer if args.summarizer is not None \
|
summarizer_arg = args.summarizer if args.summarizer is not None \
|
||||||
else 'example'
|
else 'example'
|
||||||
@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
|
|||||||
format_python_code=False)
|
format_python_code=False)
|
||||||
|
|
||||||
|
|
||||||
|
def change_accelerator(models, accelerator):
|
||||||
|
models = models.copy()
|
||||||
|
model_accels = []
|
||||||
|
for model in models:
|
||||||
|
get_logger().info(f'Transforming {model["abbr"]} to {accelerator}')
|
||||||
|
# change HuggingFace model to VLLM or TurboMindModel
|
||||||
|
if model['type'] is HuggingFaceCausalLM:
|
||||||
|
gen_args = dict()
|
||||||
|
if model.get('generation_kwargs') is not None:
|
||||||
|
generation_kwargs = model['generation_kwargs'].copy()
|
||||||
|
gen_args['temperature'] = 0.001 if generation_kwargs.get(
|
||||||
|
'temperature'
|
||||||
|
) is None else generation_kwargs['temperature']
|
||||||
|
gen_args['top_k'] = 1 if generation_kwargs.get(
|
||||||
|
'top_k') is None else generation_kwargs['top_k']
|
||||||
|
gen_args['top_p'] = 0.9 if generation_kwargs.get(
|
||||||
|
'top_p') is None else generation_kwargs['top_p']
|
||||||
|
gen_args['stop_token_ids'] = None if generation_kwargs.get(
|
||||||
|
'eos_token_id'
|
||||||
|
) is None else generation_kwargs['eos_token_id']
|
||||||
|
generation_kwargs[
|
||||||
|
'stop_token_ids'] = None if generation_kwargs.get(
|
||||||
|
'eos_token_id'
|
||||||
|
) is None else generation_kwargs['eos_token_id']
|
||||||
|
generation_kwargs.pop('eos_token_id')
|
||||||
|
else:
|
||||||
|
# if generation_kwargs is not provided, set default values
|
||||||
|
generation_kwargs = dict()
|
||||||
|
gen_args['temperature'] = 0.0
|
||||||
|
gen_args['top_k'] = 1
|
||||||
|
gen_args['top_p'] = 0.9
|
||||||
|
gen_args['stop_token_ids'] = None
|
||||||
|
|
||||||
|
if accelerator == 'lmdeploy':
|
||||||
|
get_logger().info(
|
||||||
|
f'Transforming {model["abbr"]} to {accelerator}')
|
||||||
|
model = dict(
|
||||||
|
type= # noqa E251
|
||||||
|
f'{TurboMindModel.__module__}.{TurboMindModel.__name__}',
|
||||||
|
abbr=model['abbr'].replace('hf', 'lmdeploy')
|
||||||
|
if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
|
||||||
|
path=model['path'],
|
||||||
|
engine_config=dict(session_len=model['max_seq_len'],
|
||||||
|
max_batch_size=model['batch_size'],
|
||||||
|
tp=model['run_cfg']['num_gpus']),
|
||||||
|
gen_config=dict(top_k=gen_args['top_k'],
|
||||||
|
temperature=gen_args['temperature'],
|
||||||
|
top_p=gen_args['top_p'],
|
||||||
|
max_new_tokens=model['max_out_len'],
|
||||||
|
stop_words=gen_args['stop_token_ids']),
|
||||||
|
max_out_len=model['max_out_len'],
|
||||||
|
max_seq_len=model['max_seq_len'],
|
||||||
|
batch_size=model['batch_size'],
|
||||||
|
concurrency=model['batch_size'],
|
||||||
|
run_cfg=model['run_cfg'],
|
||||||
|
)
|
||||||
|
for item in ['meta_template']:
|
||||||
|
if model.get(item) is not None:
|
||||||
|
model.update(item, model[item])
|
||||||
|
elif accelerator == 'vllm':
|
||||||
|
get_logger().info(
|
||||||
|
f'Transforming {model["abbr"]} to {accelerator}')
|
||||||
|
|
||||||
|
model = dict(
|
||||||
|
type=f'{VLLM.__module__}.{VLLM.__name__}',
|
||||||
|
abbr=model['abbr'].replace('hf', 'vllm')
|
||||||
|
if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
|
||||||
|
path=model['path'],
|
||||||
|
model_kwargs=dict(
|
||||||
|
tensor_parallel_size=model['run_cfg']['num_gpus']),
|
||||||
|
max_out_len=model['max_out_len'],
|
||||||
|
max_seq_len=model['max_seq_len'],
|
||||||
|
batch_size=model['batch_size'],
|
||||||
|
generation_kwargs=generation_kwargs,
|
||||||
|
run_cfg=model['run_cfg'],
|
||||||
|
)
|
||||||
|
for item in ['meta_template', 'end_str']:
|
||||||
|
if model.get(item) is not None:
|
||||||
|
model.update(item, model[item])
|
||||||
|
generation_kwargs.update(
|
||||||
|
dict(temperature=gen_args['temperature']))
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Unsupported accelerator {accelerator}')
|
||||||
|
model_accels.append(model)
|
||||||
|
return model_accels
|
||||||
|
|
||||||
|
|
||||||
def exec_mm_infer_runner(tasks, args, cfg):
|
def exec_mm_infer_runner(tasks, args, cfg):
|
||||||
"""execute multimodal infer runner according to args."""
|
"""execute multimodal infer runner according to args."""
|
||||||
if args.slurm:
|
if args.slurm:
|
||||||
|
Loading…
Reference in New Issue
Block a user