[Sync] deprecate old mbpps (#1064)

This commit is contained in:
Fengzhe Zhou 2024-04-19 20:49:46 +08:00 committed by GitHub
parent c172401323
commit 8c85edd1cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
95 changed files with 1507 additions and 409 deletions

View File

@ -15,6 +15,6 @@ with read_base():
from ..math.math_evaluatorv2_gen_9d2049 import math_datasets from ..math.math_evaluatorv2_gen_9d2049 import math_datasets
from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets from ..TheoremQA.TheoremQA_post_v2_gen_2c2583 import TheoremQA_datasets
from ..humaneval.humaneval_gen_d2537e import humaneval_datasets from ..humaneval.humaneval_gen_d2537e import humaneval_datasets
from ..mbpp.sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets from ..mbpp.deprecated_sanitized_mbpp_gen_cb43ef import sanitized_mbpp_datasets
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), []) datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])

View File

@ -7,7 +7,7 @@ with read_base():
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

View File

@ -7,7 +7,7 @@ with read_base():
from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets from ..GaokaoBench.GaokaoBench_mixed_f2038e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.humaneval_gen_a82cae import humaneval_datasets from ..humaneval.humaneval_gen_a82cae import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets from ..CLUE_C3.CLUE_C3_ppl_e24a31 import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

View File

@ -11,7 +11,7 @@ with read_base():
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets from ..FewCLUE_cluewsc.FewCLUE_cluewsc_ppl_868415 import cluewsc_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_ppl_f1e631 import eprstmt_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_ppl_496661 import storycloze_datasets from ..storycloze.storycloze_ppl_496661 import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets from ..SuperGLUE_AX_b.SuperGLUE_AX_b_ppl_6db806 import AX_b_datasets

View File

@ -15,6 +15,6 @@ with read_base():
from ..math.math_evaluatorv2_gen_cecb31 import math_datasets from ..math.math_evaluatorv2_gen_cecb31 import math_datasets
from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets from ..TheoremQA.TheoremQA_post_v2_gen_ef26ca import TheoremQA_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets from ..mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), []) datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])

View File

@ -7,7 +7,7 @@ with read_base():
from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets from ..GaokaoBench.GaokaoBench_gen_5cfe9e import GaokaoBench_datasets
from ..bbh.bbh_gen_5b92b0 import bbh_datasets from ..bbh.bbh_gen_5b92b0 import bbh_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets from ..CLUE_C3.CLUE_C3_gen_8c358f import C3_datasets
from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets from ..CLUE_CMRC.CLUE_CMRC_gen_1bd3c8 import CMRC_datasets
from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets from ..CLUE_DRCD.CLUE_DRCD_gen_1bd3c8 import DRCD_datasets

View File

@ -12,7 +12,7 @@ with read_base():
from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets from ..FewCLUE_cluewsc.FewCLUE_cluewsc_gen_c68933 import cluewsc_datasets
from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets from ..FewCLUE_eprstmt.FewCLUE_eprstmt_gen_740ea0 import eprstmt_datasets
from ..humaneval.humaneval_gen_8e312c import humaneval_datasets from ..humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..mbpp.mbpp_gen_1e1056 import mbpp_datasets from ..mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ..lambada.lambada_gen_217e11 import lambada_datasets from ..lambada.lambada_gen_217e11 import lambada_datasets
from ..storycloze.storycloze_gen_7f656a import storycloze_datasets from ..storycloze.storycloze_gen_7f656a import storycloze_datasets
from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets from ..SuperGLUE_AX_b.SuperGLUE_AX_b_gen_4dfefa import AX_b_datasets

View File

@ -44,7 +44,7 @@ with read_base():
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.drop_gen_8a9ed9 import drop_datasets from ...drop.drop_gen_8a9ed9 import drop_datasets
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_5bf00b import bbh_datasets from ...bbh.bbh_gen_5bf00b import bbh_datasets

View File

@ -44,7 +44,7 @@ with read_base():
from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from ...gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ...drop.drop_gen_8a9ed9 import drop_datasets from ...drop.drop_gen_8a9ed9 import drop_datasets
from ...humaneval.humaneval_gen_a82cae import humaneval_datasets from ...humaneval.humaneval_gen_a82cae import humaneval_datasets
from ...mbpp.mbpp_gen_1e1056 import mbpp_datasets from ...mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
from ...bbh.bbh_gen_5b92b0 import bbh_datasets from ...bbh.bbh_gen_5b92b0 import bbh_datasets

View File

@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GPQADataset, GPQAEvaluator
from opencompass.utils import first_option_postprocess
gpqa_reader_cfg = dict(
input_columns=['question', 'A', 'B', 'C', 'D'],
output_column='answer')
gpqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='What is the correct answer to this question: {question}\nChoices:\n'
'(A){A}\n'
'(B){B}\n'
'(C){C}\n'
'(D){D}\n'
'Format your response as follows: "The correct answer is (insert answer here)"'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
gpqa_eval_cfg = dict(evaluator=dict(type=GPQAEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD'))
gpqa_datasets = []
gpqa_subsets = {
'extended': 'gpqa_extended.csv',
'main': 'gpqa_main.csv',
'diamond': 'gpqa_diamond.csv'
}
for split in list(gpqa_subsets.keys()):
gpqa_datasets.append(
dict(
abbr='GPQA_' + split,
type=GPQADataset,
path='./data/gpqa/',
name=gpqa_subsets[split],
reader_cfg=gpqa_reader_cfg,
infer_cfg=gpqa_infer_cfg,
eval_cfg=gpqa_eval_cfg)
)

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .mbpp_gen_1e1056 import mbpp_datasets # noqa: F401, F403 from .mbpp_gen_830460 import mbpp_datasets # noqa: F401, F403

View File

@ -1,46 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset, MBPPEvaluator2
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
# This prompt is used for WizardLMCode series
# You can use other config file for basic 3-shot generation
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Create a Python script for this problem:
{text}
Test examples:
{test_list}
### Response:""",
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset,
abbr="mbpp",
path="./data/mbpp/mbpp.jsonl",
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]

View File

@ -25,7 +25,7 @@ mbpp_infer_cfg = dict(
), ),
), ),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer), inferencer=dict(type=GenInferencer, max_out_len=512),
) )
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")

View File

@ -0,0 +1,42 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset_V2,
abbr="mbpp_passk",
path="./data/mbpp/mbpp.jsonl",
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,45 @@
# This config is used for pass@k evaluation with dataset repetition
# That model cannot generate multiple response for single input
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset_V2,
abbr="mbpp_repeat10",
path="./data/mbpp/mbpp.jsonl",
num_repeats=10,
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,42 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n",),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n",),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n",),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n",),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp",
path="./data/mbpp/sanitized-mbpp.jsonl",
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,42 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp_passk",
path="./data/mbpp/sanitized-mbpp.jsonl",
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -0,0 +1,43 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp_repeat10",
path="./data/mbpp/sanitized-mbpp.jsonl",
num_repeats=10,
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .mbpp_cn_gen_1d1481 import mbpp_cn_datasets # noqa: F401, F403 from .mbpp_cn_gen_9114d5 import mbpp_cn_datasets # noqa: F401, F403

View File

@ -0,0 +1,64 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset, MBPPEvaluator
mbpp_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='test_list_2')
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n"
),
dict(role="BOT", prompt="[BEGIN]\n"),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
mbpp_cn_datasets = [
dict(
type=MBPPDataset,
abbr='mbpp_cn',
path='./data/mbpp_cn/mbpp_cn.jsonl',
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from.mbpp_plus_gen_94815c import mbpp_plus_datasets # noqa: F401, F403 from.mbpp_plus_gen_0b836a import mbpp_plus_datasets # noqa: F401, F403

View File

@ -0,0 +1,64 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset
mbpp_plus_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='task_id')
mbpp_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"
),
dict(role="BOT", prompt="[BEGIN]\n"),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role="BOT")
mbpp_plus_datasets = [
dict(
type=MBPPPlusDataset,
abbr='mbpp_plus',
path='./data/mbpp_plus/mbpp_plus.jsonl',
reader_cfg=mbpp_plus_reader_cfg,
infer_cfg=mbpp_plus_infer_cfg,
eval_cfg=mbpp_plus_eval_cfg)
]

View File

@ -0,0 +1,18 @@
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])

View File

@ -0,0 +1,287 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_5needle_en_datasets.append(dataset_dict)
# ----------Chinese Version----------
base_path = './data/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_5needle_zh_datasets.append(dataset_dict)

View File

@ -0,0 +1,109 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = "linear"
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_en_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_zh_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_zh_datasets.append(dataset_dict)

View File

@ -0,0 +1,110 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchOriginEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_zh_datasets.append(dataset_dict)

View File

@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask
with read_base(): with read_base():
from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_passk_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import mbpp_datasets
from .datasets.mbpp.sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets from .datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets
datasets = [] datasets = []
datasets += humaneval_datasets datasets += humaneval_datasets

View File

@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask
with read_base(): with read_base():
from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_repeat10_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_repeat10_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_repeat10_gen_1e1056 import mbpp_datasets
from .datasets.mbpp.sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets from .datasets.mbpp.deprecated_sanitized_mbpp_repeat10_gen_1e1056 import sanitized_mbpp_datasets
datasets = [] datasets = []
datasets += humaneval_datasets datasets += humaneval_datasets

View File

@ -8,7 +8,7 @@ with read_base():
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets from .datasets.math.math_evaluatorv2_gen_cecb31 import math_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model

View File

@ -7,7 +7,7 @@ with read_base():
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.math.math_gen_265cce import math_datasets from .datasets.math.math_gen_265cce import math_datasets
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets from .datasets.mbpp.deprecated_sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model
from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model

View File

@ -1,6 +1,7 @@
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceCausalLM
_meta_template = dict( _meta_template = dict(
begin='<begin▁of▁sentence>',
round=[ round=[
dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="HUMAN", begin='User: ', end='\n\n'),
dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True), dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True),
@ -12,7 +13,6 @@ models = [
type=HuggingFaceCausalLM, type=HuggingFaceCausalLM,
abbr='deepseek-67b-chat-hf', abbr='deepseek-67b-chat-hf',
path="deepseek-ai/deepseek-llm-67b-chat", path="deepseek-ai/deepseek-llm-67b-chat",
tokenizer_path='deepseek-ai/deepseek-llm-67b-chat',
model_kwargs=dict( model_kwargs=dict(
device_map='auto', device_map='auto',
trust_remote_code=True, trust_remote_code=True,
@ -28,6 +28,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<end▁of▁sentence>', batch_padding=True,
) )
] ]

View File

@ -1,6 +1,7 @@
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceCausalLM
_meta_template = dict( _meta_template = dict(
begin='<begin▁of▁sentence>',
round=[ round=[
dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="HUMAN", begin='User: ', end='\n\n'),
dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True), dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True),
@ -12,7 +13,6 @@ models = [
type=HuggingFaceCausalLM, type=HuggingFaceCausalLM,
abbr='deepseek-7b-chat-hf', abbr='deepseek-7b-chat-hf',
path="deepseek-ai/deepseek-llm-7b-chat", path="deepseek-ai/deepseek-llm-7b-chat",
tokenizer_path='deepseek-ai/deepseek-llm-7b-chat',
model_kwargs=dict( model_kwargs=dict(
device_map='auto', device_map='auto',
trust_remote_code=True, trust_remote_code=True,
@ -28,5 +28,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
batch_padding=True,
) )
] ]

View File

@ -1,6 +1,7 @@
from opencompass.models import HuggingFaceCausalLM from opencompass.models import HuggingFaceCausalLM
_meta_template = dict( _meta_template = dict(
begin='<begin▁of▁sentence>',
round=[ round=[
dict(role="HUMAN", begin='User: ', end='\n\n'), dict(role="HUMAN", begin='User: ', end='\n\n'),
dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True), dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True),
@ -12,7 +13,6 @@ models = [
type=HuggingFaceCausalLM, type=HuggingFaceCausalLM,
abbr='deepseek-moe-16b-chat-hf', abbr='deepseek-moe-16b-chat-hf',
path="deepseek-ai/deepseek-moe-16b-chat", path="deepseek-ai/deepseek-moe-16b-chat",
tokenizer_path='deepseek-ai/deepseek-moe-16b-chat',
model_kwargs=dict( model_kwargs=dict(
device_map='auto', device_map='auto',
trust_remote_code=True, trust_remote_code=True,
@ -26,7 +26,7 @@ models = [
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<end▁of▁sentence>', batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'), dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True), dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,9 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
min_out_len=1,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'), dict(role="HUMAN", begin='<start_of_turn>user\n', end='<end_of_turn>\n'),
dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True), dict(role="BOT", begin="<start_of_turn>model\n", end='<end_of_turn>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -29,5 +28,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -7,7 +7,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -33,5 +32,6 @@ models = [
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -6,7 +6,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -32,5 +31,6 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -7,7 +7,6 @@ _meta_template = dict(
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True), dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
@ -33,5 +32,6 @@ models = [
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
generation_kwargs = {"eos_token_id": [2, 92542]}, generation_kwargs = {"eos_token_id": [2, 92542]},
batch_padding=True,
) )
] ]

View File

@ -3,27 +3,31 @@ from opencompass.models.turbomind import TurboMindModel
_meta_template = dict( _meta_template = dict(
round=[ round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True),
generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
dict( dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm2-chat-20b-turbomind', abbr="internlm2-chat-20b-turbomind",
path="internlm/internlm2-chat-20b", path="internlm/internlm2-chat-20b",
meta_template=_meta_template, meta_template=_meta_template,
engine_config=dict(session_len=210000, engine_config=dict(
session_len=210000,
max_batch_size=8, max_batch_size=8,
rope_scaling_factor=3.0, rope_scaling_factor=3.0,
model_name="internlm2-chat-20b", model_name="internlm2-chat-20b",
tp=2), tp=2,
gen_config=dict(top_k=1, top_p=0.8, stop_words=[2, 92542],
),
gen_config=dict(
top_k=1,
top_p=0.8,
temperature=1.0, temperature=1.0,
max_new_tokens=2000,), max_new_tokens=2000,
),
max_out_len=2000, max_out_len=2000,
max_seq_len=210000, max_seq_len=210000,
batch_size=1, batch_size=1,

View File

@ -3,29 +3,34 @@ from opencompass.models.turbomind import TurboMindModel
_meta_template = dict( _meta_template = dict(
round=[ round=[
dict(role='HUMAN', begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin="<|im_start|>user\n", end="<|im_end|>\n"),
dict(role='BOT', begin='<|im_start|>assistant\n', end='<|im_end|>\n', dict(role="BOT", begin="<|im_start|>assistant\n", end="<|im_end|>\n", generate=True),
generate=True),
], ],
eos_token_id=92542
) )
models = [ models = [
dict( dict(
type=TurboMindModel, type=TurboMindModel,
abbr='internlm2-chat-7b-turbomind', abbr="internlm2-chat-7b-turbomind",
path="internlm/internlm2-chat-7b", path="internlm/internlm2-chat-7b",
meta_template=_meta_template, meta_template=_meta_template,
engine_config=dict(session_len=210000, engine_config=dict(
session_len=210000,
max_batch_size=8, max_batch_size=8,
rope_scaling_factor=2.0, rope_scaling_factor=2.0,
model_name="internlm2-chat-7b"), model_name="internlm2-chat-7b",
gen_config=dict(top_k=1, top_p=0.8, tp=1,
stop_words=[2, 92542],
),
gen_config=dict(
top_k=1,
top_p=0.8,
temperature=1.0, temperature=1.0,
max_new_tokens=2000), max_new_tokens=2000,
),
max_out_len=2000, max_out_len=2000,
max_seq_len=210000, max_seq_len=210000,
batch_size=8, batch_size=1,
concurrency=8, concurrency=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
) )

View File

@ -27,5 +27,6 @@ models = [
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='[INST]', end_str='[INST]',
batch_padding=True,
) )
] ]

View File

@ -27,5 +27,6 @@ models = [
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='[INST]', end_str='[INST]',
batch_padding=True,
) )
] ]

View File

@ -27,5 +27,6 @@ models = [
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='[INST]', end_str='[INST]',
batch_padding=True,
) )
] ]

View File

@ -7,7 +7,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
dict(role="BOT", begin="", end='</s> ', generate=True), dict(role="BOT", begin="", end='</s> ', generate=True),
], ],
eos_token_id=2
) )
models = [ models = [
@ -30,5 +29,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
batch_padding=True,
) )
] ]

View File

@ -7,7 +7,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
dict(role="BOT", begin="", end='</s> ', generate=True), dict(role="BOT", begin="", end='</s> ', generate=True),
], ],
eos_token_id=2
) )
models = [ models = [
@ -30,6 +29,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>', batch_padding=True,
) )
] ]

View File

@ -7,7 +7,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='[INST] ', end=' [/INST]'), dict(role="HUMAN", begin='[INST] ', end=' [/INST]'),
dict(role="BOT", begin="", end='</s> ', generate=True), dict(role="BOT", begin="", end='</s> ', generate=True),
], ],
eos_token_id=2
) )
models = [ models = [
@ -30,6 +29,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='</s>', batch_padding=True,
) )
] ]

View File

@ -12,7 +12,6 @@ models = [
type=HuggingFace, type=HuggingFace,
abbr='minicpm-2b-dpo-hf', abbr='minicpm-2b-dpo-hf',
path='openbmb/MiniCPM-2B-dpo-fp32', path='openbmb/MiniCPM-2B-dpo-fp32',
tokenizer_path='openbmb/MiniCPM-2B-dpo-fp32',
model_kwargs=dict( model_kwargs=dict(
trust_remote_code=True, trust_remote_code=True,
device_map='auto', device_map='auto',
@ -27,6 +26,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<用户>', batch_padding=True,
) )
] ]

View File

@ -12,7 +12,6 @@ models = [
type=HuggingFace, type=HuggingFace,
abbr='minicpm-2b-sft-hf', abbr='minicpm-2b-sft-hf',
path='openbmb/MiniCPM-2B-sft-fp32', path='openbmb/MiniCPM-2B-sft-fp32',
tokenizer_path='openbmb/MiniCPM-2B-sft-fp32',
model_kwargs=dict( model_kwargs=dict(
trust_remote_code=True, trust_remote_code=True,
device_map='auto', device_map='auto',
@ -27,6 +26,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<用户>', batch_padding=True,
) )
] ]

View File

@ -0,0 +1,25 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|START_OF_TURN_TOKEN|><|USER_TOKEN|>', end='<|END_OF_TURN_TOKEN|>'),
dict(role="BOT", begin="<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", end='<|END_OF_TURN_TOKEN|>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='command-r-plus-hf',
path="CohereForAI/c4ai-command-r-plus",
model_kwargs=dict(device_map='auto', trust_remote_code=True),
tokenizer_kwargs=dict(padding_side='left', truncation_side='left', trust_remote_code=True),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=8, num_procs=1),
end_str='<|END_OF_TURN_TOKEN|>',
batch_padding=True,
)
]

View File

@ -29,7 +29,6 @@ models = [
batch_size=8, batch_size=8,
meta_template=_meta_template, meta_template=_meta_template,
run_cfg=dict(num_gpus=8, num_procs=1), run_cfg=dict(num_gpus=8, num_procs=1),
end_str='<|im_end|>',
batch_padding=True, batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -0,0 +1,25 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen1.5-32b-hf',
path="Qwen/Qwen1.5-32B",
tokenizer_path='Qwen/Qwen1.5-32B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
pad_token_id=151645,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='qwen1.5-32b-chat-hf',
path="Qwen/Qwen1.5-32B-Chat",
model_kwargs=dict(
device_map='auto',
trust_remote_code=True
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<|im_end|>',
batch_padding=True,
)
]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -5,7 +5,6 @@ _meta_template = dict(
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'), dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True), dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
], ],
eos_token_id=151645,
) )
models = [ models = [
@ -24,11 +23,11 @@ models = [
use_fast=False, use_fast=False,
), ),
meta_template=_meta_template, meta_template=_meta_template,
pad_token_id=151645,
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -12,7 +12,6 @@ models = [
type=HuggingFace, type=HuggingFace,
abbr='yi-34b-chat-hf', abbr='yi-34b-chat-hf',
path='01-ai/Yi-34B-Chat', path='01-ai/Yi-34B-Chat',
tokenizer_path='01-ai/Yi-34B-Chat',
model_kwargs=dict( model_kwargs=dict(
trust_remote_code=True, trust_remote_code=True,
device_map='auto', device_map='auto',
@ -26,7 +25,8 @@ models = [
max_out_len=100, max_out_len=100,
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -28,5 +28,6 @@ models = [
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1), run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>', end_str='<|im_end|>',
batch_padding=True,
) )
] ]

View File

@ -133,6 +133,8 @@ context_lengths_128k = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 1
needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k") needlebench_128k_summarizer = create_summarizer(context_lengths_128k, depths_list_sparse, "128k")
context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000]) context_lengths_200k = list([16000, 48000, 80000, 112000, 128000, 144000, 176000, 200000])
needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k") needlebench_200k_summarizer = create_summarizer(context_lengths_200k, depths_list_sparse, "200k")
context_lengths_256k = list([32000, 128000, 256000])
needlebench_256k_summarizer = create_summarizer(context_lengths_256k, depths_list_sparse, "256k")
context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000]) context_lengths_1000k = list([20000, 160000, 300000, 440000, 580000, 720000, 860000, 1000000])
needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k") needlebench_1000k_summarizer = create_summarizer(context_lengths_1000k, depths_list_sparse, "1000k")

View File

@ -4,7 +4,7 @@ This tutorial primarily focuses on evaluating a model's coding proficiency, usin
## pass@1 ## pass@1
If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md). If you only need to generate a single response to evaluate the pass@1 performance, you can directly use [configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) and [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py), referring to the general [quick start tutorial](../get_started/quick_start.md).
For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md). For multilingual evaluation, please refer to the [Multilingual Code Evaluation Tutorial](./code_eval_service.md).
@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
mbpp_datasets[0]['type'] = MBPPDataset_V2 mbpp_datasets[0]['type'] = MBPPDataset_V2
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
@ -63,7 +63,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
humaneval_datasets[0]['num_repeats'] = 10 humaneval_datasets[0]['num_repeats'] = 10

View File

@ -4,7 +4,7 @@
## pass@1 ## pass@1
如果只需要生成单条回复来评测pass@1的性能可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。 如果只需要生成单条回复来评测pass@1的性能可以直接使用[configs/datasets/humaneval/humaneval_gen_8e312c.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/humaneval/humaneval_gen_8e312c.py) 和 [configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py](https://github.com/open-compass/opencompass/blob/main/configs/datasets/mbpp/deprecated_mbpp_gen_1e1056.py) 并参考通用的[快速上手教程](../get_started/quick_start.md)即可。
如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。 如果要进行多语言评测,可以参考[多语言代码评测教程](./code_eval_service.md)。
@ -21,7 +21,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
mbpp_datasets[0]['type'] = MBPPDataset_V2 mbpp_datasets[0]['type'] = MBPPDataset_V2
mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator mbpp_datasets[0]['eval_cfg']['evaluator']['type'] = MBPPPassKEvaluator
@ -64,7 +64,7 @@ from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
with read_base(): with read_base():
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_gen_1e1056 import mbpp_datasets
humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10' humaneval_datasets[0]['abbr'] = 'openai_humaneval_pass10'
humaneval_datasets[0]['num_repeats'] = 10 humaneval_datasets[0]['num_repeats'] = 10

View File

@ -56,6 +56,12 @@ def parse_args():
'to run', 'to run',
action='store_true', action='store_true',
default=False) default=False)
parser.add_argument(
'--accelerator',
help='Infer accelerator, support vllm and lmdeploy now.',
choices=['vllm', 'lmdeploy', 'hg'],
default='hg',
type=str)
parser.add_argument('-m', parser.add_argument('-m',
'--mode', '--mode',
help='Running mode. You can choose "infer" if you ' help='Running mode. You can choose "infer" if you '

View File

@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
@ -321,7 +319,7 @@ def timeout_handler(signum, frame):
try: try:
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError: except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform') print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds

View File

@ -134,11 +134,20 @@ class MBPPPlusDataset(BaseDataset):
multiple responses in special cases. multiple responses in special cases.
""" """
def processing_test(example):
example['test_case'] = example['test_list']
example['test_list'] = '\n'.join(example['test_list'])
example['test_list_2'] = example['test_list']
example['test_column'] = dict(test_list_2=example['test_list'],
task_id=example['task_id'])
return example
dataset = [] dataset = []
with open(path, 'r', encoding='utf-8') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
dataset.extend( example = json.loads(line.strip())
[json.loads(line.strip()) for _ in range(num_repeats)]) example = processing_test(example)
dataset.extend([example for _ in range(num_repeats)])
return Dataset.from_list(dataset) return Dataset.from_list(dataset)
@ -211,7 +220,7 @@ class MBPPEvaluator(BaseEvaluator):
predictions)): predictions)):
pred = self._process_answer(pred) pred = self._process_answer(pred)
programs = self._process_test(refer, pred) programs = self._process_test(refer, pred)
future = executor.submit(execution, programs, i, 3) future = executor.submit(execution, programs, i, 10)
futures.append(future) futures.append(future)
details[str(i)] = {} details[str(i)] = {}
details[str(i)]['origin'] = predictions[i] details[str(i)]['origin'] = predictions[i]
@ -262,39 +271,34 @@ class MBPPEvaluator(BaseEvaluator):
return {f'mbpp_plus_{k}': score[k] * 100 for k in score} return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
def _process_answer(self, text): def _process_answer(self, text):
try: patterns = [
# for chatGLM related text r"\[BEGIN\]\s*'(.*)'\s*\[DONE\]",
eval_text = eval(text) r"BEGIN\s*'(.*)'\s*\[DONE\]",
except Exception: r"\[BEGIN\]\s*'(.*)'\s*DONE",
pass r"BEGIN\s*'(.*)'\s*DONE",
else: r"\[BEGIN\]\s*'(.*)\s*\[DONE\]",
if isinstance(eval_text, str): r"BEGIN\s*'(.*)\s*\[DONE\]",
text = eval_text r"\[BEGIN\]\s*'(.*)\s*DONE",
# deal with code block r"BEGIN\s*'(.*)\s*DONE",
if '```' in text: r'\[BEGIN\]\s*(.*)\s*\[DONE\]',
blocks = re.findall(r'```(.*?)```', text, re.DOTALL) r'BEGIN\s*(.*)\s*\[DONE\]',
if len(blocks) == 0: r'\[BEGIN\]\s*(.*)\s*DONE',
text = text.split('```')[1] # fall back to default strategy r'BEGIN\s*(.*)\s*DONE',
else: r'```python\s*(.*)\s*```',
text = blocks[0] # fetch the first code block r'```\s*(.*)\s*```',
if not text.startswith('\n'): # in case starting with ```xxx r'(.*)\s*```.*',
text = text[max(text.find('\n') + 1, 0):] r"\[BEGIN\]\s*'(.*)",
r'\[BEGIN\](.*)',
]
for p in patterns:
match = re.search(p, text, re.DOTALL)
if match:
text = match.group(1)
break
text = text.split('```')[0]
text = re.split(r"'?\s*\[?DONE\]?", text)[0]
text = text.replace('\\_', '_')
text = text.strip() text = text.strip()
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
if match:
text = text[:match.start()]
match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
if match:
text = text[match.end():]
text = text.strip()
if text.startswith("'"):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
text = text.replace('\\', '')
match = re.search(r'```python(.*)```', text, re.DOTALL)
if match:
text = match.group(1).strip().split('```')[0].strip()
return text return text
def _process_test(self, test_case, pred): def _process_test(self, test_case, pred):
@ -451,7 +455,7 @@ class MBPPPassKEvaluator(MBPPEvaluator):
for pred in preds: for pred in preds:
pred = self._process_answer(pred) pred = self._process_answer(pred)
programs = self._process_test(test_case, pred) programs = self._process_test(test_case, pred)
future = executor.submit(execution, programs, task_id, 3) future = executor.submit(execution, programs, task_id, 10)
futures.append(future) futures.append(future)
from tqdm import tqdm from tqdm import tqdm

View File

@ -27,11 +27,9 @@ except ImportError:
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from opencompass.utils.logging import get_logger
from .base import BaseDataset from .base import BaseDataset
logger = get_logger()
TIMEOUT = 10 TIMEOUT = 10
@ -267,7 +265,7 @@ def timeout_handler(signum, frame):
try: try:
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
except AttributeError: except AttributeError:
logger.warning('signal.SIGALRM is not available on this platform') print('signal.SIGALRM is not available on this platform')
timeout = 4 # seconds timeout = 4 # seconds

View File

@ -84,7 +84,12 @@ class OpenAI(BaseAPIModel):
self.top_logprobs = top_logprobs self.top_logprobs = top_logprobs
if isinstance(key, str): if isinstance(key, str):
self.keys = [os.getenv('OPENAI_API_KEY') if key == 'ENV' else key] if key == 'ENV':
if 'OPENAI_API_KEY' not in os.environ:
raise ValueError('OpenAI API key is not set.')
self.keys = os.getenv('OPENAI_API_KEY').split(',')
else:
self.keys = [key]
else: else:
self.keys = key self.keys = key
@ -101,12 +106,11 @@ class OpenAI(BaseAPIModel):
self.url = openai_api_base self.url = openai_api_base
self.path = path self.path = path
def generate( def generate(self,
self,
inputs: List[PromptType], inputs: List[PromptType],
max_out_len: int = 512, max_out_len: int = 512,
temperature: float = 0.7, temperature: float = 0.7,
) -> List[str]: **kwargs) -> List[str]:
"""Generate results given a list of inputs. """Generate results given a list of inputs.
Args: Args:
@ -412,9 +416,15 @@ class OpenAIAllesAPIN(OpenAI):
} }
for _ in range(self.retry): for _ in range(self.retry):
self.wait() self.wait()
try:
raw_response = requests.post(self.url, raw_response = requests.post(self.url,
headers=self.headers, headers=self.headers,
data=json.dumps(data)) data=json.dumps(data))
except requests.ConnectionError:
self.logger.error('Request error, got',
str(raw_response.content))
time.sleep(1)
continue
try: try:
response = raw_response.json() response = raw_response.json()
except requests.JSONDecodeError: except requests.JSONDecodeError:

View File

@ -161,7 +161,7 @@ class Qwen(BaseAPIModel):
time.sleep(1) time.sleep(1)
continue continue
if response.status_code == 429: if response.status_code == 429:
print('Rate limited') print(response)
time.sleep(2) time.sleep(2)
continue continue
if response.status_code == 400: if response.status_code == 400:

View File

@ -214,6 +214,16 @@ class DLCRunner(BaseRunner):
pod_create_time = None pod_create_time = None
pri_time = None pri_time = None
initial_time = datetime.datetime.now() initial_time = datetime.datetime.now()
url = 'http://pai-console.cb210e3f99cd7403f8de2a630dcc99fc3.cn-wulanchabu.alicontainer.com' # noqa: E501
logger = get_logger()
logger.debug('')
logger.debug('*' * 168)
logger.debug(
f'{url}/index?workspaceId={self.aliyun_cfg["workspace_id"]}#/dlc2/job/{job_id}/detail' # noqa: E501
)
logger.debug('*' * 168)
while True: while True:
# 1. Avoid to request dlc too frequently. # 1. Avoid to request dlc too frequently.
# 2. DLC job may not be ready immediately after creation. # 2. DLC job may not be ready immediately after creation.

View File

@ -188,6 +188,7 @@ class SlurmSequentialRunner(BaseRunner):
tmpl += f' --gres=gpu:{num_gpus}' tmpl += f' --gres=gpu:{num_gpus}'
for extra_cmd in self.extra_command: for extra_cmd in self.extra_command:
tmpl += f' {extra_cmd}' tmpl += f' {extra_cmd}'
tmpl += ' -x HOST-10-140-60-7'
tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}' tmpl += f" -N1 -u -J '{task_name[:512]}'" + ' {task_cmd}'
get_cmd = partial(task.get_command, get_cmd = partial(task.get_command,
cfg_path=param_file, cfg_path=param_file,

View File

@ -72,7 +72,7 @@ dataset_mapping_dict = {}
needle_counts = ['2', '3', '4', '5'] needle_counts = ['2', '3', '4', '5']
languages = ['en', 'zh'] languages = ['en', 'zh']
sizes = ['4k', '8k', '32k', '200k', '1000k'] sizes = ['4k', '8k', '32k', '200k', '256k', '1000k']
types = ['origin', 'parallel'] types = ['origin', 'parallel']
for needle_count in needle_counts: for needle_count in needle_counts:
@ -190,7 +190,7 @@ def save_results_to_plots(txt_results_save_path):
numbers = [2, 3, 4, 5] numbers = [2, 3, 4, 5]
languages = ['en', 'zh'] languages = ['en', 'zh']
size_exists = [] size_exists = []
sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_1000k'] sizes_origin = ['_4k', '_8k', '_32k', '_128k', '_200k', '_256k', '_1000k']
for size in sizes_origin: for size in sizes_origin:
if size in content: if size in content:
@ -301,6 +301,9 @@ def visualize(df_raw, save_path: str,model_name: str ,dataset_type:str):
markersize=8, markersize=8,
label='Average Depth Score' label='Average Depth Score'
) )
for x_value, y_value in zip(x_data, y_data):
ax2.text(x_value, y_value, f'{y_value:.2f}', ha='center', va='top')
ax2.set_ylim(0, 100) ax2.set_ylim(0, 100)
ax2.set_yticklabels([]) ax2.set_yticklabels([])

View File

@ -1,6 +1,5 @@
# flake8: noqa: E501 # flake8: noqa
import ast # yapf: disable
import csv
import os import os
import os.path as osp import os.path as osp
import re import re
@ -10,7 +9,7 @@ from itertools import product
import mmengine import mmengine
from mmengine import ConfigDict from mmengine import ConfigDict
from prettytable import from_csv from tabulate import tabulate
from opencompass.partitioners.sub_naive import remove_duplicate_pairs from opencompass.partitioners.sub_naive import remove_duplicate_pairs
from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
@ -18,6 +17,12 @@ from opencompass.utils import dataset_abbr_from_cfg, model_abbr_from_cfg
from .utils import get_judgeanswer_and_reference, get_outdir from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_compass_arena(s): def post_process_compass_arena(s):
if result := re.findall('(?:选择:|Choice: )([ABC])', s): if result := re.findall('(?:选择:|Choice: )([ABC])', s):
return result[0] return result[0]
@ -68,17 +73,90 @@ class CompassArenaSummarizer:
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner']['compare_models'] self.compare_models = self.cfg['eval']['partitioner']['compare_models']
self.judge_models = self.cfg.get('judge_models', None) self.judge_models = self.cfg.get('judge_models', None)
self.meta_judge_model = self.cfg.eval.partitioner.get( self.meta_judge_model = self.cfg.eval.partitioner.get('meta_judge_model', None)
'meta_judge_model', None)
self.judge_type = judge_type self.judge_type = judge_type
assert self.judge_type in ['general'] assert self.judge_type in ['general']
self.judge_map = { self.judge_map = {'general': post_process_compass_arena}
'general': post_process_compass_arena,
}
self.judge_function = self.judge_map[self.judge_type] self.judge_function = self.judge_map[self.judge_type]
self.check_pos_bias = check_pos_bias self.check_pos_bias = check_pos_bias
self.summary_type = summary_type self.summary_type = summary_type
def get_score(self, time_str):
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs([combo for combo in model_combinations if combo[0] != combo[1]])
if self.meta_judge_model is not None:
self.judge_models.append(self.meta_judge_model)
scores = {}
for idx, judge_model_cfg in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model_cfg)
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
for model_pair in unique_combinations:
model1 = model_pair[0]['abbr']
model2 = model_pair[1]['abbr']
if idx == len(self.judge_models):
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model
else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model
subdir_path = os.path.join(results_folder, subdir)
if not os.path.isdir(subdir_path):
print(subdir_path + ' is not exist! please check!')
continue
judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
if self.check_pos_bias:
bias_num = check_position_bias(judged_answers, references)
else:
bias_num = 0
win_model1 = defaultdict(float)
win_model2 = defaultdict(float)
categories = defaultdict(float)
model1 = references[0]['answer1']
model2 = references[0]['answer2']
for prediction, reference in zip(judged_answers, references):
categories[dataset_abbr] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
score_1, score_2 = 1, 0
else:
score_1, score_2 = 0, 1
elif prediction == 'B':
if reference['answer1'] == model1:
score_1, score_2 = 0, 1
else:
score_1, score_2 = 1, 0
elif prediction == 'C':
if self.summary_type == 'half_add':
score_1, score_2 = 0.5, 0.5
else:
score_1, score_2 = 0, 0
win_model1[reference['capability']] += score_1
win_model1[dataset_abbr] += score_1
win_model2[reference['capability']] += score_2
win_model2[dataset_abbr] += score_2
for capability in categories:
win_model1[capability] = win_model1[capability] / categories[capability] * 100
win_model1[capability] = round(win_model1[capability], 2)
win_model2[capability] = win_model2[capability] / categories[capability] * 100
win_model2[capability] = round(win_model2[capability], 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
if judge_model not in scores:
scores[judge_model] = {}
if dataset_abbr not in scores[judge_model]:
scores[judge_model][dataset_abbr] = {}
scores[judge_model][dataset_abbr][model2] = win_model2
return scores
def summarize( def summarize(
self, self,
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'), time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S'),
@ -91,143 +169,72 @@ class CompassArenaSummarizer:
Returns: Returns:
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str)
model_combinations = list(
product(self.base_models, self.compare_models))
unique_combinations = remove_duplicate_pairs(
[combo for combo in model_combinations if combo[0] != combo[1]])
fout_list = []
pre_len = len(self.judge_models) scores = self.get_score(time_str)
if self.meta_judge_model is not None: # scores['win_' + model1] = win_model1
self.judge_models.append(self.meta_judge_model) output_dir, results_folder = get_outdir(self.cfg, time_str)
meta_judge_model_abbr = model_abbr_from_cfg(self.meta_judge_model)
else:
meta_judge_model_abbr = None
for idx, judge_model in enumerate(self.judge_models): for idx, judge_model in enumerate(self.judge_models):
judge_model = model_abbr_from_cfg(judge_model) judge_abbr = model_abbr_from_cfg(judge_model)
for dataset in dataset_cfgs: for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset) dataset_abbr = dataset_abbr_from_cfg(dataset)
if idx == pre_len: summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
fout = osp.join( one_column = list(scores[judge_abbr][dataset_abbr].values())[0]
output_dir, 'summarized-by--' + judge_model + '-' + row_headers = [i for i in one_column.keys() if i not in [dataset_abbr, 'position_bias']]
dataset_abbr + '-report.csv') row_headers = [dataset_abbr, 'position_bias'] + row_headers
headers = [''] + summarizer_model_abbrs
table = []
for row_header in row_headers:
row = [row_header]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(row_header, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
else: else:
fout = osp.join( output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-' + dataset_abbr + '-report.csv')
output_dir, 'judged-by--' + judge_model + '-' +
dataset_abbr + '-report.csv') with open(output_filename, 'w') as f:
fout_list.append(fout) f.write(','.join(headers) + '\n')
for model_pair in unique_combinations: for line in table:
model1, model2, = model_pair[0]['abbr'], model_pair[1][ f.write(','.join(line) + '\n')
'abbr'], print(output_filename)
if idx == pre_len:
subdir = model1 + '_' + model2 + '_summarized-by--' + judge_model table = []
summarizer_model_abbrs = [model_abbr_from_cfg_used_in_summarizer(i) for i in self.compare_models]
headers = [''] + summarizer_model_abbrs
for dataset in self.cfg['datasets']:
dataset_abbr = dataset_abbr_from_cfg(dataset)
row = [dataset_abbr]
for model_cfg in self.compare_models:
model_abbr = model_abbr_from_cfg(model_cfg)
s = scores[judge_abbr][dataset_abbr][model_abbr].get(dataset_abbr, '')
if isinstance(s, float):
s = f'{s:.2f}'
if isinstance(s, int):
s = str(s)
row.append(s)
table.append(row)
txt = tabulate(table, headers=headers)
print(txt)
if idx == len(self.judge_models):
output_filename = osp.join(output_dir, 'summarized-by--' + judge_abbr + '-overall-report.csv')
else: else:
subdir = model1 + '_' + model2 + '_judged-by--' + judge_model output_filename = osp.join(output_dir, 'judged-by--' + judge_abbr + '-overall-report.csv')
subdir_path = os.path.join(results_folder, subdir) with open(output_filename, 'w') as f:
if os.path.isdir(subdir_path): f.write(','.join(headers) + '\n')
judged_answers, references = get_judgeanswer_and_reference( for line in table:
dataset, f.write(','.join(line) + '\n')
subdir_path, print(output_filename)
self.judge_function,
)
if self.check_pos_bias:
bias_num = check_position_bias(
judged_answers, references)
else:
bias_num = 0
win_model1, win_model2, categories = defaultdict(
float), defaultdict(float), defaultdict(float)
model1, model2 = references[0]['answer1'], references[
0]['answer2']
for prediction, reference in zip(
judged_answers, references):
if self.summary_type == 'single':
if prediction == 'A':
categories['total'] += 1
categories[reference['capability']] += 1
if reference['answer1'] == model1:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
categories['total'] += 1
categories[reference['capability']] += 1
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif self.summary_type == 'half_add':
categories['total'] += 1
categories[reference['capability']] += 1
if prediction == 'A':
if reference['answer1'] == model1:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
else:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
elif prediction == 'B':
if reference['answer1'] == model1:
win_model2[
reference['capability']] += 1
win_model2['total'] += 1
else:
win_model1[
reference['capability']] += 1
win_model1['total'] += 1
elif prediction == 'C':
win_model1[reference['capability']] += 0.5
win_model1['total'] += 0.5
win_model2[reference['capability']] += 0.5
win_model2['total'] += 0.5
for capability in categories:
if capability not in win_model1:
win_model1[capability] = 0.0
else:
win_model1[capability] = round(
(win_model1[capability] /
categories[capability]) * 100, 2)
if capability not in win_model2:
win_model2[capability] = 0.0
else:
win_model2[capability] = round(
(win_model2[capability] /
categories[capability]) * 100, 2)
win_model1['position_bias'] = bias_num
win_model2['position_bias'] = bias_num
scores = {
'win_' + model1: win_model1,
'win_' + model2: win_model2
}
rows = list(scores.keys())
columns = list(scores[rows[0]].keys())
columns.insert(0, columns.pop(columns.index('total')))
columns.insert(
1, columns.pop(columns.index('position_bias')))
with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([model1 + '_vs_' + model2] +
columns)
for row in rows:
writer.writerow([row] + [
scores[row][column] for column in columns
])
else:
print(subdir_path + ' is not exist! please check!')
for fout in fout_list:
with open(fout, 'r') as f:
x = from_csv(f)
print(fout)
print(x)

View File

@ -1,4 +1,5 @@
# flake8: noqa: E501 # flake8: noqa
# yapf: disable
import csv import csv
import os import os
import os.path as osp import os.path as osp
@ -8,11 +9,7 @@ from datetime import datetime
import numpy as np import numpy as np
from mmengine import ConfigDict from mmengine import ConfigDict
from tabulate import tabulate
try:
from prettytable import from_csv
except ImportError:
from_csv = None
from opencompass.utils import model_abbr_from_cfg from opencompass.utils import model_abbr_from_cfg
@ -20,6 +17,12 @@ from .compass_arena import CompassArenaSummarizer
from .utils import get_judgeanswer_and_reference, get_outdir from .utils import get_judgeanswer_and_reference, get_outdir
def model_abbr_from_cfg_used_in_summarizer(model):
if model.get('summarizer_abbr', None):
return model['summarizer_abbr']
else:
return model_abbr_from_cfg(model)
def post_process_mtbench_pair(judgement: str): def post_process_mtbench_pair(judgement: str):
"""Input a string like below: """Input a string like below:
@ -52,7 +55,7 @@ def get_capability_results(
references, references,
fout, fout,
fout_flag, fout_flag,
model, model_abbr,
): ):
capability_ratings = defaultdict(int) capability_ratings = defaultdict(int)
capability_counts = defaultdict(int) capability_counts = defaultdict(int)
@ -70,12 +73,12 @@ def get_capability_results(
capability_avg_ratings[capability] = s capability_avg_ratings[capability] = s
columns = list(capability_avg_ratings.keys()) columns = list(capability_avg_ratings.keys())
columns.insert(0, columns.pop(columns.index('total'))) columns.insert(0, columns.pop(columns.index('total')))
with open(fout, 'a+', newline='') as csvfile: with open(fout, 'a+', newline='') as csvfile:
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
if fout_flag == 0: if fout_flag == 0:
writer.writerow(['model'] + columns) writer.writerow(['model'] + columns)
writer.writerow([model] + writer.writerow([model_abbr] + [capability_avg_ratings[column] for column in columns])
[capability_avg_ratings[column] for column in columns])
class MTBenchSummarizer(CompassArenaSummarizer): class MTBenchSummarizer(CompassArenaSummarizer):
@ -92,13 +95,9 @@ class MTBenchSummarizer(CompassArenaSummarizer):
self.cfg = config self.cfg = config
if self.judge_type == 'single': if self.judge_type == 'single':
self.eval_model_cfgs = self.cfg['eval']['partitioner']['models'] self.eval_model_cfgs = self.cfg['eval']['partitioner']['models']
self.eval_model_abbrs = [
model_abbr_from_cfg(model) for model in self.eval_model_cfgs
]
elif self.judge_type == 'pair': elif self.judge_type == 'pair':
self.base_models = self.cfg['eval']['partitioner']['base_models'] self.base_models = self.cfg['eval']['partitioner']['base_models']
self.compare_models = self.cfg['eval']['partitioner'][ self.compare_models = self.cfg['eval']['partitioner']['compare_models']
'compare_models']
self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0]) self.judge_abbr = model_abbr_from_cfg(self.cfg['judge_models'][0])
self.judge_map = { self.judge_map = {
'single': post_process_mtbench_single, 'single': post_process_mtbench_single,
@ -106,8 +105,7 @@ class MTBenchSummarizer(CompassArenaSummarizer):
} }
self.judge_function = self.judge_map[self.judge_type] self.judge_function = self.judge_map[self.judge_type]
def summarize(self, def summarize(self, time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
"""Summarize the subjectivity analysis based on evaluation results. """Summarize the subjectivity analysis based on evaluation results.
Args: Args:
@ -116,33 +114,40 @@ class MTBenchSummarizer(CompassArenaSummarizer):
Returns: Returns:
pd.DataFrame: The summary results. pd.DataFrame: The summary results.
""" """
if self.judge_type == 'single': if self.judge_type == 'pair':
return super().summarize()
# self.judge_type == 'single'
dataset_cfgs = self.cfg['datasets'] dataset_cfgs = self.cfg['datasets']
output_dir, results_folder = get_outdir(self.cfg, time_str) output_dir, results_folder = get_outdir(self.cfg, time_str)
fout_flag = 0 fout_flag = 0
for eval_model_abbr in self.eval_model_abbrs: for eval_model_cfg in self.eval_model_cfgs:
subdir = eval_model_abbr + '_judged-by--' + self.judge_abbr eval_model_abbr = model_abbr_from_cfg(eval_model_cfg)
subdir_path = os.path.join(results_folder, subdir) show_model_abbr = model_abbr_from_cfg_used_in_summarizer(eval_model_cfg)
subdir_path = os.path.join(results_folder, eval_model_abbr + '_judged-by--' + self.judge_abbr)
if os.path.isdir(subdir_path): if os.path.isdir(subdir_path):
model, judge_model = eval_model_abbr, self.judge_abbr fout = osp.join(output_dir, 'judged-by--' + self.judge_abbr + '-capability.csv')
fout = osp.join(
output_dir,
'judged-by--' + judge_model + '-capability.csv')
overall_judged_answers, overall_references = [], [] overall_judged_answers, overall_references = [], []
for dataset in dataset_cfgs: for dataset in dataset_cfgs:
judged_answers, references = get_judgeanswer_and_reference( judged_answers, references = get_judgeanswer_and_reference(dataset, subdir_path, self.judge_function)
dataset, subdir_path, self.judge_function)
overall_judged_answers += judged_answers overall_judged_answers += judged_answers
overall_references += references overall_references += references
get_capability_results(overall_judged_answers, get_capability_results(overall_judged_answers, overall_references, fout, fout_flag, show_model_abbr)
overall_references, fout, fout_flag,
model)
fout_flag += 1 fout_flag += 1
else: else:
print(subdir_path + ' is not exist! please check!') print(subdir_path + ' is not exist! please check!')
with open(fout, 'r') as f: with open(fout, 'r') as f:
x = from_csv(f) csv_reader = csv.reader(f)
print(x) header = next(csv_reader)
table = [line for line in csv_reader]
new_header = [''] + [line[0] for line in table]
new_table = [[h] + line[1:] for h, line in zip(header[1:], table)]
new_table = [[h] + [line[i] for line in table] for i, h in enumerate(header[1:], start=1)]
t = tabulate(new_table, headers=new_header)
with open(fout, 'w') as f:
f.write(','.join(new_header) + '\n')
for line in new_table:
f.write(','.join(map(str, line)) + '\n')
print(t)
print(fout) print(fout)
elif self.judge_type == 'pair':
super().summarize()

View File

@ -3,6 +3,7 @@ import copy
import fnmatch import fnmatch
import math import math
import os.path as osp import os.path as osp
import re
import statistics import statistics
import time import time
from collections import Counter from collections import Counter
@ -38,12 +39,12 @@ def extract_role_pred(s: str, begin_str: Optional[str],
start = 0 start = 0
end = len(s) end = len(s)
if begin_str: if begin_str and re.match(r'\s*', begin_str) is None:
begin_idx = s.find(begin_str) begin_idx = s.find(begin_str)
if begin_idx != -1: if begin_idx != -1:
start = begin_idx + len(begin_str) start = begin_idx + len(begin_str)
if end_str: if end_str and re.match(r'\s*', end_str) is None:
# TODO: Support calling tokenizer for the accurate eos token # TODO: Support calling tokenizer for the accurate eos token
# and avoid such hardcode # and avoid such hardcode
end_idx = s.find(end_str, start) end_idx = s.find(end_str, start)

View File

@ -1,6 +1,7 @@
# flake8: noqa: E501 # flake8: noqa: E501
import copy import copy
import json import json
import os
import os.path as osp import os.path as osp
import mmengine import mmengine
@ -123,6 +124,10 @@ class AlpacaEvalTask(BaseTask):
command = '' command = ''
if api_key is not None: if api_key is not None:
command += f'export OPENAI_API_KEY={api_key}; ' command += f'export OPENAI_API_KEY={api_key}; '
else:
api_key = os.environ.get('OPENAI_API_KEY', '').split(',')[0]
if api_key:
command += f'export OPENAI_API_KEY={api_key}; '
command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}' command += f'alpaca_eval --model_outputs {filename} --annotators_config {alpaca_cfg} --output_path {output_path}'
return template.format(task_cmd=command) return template.format(task_cmd=command)

View File

@ -5,6 +5,7 @@ import tabulate
from mmengine.config import Config from mmengine.config import Config
from opencompass.datasets.custom import make_custom_dataset_config from opencompass.datasets.custom import make_custom_dataset_config
from opencompass.models import VLLM, HuggingFaceCausalLM, TurboMindModel
from opencompass.partitioners import NaivePartitioner, SizePartitioner from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner from opencompass.runners import DLCRunner, LocalRunner, SlurmRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
@ -72,6 +73,10 @@ def get_config_from_arg(args) -> Config:
if args.config: if args.config:
config = Config.fromfile(args.config, format_python_code=False) config = Config.fromfile(args.config, format_python_code=False)
config = try_fill_in_custom_cfgs(config) config = try_fill_in_custom_cfgs(config)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
config['models'] = change_accelerator(config['models'],
args.accelerator)
return config return config
# parse dataset args # parse dataset args
if not args.datasets and not args.custom_dataset_path: if not args.datasets and not args.custom_dataset_path:
@ -137,6 +142,9 @@ def get_config_from_arg(args) -> Config:
pad_token_id=args.pad_token_id, pad_token_id=args.pad_token_id,
run_cfg=dict(num_gpus=args.num_gpus)) run_cfg=dict(num_gpus=args.num_gpus))
models.append(model) models.append(model)
# set infer accelerator if needed
if args.accelerator in ['vllm', 'lmdeploy']:
models = change_accelerator(models, args.accelerator)
# parse summarizer args # parse summarizer args
summarizer_arg = args.summarizer if args.summarizer is not None \ summarizer_arg = args.summarizer if args.summarizer is not None \
else 'example' else 'example'
@ -164,6 +172,93 @@ def get_config_from_arg(args) -> Config:
format_python_code=False) format_python_code=False)
def change_accelerator(models, accelerator):
models = models.copy()
model_accels = []
for model in models:
get_logger().info(f'Transforming {model["abbr"]} to {accelerator}')
# change HuggingFace model to VLLM or TurboMindModel
if model['type'] is HuggingFaceCausalLM:
gen_args = dict()
if model.get('generation_kwargs') is not None:
generation_kwargs = model['generation_kwargs'].copy()
gen_args['temperature'] = 0.001 if generation_kwargs.get(
'temperature'
) is None else generation_kwargs['temperature']
gen_args['top_k'] = 1 if generation_kwargs.get(
'top_k') is None else generation_kwargs['top_k']
gen_args['top_p'] = 0.9 if generation_kwargs.get(
'top_p') is None else generation_kwargs['top_p']
gen_args['stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs[
'stop_token_ids'] = None if generation_kwargs.get(
'eos_token_id'
) is None else generation_kwargs['eos_token_id']
generation_kwargs.pop('eos_token_id')
else:
# if generation_kwargs is not provided, set default values
generation_kwargs = dict()
gen_args['temperature'] = 0.0
gen_args['top_k'] = 1
gen_args['top_p'] = 0.9
gen_args['stop_token_ids'] = None
if accelerator == 'lmdeploy':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type= # noqa E251
f'{TurboMindModel.__module__}.{TurboMindModel.__name__}',
abbr=model['abbr'].replace('hf', 'lmdeploy')
if '-hf' in model['abbr'] else model['abbr'] + '-lmdeploy',
path=model['path'],
engine_config=dict(session_len=model['max_seq_len'],
max_batch_size=model['batch_size'],
tp=model['run_cfg']['num_gpus']),
gen_config=dict(top_k=gen_args['top_k'],
temperature=gen_args['temperature'],
top_p=gen_args['top_p'],
max_new_tokens=model['max_out_len'],
stop_words=gen_args['stop_token_ids']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
concurrency=model['batch_size'],
run_cfg=model['run_cfg'],
)
for item in ['meta_template']:
if model.get(item) is not None:
model.update(item, model[item])
elif accelerator == 'vllm':
get_logger().info(
f'Transforming {model["abbr"]} to {accelerator}')
model = dict(
type=f'{VLLM.__module__}.{VLLM.__name__}',
abbr=model['abbr'].replace('hf', 'vllm')
if '-hf' in model['abbr'] else model['abbr'] + '-vllm',
path=model['path'],
model_kwargs=dict(
tensor_parallel_size=model['run_cfg']['num_gpus']),
max_out_len=model['max_out_len'],
max_seq_len=model['max_seq_len'],
batch_size=model['batch_size'],
generation_kwargs=generation_kwargs,
run_cfg=model['run_cfg'],
)
for item in ['meta_template', 'end_str']:
if model.get(item) is not None:
model.update(item, model[item])
generation_kwargs.update(
dict(temperature=gen_args['temperature']))
else:
raise ValueError(f'Unsupported accelerator {accelerator}')
model_accels.append(model)
return model_accels
def exec_mm_infer_runner(tasks, args, cfg): def exec_mm_infer_runner(tasks, args, cfg):
"""execute multimodal infer runner according to args.""" """execute multimodal infer runner according to args."""
if args.slurm: if args.slurm: