[Sync] Add InternLM2 Keyset Evaluation Demo (#807)

Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn>
This commit is contained in:
Fengzhe Zhou 2024-01-17 13:48:12 +08:00 committed by GitHub
parent acae560911
commit b4afe3e7c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
54 changed files with 1554 additions and 106 deletions

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403

View File

@ -0,0 +1,36 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every'),
)
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_generation_{lib}",
type=CIBenchDataset,
path=f"./data/cibench/{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]

View File

@ -0,0 +1,40 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchTemplateDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every'),
)
# no tensorboard
libs = ['/lightgbm', '/matplotlib', '/nltk', '/opencv', '/pandas', '/pytorch',
'/scipy', '/seaborn', '/sklearn', '/tensorflow',
'_chinese/lightgbm', '_chinese/matplotlib', '_chinese/nltk',
'_chinese/opencv', '_chinese/pandas', '_chinese/pytorch',
'_chinese/scipy', '_chinese/seaborn', '_chinese/sklearn', '_chinese/tensorflow']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_template{lib}",
type=CIBenchTemplateDataset,
path=f"./data/cibench_dataset/cibench_template{lib}",
internet_check=False,
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]

View File

@ -0,0 +1,128 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
PROMPT_EN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result\n```'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result\n```"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: Substituting point P into the function yields the value of k as $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='The graph of the power function $y=(x)$ passes through the point$ (2, \\dfrac {{1}}{{4}}) $, what is the value of $f(-3)$? A. $\\frac{{1}}{{9}}$ B. $\\frac{{1}}{{8}})=196-x$ C. $\\frac{{2}}{{9}}$ D. $\\frac{{1}}{{4}}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result\n```'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
PROMPT_CN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球8个3分球和10个罚球。他们的对手得到2分球的两倍但3分球和罚球的一半。两队得分的总和是多少'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5她会有多少总数的物品如果她现在有60颗弹珠'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result\n```'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='一个有4个水果篮子前三个篮子里有9个苹果、15个橙子和14个香蕉第四个篮子里每种水果都少2个。总共有多少水果'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result\n```"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='已知点P-14在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上则k的值是____'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: 将点 P 带入函数解出 k 的值为 $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='幂函数$ y=(x) $的图象经过点$ (2, \\dfrac {{1}}{{4}}) $,则$ f(-3) $的值为 ______ '),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result\n```'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\nfrom sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result\n```'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn'],
'primary_refine': ['refine_cloze_cn']
}
# Use circular evaluation or not
with_circular_eval = True
mathbench_agent_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate,
template=dict(
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer)
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -0,0 +1,130 @@
# GONNA BE DEPRECATED, DON'T USE IT
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
PROMPT_EN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: Substituting point P into the function yields the value of k as $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='The graph of the power function $y=(x)$ passes through the point$ (2, \\dfrac {{1}}{{4}}) $, what is the value of $f(-3)$? A. $\\frac{{1}}{{9}}$ B. $\\frac{{1}}{{8}})=196-x$ C. $\\frac{{2}}{{9}}$ D. $\\frac{{1}}{{4}}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
PROMPT_CN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球8个3分球和10个罚球。他们的对手得到2分球的两倍但3分球和罚球的一半。两队得分的总和是多少'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5她会有多少总数的物品如果她现在有60颗弹珠'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='一个有4个水果篮子前三个篮子里有9个苹果、15个橙子和14个香蕉第四个篮子里每种水果都少2个。总共有多少水果'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='已知点P-14在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上则k的值是____'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: 将点 P 带入函数解出 k 的值为 $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='幂函数$ y=(x) $的图象经过点$ (2, \\dfrac {{1}}{{4}}) $,则$ f(-3) $的值为 ______ '),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn'],
'primary_refine': ['refine_cloze_cn']
}
# Use circular evaluation or not
with_circular_eval = True
mathbench_agent_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate,
template=dict(
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer)
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -0,0 +1,58 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
cloze_prompts ={
"cloze_arith_en": [
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]
}
mathbench_sets = {
'arithmetic': ['cloze_arith_en'],
}
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=False,
abbr="mathbench-arithmetic" + _split + '-' + _name,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -0,0 +1,110 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
}
cloze_prompts = {
"cloze_cn": [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块'),
dict(role='BOT', prompt='A: 黎恩有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖'),
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]}
cloze_prompts['refine_cloze_cn'] = cloze_prompts['cloze_cn']
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn'],
'primary_refine': ['refine_cloze_cn']
}
# Generate reasoning path or not, only for single choice
with_reasoning = True
# Use circular evaluation or not
with_circular_eval = True
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .mathbench_gen_7b734b import mathbench_datasets # noqa: F401, F403

View File

@ -0,0 +1,110 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题请你一步一步推理并在最后用“所以答案为选项X”给出答案其中“X”为选项ABCD中你认为正确的选项。下面是你要回答的问题\n{question}\n让我们一步一步思考:\n",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please reason through it step by step, and at the end, provide your answer option with 'Therefore, the correct answer is option X', Where 'X' is the correct option you think from ABCD. Here is the question you need to answer:\n{question}\nLet's think step by step:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
}
cloze_prompts = {
"cloze_cn": [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: 黎恩有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块'),
dict(role='BOT', prompt='A: 黎恩有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: 杰森有20个棒棒糖。他给丹妮一些棒棒糖。现在Jason只剩下12个棒棒糖。杰森给丹妮多少个棒棒糖'),
dict(role='BOT', prompt='A: 杰森有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了丹妮。他给丹妮的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: 莎莎有五个玩具。在圣诞节,他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具?'),
dict(role='BOT', prompt='A: 她有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: 迈克尔有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='A: 迈克尔一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: 奥利弗有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]}
cloze_prompts['refine_cloze_cn'] = cloze_prompts['cloze_cn']
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn'],
'primary_refine': ['refine_cloze_cn']
}
# Generate reasoning path or not, only for single choice
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = True
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -4,7 +4,7 @@ from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
humanevalx_reader_cfg = dict(
input_columns=['prompt'], output_column='task_id', train_split='test')
input_columns=['prompt'], output_column='declaration', train_split='test')
humanevalx_infer_cfg = dict(
prompt_template=dict(

View File

@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.custom import OptionSimAccEvaluator
from opencompass.datasets import siqaDataset_V3
siqa_reader_cfg = dict(
input_columns=["context", "question", "A", "B", "C"],
output_column="answer",
test_split="validation")
siqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=
"{context}\nQuestion: {question}\nA. {A}\nB. {B}\nC. {C}\nAnswer:"
)
], ),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
siqa_eval_cfg = dict(
evaluator=dict(type=OptionSimAccEvaluator, options='ABC'),
pred_role="BOT",
)
siqa_datasets = [
dict(
abbr="siqa",
type=siqaDataset_V3,
path='./data/siqa',
reader_cfg=siqa_reader_cfg,
infer_cfg=siqa_infer_cfg,
eval_cfg=siqa_eval_cfg)
]

View File

@ -0,0 +1,37 @@
from copy import deepcopy
from mmengine.config import read_base
with read_base():
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.agieval.agieval_gen_64afd3 import agieval_datasets
from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.math.math_evaluatorv2_gen_265cce import math_datasets
from .datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
from .models.hf_internlm.hf_internlm2_chat_7b import models as hf_internlm2_chat_7b_model
from .models.hf_internlm.hf_internlm2_chat_20b import models as hf_internlm2_chat_20b_model
from .summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-chat-keyset/'
_origin_datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], [])
_origin_models = sum([v for k, v in locals().items() if k.endswith("_model")], [])
_vanilla_datasets = [deepcopy(d) for d in _origin_datasets]
_vanilla_models = []
for m in _origin_models:
m = deepcopy(m)
if 'meta_template' in m and 'round' in m['meta_template']:
round = m['meta_template']['round']
if any(r['role'] == 'SYSTEM' for r in round):
new_round = [r for r in round if r['role'] != 'SYSTEM']
print(f'WARNING: remove SYSTEM round in meta_template for {m.get("abbr", None)}')
m['meta_template']['round'] = new_round
_vanilla_models.append(m)
datasets = _vanilla_datasets
models = _vanilla_models

View File

@ -0,0 +1,20 @@
from mmengine.config import read_base
with read_base():
from .datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from .datasets.agieval.agieval_mixed_2f14ad import agieval_datasets
from .datasets.bbh.bbh_gen_5b92b0 import bbh_datasets
from .datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from .datasets.math.math_gen_265cce import math_datasets
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
from .datasets.mbpp.sanitized_mbpp_gen_1e1056 import sanitized_mbpp_datasets
from .models.hf_internlm.hf_internlm2_7b import models as hf_internlm2_7b_model
from .models.hf_internlm.hf_internlm2_20b import models as hf_internlm2_20b_model
from .summarizers.internlm2_keyset import summarizer
work_dir = './outputs/internlm2-keyset/'
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets")], [])
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])

View File

@ -0,0 +1,24 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='deepseek-moe-16b-base-hf',
path="deepseek-ai/deepseek-moe-16b-base",
tokenizer_path='deepseek-ai/deepseek-moe-16b-base',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,32 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='User: ', end='\n\n'),
dict(role="BOT", begin="Assistant: ", end='<end▁of▁sentence>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='deepseek-moe-16b-chat-hf',
path="deepseek-ai/deepseek-moe-16b-chat",
tokenizer_path='deepseek-ai/deepseek-moe-16b-chat',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<end▁of▁sentence>',
)
]

View File

@ -0,0 +1,25 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-20b-hf',
path="internlm/internlm2-20b",
tokenizer_path='internlm/internlm2-20b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,26 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-7b-hf',
path="internlm/internlm2-7b",
tokenizer_path='internlm/internlm2-7b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,36 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='[UNUSED_TOKEN_146]user\n', end='[UNUSED_TOKEN_145]\n'),
dict(role='SYSTEM', begin='[UNUSED_TOKEN_146]system\n', end='[UNUSED_TOKEN_145]\n'),
dict(role='BOT', begin='[UNUSED_TOKEN_146]assistant\n', end='[UNUSED_TOKEN_145]\n', generate=True),
],
eos_token_id=92542
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-chat-20b-hf',
path="internlm/internlm2-chat-20b",
tokenizer_path='internlm/internlm2-chat-20b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='[UNUSED_TOKEN_145]',
)
]

View File

@ -0,0 +1,36 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role='HUMAN', begin='[UNUSED_TOKEN_146]user\n', end='[UNUSED_TOKEN_145]\n'),
dict(role='SYSTEM', begin='[UNUSED_TOKEN_146]system\n', end='[UNUSED_TOKEN_145]\n'),
dict(role='BOT', begin='[UNUSED_TOKEN_146]assistant\n', end='[UNUSED_TOKEN_145]\n', generate=True),
],
eos_token_id=92542
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='internlm2-chat-7b-hf',
path="internlm/internlm2-chat-7b",
tokenizer_path='internlm/internlm2-chat-7b',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
trust_remote_code=True,
),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='[UNUSED_TOKEN_145]',
)
]

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='Question:\n', end='\n'),
dict(role="BOT", begin="Answer:\n", end='\n', generate=True),
],
)
models = [
dict(
abbr='abel-7b-001',
type=HuggingFaceCausalLM,
path='GAIR/Abel-7B-001',
tokenizer_path='GAIR/Abel-7B-001',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='Question:\n', end='\n'),
dict(role="BOT", begin="Answer:\n", end='\n', generate=True),
],
)
models = [
dict(
abbr='abel-7b-002',
type=HuggingFaceCausalLM,
path='GAIR/Abel-7B-002',
tokenizer_path='GAIR/Abel-7B-002',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin='',
round=[
dict(role="HUMAN", begin='Question: ', end='\n\n'),
dict(role="BOT", begin="Answer: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='arithmo-mistral-7b-hf',
type=HuggingFaceCausalLM,
path='akjindal53244/Arithmo-Mistral-7B',
tokenizer_path='akjindal53244/Arithmo-Mistral-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin='Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n',
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response:", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='gsm8k-rft-llama7b2-u13b',
type=HuggingFaceCausalLM,
path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
tokenizer_path='OFA-Sys/gsm8k-rft-llama7b2-u13b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-7b-v1.0-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-7B-V1.0',
tokenizer_path='meta-math/MetaMath-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-llemma-7b-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-Llemma-7B',
tokenizer_path='meta-math/MetaMath-Llemma-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
begin="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
round=[
dict(role="HUMAN", begin='### Instruction:\n', end='\n\n'),
dict(role="BOT", begin="### Response: ", end='\n\n', generate=True),
],
)
models = [
dict(
abbr='metamath-mistral-7b-hf',
type=HuggingFaceCausalLM,
path='meta-math/MetaMath-Mistral-7B',
tokenizer_path='meta-math/MetaMath-Mistral-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,24 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
type=HuggingFaceCausalLM,
abbr='phi-2-hf',
path='microsoft/phi-2',
tokenizer_path='microsoft/phi-2',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,34 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<_user>'),
dict(role="BOT", begin="<_bot>", end='<_end>', generate=True),
],
eos_token_id=160133
)
models = [
dict(
abbr='telechat-7b-hf',
type=HuggingFaceCausalLM,
path='Tele-AI/telechat-7B',
tokenizer_path='Tele-AI/telechat-7B',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<_end>',
)
]

View File

@ -0,0 +1,25 @@
from opencompass.models import HuggingFaceCausalLM
models = [
dict(
abbr='yayi2-30b-hf',
type=HuggingFaceCausalLM,
path='wenge-research/yayi2-30b',
tokenizer_path='wenge-research/yayi2-30b',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
max_out_len=100,
min_out_len=3,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardmath-7b-v1.0-hf',
path='WizardLM/WizardMath-7B-V1.0',
tokenizer_path='WizardLM/WizardMath-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardmath-7b-v1.1-hf',
path='WizardLM/WizardMath-7B-V1.1',
tokenizer_path='WizardLM/WizardMath-7B-V1.1',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -16,7 +16,7 @@ models = [
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
batch_size=1,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),

View File

@ -11,8 +11,8 @@ agent_summary_groups = [
dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(
name='agent',
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10', 'plugin_eval-p10_zh'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1, 'plugin_eval-p10_zh': 1}
)
]
@ -48,13 +48,26 @@ summarizer = dict(
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
['plugin_eval-p10-reason_str_v1', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-p10-retrieve_str_v1', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-p10-understand_str_v1', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-p10-review_str_v1', 'review_quality'],
['plugin_eval-p10_zh', 'naive_average'],
['plugin_eval-p10-instruct_v1_zh', 'format_metric'],
['plugin_eval-p10-instruct_v1_zh', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1_zh', 'f1_score'],
['plugin_eval-p10-plan_json_v1_zh', 'f1_score'],
['plugin_eval-p10-reason_str_v1_zh', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'thought'],
['plugin_eval-p10-retrieve_str_v1_zh', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'name'],
['plugin_eval-p10-understand_str_v1_zh', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v1_zh', 'args'],
['plugin_eval-p10-review_str_v1_zh', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])

View File

@ -0,0 +1,3 @@
leval_summary_groups = [
{"name": "leval", "subsets": ["LEval_coursera", "LEval_gsm100", "LEval_quality", "LEval_tpo", "LEval_topic_retrieval", "LEval_financialqa", "LEval_gov_report_summ", "LEval_legal_contract_qa", "LEval_meeting_summ", "LEval_multidocqa", "LEval_narrativeqa", "LEval_nq", "LEval_news_summ", "LEval_paper_assistant", "LEval_patent_summ", "LEval_review_summ", "LEval_scientificqa", "LEval_tvshow_summ"]},
]

View File

@ -0,0 +1,10 @@
longbench_summary_groups = [
{'name': 'longbench_single-document-qa', 'subsets': ['LongBench_narrativeqa', 'LongBench_qasper', 'LongBench_multifieldqa_en', 'LongBench_multifieldqa_zh']},
{'name': 'longbench_multi-document-qa', 'subsets': ['LongBench_hotpotqa', 'LongBench_2wikimqa', 'LongBench_musique', 'LongBench_dureader']},
{'name': 'longbench_summarization', 'subsets': ['LongBench_gov_report', 'LongBench_qmsum', 'LongBench_multi_news', 'LongBench_vcsum']},
{'name': 'longbench_few-shot-learning', 'subsets': ['LongBench_trec', 'LongBench_triviaqa', 'LongBench_samsum', 'LongBench_lsht']},
{'name': 'longbench_synthetic-tasks', 'subsets': ['LongBench_passage_count', 'LongBench_passage_retrieval_en', 'LongBench_passage_retrieval_zh']},
{'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
{'name': 'longbench_code-completion', 'subsets': ['LongBench_lcc', 'LongBench_repobench-p']},
{'name': 'longbench', 'subsets': ['longbench_single-document-qa', 'longbench_multi-document-qa', 'longbench_summarization', 'longbench_few-shot-learning', 'longbench_synthetic-tasks', 'longbench_code-completion', 'longbench_code-completion']},
]

View File

@ -66,9 +66,9 @@ naive_mathbench_summary_groups = [
{
'name': 'mathbench-circular-and-cloze',
'subsets': [
'mathbench-college-circular',
'mathbench-high-circular',
'mathbench-middle-circular',
'mathbench-circular',
'mathbench-college-cloze_en',
'mathbench-primary-cloze_cn',
],

View File

@ -65,9 +65,9 @@ mathbench_agent_summary_groups = [
{
'name': 'mathbench-circular-and-cloze-agent',
'subsets': [
'mathbench-college-circular-agent',
'mathbench-high-circular-agent',
'mathbench-middle-circular-agent',
'mathbench-circular-agent',
'mathbench-college-cloze_en-agent',
'mathbench-primary-cloze_cn-agent',
],

View File

@ -1,4 +1,6 @@
plugineval_summary_groups = [
from copy import deepcopy
_base_summary_groups = [
{
'name': 'plugin_eval-instruct_v1',
'metric': 'format_metric',
@ -22,47 +24,41 @@ plugineval_summary_groups = [
['plugin_eval-instruct_v1', 'args_em_metric'],
['plugin_eval-plan_str_v1', 'f1_score'],
['plugin_eval-plan_json_v1', 'f1_score'],
['plugin_eval-reason_str_v2', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-retrieve_str_v2', 'name'],
['plugin_eval-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-understand_str_v2', 'args'],
['plugin_eval-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-review_str_v6', 'review_quality'],
]
},
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
['plugin_eval-reason_str_v1', 'thought'],
['plugin_eval-reason_retrieve_understand_json_v1', 'thought'],
['plugin_eval-retrieve_str_v1', 'name'],
['plugin_eval-reason_retrieve_understand_json_v1', 'name'],
['plugin_eval-understand_str_v1', 'args'],
['plugin_eval-reason_retrieve_understand_json_v1', 'args'],
['plugin_eval-review_str_v1', 'review_quality'],
]
},
]
plugineval_summary_groups = []
# base
for group in _base_summary_groups:
group = deepcopy(group)
plugineval_summary_groups.append(group)
# base _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'] + '_zh'
group['subsets'] = [[subset[0] + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
# base -p10-
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10')
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10'), subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)
# base -p10- _zh
for group in _base_summary_groups:
group = deepcopy(group)
group['name'] = group['name'].replace('plugin_eval', 'plugin_eval-p10') + '_zh'
group['subsets'] = [[subset[0].replace('plugin_eval', 'plugin_eval-p10') + '_zh', subset[1]] for subset in group['subsets']]
plugineval_summary_groups.append(group)

View File

@ -0,0 +1,20 @@
from mmengine.config import read_base
with read_base():
from .groups.agieval import agieval_summary_groups
from .groups.mmlu import mmlu_summary_groups
from .groups.bbh import bbh_summary_groups
summarizer = dict(
dataset_abbrs=[
['mmlu', 'naive_average'],
['agieval', 'naive_average'],
['bbh', 'naive_average'],
['gsm8k', 'accuracy'],
['math', 'accuracy'],
['openai_humaneval', 'humaneval_pass@1'],
['sanitized_mbpp', 'score'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], []),
)

View File

@ -1,10 +1,10 @@
summarizer = dict(
dataset_abbrs = [
'--------- LEval Exact Match (Acc) ---------', # category
"LEval_coursera",
'LEval_coursera',
'LEval_gsm100',
'LEval_quality',
"LEval_tpo",
'LEval_tpo',
'LEval_topic_retrieval',
'--------- LEval Gen (ROUGE) ---------', # category
'LEval_financialqa',
@ -21,5 +21,5 @@ summarizer = dict(
'LEval_scientificqa',
'LEval_tvshow_summ'
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -1,10 +1,10 @@
summarizer = dict(
dataset_abbrs = [
'--------- LongBench Single-Document QA ---------', # category
"LongBench_narrativeqa",
'LongBench_narrativeqa',
'LongBench_qasper',
'LongBench_multifieldqa_en',
"LongBench_multifieldqa_zh",
'LongBench_multifieldqa_zh',
'--------- LongBench Multi-Document QA ---------', # category
'LongBench_hotpotqa',
'LongBench_2wikimqa',
@ -28,5 +28,5 @@ summarizer = dict(
'LongBench_lcc',
'LongBench_repobench-p',
],
summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
)

View File

@ -13,7 +13,9 @@ from .commonsenseqa import commonsenseqaDataset
from .hellaswag import hellaswagDataset_V2
from .mmlu import MMLUDataset
from .obqa import OBQADataset
from .piqa import piqaDataset_V2
from .race import RaceDataset
from .siqa import siqaDataset_V3
from .xiezhi import XiezhiDataset
@ -273,6 +275,24 @@ class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
default_answer_key = 'answer'
class CircularsiqaDataset(siqaDataset_V3, metaclass=CircularDatasetMeta):
dataset_class = siqaDataset_V3
default_circular_splits = ['validation']
default_option_keys = ['A', 'B', 'C']
default_answer_key = 'answer'
class CircularpiqaDataset(piqaDataset_V2, metaclass=CircularDatasetMeta):
dataset_class = piqaDataset_V2
default_circular_splits = ['validation']
default_option_keys = ['sol1', 'sol2']
def default_answer_key_switch_method(item, circular_pattern):
circular_pattern = tuple(int(i[-1]) - 1 for i in circular_pattern)
item['answer'] = 'AB'[circular_pattern['AB'.index(item['answer'])]]
return item
class CircularEvaluator(BaseEvaluator):
"""This Evaluator assesses datasets post-Circular processing, generating
the following evaluation metrics:

View File

@ -378,6 +378,8 @@ class DS1000ServiceEvaluator(BaseEvaluator):
processed_predictions = {}
assert len(predictions) == len(references)
for i, (pred, gold) in enumerate(zip(predictions, references)):
if len(pred) > 10000:
pred = ''
processed_predictions[str(i)] = {'prediction': pred, 'gold': gold}
with tempfile.TemporaryDirectory() as tmp_dir:

View File

@ -155,6 +155,11 @@ def humaneval_postprocess(text: str) -> str:
def humaneval_postprocess_v2(text: str) -> str:
"""This is an advanced version of previous postprocess to handle more
situations, better to use this one."""
try:
# for chatGLM raw text
text = eval(text)
except Exception:
pass
text = text.lstrip('\n')
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
@ -173,11 +178,11 @@ def humaneval_postprocess_v2(text: str) -> str:
text = text.lstrip('\n')
if text.strip().startswith('def'):
text = '\n'.join(text.split('\n')[1:])
if not text.startswith(' '):
if text.startswith(' '):
text = ' ' + text.lstrip()
else:
text = '\n'.join([' ' + line for line in text.split('\n')])
# deal with the indentation error
if text.startswith(' '):
text = ' ' + text.lstrip()
else:
text = '\n'.join([' ' + line for line in text.split('\n')])
text = text.split('\n')
# If number of leading space reduces, we assume that the code block ends.

View File

@ -14,6 +14,7 @@ from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from .base import BaseDataset
from .humaneval import humaneval_postprocess_v2
_LANGUAGE_NAME_DICT = {
'cpp': 'CPP',
@ -89,9 +90,11 @@ class HumanevalXEvaluator(BaseEvaluator):
def score(self, predictions, references):
predictions = [{
'task_id': f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
'generation': _clean_up_code(pred, self.language),
} for i, pred in enumerate(predictions)]
'task_id':
f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
'generation':
_clean_up_code(pred, self.language, refer),
} for i, (pred, refer) in enumerate(zip(predictions, references))]
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_out_path = osp.join(tmp_dir,
f'humanevalx_{self.language}.json')
@ -161,15 +164,28 @@ class HumanevalXEvaluator(BaseEvaluator):
return False, err
def _clean_up_code(text: str, language_type: str) -> str:
def _clean_up_code(text: str, language_type: str, reference) -> str:
"""Cleans up the generated code."""
try:
# for chatGLM related text
text = eval(text)
except Exception:
pass
# extract code from code block
text = text.lstrip('\n')
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```xxx
text = text[max(text.find('\n') + 1, 0):]
if language_type.lower() == 'python':
text = humaneval_postprocess_v2(text)
# we need to take care of the first line
# append extra space for first line for correct indentation
for c_index, c in enumerate(text[:5]):
if c != ' ':
text = ' ' * (4 - c_index) + text
break
text = ' ' + text.lstrip()
text_splits = text.split('\n')
is_empty_line = False
@ -189,7 +205,13 @@ def _clean_up_code(text: str, language_type: str) -> str:
for w in end_words:
if w in text:
text = text[:text.rfind(w)]
elif language_type.lower() == 'java':
# strip function head for all other language
func_name = reference.strip().split('\n')[-1]
if func_name:
func_name = func_name.strip().strip('{')
if func_name in text:
text = '\n'.join(text[text.find(func_name):].split('\n')[1:])
if language_type.lower() == 'java':
main_pos = text.find('public static void main')
if main_pos != -1:
text = text[:main_pos] + '}'

View File

@ -200,30 +200,28 @@ class MBPPEvaluator(BaseEvaluator):
def score(self, predictions, references):
assert len(predictions) == len(references)
predictions = [self._process_answer(pred) for pred in predictions]
if self.metric == 'MBPP':
result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
details = {}
for index, (test_case,
pred) in enumerate(zip(references, predictions)):
programs = self._process_test(test_case, pred)
try:
# Add exec globals to prevent the exec to raise
# unnecessary NameError for correct answer
exec_globals = {}
with swallow_io():
with time_limit(2):
exec(programs, exec_globals)
r = 'pass'
except TimeOutException:
r = 'timeout'
except AssertionError:
r = 'wrong_answer'
except BaseException:
r = 'failed'
result[r] += 1
details[str(index)] = {'programs': programs, 'result': r}
# change to thread pool for better killing blocked instance
with ThreadPoolExecutor() as executor:
futures = []
for i, (refer, pred) in enumerate(zip(references,
predictions)):
pred = self._process_answer(pred)
programs = self._process_test(refer, pred)
future = executor.submit(execution, programs, i, 3)
futures.append(future)
from tqdm import tqdm
for future in tqdm(as_completed(futures), total=len(futures)):
index, key = future.result()
result[key] += 1
details[str(index)] = {
'programs': predictions[index],
'result': key
}
result['score'] = result['pass'] / len(predictions) * 100
result['details'] = details
@ -263,6 +261,20 @@ class MBPPEvaluator(BaseEvaluator):
return {f'mbpp_plus_{k}': score[k] * 100 for k in score}
def _process_answer(self, text):
try:
# for chatGLM related text
text = eval(text)
except Exception:
pass
# deal with code block
if '```' in text:
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
text = text.split('```')[1] # fall back to default strategy
else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # in case starting with ```xxx
text = text[max(text.find('\n') + 1, 0):]
text = text.strip()
match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
if match:
@ -275,6 +287,10 @@ class MBPPEvaluator(BaseEvaluator):
text = text[1:]
if text.endswith("'"):
text = text[:-1]
text = text.replace('\\', '')
match = re.search(r'```python(.*)```', text, re.DOTALL)
if match:
text = match.group(1).strip().split('```')[0].strip()
return text
def _process_test(self, test_case, pred):

View File

@ -78,3 +78,37 @@ class siqaDataset_V2(BaseDataset):
val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})
@LOAD_DATASET.register_module()
class siqaDataset_V3(BaseDataset):
"""Disconnect from HuggingFace version of HFDataset."""
@staticmethod
def load_single(path, data_filename, label_filename):
data_path = os.path.join(path, data_filename)
label_path = os.path.join(path, label_filename)
dataset = []
with open(data_path, 'r', encoding='utf-8') as f:
data_lines = f.readlines()
with open(label_path, 'r', encoding='utf-8') as f:
label_lines = f.readlines()
assert len(data_lines) == len(label_lines)
for data, label in zip(data_lines, label_lines):
i = json.loads(data.strip())
i['A'] = i.pop('answerA')
i['B'] = i.pop('answerB')
i['C'] = i.pop('answerC')
i['answer'] = 'ABC'[int(label.strip()) - 1]
dataset.append(i)
return Dataset.from_list(dataset)
@staticmethod
def load(path):
train_dataset = siqaDataset_V3.load_single(path, 'train.jsonl',
'train-labels.lst')
val_dataset = siqaDataset_V3.load_single(path, 'dev.jsonl',
'dev-labels.lst')
return DatasetDict({'train': train_dataset, 'validation': val_dataset})

View File

@ -57,6 +57,8 @@ class IPythonInterpreter(BaseAction):
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
force_user_data (bool): Whether to force use user data.
Defaults to True.
"""
_KERNEL_CLIENTS = {}
@ -68,7 +70,8 @@ class IPythonInterpreter(BaseAction):
disable_description: Optional[str] = None,
timeout: int = 20,
trim_output: Optional[int] = 1024,
user_data_dir: str = 'ENV') -> None:
user_data_dir: str = 'ENV',
force_user_data: bool = True) -> None:
super().__init__(description, name, enable, disable_description)
self.timeout = timeout
@ -82,6 +85,11 @@ class IPythonInterpreter(BaseAction):
f'{user_data_dir} does not exist.'
user_data_dir = os.path.abspath(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
else:
if force_user_data:
raise ValueError('user_data_dir is not set. Please '
'set force_user_data to False if '
'no extra data needed.')
self.user_data_dir = user_data_dir
self._initialized = False
self.trim_output = trim_output

View File

@ -225,6 +225,7 @@ class HuggingFace(BaseModel):
def generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
@ -232,6 +233,7 @@ class HuggingFace(BaseModel):
Args:
inputs (List[str]): A list of strings.
max_out_len (int): The maximum length of the output.
min_out_len (Optional[int]): The minimum length of the output.
Returns:
List[str]: A list of generated strings.
@ -241,12 +243,14 @@ class HuggingFace(BaseModel):
if self.batch_padding and len(inputs) > 1:
return self._batch_generate(inputs=inputs,
max_out_len=max_out_len,
min_out_len=min_out_len,
stopping_criteria=stopping_criteria,
**generation_kwargs)
else:
return sum(
(self._single_generate(inputs=[input_],
max_out_len=max_out_len,
min_out_len=min_out_len,
stopping_criteria=stopping_criteria,
**generation_kwargs)
for input_ in inputs), [])
@ -254,6 +258,7 @@ class HuggingFace(BaseModel):
def _batch_generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Support for batch prompts inference.
@ -308,6 +313,9 @@ class HuggingFace(BaseModel):
])
kwargs['stopping_criteria'] = stopping_criteria
if min_out_len is not None:
kwargs['min_new_tokens'] = min_out_len
# step-2: conduct model forward to generate output
outputs = self.model.generate(**tokens,
max_new_tokens=max_out_len,
@ -331,6 +339,7 @@ class HuggingFace(BaseModel):
def _single_generate(self,
inputs: List[str],
max_out_len: int,
min_out_len: Optional[int] = None,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Support for single prompt inference.
@ -390,6 +399,9 @@ class HuggingFace(BaseModel):
])
kwargs['stopping_criteria'] = stopping_criteria
if min_out_len is not None:
kwargs['min_new_tokens'] = min_out_len
# To accommodate the PeftModel, parameters should be passed in
# key-value format for generate.
outputs = self.model.generate(input_ids=input_ids,
@ -502,7 +514,7 @@ class HuggingFace(BaseModel):
self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
if mask_length is not None:
lens -= np.array(mask_length)
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens
return ce_loss
def get_loglikelihood(
@ -554,7 +566,6 @@ class HuggingFace(BaseModel):
input_ids = input_tokenizer_out['input_ids'][:, :self.max_seq_len]
input_length = input_tokenizer_out['length']
attention_mask = input_tokenizer_out['attention_mask']
context_ids = [
self.tokenizer(inputs[i].replace(conts[i], ''),
padding=False,
@ -563,7 +574,7 @@ class HuggingFace(BaseModel):
for i in range(len(inputs))
]
# forward
outputs = self.model(input_ids, attention_mask)['logits']
outputs = self.model(input_ids)['logits']
outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
# calculate loglikelihood
answer = np.zeros(len(inputs))
@ -609,9 +620,10 @@ class HuggingFace(BaseModel):
self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
mink_percent = []
for nloss, nlen in zip(loss, lens):
nlen = max(int(nlen) * k // 100, 1)
nloss = torch.topk(loss, nlen, dim=-1)[0]
nloss = -nloss.mean().cpu().detach().numpy()
nlen = int(nlen)
minklen = max(nlen * k // 100, 1)
nloss = torch.topk(loss[-nlen:], minklen, dim=-1)[0]
nloss = -nloss.float().mean().cpu().detach().numpy()
mink_percent.append(nloss)
return np.array(mink_percent)

View File

@ -29,6 +29,8 @@ class GenInferencer(BaseInferencer):
model (:obj:`BaseModelWrapper`, optional): The module to inference.
max_seq_len (:obj:`int`, optional): Maximum number of tokenized words
allowed by the LM.
min_out_len (:obj:`int`, optional): Minimum number of generated tokens
by the LM
batch_size (:obj:`int`, optional): Batch size for the
:obj:`DataLoader`.
output_json_filepath (:obj:`str`, optional): File path for output
@ -49,6 +51,7 @@ class GenInferencer(BaseInferencer):
max_out_len: int,
stopping_criteria: List[str] = [],
max_seq_len: Optional[int] = None,
min_out_len: Optional[int] = None,
batch_size: Optional[int] = 1,
gen_field_replace_token: Optional[str] = '',
output_json_filepath: Optional[str] = './icl_inference_output',
@ -66,6 +69,7 @@ class GenInferencer(BaseInferencer):
self.gen_field_replace_token = gen_field_replace_token
self.max_out_len = max_out_len
self.min_out_len = min_out_len
self.stopping_criteria = stopping_criteria
if self.model.is_api and save_every is None:
@ -135,6 +139,8 @@ class GenInferencer(BaseInferencer):
sig = inspect.signature(self.model.generate)
if 'stopping_criteria' in sig.parameters:
extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
if 'min_out_len' in sig.parameters:
extra_gen_kwargs['min_out_len'] = self.min_out_len
with torch.no_grad():
parsed_entries = self.model.parse_template(entry, mode='gen')
results = self.model.generate_from_template(

View File

@ -116,7 +116,7 @@ class DLCRunner(BaseRunner):
' --worker_count 1'
f' --worker_cpu {max(num_gpus * 6, 8)}'
f' --worker_gpu {num_gpus}'
f' --worker_memory {max(num_gpus * 32, 48)}'
f' --worker_memory {max(num_gpus * 64, 48)}'
f" --worker_image {self.aliyun_cfg['worker_image']}"
' --interactive')
get_cmd = partial(task.get_command,

View File

@ -61,6 +61,7 @@ class OpenICLInferTask(BaseTask):
for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
self.max_out_len = model_cfg.get('max_out_len', None)
self.batch_size = model_cfg.get('batch_size', None)
self.min_out_len = model_cfg.get('min_out_len', None)
self.model = build_model_from_cfg(model_cfg)
for dataset_cfg in dataset_cfgs:
@ -102,6 +103,8 @@ class OpenICLInferTask(BaseTask):
inferencer_cfg['model'] = self.model
self._set_default_value(inferencer_cfg, 'max_out_len',
self.max_out_len)
self._set_default_value(inferencer_cfg, 'min_out_len',
self.min_out_len)
self._set_default_value(inferencer_cfg, 'batch_size', self.batch_size)
inferencer_cfg['max_seq_len'] = self.model_cfg.get('max_seq_len')
inferencer = ICL_INFERENCERS.build(inferencer_cfg)

View File

@ -21,4 +21,5 @@ def build_model_from_cfg(model_cfg: ConfigDict):
model_cfg.pop('abbr', None)
model_cfg.pop('summarizer_abbr', None)
model_cfg.pop('pred_postprocessor', None)
model_cfg.pop('min_out_len', None)
return MODELS.build(model_cfg)

View File

@ -5,7 +5,8 @@ from typing import Dict
from mmengine.config import Config, ConfigDict
from opencompass.openicl.icl_inferencer import (CLPInferencer, GenInferencer,
PPLInferencer)
PPLInferencer,
PPLOnlyInferencer)
from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
from opencompass.utils import (Menu, build_dataset_from_cfg,
build_model_from_cfg, dataset_abbr_from_cfg,
@ -77,7 +78,8 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
ice_idx_list = retriever.retrieve()
assert infer_cfg.inferencer.type in [PPLInferencer, GenInferencer], \
assert infer_cfg.inferencer.type in [
PPLInferencer, GenInferencer, CLPInferencer, PPLOnlyInferencer], \
'Only PPLInferencer and GenInferencer are supported'
for idx in range(min(count, len(ice_idx_list))):
@ -127,7 +129,9 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
print('-' * 100)
print(prompt)
print('-' * 100)
elif infer_cfg.inferencer.type in [GenInferencer, CLPInferencer]:
elif infer_cfg.inferencer.type in [
GenInferencer, CLPInferencer, PPLOnlyInferencer
]:
ice_idx = ice_idx_list[idx]
ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
prompt = retriever.generate_prompt_for_generate_task(