[Sync] Sync with internal codes 2023.01.08 (#777)

This commit is contained in:
Fengzhe Zhou 2024-01-08 22:07:24 +08:00 committed by GitHub
parent 8194199d79
commit 32f40a8f83
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
121 changed files with 3557 additions and 955 deletions

26
.gitignore vendored
View File

@ -91,10 +91,32 @@ docs/zh_cn/_build/
# sft config ignore list
configs/sft_cfg/*B_*
configs/sft_cfg/7B/*
configs/sft_cfg/20B/*
configs/cky/
# in case llama clone in the opencompass
llama/
# in case ilagent clone in the opencompass
ilagent/
# ignore the config file for criticbench evaluation
configs/sft_cfg/criticbench_eval/*
# path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
turbomind/
# ignore the config file for criticbench evaluation
configs/sft_cfg/criticbench_eval/*
# cibench output
*.db
*.pth
*.pt
*.onnx
*.gz
*.gz.*
*.png
*.txt
*.jpg
*.json
*.csv
*.npy
*.c

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403

View File

@ -1,35 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import CIBenchDataset, CIBenchEvaluator
cibench_reader_cfg = dict(
input_columns=["questions"],
output_column="references",
train_split='test',
test_split='test')
cibench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer, infer_mode='every'),
)
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_generation_{lib}",
type=CIBenchDataset,
path=f"./data/cibench/{lib}",
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg,
) for lib in libs
]

View File

@ -1,128 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
PROMPT_EN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='Given point P(-1,4) lies on the graph of the inverse proportionality function $y=\\frac{{k}}{{x}}$ (k≠0), what is the value of k? A. $-\\frac{{1}}{{4}}$ B. $\\frac{{1}}{{4}}$ C. $4$ D. $-4$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: Substituting point P into the function yields the value of k as $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='The graph of the power function $y=(x)$ passes through the point$ (2, \\dfrac {{1}}{{4}}) $, what is the value of $f(-3)$? A. $\\frac{{1}}{{9}}$ B. $\\frac{{1}}{{8}})=196-x$ C. $\\frac{{2}}{{9}}$ D. $\\frac{{1}}{{4}}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
PROMPT_CN = {
"FEWSHOT_INSTRUCTION_CLOZE" : [
dict(role='HUMAN', prompt='Mark的篮球队得到25个2分球8个3分球和10个罚球。他们的对手得到2分球的两倍但3分球和罚球的一半。两队得分的总和是多少'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5她会有多少总数的物品如果她现在有60颗弹珠'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='一个有4个水果篮子前三个篮子里有9个苹果、15个橙子和14个香蕉第四个篮子里每种水果都少2个。总共有多少水果'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
],
"FEWSHOT_INSTRUCTION_CHOICE" : [
dict(role='HUMAN', prompt='已知点P-14在反比例函数$y=\\frac{{k}}{{x}}$ (k≠0)的图象上则k的值是____'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import solve, symbols, Ne, Eq\ndef solution():\n k = symbols(\'k\')\n result = solve([Eq(4, k / (-1)), Ne(k, 0)], k, dict=True)\n return result'),
dict(role='SYSTEM', prompt='Response:{{Eq(k, -4)}}'),
dict(role='BOT', prompt='Thought: 将点 P 带入函数解出 k 的值为 $-4$\nFinalAnswer: D'),
dict(role='HUMAN', prompt='幂函数$ y=(x) $的图象经过点$ (2, \\dfrac {{1}}{{4}}) $,则$ f(-3) $的值为 ______ '),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
dict(role='SYSTEM', prompt='Response:1/9'),
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
dict(role='SYSTEM', prompt='Response:2**12'),
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
dict(role='HUMAN', prompt='{question}'),
]
}
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn']
}
# Use circular evaluation or not
with_circular_eval = True
mathbench_agent_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
mathbench_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate,
template=dict(
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer)
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -1,58 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
cloze_prompts ={
"cloze_arith_en": [
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]
}
mathbench_sets = {
'arithmetic': ['cloze_arith_en'],
}
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=False,
abbr="mathbench-arithmetic" + _split + '-' + _name,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .mathbench_gen_ad37c1 import mathbench_datasets # noqa: F401, F403

View File

@ -1,108 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import CircularEvaluator, AccEvaluator
from opencompass.datasets import MathBenchDataset, mathbench_postprocess
from opencompass.utils.text_postprocessors import first_option_postprocess
single_choice_prompts = {
"single_choice_cn_with_reasoning": "以下是一道关于数学的单项选择题,请你一步一步推理并得到最终的答案选项。回答格式为如下:\n答案选项A、B、C、D中你认为正确的一个选项\n计算过程:根据题目得到选项答案的一步步过程\n请严格按照上面的格式回答问题,下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_cn": "以下是一道关于数学的单项选择题,请你直接回答正确答案的选项序号。\n下面是你要回答的题目:\n{question}\n答案选项:",
"single_choice_en_with_reasoning": "Here is a multiple-choice question about mathematics. Please provide the final answer option by step-by-step reasoning. Please answer in the following format:\nAnswer option: A, B, C, or D (the option you believe is correct)\nCalculation process: Step-by-step process to derive the answer option based on the question\nPlease strictly follow the above format to answer the question. Here is the question you need to answer:\n{question}\nAnswer option:",
"single_choice_en": "Here is a multiple-choice question about mathematics. Please provide the correct answer option directly.\nHere is the question you need to answer:\n{question}\nAnswer option:",
}
cloze_prompts = {
"cloze_cn": [
dict(role='HUMAN', prompt='Q: 林中有15棵树。林务工人员今天将在林中种植树木。完成后将有21棵树。林务工人员今天种植了多少棵树'),
dict(role='BOT', prompt='A: 我们从15棵树开始。后来有21棵树。差值必定是他们种植的树木数量。所以他们必须种植了21 - 15 = 6棵树。答案是 6\n'),
dict(role='HUMAN', prompt='Q: 如果停车场有3辆车又有2辆车进来停车场里有多少辆车'),
dict(role='BOT', prompt='A: 停车场已经有3辆车。又进来了2辆车。现在有3 + 2 = 5辆车。答案是 5\n'),
dict(role='HUMAN', prompt='Q: Leah有32块巧克力她的妹妹有42块。如果他们吃了35块他们总共剩下多少块'),
dict(role='BOT', prompt='A: Leah有32块巧克力Leah的妹妹有42块。这意味着原本有32 + 42 = 74块巧克力。被吃掉了35块。所以他们总共还剩下74 - 35 = 39块巧克力。答案是 39\n'),
dict(role='HUMAN', prompt='Q: Jason有20个棒棒糖。他给Denny一些棒棒糖。现在Jason只剩下12个棒棒糖。Jason给Denny多少个棒棒糖'),
dict(role='BOT', prompt='A: Jason有20个棒棒糖。因为他现在只剩下12个所以他必须把剩下的都给了Denny。他给Denny的棒棒糖数量必定是20 - 12 = 8个。答案是 8\n'),
dict(role='HUMAN', prompt='Q: Shawn有五个玩具。在圣诞节他从他的爸爸和妈妈那里各得到了两个玩具。现在他有多少个玩具'),
dict(role='BOT', prompt='A: 他有5个玩具。他从妈妈那里得到了2个所以之后他有5 + 2 = 7个玩具。然后他从爸爸那里得到了2个所以总共他有7 + 2 = 9个玩具。答案是 9\n'),
dict(role='HUMAN', prompt='Q: 服务器房间里有九台电脑。从周一到周四每天增加五台电脑。现在服务器房里有多少台电脑?'),
dict(role='BOT', prompt='A: 从周一到周四有4天。每天增加5台电脑。这意味着总共增加了4 * 5 = 20台电脑。一开始有9台电脑所以现在有9 + 20 = 29台电脑。答案是 29\n'),
dict(role='HUMAN', prompt='Q: Michael有58个高尔夫球。星期二他丢失了23个高尔夫球。星期三他又丢失了2个。星期三结束时他还剩下多少个高尔夫球'),
dict(role='BOT', prompt='A: Michael一开始有58个球。星期二他丢失了23个所以之后他还剩下58 - 23 = 35个球。星期三他又丢失了2个所以现在他还剩下35 - 2 = 33个球。答案是 33\n'),
dict(role='HUMAN', prompt='Q: Olivia有23美元。她用每个3美元的价格买了五个百吉饼。她还剩下多少钱'),
dict(role='BOT', prompt='A: 她以每个3美元的价格买了5个百吉饼。这意味着她在百吉饼上花费了5 * 3 = 15美元。她一开始有23美元所以现在她还剩下23 - 15 = 8美元。答案是 8\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}'),
],
"cloze_en": [
dict(role='HUMAN', prompt='Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?'),
dict(role='BOT', prompt='A: We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\n'),
dict(role='HUMAN', prompt='Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?'),
dict(role='BOT', prompt='A: There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\n'),
dict(role='HUMAN', prompt='Q: Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?'),
dict(role='BOT', prompt="A: Leah had 32 chocolates and Leah's sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\n"),
dict(role='HUMAN', prompt='Q: Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?'),
dict(role='BOT', prompt='A: Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?'),
dict(role='BOT', prompt='A: He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\n'),
dict(role='HUMAN', prompt='Q: There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?'),
dict(role='BOT', prompt='A: There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\n'),
dict(role='HUMAN', prompt='Q: Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?'),
dict(role='BOT', prompt='A: Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\n'),
dict(role='HUMAN', prompt='Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?'),
dict(role='BOT', prompt='A: She bought 5 bagels for $3 each. This means she spent 5 * $3 = $15 on the bagels. She had $23 in beginning, so now she has $23 - $15 = $8. The answer is 8.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]}
mathbench_sets = {
'college': ['single_choice_cn', 'cloze_en'],
'high': ['single_choice_cn', 'single_choice_en'],
'middle': ['single_choice_cn'],
'primary': ['cloze_cn']
}
# Generate reasoning path if set True or just generate the final answer
with_reasoning = False
# Use circular evaluation or not
with_circular_eval = True
mathbench_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
mathbench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=single_choice_prompts[_name + "_with_reasoning"] if with_reasoning else single_choice_prompts[_name],
),
dict(role="BOT", prompt="{answer}")] if 'choice' in _name else cloze_prompts[_name],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mathbench_eval_cfg = dict(
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=mathbench_infer_cfg,
eval_cfg=mathbench_eval_cfg,
))

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .reasonbench_gen_d15233 import reasonbench_datasets

View File

@ -1,140 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.utils.text_postprocessors import first_capital_postprocess
from opencompass.datasets.reasonbench import ReasonBenchDataset
reasonbench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess)
)
reader_cfgs = []
for i in range(2, 5):
choices = ["A", "B", "C", "D"][:i]
reader_cfgs.append(dict(
input_columns=["prompt_ppl"],
output_column="label_ppl")
)
infer_cfg=dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(
role="HUMAN",
prompt="</E>{prompt_ppl}"
),
dict(role="BOT", prompt="Answer: {label_ppl}"),
]),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[]),
inferencer=dict(type=GenInferencer)
)
CausalReasoningDataset = [
dict(
abbr="reasonbench-causal",
type=ReasonBenchDataset,
path="data/reasonbench/causal.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
CommonsenseReasoningDataset = [
dict(
abbr="reasonbench-commonsense",
type=ReasonBenchDataset,
path="data/reasonbench/commonsense.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
AbductiveReasoningDataset = [
dict(
abbr="reasonbench-abductive",
type=ReasonBenchDataset,
path="data/reasonbench/abductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
DeductiveReasoningDataset = [
dict(
abbr="reasonbench-deductive",
type=ReasonBenchDataset,
path="data/reasonbench/deductive.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
InductiveReasoningDataset = [
dict(
abbr="reasonbench-inductive",
type=ReasonBenchDataset,
path="data/reasonbench/inductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
SymbolicReasoningDataset = [
dict(
abbr="reasonbench-symbolic",
type=ReasonBenchDataset,
path="data/reasonbench/symbolic.jsonl",
reader_cfg=reader_cfgs[2],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_CommonsenseReasoningDataset = [
dict(
abbr="reasonbench-cleva_commonsense",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_commonsense.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_DeductiveReasoningDataset = [
dict(
abbr="reasonbench-cleva_deductive",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_deductive.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_InductiveReasoningDataset = [
dict(
abbr="reasonbench-cleva_inductive",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_inductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfg,
eval_cfg=reasonbench_eval_cfg),
]
reasonbench_datasets = \
CLEVA_CommonsenseReasoningDataset + \
CLEVA_DeductiveReasoningDataset + \
CLEVA_InductiveReasoningDataset + \
CausalReasoningDataset + \
CommonsenseReasoningDataset + \
AbductiveReasoningDataset + \
DeductiveReasoningDataset + \
InductiveReasoningDataset + \
SymbolicReasoningDataset

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .reasonbench_ppl_b4a005 import reasonbench_datasets

View File

@ -1,136 +0,0 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets.reasonbench import ReasonBenchDataset
reasonbench_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
)
reader_cfgs, infer_cfgs = [], []
for i in range(2, 5):
choices = ["A", "B", "C", "D"][:i]
reader_cfgs.append(dict(
input_columns=["prompt_ppl"] + choices + ["choices"],
output_column="label")
)
infer_cfgs.append(dict(
prompt_template=dict(
type=PromptTemplate,
template={
str(id):
dict(
round=[
dict(role="HUMAN", prompt="{prompt_ppl}Answer:"),
dict(role="BOT", prompt=f"{choice}")
], )
for id, choice in enumerate(choices)
}),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer)
))
CausalReasoningDataset = [
dict(
abbr="reasonbench-causal",
type=ReasonBenchDataset,
path="data/reasonbench/causal.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfgs[0],
eval_cfg=reasonbench_eval_cfg),
]
CommonsenseReasoningDataset = [
dict(
abbr="reasonbench-commonsense",
type=ReasonBenchDataset,
path="data/reasonbench/commonsense.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfgs[1],
eval_cfg=reasonbench_eval_cfg),
]
AbductiveReasoningDataset = [
dict(
abbr="reasonbench-abductive",
type=ReasonBenchDataset,
path="data/reasonbench/abductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfgs[0],
eval_cfg=reasonbench_eval_cfg),
]
DeductiveReasoningDataset = [
dict(
abbr="reasonbench-deductive",
type=ReasonBenchDataset,
path="data/reasonbench/deductive.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfgs[1],
eval_cfg=reasonbench_eval_cfg),
]
InductiveReasoningDataset = [
dict(
abbr="reasonbench-inductive",
type=ReasonBenchDataset,
path="data/reasonbench/inductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfgs[0],
eval_cfg=reasonbench_eval_cfg),
]
SymbolicReasoningDataset = [
dict(
abbr="reasonbench-symbolic",
type=ReasonBenchDataset,
path="data/reasonbench/symbolic.jsonl",
reader_cfg=reader_cfgs[2],
infer_cfg=infer_cfgs[2],
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_CommonsenseReasoningDataset = [
dict(
abbr="reasonbench-cleva_commonsense",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_commonsense.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfgs[1],
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_DeductiveReasoningDataset = [
dict(
abbr="reasonbench-cleva_deductive",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_deductive.jsonl",
reader_cfg=reader_cfgs[1],
infer_cfg=infer_cfgs[1],
eval_cfg=reasonbench_eval_cfg),
]
CLEVA_InductiveReasoningDataset = [
dict(
abbr="reasonbench-cleva_inductive",
type=ReasonBenchDataset,
path="data/reasonbench/cleva_inductive.jsonl",
reader_cfg=reader_cfgs[0],
infer_cfg=infer_cfgs[0],
eval_cfg=reasonbench_eval_cfg),
]
reasonbench_datasets = \
CLEVA_CommonsenseReasoningDataset + \
CLEVA_DeductiveReasoningDataset + \
CLEVA_InductiveReasoningDataset + \
CausalReasoningDataset + \
CommonsenseReasoningDataset + \
AbductiveReasoningDataset + \
DeductiveReasoningDataset + \
InductiveReasoningDataset + \
SymbolicReasoningDataset

View File

@ -47,7 +47,7 @@ with read_base():
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
from ..siqa.siqa_ppl_ced5f6 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.winogrande_ppl_55a66e import winogrande_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets

View File

@ -47,7 +47,7 @@ with read_base():
from ..piqa.piqa_ppl_0cfff2 import piqa_datasets
from ..siqa.siqa_ppl_e8d8c5 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.winogrande_ppl_55a66e import winogrande_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_6aac9e import obqa_datasets
from ..nq.nq_gen_0356ec import nq_datasets
from ..triviaqa.triviaqa_gen_0356ec import triviaqa_datasets

View File

@ -30,7 +30,7 @@ with read_base():
from ..summedits.summedits_ppl_1fbeb6 import summedits_datasets
from ..hellaswag.hellaswag_ppl_47bff9 import hellaswag_datasets
from ..piqa.piqa_ppl_1cf9f0 import piqa_datasets
from ..winogrande.winogrande_ppl_55a66e import winogrande_datasets
from ..winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from ..obqa.obqa_ppl_c7c154 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets

View File

@ -0,0 +1,55 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import (
GSM8KDataset,
gsm8k_postprocess,
gsm8k_dataset_postprocess,
Gsm8kAgentEvaluator,
)
gsm8k_reader_cfg = dict(input_columns=["question"], output_column="answer")
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
# # ################################### NEW SHOT ###################################
dict(role='HUMAN', prompt='Mark\'s basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What\'s the total number of points scored by both teams added together?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result\n```'),
dict(role='SYSTEM', prompt='Response:210'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result\n```'),
dict(role='SYSTEM', prompt='Response:140'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:```python\ndef solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result\n```"""),
dict(role='SYSTEM', prompt='Response:146'),
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
dict(role='HUMAN', prompt='{question}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
gsm8k_eval_cfg = dict(
evaluator=dict(type=Gsm8kAgentEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess),
)
gsm8k_datasets = [
dict(
abbr='gsm8k-agent',
type=GSM8KDataset,
path='./data/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg,
)
]

View File

@ -33,12 +33,12 @@ Create a {lang} script for this problem:
humanevalx_eval_cfg_dict = {
lang: dict(
evaluator=dict(
type=HumanevalXEvaluator,
type=HumanevalXEvaluator,
language=lang,
ip_address=
"localhost", # replace to your code_eval_server ip_address, port
port=5000
), # refer to https://github.com/Ezra-Yu/code-evaluator to launch a server
port=5001
), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
pred_role='BOT')
for lang in ['python', 'cpp', 'go', 'java', 'js'
] # do not support rust now

View File

@ -15,12 +15,13 @@ humanevalx_infer_cfg = dict(
humanevalx_eval_cfg_dict = {
lang : dict(
evaluator=dict(
type=HumanevalXEvaluator,
language=lang,
ip_address="localhost", # replace to your code_eval_server ip_address, port
port=5000), # refer to https://github.com/Ezra-Yu/code-evaluator to launch a server
pred_role='BOT')
evaluator=dict(
type=HumanevalXEvaluator,
language=lang,
ip_address=
"localhost", # replace to your code_eval_server ip_address, port
port=5001), # refer to https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html to launch a server
pred_role='BOT')
for lang in ['python', 'cpp', 'go', 'java', 'js'] # do not support rust now
}

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .hungarian_exam_gen_8a1435 import hungarianmath_datasets # noqa: F401, F403

View File

@ -0,0 +1,91 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import HungarianExamMathDataset
hungarianmath_reader_cfg = dict(input_columns=['question'], output_column=None)
template = """Problem:
Find the domain of the expression $\frac{\sqrt{x-2}}{\sqrt{5-x}}$.
Solution:
To determine the domain, we must ensure that:
1. The expressions inside each square root are non-negative.
2. The denominator is not equal to zero.
For the numerator, $x-2 \ge 0$ gives $x \ge 2$.
For the denominator, $5-x \ge 0$ gives $x \le 5$. And since the denominator cannot be zero, $5-x > 0$ which further narrows it to $x < 5$.
Combining these results, the domain of the expression is $[2,5)$.
Final Answer: The final answer is $[2,5)$.
Problem:
If $\det \mathbf{A} = 2$ and $\det \mathbf{B} = 12$, then find $\det (\mathbf{A} \mathbf{B})$.
Solution:
Using the property of determinants, we can say that:
$\det (\mathbf{A} \mathbf{B}) = (\det \mathbf{A})(\det \mathbf{B})$.
Plugging in the given values:
$\det (\mathbf{A} \mathbf{B}) = 2 \times 12 = 24$.
Final Answer: The final answer is $24$.
Problem:
Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?
Solution:
First, calculate the total weight Terrell lifts with the 20-pound weights:
$2 \times 12 \times 20 = 480$ pounds.
If he uses 15-pound weights and lifts them $n$ times:
$2 \times 15 \times n = 30n$ pounds.
To find $n$, set these two equal:
\begin{align*}
30n &= 480 \\
n &= \frac{480}{30} \\
n &= 16
\end{align*}
Final Answer: The final answer is $16$.
Problem:
If the system of equations
\begin{align*}
6x-4y &= a, \\
6y-9x &= b.
\end{align*}
has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\frac{a}{b}$, assuming $b$ is nonzero.
Solution:
Multiply the first equation by $-\frac{3}{2}$ to obtain:
$6y-9x = -\frac{3}{2}a$.
Since we also know that $6y-9x = b$, equating them gives:
$-\frac{3}{2}a = b$ which implies $\frac{a}{b} = -\frac{2}{3}$.
Final Answer: The final answer is $-\frac{2}{3}$."""
hungarianmath_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=template+"\n\nProblem:\n{question}\n\nSolution:\n"),
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=1024))
# Attention: this math dataset needs human to evaluate the generated answer, so the AccEvaluator is just a placeholder.
hungarianmath_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
hungarianmath_datasets = [
dict(
abbr='HungarianExamMath',
type=HungarianExamMathDataset,
path='./data/HungarianExamMath/test.csv',
reader_cfg=hungarianmath_reader_cfg,
infer_cfg=hungarianmath_infer_cfg,
eval_cfg=hungarianmath_eval_cfg)
]

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .longbench_multi_news_gen_f6e3fb import LongBench_multi_news_datasets # noqa: F401, F403
from .longbench_multi_news_gen_6f9da9 import LongBench_multi_news_datasets # noqa: F401, F403

View File

@ -15,7 +15,7 @@ LongBench_multi_news_infer_cfg = dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:'),
dict(role='HUMAN', prompt='You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{context}\n\nNow, write a one-page summary of all the news.\n\nSummary:\n'),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512)

View File

@ -0,0 +1,99 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import (
MATHDataset, MATHAgentEvaluator, math_postprocess_v2
)
# use pal format but not perform well
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
# # ################################### NEW SHOT ###################################
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
from sympy import symbols, simplify
def solution():
x = symbols('x')
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
simplified_expr = simplify(expr)
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
result = x3_coefficient
return result
```"""),
dict(role='SYSTEM', prompt='Response:26'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
import math
def solution():
radius = 6
# Surface area of the hemisphere
hemisphere_area = 2 * math.pi * radius**2
# Area of the circular base
base_area = math.pi * radius**2
# Total surface area
total_surface_area = hemisphere_area + base_area
# Formatting the result in LaTeX
result = r'{}\pi'.format(total_surface_area / math.pi)
return result
```"""),
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
def solution():
# Probabilities of each outcome
prime_prob = 1 / 6
composite_prob = 1 / 3
otherwise_prob = 1 / 6
# Expected value of each outcome
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
composite_expected_value = 0 * composite_prob
otherwise_expected_value = -3 * otherwise_prob
# Total expected value
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
# Dollar value to the nearest cent
result = "{:.2f}".format(total_expected_value)
return result
```"""),
dict(role='SYSTEM', prompt='Response:1.17'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
dict(role='HUMAN', prompt='{problem}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
math_eval_cfg = dict(
evaluator=dict(
type=MATHAgentEvaluator,
version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2))
math_datasets = [
dict(
abbr='math-agent',
type=MATHDataset,
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,90 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import (
MATHDataset, MATHAgentEvaluator, math_postprocess_v2
)
# use pal format but not perform well
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
# # ################################### NEW SHOT ###################################
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:from sympy import symbols, simplify
def solution():
x = symbols('x')
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
simplified_expr = simplify(expr)
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
result = x3_coefficient
return result"""),
dict(role='SYSTEM', prompt='Response:26'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:import math
def solution():
radius = 6
# Surface area of the hemisphere
hemisphere_area = 2 * math.pi * radius**2
# Area of the circular base
base_area = math.pi * radius**2
# Total surface area
total_surface_area = hemisphere_area + base_area
# Formatting the result in LaTeX
result = r'{}\pi'.format(total_surface_area / math.pi)
return result"""),
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():
# Probabilities of each outcome
prime_prob = 1 / 6
composite_prob = 1 / 3
otherwise_prob = 1 / 6
# Expected value of each outcome
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
composite_expected_value = 0 * composite_prob
otherwise_expected_value = -3 * otherwise_prob
# Total expected value
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
# Dollar value to the nearest cent
result = "{:.2f}".format(total_expected_value)
return result"""),
dict(role='SYSTEM', prompt='Response:1.17'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
dict(role='HUMAN', prompt='{problem}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
math_eval_cfg = dict(
evaluator=dict(
type=MATHAgentEvaluator,
version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2))
math_datasets = [
dict(
abbr='math-agent',
type=MATHDataset,
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,98 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import (
MATHDataset, MATHAgentEvaluator, math_postprocess
)
# use pal format but not perform well
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
# # ################################### NEW SHOT ###################################
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
from sympy import symbols, simplify
def solution():
x = symbols('x')
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
simplified_expr = simplify(expr)
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
result = x3_coefficient
return result
```"""),
dict(role='SYSTEM', prompt='Response:26'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
import math
def solution():
radius = 6
# Surface area of the hemisphere
hemisphere_area = 2 * math.pi * radius**2
# Area of the circular base
base_area = math.pi * radius**2
# Total surface area
total_surface_area = hemisphere_area + base_area
# Formatting the result in LaTeX
result = r'{}\pi'.format(total_surface_area / math.pi)
return result
```"""),
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
def solution():
# Probabilities of each outcome
prime_prob = 1 / 6
composite_prob = 1 / 3
otherwise_prob = 1 / 6
# Expected value of each outcome
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
composite_expected_value = 0 * composite_prob
otherwise_expected_value = -3 * otherwise_prob
# Total expected value
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
# Dollar value to the nearest cent
result = "{:.2f}".format(total_expected_value)
return result
```"""),
dict(role='SYSTEM', prompt='Response:1.17'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
dict(role='HUMAN', prompt='{problem}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
math_eval_cfg = dict(
evaluator=dict(type=MATHAgentEvaluator),
pred_postprocessor=dict(type=math_postprocess),
)
math_datasets = [
dict(
abbr='math-agent',
type=MATHDataset,
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]

View File

@ -0,0 +1,72 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt=
"Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
),
dict(
role="BOT",
prompt=
"The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
),
dict(
role="BOT",
prompt=
"We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
),
dict(
role="BOT",
prompt=
"If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
),
dict(
role="BOT",
prompt=
"If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"
),
dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
# postprocess v2
math_eval_cfg = dict(
evaluator=dict(
type=MATHEvaluator,
version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2))
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .math401_gen_ab5f39 import math401_datasets # noqa: F401, F403

View File

@ -0,0 +1,47 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MathBenchDataset, Math401Evaluator, mathbench_postprocess
cloze_prompt = [
dict(role='HUMAN', prompt='Q: Calculate 2.9-0.11.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 2.9 - 0.11 equals 2.7900. The answer is 2.7900.\n'),
dict(role='HUMAN', prompt='Q: Calculate 0.15-0.032.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 0.15 - 0.032 equals 0.1180. The answer is 0.1180.\n'),
dict(role='HUMAN', prompt='Q: Calculate 78*64.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 78 multiplied by 64 equals 4992. The answer is 4992.\n'),
dict(role='HUMAN', prompt='Q: Calculate 62×42.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 62 multiplied by 42 equals 2604. The answer is 2604.\n'),
dict(role='HUMAN', prompt='Q: Calculate {question}'),
dict(role='BOT', prompt='A: {answer}\n')]
math401_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=cloze_prompt,
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
math401_eval_cfg = dict(
evaluator=dict(type=Math401Evaluator),
pred_postprocessor=dict(type=mathbench_postprocess, name='en'))
math401_datasets = [
dict(
abbr="math401",
type=MathBenchDataset,
path=f"./data/math401/",
with_circular=False,
name="cloze_en",
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=math401_infer_cfg,
eval_cfg=math401_eval_cfg,
)]

View File

@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp',
path='./sanitized-mbpp.jsonl',
path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg)

View File

@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp_passk',
path='./sanitized-mbpp.jsonl',
path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg)

View File

@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr='sanitized_mbpp_repeat10',
path='./sanitized-mbpp.jsonl',
path='./data/mbpp/sanitized-mbpp.jsonl',
num_repeats=10,
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,

View File

@ -0,0 +1,61 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever, RandomRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import NQOpenDataset, NQEvaluator
nq_datasets = []
for k in [0, 1, 5, 25]:
nq_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
nq_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
nq_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
nq_datasets.append(
dict(
type=NQOpenDataset,
abbr=f'nq_open_{k}shot',
path='./data/nq-open/',
reader_cfg=nq_reader_cfg,
infer_cfg=nq_infer_cfg,
eval_cfg=nq_eval_cfg)
)

View File

@ -0,0 +1,62 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
triviaqa_datasets = []
for k in [0, 1, 5, 25]:
triviaqa_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
triviaqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
triviaqa_datasets.append(
dict(
type=TriviaQADataset_V2,
abbr=f'triviaqa_wiki_{k}shot',
path='./data/triviaqa',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
)

View File

@ -0,0 +1,4 @@
from mmengine.config import read_base
with read_base():
from .winogrande_ll_c5cf57 import winogrande_datasets # noqa: F401, F403

View File

@ -1,6 +1,6 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
from opencompass.openicl.icl_inferencer import LLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset
@ -18,7 +18,7 @@ winogrande_infer_cfg = dict(
}
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=LoglikelihoodInferencer))
inferencer=dict(type=LLInferencer))
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))

View File

@ -1,4 +0,0 @@
from mmengine.config import read_base
with read_base():
from .winogrande_ppl_8be6c3 import winogrande_datasets # noqa: F401, F403

View File

@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3
# Please try winogrande_ll_c5cf57
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],

View File

@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3
# Please try winogrande_ll_c5cf57
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],

18
configs/eval_hf_llama2.py Normal file
View File

@ -0,0 +1,18 @@
from mmengine.config import read_base
with read_base():
from .datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from .datasets.triviaqa.triviaqa_wiki_gen_d18bf4 import triviaqa_datasets
from .datasets.nq.nq_open_gen_e93f8a import nq_datasets
from .datasets.gsm8k.gsm8k_gen_3309bd import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
from .datasets.agieval.agieval_mixed_2f14ad import agieval_datasets
from .datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from .datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
from .datasets.obqa.obqa_ppl_6aac9e import obqa_datasets
from .datasets.winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from .models.hf_llama.hf_llama2_7b import models
from .summarizers.example import summarizer
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
work_dir = './outputs/llama2/'

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceChatGLM3
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-32k-hf',
path='THUDM/chatglm3-6b-32k',
tokenizer_path='THUDM/chatglm3-6b-32k',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=api_meta_template,
max_out_len=100,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
]

View File

@ -0,0 +1,14 @@
from opencompass.models import VLLM
models = [
dict(
type=VLLM,
abbr='chatglm3-6b-32k-vllm',
path='THUDM/chatglm3-6b-32k',
max_out_len=100,
max_seq_len=4096,
batch_size=32,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -28,5 +28,6 @@ models = [
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<end▁of▁sentence>',
)
]

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '),
dict(role="BOT", begin='', end='', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='llama-2-13b-chat-hf',
path="meta-llama/Llama-2-13b-chat-hf",
tokenizer_path='meta-llama/Llama-2-13b-chat-hf',
model_kwargs=dict(
device_map='auto'
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='[INST]',
)
]

View File

@ -16,6 +16,6 @@ models = [
batch_size=8,
model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding
run_cfg=dict(num_gpus=8, num_procs=1),
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '),
dict(role="BOT", begin='', end='', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='llama-2-70b-chat-hf',
path="meta-llama/Llama-2-70b-chat-hf",
tokenizer_path='meta-llama/Llama-2-70b-chat-hf',
model_kwargs=dict(
device_map='auto'
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='[INST]',
)
]

View File

@ -0,0 +1,31 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '),
dict(role="BOT", begin='', end='', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='llama-2-7b-chat-hf',
path="meta-llama/Llama-2-7b-chat-hf",
tokenizer_path='meta-llama/Llama-2-7b-chat-hf',
model_kwargs=dict(
device_map='auto'
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='[INST]',
)
]

View File

@ -16,6 +16,6 @@ models = [
batch_size=8,
model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding
run_cfg=dict(num_gpus=8, num_procs=1),
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -0,0 +1,30 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='lemur-70b-chat-v1',
path="OpenLemur/lemur-70b-chat-v1",
tokenizer_path='OpenLemur/lemur-70b-chat-v1',
# tokenizer_kwargs=dict(
# padding_side='left',
# truncation_side='left',
# trust_remote_code=True,
# use_fast=False,),
# pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -30,5 +30,6 @@ models = [
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,26 @@
from opencompass.models import VLLM
_meta_template = dict(
begin="<s>",
round=[
dict(role="HUMAN", begin='[INST]', end='[/INST]'),
dict(role="BOT", begin="", end='</s>', generate=True),
],
eos_token_id=2
)
models = [
dict(
type=VLLM,
abbr='mistral-7b-instruct-v0.2-vllm',
path='mistralai/Mistral-7B-Instruct-v0.2',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -29,6 +29,7 @@ models = [
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='</s>',
)
]

View File

@ -19,6 +19,6 @@ models = [
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,27 @@
from opencompass.models import VLLM
_meta_template = dict(
begin="<s>",
round=[
dict(role="HUMAN", begin='[INST]', end='[/INST]'),
dict(role="BOT", begin="", end='</s>', generate=True),
],
eos_token_id=2
)
models = [
dict(
type=VLLM,
abbr='mixtral-8x7b-instruct-v0.1-vllm',
path='mistralai/Mixtral-8x7B-Instruct-v0.1',
model_kwargs=dict(tensor_parallel_size=2),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -30,5 +30,6 @@ models = [
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|endoftext|>',
)
]

View File

@ -1,5 +1,6 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
@ -28,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -0,0 +1,25 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
dict(role="BOT", begin="\n<|im_start|>assistant\n", end='<|im_end|>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='qwen-72b-chat-vllm',
path="Qwen/Qwen-72B-Chat",
model_kwargs=dict(tensor_parallel_size=4),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='<|im_end|>',
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -1,5 +1,11 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: '),
dict(role="BOT", begin=" ASSISTANT:", end='</s>', generate=True),
],
)
models = [
dict(
@ -12,12 +18,13 @@ models = [
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=8192,
batch_size=8,
model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=2, num_procs=1)
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='</s>',
)
]

View File

@ -1,5 +1,11 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: '),
dict(role="BOT", begin=" ASSISTANT:", end='</s>', generate=True),
],
)
models = [
dict(
@ -12,12 +18,13 @@ models = [
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=8192,
batch_size=8,
model_kwargs=dict(device_map='auto'),
batch_padding=False, # if false, inference with for-loop without batch padding
use_fastchat_template=True,
run_cfg=dict(num_gpus=1, num_procs=1)
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,23 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: '),
dict(role="BOT", begin=" ASSISTANT:", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='vicuna-13b-v1.5-16k-vllm',
path="lmsys/vicuna-13b-v1.5-16k",
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=2, num_procs=1),
)
]

View File

@ -0,0 +1,23 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: '),
dict(role="BOT", begin=" ASSISTANT:", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='vicuna-7b-v1.5-16k-vllm',
path="lmsys/vicuna-7b-v1.5-16k",
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-13b-v1.2-hf',
path='WizardLM/WizardLM-13B-V1.2',
tokenizer_path='WizardLM/WizardLM-13B-V1.2',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-70b-v1.0-hf',
path='WizardLM/WizardLM-70B-V1.0',
tokenizer_path='WizardLM/WizardLM-70B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-7b-v1.0-hf',
path='WizardLM/WizardLM-7B-V1.0',
tokenizer_path='WizardLM/WizardLM-7B-V1.0',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,24 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-13b-v1.2-vllm',
path='WizardLM/WizardLM-13B-V1.2',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,25 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='USER: ', end=' '),
dict(role="BOT", begin="ASSISTANT: ", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-70b-v1.0-vllm',
path='WizardLM/WizardLM-70B-V1.0',
model_kwargs=dict(tensor_parallel_size=4),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -0,0 +1,24 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='wizardlm-7b-v1.0-vllm',
path='WizardLM/WizardLM-7B-V1.0',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -1,12 +1,16 @@
from opencompass.models import HuggingFaceCausalLM
from opencompass.models import HuggingFace
models = [
dict(
type=HuggingFaceCausalLM,
abbr='wizardlm-7b-hf',
path='TheBloke/wizardLM-7B-HF',
tokenizer_path='TheBloke/wizardLM-7B-HF',
type=HuggingFace,
abbr='yi-34b-200k-hf',
path='01-ai/Yi-34B-200K',
tokenizer_path='01-ai/Yi-34B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
@ -15,10 +19,6 @@ models = [
max_out_len=100,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
run_cfg=dict(num_gpus=1, num_procs=1),
run_cfg=dict(num_gpus=4, num_procs=1),
)
]

View File

@ -0,0 +1,32 @@
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-34b-chat-hf',
path='01-ai/Yi-34B-Chat',
tokenizer_path='01-ai/Yi-34B-Chat',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -0,0 +1,33 @@
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", end='\n\n'),
dict(role="BOT", begin="### Response:", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-6b-200k-hf',
path='01-ai/Yi-6B-200K',
tokenizer_path='01-ai/Yi-6B-200K',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,32 @@
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|im_start|>user\n', end='<|im_end|>\n'),
dict(role="BOT", begin="<|im_start|>assistant\n", end='<|im_end|>\n', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='yi-6b-chat-hf',
path='01-ai/Yi-6B-Chat',
tokenizer_path='01-ai/Yi-6B-Chat',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -0,0 +1,32 @@
from opencompass.models import HuggingFace
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
],
)
models = [
dict(
type=HuggingFace,
abbr='zephyr-7b-beta-hf',
path='HuggingFaceH4/zephyr-7b-beta',
tokenizer_path='HuggingFaceH4/zephyr-7b-beta',
model_kwargs=dict(
trust_remote_code=True,
device_map='auto',
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='</s>',
)
]

View File

@ -0,0 +1,23 @@
from opencompass.models import VLLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin='<|user|>\n', end='</s>'),
dict(role="BOT", begin="<|assistant|>\n", end='</s>', generate=True),
],
)
models = [
dict(
type=VLLM,
abbr='zephyr-7b-beta-vllm',
path='HuggingFaceH4/zephyr-7b-beta',
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=32,
generation_kwargs=dict(temperature=0),
end_str='</s>',
run_cfg=dict(num_gpus=1, num_procs=1),
)
]

View File

@ -0,0 +1,61 @@
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
from .groups.plugineval import plugineval_summary_groups
agent_summary_groups = [
dict(name='math_acc_1_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-native', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-native', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-native', subsets=[['compassbench_v1_math-high-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-native', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-native', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-native', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-native', 'accuracy']]),
dict(name='math_acc_1_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-high-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'acc_1'], ['compassbench_v1_math-middle-single_choice_en-agent', 'acc_1'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(name='math_perf_4_and_fill_in_blank-agent', subsets=[['compassbench_v1_math-high-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-high-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_cn-agent', 'perf_4'], ['compassbench_v1_math-middle-single_choice_en-agent', 'perf_4'], ['compassbench_v1_math-primary-cloze_cn-agent', 'accuracy'], ['compassbench_v1_math-primary-cloze_en-agent', 'accuracy']]),
dict(
name='agent',
subsets=['math_perf_4_and_fill_in_blank-agent', 'cibench_template_wo_nltk:executable', 'cibench_template_wo_nltk:numeric_correct', 'cibench_template_wo_nltk:vis_sim', 'cibench_template_cn_wo_nltk:executable', 'cibench_template_cn_wo_nltk:numeric_correct', 'cibench_template_cn_wo_nltk:vis_sim', 'plugin_eval-p10'],
weights={'math_perf_4_and_fill_in_blank-agent': 1, 'cibench_template_wo_nltk:executable': 0.5, 'cibench_template_wo_nltk:numeric_correct': 0.25, 'cibench_template_wo_nltk:vis_sim': 0.25, 'cibench_template_cn_wo_nltk:executable': 0.5, 'cibench_template_cn_wo_nltk:numeric_correct': 0.25, 'cibench_template_cn_wo_nltk:vis_sim': 0.25, 'plugin_eval-p10': 1}
)
]
summarizer = dict(
dataset_abbrs=[
'agent',
'math_acc_1_and_fill_in_blank-native',
'math_perf_4_and_fill_in_blank-native',
# '######## MathBench-Agent Accuracy ########', # category
'math_acc_1_and_fill_in_blank-agent',
'math_perf_4_and_fill_in_blank-agent',
# '######## CIBench Template ########', # category
'cibench_template:executable',
'cibench_template:numeric_correct',
'cibench_template:text_score',
'cibench_template:vis_sim',
# '######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim',
# '######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
# '######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
# '######## T-Eval ########', # category
['plugin_eval-p10', 'naive_average'],
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)

View File

@ -0,0 +1,33 @@
from mmengine.config import read_base
with read_base():
from .groups.cibench import cibench_summary_groups
summarizer = dict(
dataset_abbrs=[
'######## CIBench Generation ########', # category
['cibench', 'executable'],
['cibench', 'general_correct'],
['cibench', 'vis_sim'],
'######## CIBench Template ########', # category
'cibench_template:executable',
'cibench_template:numeric_correct',
'cibench_template:text_score',
'cibench_template:vis_sim',
'######## CIBench Template Chinese ########', # category
'cibench_template_cn:executable',
'cibench_template_cn:numeric_correct',
'cibench_template_cn:text_score',
'cibench_template_cn:vis_sim',
'######## CIBench Template w/o NLTK ########', # category no text score becase it is only for nltk
'cibench_template_wo_nltk:executable',
'cibench_template_wo_nltk:numeric_correct',
'cibench_template_wo_nltk:vis_sim',
'######## CIBench Template Chinese w/o NLTK ########', # category
'cibench_template_cn_wo_nltk:executable',
'cibench_template_cn_wo_nltk:numeric_correct',
'cibench_template_cn_wo_nltk:vis_sim',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)

View File

@ -0,0 +1,51 @@
code_passk_summary_groups = [
# rename
{'name': 'humaneval_pass@1(greedy)', 'subsets': [['openai_humaneval', 'humaneval_pass@1']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_pass@10', 'subsets': [['openai_humaneval_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@1(greedy)', 'subsets': [['openai_humaneval_cn', 'humaneval_pass@1']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_passk', 'humaneval_pass@10']]},
{'name': 'humaneval_cn_pass@10', 'subsets': [['openai_humaneval_cn_repeat10', 'humaneval_pass@10']]},
{'name': 'humaneval_plus_pass@1(greedy)', 'subsets': [['humaneval_plus', 'humaneval_plus_pass@1']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_passk', 'humaneval_plus_pass@10']]},
{'name': 'humaneval_plus_pass@10', 'subsets': [['humaneval_plus_repeat10', 'humaneval_plus_pass@10']]},
{'name': 'mbpp_pass@1(greedy)', 'subsets': [['mbpp', 'score']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_passk', 'pass@10']]},
{'name': 'mbpp_pass@10', 'subsets': [['mbpp_repeat10', 'pass@10']]},
{'name': 'mbpp_cn_pass@1(greedy)', 'subsets': [['mbpp_cn', 'score']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_passk', 'pass@10']]},
{'name': 'mbpp_cn_pass@10', 'subsets': [['mbpp_cn_repeat10', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@1(greedy)', 'subsets': [['sanitized_mbpp', 'score']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_passk', 'pass@10']]},
{'name': 'sanitized_mbpp_pass@10', 'subsets': [['sanitized_mbpp_repeat10', 'pass@10']]},
# real add
{'name': 'humanevalx', 'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-go', 'humanevalx-java', 'humanevalx-js']},
{'name': 'code', 'subsets': ['humaneval_plus_pass@1(greedy)', 'sanitized_mbpp_pass@1(greedy)', 'humaneval_cn_pass@1(greedy)', 'mbpp_cn_pass@1(greedy)', 'humanevalx']}
]
summarizer = dict(
dataset_abbrs=[
'code',
'humaneval_pass@1(greedy)',
'humaneval_pass@10',
'humaneval_cn_pass@1(greedy)',
'humaneval_cn_pass@10',
'humaneval_plus_pass@1(greedy)',
'humaneval_plus_pass@10',
'mbpp_pass@1(greedy)',
'mbpp_pass@10',
'mbpp_cn_pass@1(greedy)',
'mbpp_cn_pass@10',
'sanitized_mbpp_pass@1(greedy)',
'sanitized_mbpp_pass@10',
'humanevalx',
'humanevalx-python',
'humanevalx-cpp',
'humanevalx-go',
'humanevalx-java',
'humanevalx-js',
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)

View File

@ -0,0 +1,38 @@
# This summarizer is used for `./datasets/compassbench_v1_knowledge/compassbench_v1_knowledge_gen`
compassbench_v1_knowledge_names = [
'compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular',
'compassbench_v1_knowledge-engineering-single_choice_cn_circular',
'compassbench_v1_knowledge-humanity-single_choice_cn_circular',
'compassbench_v1_knowledge-natural_science-single_choice_cn_circular',
'compassbench_v1_knowledge-social_science-single_choice_cn_circular',
]
compassbench_v1_knowledge_groups = [
{'name': 'knowledge_cn', 'subsets': compassbench_v1_knowledge_names},
{'name': 'knowledge_acc_1_and_cloze', 'subsets': [['knowledge_cn', 'acc_1'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
{'name': 'knowledge_perf_4_and_cloze', 'subsets': [['knowledge_cn', 'perf_4'], ['compassbench_v1_knowledge-mixed-cloze_en', 'score']]},
]
'compassbench_v1_knowledge-mixed-cloze_en'
summarizer = dict(
dataset_abbrs=[
'knowledge_acc_1_and_cloze',
['knowledge_cn', 'acc_1'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'acc_1'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'acc_1'],
'compassbench_v1_knowledge-mixed-cloze_en',
'knowledge_perf_4_and_cloze',
['knowledge_cn', 'perf_4'],
['compassbench_v1_knowledge-common_knowledge-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-engineering-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-humanity-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-natural_science-single_choice_cn_circular', 'perf_4'],
['compassbench_v1_knowledge-social_science-single_choice_cn_circular', 'perf_4'],
'compassbench_v1_knowledge-mixed-cloze_en',
],
summary_groups=compassbench_v1_knowledge_groups
)

View File

@ -0,0 +1,42 @@
# This summarizer is used for `./datasets/compassbench_v1_math/compassbench_v1_math_gen`
compassbench_v1_math_groups = [
{'name': 'math_acc_1_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
{'name': 'math_perf_4_and_fill_in_blank', 'subsets': [
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
]},
]
summarizer = dict(
dataset_abbrs=[
'math_acc_1_and_fill_in_blank',
['compassbench_v1_math-high-single_choice_cn', 'acc_1'],
['compassbench_v1_math-high-single_choice_en', 'acc_1'],
['compassbench_v1_math-middle-single_choice_cn', 'acc_1'],
['compassbench_v1_math-middle-single_choice_en', 'acc_1'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
'math_perf_4_and_fill_in_blank',
['compassbench_v1_math-high-single_choice_cn', 'perf_4'],
['compassbench_v1_math-high-single_choice_en', 'perf_4'],
['compassbench_v1_math-middle-single_choice_cn', 'perf_4'],
['compassbench_v1_math-middle-single_choice_en', 'perf_4'],
['compassbench_v1_math-primary-cloze_cn', 'accuracy'],
['compassbench_v1_math-primary-cloze_en', 'accuracy'],
],
summary_groups=compassbench_v1_math_groups,
)

View File

@ -0,0 +1,72 @@
compassbench_v1_language_names = [
# ['information_retrieval_en', 'score'],
# ['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'acc_origin'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
]
compassbench_v1_language_groups = [
{'name': 'language_zh_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'perf_circular']},
{'name': 'language_en_acc_1_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'perf_circular']},
{'name': 'language_acc_1_and_non_mcq', 'subsets': ['language_zh_acc_1_and_non_mcq', 'language_en_acc_1_and_non_mcq']},
{'name': 'language_zh_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_zh' in name and metric != 'acc_origin']},
{'name': 'language_en_perf_4_and_non_mcq', 'subsets': [[name, metric] for name, metric in compassbench_v1_language_names if '_en' in name and metric != 'acc_origin']},
{'name': 'language_perf_4_and_non_mcq', 'subsets': ['language_zh_perf_4_and_non_mcq', 'language_en_perf_4_and_non_mcq']},
]
summarizer = dict(
dataset_abbrs=[
'language_acc_1_and_non_mcq',
'language_en_acc_1_and_non_mcq',
'language_zh_acc_1_and_non_mcq',
['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'acc_origin'],
['intention_recognition_zh_circular', 'acc_origin'],
['sentiment_analysis_en_circular', 'acc_origin'],
['sentiment_analysis_zh_circular', 'acc_origin'],
['translation', 'score'],
['content_critic_en_circular', 'acc_origin'],
['content_critic_zh_circular', 'acc_origin'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'acc_origin'],
['chinese_semantic_understanding_zh_circular', 'acc_origin'],
'language_perf_4_and_non_mcq',
'language_en_perf_4_and_non_mcq',
'language_zh_perf_4_and_non_mcq',
['information_retrieval_en', 'score'],
['information_retrieval_zh', 'score'],
['intention_recognition_en_circular', 'perf_circular'],
['intention_recognition_zh_circular', 'perf_circular'],
['sentiment_analysis_en_circular', 'perf_circular'],
['sentiment_analysis_zh_circular', 'perf_circular'],
['translation', 'score'],
['content_critic_en_circular', 'perf_circular'],
['content_critic_zh_circular', 'perf_circular'],
['content_summarization_en', 'rouge1'],
['content_summarization_zh', 'rouge1'],
['traditional_cultural_understanding_zh_circular', 'perf_circular'],
['chinese_semantic_understanding_zh_circular', 'perf_circular'],
],
summary_groups=compassbench_v1_language_groups,
)

View File

@ -0,0 +1,44 @@
compassbench_v1_reason_groups = [
{'name': 'reasonbench_cn_logic_circular', 'subsets': ['reasonbench_cn_abductive_alphanlg_translated_circular', 'reasonbench_cn_deductive_bbh3obj_translated_circular', 'reasonbench_cn_deductive_logiqa_zh_circular', 'reasonbench_cn_inductive_deer_translated_circular', 'reasonbench_cn_inductive_selfgenerated_circular']},
{'name': 'reasonbench_en_logic_circular', 'subsets': ['reasonbench_en_abductive_alphanlg_circular', 'reasonbench_en_deductive_bbh7obj_circular', 'reasonbench_en_deductive_logiqa_zh_translated_circular', 'reasonbench_en_deductive_ocnli_translated_circular', 'reasonbench_en_inductive_deer_circular', 'reasonbench_en_inductive_selfgenerated_circular']},
{'name': 'reasonbench', 'subsets': ['reasonbench_cn_commonsense_circular', 'reasonbench_cn_logic_circular', 'reasonbench_en_commonsense_circular', 'reasonbench_en_logic_circular']},
]
summarizer = dict(
dataset_abbrs=[
['reasonbench', 'acc_origin'],
['reasonbench_cn_commonsense_circular', 'acc_origin'],
['reasonbench_en_commonsense_circular', 'acc_origin'],
['reasonbench_cn_logic_circular', 'acc_origin'],
['reasonbench_en_logic_circular', 'acc_origin'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'acc_origin'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'acc_origin'],
['reasonbench_cn_inductive_deer_translated_circular', 'acc_origin'],
['reasonbench_cn_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench_en_abductive_alphanlg_circular', 'acc_origin'],
['reasonbench_en_deductive_bbh7obj_circular', 'acc_origin'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'acc_origin'],
['reasonbench_en_deductive_ocnli_translated_circular', 'acc_origin'],
['reasonbench_en_inductive_deer_circular', 'acc_origin'],
['reasonbench_en_inductive_selfgenerated_circular', 'acc_origin'],
['reasonbench', 'perf_circular'],
['reasonbench_cn_commonsense_circular', 'perf_circular'],
['reasonbench_en_commonsense_circular', 'perf_circular'],
['reasonbench_cn_logic_circular', 'perf_circular'],
['reasonbench_en_logic_circular', 'perf_circular'],
['reasonbench_cn_abductive_alphanlg_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_bbh3obj_translated_circular', 'perf_circular'],
['reasonbench_cn_deductive_logiqa_zh_circular', 'perf_circular'],
['reasonbench_cn_inductive_deer_translated_circular', 'perf_circular'],
['reasonbench_cn_inductive_selfgenerated_circular', 'perf_circular'],
['reasonbench_en_abductive_alphanlg_circular', 'perf_circular'],
['reasonbench_en_deductive_bbh7obj_circular', 'perf_circular'],
['reasonbench_en_deductive_logiqa_zh_translated_circular', 'perf_circular'],
['reasonbench_en_deductive_ocnli_translated_circular', 'perf_circular'],
['reasonbench_en_inductive_deer_circular', 'perf_circular'],
['reasonbench_en_inductive_selfgenerated_circular', 'perf_circular'],
],
summary_groups=compassbench_v1_reason_groups,
)

View File

@ -1,4 +1,109 @@
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_generation_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench_generation', 'subsets': _cibench}]
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
_cibench_template = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template = ['cibench_template/' + i for i in _cibench_template]
# number of total exec questions in this module
_cibench_template_weight = {
'lightgbm': [30, 15, 0, 0],
'matplotlib': [42, 0, 0, 36],
'nltk': [70, 30, 20, 10],
'opencv': [60, 10, 0, 40],
'pandas': [60, 40, 0, 10],
'pytorch': [28, 0, 0, 0],
'scipy': [60, 40, 0, 0],
'seaborn': [42, 0, 0, 35],
'sklearn': [42, 6, 0, 18],
'tensorflow': [36, 6, 0, 12],
}
cibench_summary_groups.extend([
{
'name': 'cibench_template:executable',
'subsets': [[i, 'executable'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## chinese
_cibench_template_cn = ['lightgbm', 'matplotlib', 'nltk', 'opencv', 'pandas', 'pytorch',
'scipy', 'seaborn', 'sklearn', 'tensorflow']
_cibench_template_cn = ['cibench_template_chinese/' + i for i in _cibench_template_cn]
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:text_score',
'subsets': [[i, 'text_score'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[2] for k,v in _cibench_template_weight.items()},
},
{
'name': 'cibench_template_cn:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items()},
},
])
## add more without nltk
cibench_summary_groups.extend([
{
'name': 'cibench_template_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template if 'nltk' not in i],
'weights': {'cibench_template/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])
cibench_summary_groups.extend([
{
'name': 'cibench_template_cn_wo_nltk:executable',
'subsets': [[i, 'executable'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[0] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:numeric_correct',
'subsets': [[i, 'numeric_correct'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[1] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
{
'name': 'cibench_template_cn_wo_nltk:vis_sim',
'subsets': [[i, 'vis_sim'] for i in _cibench_template_cn if 'nltk' not in i],
'weights': {'cibench_template_chinese/' + k : v[3] for k,v in _cibench_template_weight.items() if 'nltk' not in k},
},
])

View File

@ -31,4 +31,38 @@ plugineval_summary_groups = [
['plugin_eval-review_str_v6', 'review_quality'],
]
},
# special treatment for first 10% data points
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'format_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_format_metric'],
['plugin_eval-p10-instruct_v1', 'json_format_metric'],
]
},
{
'name': 'plugin_eval-p10-instruct_v1',
'metric': 'args_em_metric',
'subsets': [
['plugin_eval-p10-instruct_v1', 'string_args_em_metric'],
['plugin_eval-p10-instruct_v1', 'json_args_em_metric'],
]
},
{
'name': 'plugin_eval-p10',
'subsets': [
['plugin_eval-p10-instruct_v1', 'format_metric'],
['plugin_eval-p10-instruct_v1', 'args_em_metric'],
['plugin_eval-p10-plan_str_v1', 'f1_score'],
['plugin_eval-p10-plan_json_v1', 'f1_score'],
['plugin_eval-p10-reason_str_v2', 'thought'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'thought'],
['plugin_eval-p10-retrieve_str_v2', 'name'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'name'],
['plugin_eval-p10-understand_str_v2', 'args'],
['plugin_eval-p10-reason_retrieve_understand_json_v2', 'args'],
['plugin_eval-p10-review_str_v6', 'review_quality'],
]
},
]

View File

@ -0,0 +1,61 @@
_longeval_2k = ['classification_en_2k', 'lines_2k', 'qa_en_2k', 'qa_zh_2k', 'stackselect_2k', 'summarization_en_2k', 'textsort_2k']
_longeval_4k = ['classification_en_4k', 'lines_4k', 'qa_en_4k', 'qa_zh_4k', 'stackselect_4k', 'summarization_en_4k', 'textsort_4k']
_longeval_8k = ['classification_en_8k', 'lines_8k', 'qa_en_8k', 'qa_zh_8k', 'stackselect_8k', 'summarization_en_8k', 'textsort_8k']
_longeval_15k = ['classification_en_15k', 'lines_15k', 'qa_en_15k', 'qa_zh_15k', 'stackselect_15k', 'summarization_en_15k', 'textsort_15k']
_longeval_30k = ['classification_en_30k', 'lines_30k', 'qa_en_30k', 'qa_zh_30k', 'stackselect_30k', 'summarization_en_30k', 'textsort_30k']
longeval_summary_groups = [
{'name': 'longeval_v2_2k', 'subsets': _longeval_2k},
{'name': 'longeval_v2_4k', 'subsets': _longeval_4k},
{'name': 'longeval_v2_8k', 'subsets': _longeval_8k},
{'name': 'longeval_v2_15k', 'subsets': _longeval_15k},
{'name': 'longeval_v2_30k', 'subsets': _longeval_30k},
{'name': 'longeval_v2', 'subsets': _longeval_2k + _longeval_4k + _longeval_8k + _longeval_15k + _longeval_30k}
]
summarizer = dict(
dataset_abbrs = [
'longeval_v2',
'longeval_v2_2k',
'longeval_v2_4k',
'longeval_v2_8k',
'longeval_v2_15k',
'longeval_v2_30k',
'classification_en_2k',
'classification_en_4k',
'classification_en_8k',
'classification_en_15k',
'classification_en_30k',
'lines_2k',
'lines_4k',
'lines_8k',
'lines_15k',
'lines_30k',
'qa_en_2k',
'qa_en_4k',
'qa_en_8k',
'qa_en_15k',
'qa_en_30k',
'qa_zh_2k',
'qa_zh_4k',
'qa_zh_8k',
'qa_zh_15k',
'qa_zh_30k',
'stackselect_2k',
'stackselect_4k',
'stackselect_8k',
'stackselect_15k',
'stackselect_30k',
'summarization_en_2k',
'summarization_en_4k',
'summarization_en_8k',
'summarization_en_15k',
'summarization_en_30k',
'textsort_2k',
'textsort_4k',
'textsort_8k',
'textsort_15k',
'textsort_30k',
],
summary_groups=longeval_summary_groups,
)

View File

@ -46,9 +46,11 @@ from .hellaswag import * # noqa: F401, F403
from .huggingface import * # noqa: F401, F403
from .humaneval import * # noqa: F401, F403
from .humanevalx import * # noqa: F401, F403
from .hungarian_math import * # noqa: F401, F403
from .infinitebench import * # noqa: F401, F403
from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403
from .jsonl import JsonlDataset # noqa: F401, F403
from .kaoshi import KaoshiDataset, KaoshiEvaluator # noqa: F401, F403
from .lambada import * # noqa: F401, F403
from .lawbench import * # noqa: F401, F403
@ -57,6 +59,7 @@ from .leval import * # noqa: F401, F403
from .longbench import * # noqa: F401, F403
from .mastermath2024v1 import * # noqa: F401, F403
from .math import * # noqa: F401, F403
from .math401 import * # noqa: F401, F403
from .mathbench import * # noqa: F401, F403
from .mbpp import * # noqa: F401, F403
from .medbench import * # noqa: F401, F403

View File

@ -69,13 +69,105 @@ def load_experiment(file: str) -> dict:
)
def load_experiment_template(file: str) -> dict:
"""Load single experiment file with solutions for template experiment."""
with open(file, 'r') as f:
notebook = json.load(f)
example = notebook['cells']
metadata = notebook['metadata']
modules = metadata.get('modules', [])
if modules:
# these two annotations should be the same
assert len(modules) == len(metadata.get('step_types'))
# reformat annotations
modules = [[_m.strip() for _m in _modules.split('&')]
for _modules in modules]
questions = []
source_codes = []
outputs = []
tags = []
for cell in example:
if cell['cell_type'] == 'markdown':
text = ''.join(cell['source']).strip()
if modules:
_modules = modules.pop(0)
if 'chinese' not in file:
text += f"Please use {' and '.join(_modules)} modules."
else:
text += f"请用 {''.join(_modules)} 模块."
text = text.strip() + '\n'
# append the formatted text
questions.append(text)
elif cell['cell_type'] == 'code':
source_codes.append(''.join(cell['source']))
output_flag = False
if cell['outputs']:
for _output in cell['outputs']:
if _output['output_type'] == 'display_data':
assert not output_flag
output_flag = True
tags.append('vis')
outputs.append(_output['data']['image/png'])
for _output in cell['outputs']:
if output_flag:
break
if _output['output_type'] == 'stream' and _output[
'name'] == 'stdout':
assert not output_flag
output_flag = True
tags.append('general')
outputs.append(''.join(_output['text']))
elif _output['output_type'] == 'execute_result':
assert not output_flag
output_flag = True
tags.append('general')
outputs.append(''.join(
_output['data']['text/plain']))
if not output_flag:
# no output fallback to exec
tags.append('exec')
outputs.append(None)
return dict(
experiment=file,
questions=sum(([
dict(role='user', content=question),
dict(role='assistant', content=source_code)
] for question, source_code in zip(questions, source_codes)), []),
references=dict(outputs=outputs,
tags=tags,
metadata=metadata,
experiment=file),
)
def check_internet():
"""A tricky way to check internet."""
import socket
import nltk
socket.setdefaulttimeout(10)
ret = nltk.download('stopwords', quiet=True)
socket.setdefaulttimeout(None)
if not ret:
raise ConnectionError('CIBench needs internet to get response. Please'
'check your internet and proxy.')
@LOAD_DATASET.register_module()
class CIBenchDataset(BaseDataset):
"""Code Interpreter dataset."""
@staticmethod
def load(path: str):
"""Load whole dataset."""
def load(path: str, internet_check: bool = False):
"""Load whole dataset.
Args:
path(str): Path of cibench dataset.
internet_check(bool): Whether to check internet.
Defaults to False.
"""
if internet_check:
check_internet()
assert os.path.exists(path), f'Path {path} does not exist.'
data_list = []
for cwd, dirs, files in os.walk(path):
@ -83,11 +175,36 @@ class CIBenchDataset(BaseDataset):
files.sort()
for f in files:
if '.ipynb' in f:
try:
data = load_experiment(os.path.join(cwd, f))
except Exception:
print(f'Error with file {os.path.join(cwd, f)}')
continue
data = load_experiment(os.path.join(cwd, f))
data_list.append(data)
dataset = Dataset.from_list(data_list)
return dataset
@LOAD_DATASET.register_module()
class CIBenchTemplateDataset(BaseDataset):
"""Code Interpreter dataset for template dataset."""
@staticmethod
def load(path: str, internet_check: bool = False):
"""Load whole dataset.
Args:
path(str): Path of cibench dataset.
internet_check(bool): Whether to check internet.
Defaults to False.
"""
if internet_check:
check_internet()
assert os.path.exists(path), f'Path {path} does not exist.'
data_list = []
for cwd, dirs, files in os.walk(path):
dirs.sort()
files.sort()
for f in files:
if '.ipynb' in f:
data = load_experiment_template(os.path.join(cwd, f))
data_list.append(data)
dataset = Dataset.from_list(data_list)
@ -138,7 +255,8 @@ class CIBenchEvaluator(BaseEvaluator):
def check_user_data_dir(self, user_data_dir):
if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '')
default_path = osp.abspath('./data/cibench_dataset/datasources')
user_data_dir = os.environ.get('USER_DATA_DIR', default_path)
user_data_dir = user_data_dir.rstrip('/')
basename = osp.basename(user_data_dir)
if basename and basename != 'data':
@ -172,10 +290,11 @@ class CIBenchEvaluator(BaseEvaluator):
if action['result']:
try:
pred = action['result']['text']
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
match = re.search('execute_result:\n\n```\n(.*?)\n```',
pred, re.DOTALL)
if match:
out = match.group(1)
return out == target or out in target
return out.strip() == target.strip()
except Exception:
return False
# Fall back to False
@ -313,23 +432,23 @@ class CIBenchEvaluator(BaseEvaluator):
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result = defaultdict(list)
for tag, step, output in zip(tags, steps, outputs):
# check whether this step is valid
result['executable'].append(self.valid_step(step))
if tag != 'exec':
key, func = self.TAG_MAPPING[tag]
result[key].append(func(step, output))
# add missing metric for better analyse if not exists
# create empty results
result = dict()
if hard_tags:
check_tags = ['exec', 'num', 'text', 'vis']
else:
check_tags = ['exec', 'general', 'vis']
for tag in check_tags:
key = self.TAG_MAPPING[tag][0]
if key not in result:
result[key] = []
result[key] = []
for tag, step, output in zip(tags, steps, outputs):
# check whether this step is valid
result['executable'].append(self.valid_step(step))
if tag != 'exec':
key, func = self.TAG_MAPPING[tag]
result[key].append(func(step, output))
return result

View File

@ -183,8 +183,13 @@ class CircularDatasetMeta(type):
def load(cls, circular_patterns='circular', *args, **kwargs):
circular_splits = getattr(cls, 'default_circular_splits', None)
option_keys = cls.default_option_keys
option_keys = getattr(cls, 'default_option_keys', None)
if 'option_keys' in kwargs:
option_keys = kwargs.pop('option_keys')
assert option_keys is not None, 'option_keys cannot be None'
answer_key = getattr(cls, 'default_answer_key', None)
if 'answer_key' in kwargs:
answer_key = kwargs.pop('answer_key')
answer_key_switch_method = getattr(
cls, 'default_answer_key_switch_method', None)
dataset = cls.dataset_class.load(*args, **kwargs)
@ -311,11 +316,11 @@ class CircularEvaluator(BaseEvaluator):
tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
# calculate the original accuracy
for pred, ref, origin_item in zip(predictions, references, test_set):
for pred, refr, origin_item in zip(predictions, references, test_set):
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
tmp_metrics[f'correct_{k}'] += 1 if pred == ref else 0
tmp_metrics[f'correct_{k}'] += 1 if pred == refr else 0
tmp_metrics[f'count_{k}'] += 1
for k in circular_patterns:
@ -324,13 +329,13 @@ class CircularEvaluator(BaseEvaluator):
# calculate the circular accuracy
_details = {k: {} for k in circular_patterns}
for pred, ref, origin_item in zip(predictions, references, test_set):
for pred, refr, origin_item in zip(predictions, references, test_set):
index = origin_item['qid']
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
_details[k].setdefault(
index, []).append(True if pred == ref else False)
index, []).append(True if pred == refr else False)
for k in _details:
_details[k] = {
index: sum(_details[k][index])

View File

@ -1,29 +1,175 @@
import copy
import csv
import json
import os
from typing import List
from datasets import Dataset
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets.circular import (CircularDatasetMeta,
CircularEvaluator)
from opencompass.openicl.icl_evaluator import AccEvaluator, BaseEvaluator
from opencompass.openicl.icl_inferencer import GenInferencer, PPLInferencer
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.registry import LOAD_DATASET
from opencompass.utils.text_postprocessors import first_option_postprocess
from .base import BaseDataset
class OptionSimAccEvaluator(BaseEvaluator):
def __init__(self, options) -> None:
super().__init__()
if not all((isinstance(i, str) and i.isupper() and len(i) == 1)
for i in options):
raise ValueError(
f'Each options should be single upper letter, got {options}')
self.options = options
def match_any_label(self, pred, test_item):
from rapidfuzz.distance import Levenshtein as L
from opencompass.utils.text_postprocessors import \
first_option_postprocess
pred = pred.strip()
if any([pred == i for i in self.options]):
parsed = pred
else:
parsed = ''
if parsed == '':
parsed = first_option_postprocess(pred,
''.join(self.options),
cushion=False)
if parsed == '':
possible_options = []
for opt in self.options:
opt_str = test_item[opt]
if opt_str is not None and opt_str.lower() in pred.lower():
possible_options.append(opt)
if len(possible_options) == 1:
parsed = possible_options[0]
if parsed == '':
dists = []
for opt in self.options:
opt_str = test_item[opt]
if opt_str is None:
continue
cands = [opt, opt_str, opt + '. ' + opt_str]
d = min(L.distance(pred, cand) for cand in cands)
dists.append((d, opt))
if len(dists) > 0:
parsed = min(dists)[1]
return parsed
def score(self, predictions: List, references: List, test_set) -> dict:
assert len(predictions) == len(references)
num_correct, num_total = 0, 0
details = {}
for index in range(len(predictions)):
pred = predictions[index]
refr = references[index]
parsed = self.match_any_label(pred, test_set[index])
num_correct += 1 if parsed == refr else 0
num_total += 1
details[str(index)] = {}
details[str(index)]['pred'] = pred
details[str(index)]['parsed'] = parsed
details[str(index)]['refr'] = refr
details[str(index)]['correct'] = parsed == refr
return {'accuracy': num_correct / num_total * 100, 'details': details}
# TODO: DO NOT COPY YOURSELF!!!
class CircularOptionSimAccEvaluator(OptionSimAccEvaluator):
def __init__(self, options, circular_pattern='circular'):
super().__init__(options)
self.circular_pattern = circular_pattern
def score(self, predictions, references, test_set):
from opencompass.datasets.circular import (get_all_possible_patterns,
get_circular_patterns,
get_origin_patterns)
circular_patterns = {}
circular_patterns['origin'] = get_origin_patterns(
test_set[0]['circular_pattern'])
circular_patterns['circular'] = get_circular_patterns(
test_set[0]['circular_pattern'])
if self.circular_pattern == 'all_possible':
circular_patterns['all_possible'] = get_all_possible_patterns(
test_set[0]['circular_pattern'])
metrics = {}
tmp_metrics = {}
tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
# calculate the original accuracy
for pred, refr, origin_item in zip(predictions, references, test_set):
parsed = self.match_any_label(pred, origin_item)
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
tmp_metrics[f'correct_{k}'] += (1 if parsed == refr else 0)
tmp_metrics[f'count_{k}'] += 1
for k in circular_patterns:
metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] /
tmp_metrics[f'count_{k}'] * 100)
# calculate the circular accuracy
_details = {k: {} for k in circular_patterns}
for pred, refr, origin_item in zip(predictions, references, test_set):
index = origin_item['qid']
parsed = self.match_any_label(pred, origin_item)
circular_pattern = origin_item['circular_pattern']
for k in circular_patterns:
if tuple(circular_pattern) in circular_patterns[k]:
_details[k].setdefault(
index, []).append(True if parsed == refr else False)
for k in _details:
_details[k] = {
index: sum(_details[k][index])
for index in _details[k]
}
for k in _details:
for j in range(1, len(circular_patterns[k]) + 1):
count = sum([_details[k][index] >= j for index in _details[k]])
total = len(_details[k])
if j != len(circular_patterns[k]):
metrics[f'more_{j}_{k}'] = count / total * 100
else:
metrics[f'perf_{k}'] = count / total * 100
# make details
details = {}
for index in range(len(predictions)):
parsed = self.match_any_label(predictions[index], test_set[index])
details[str(index)] = {}
if 'question' in test_set[index]:
details[str(index)]['question'] = test_set[index]['question']
details[str(index)]['pred'] = predictions[index]
details[str(index)]['parsed'] = parsed
details[str(index)]['refr'] = references[index]
details[str(index)]['correct'] = parsed == references[index]
metrics['details'] = details
return metrics
@LOAD_DATASET.register_module()
class CustomDataset(BaseDataset):
@staticmethod
def load(path):
if path.endswith('.jsonl'):
with open(path, 'r', encoding='utf-8') as f:
with open(path, 'r', encoding='utf-8-sig') as f:
data = [json.loads(line) for line in f]
elif path.endswith('.csv'):
with open(path, 'r', encoding='utf-8') as f:
with open(path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
header = next(reader)
data = [dict(zip(header, row)) for row in reader]
@ -33,6 +179,10 @@ class CustomDataset(BaseDataset):
return Dataset.from_list(data)
class CircularCustomDataset(CustomDataset, metaclass=CircularDatasetMeta):
dataset_class = CustomDataset
def stringfy_types(obj):
for k, v in obj.items():
if k == 'type':
@ -69,12 +219,12 @@ def make_mcq_gen_config(meta):
inferencer=dict(type=GenInferencer),
)
eval_cfg = dict(evaluator=dict(type=AccEvaluator),
pred_role='BOT',
pred_postprocessor=dict(
type=first_option_postprocess,
options=''.join(meta['options']),
))
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator', OptionSimAccEvaluator),
**meta.get('evaluator_kwargs',
{'options': meta['options']})),
pred_role='BOT',
)
dataset = dict(
abbr=meta['abbr'],
@ -87,6 +237,54 @@ def make_mcq_gen_config(meta):
return dataset
def make_circular_mcq_gen_config(meta):
if meta.get('template', None) is None:
_human_prompt = 'Question: {question}' + ''.join(
[f'\n{item}. {{{item}}}' for item in meta['options']])
human_prompt = meta.get('human_prompt', _human_prompt)
_bot_prompt = f'Answer: {{{meta["output_column"]}}}'
bot_prompt = meta.get('bot_prompt', _bot_prompt)
template = dict(round=[
dict(role='HUMAN', prompt=human_prompt),
dict(role='BOT', prompt=bot_prompt),
])
else:
template = meta['template']
reader_cfg = dict(
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=template,
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator',
CircularOptionSimAccEvaluator),
**meta.get('evaluator_kwargs',
{'options': meta['options']})),
pred_role='BOT',
)
dataset = dict(
abbr=meta['abbr'],
type=CircularCustomDataset,
option_keys=meta['options'],
answer_key=meta['output_column'],
path=meta['path'],
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
return dataset
def make_qa_gen_config(meta):
if meta.get('template', None) is None:
human_prompt = meta.get('human_prompt', '{question}')
@ -102,7 +300,6 @@ def make_qa_gen_config(meta):
])
else:
template = meta['template']
reader_cfg = dict(
input_columns=meta['input_columns'],
output_column=meta['output_column'],
@ -117,7 +314,8 @@ def make_qa_gen_config(meta):
)
eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
evaluator=dict(type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {})),
pred_role='BOT',
)
@ -164,7 +362,8 @@ def make_mcq_ppl_config(meta):
inferencer=dict(type=PPLInferencer),
)
eval_cfg = dict(evaluator=dict(type=AccEvaluator))
eval_cfg = dict(evaluator=dict(type=meta.get('evaluator', AccEvaluator),
**meta.get('evaluator_kwargs', {})))
dataset = dict(
abbr=meta['abbr'],
@ -177,17 +376,61 @@ def make_mcq_ppl_config(meta):
return dataset
def parse_example_dataset(config):
# try to read meta json
path = config['path']
meta_path = config.get('meta_path', path + '.meta.json')
if os.path.exists(meta_path):
with open(meta_path, 'r', encoding='utf-8') as f:
meta = json.load(f)
def make_circular_mcq_ppl_config(meta):
if meta.get('template', None) is None:
_human_prompt = 'Question: {question}' + ''.join(
[f'\n{item}. {{{item}}}' for item in meta['options']])
human_prompt = meta.get('human_prompt', _human_prompt)
_bot_prompt = f'Answer: {{{meta["output_column"]}}}'
bot_prompt = meta.get('bot_prompt', _bot_prompt)
template = {
answer: dict(round=[
dict(role='HUMAN', prompt=human_prompt),
dict(role='BOT',
prompt=bot_prompt.format(
**{meta['output_column']: answer})),
], )
for answer in meta['options']
}
else:
meta = {}
template = meta['template']
# load sample
reader_cfg = dict(
input_columns=meta['input_columns'],
output_column=meta['output_column'],
)
infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=template,
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLInferencer),
)
eval_cfg = dict(
evaluator=dict(type=meta.get('evaluator', CircularEvaluator),
**meta.get('evaluator_kwargs', {})))
dataset = dict(
abbr=meta['abbr'],
type=CircularCustomDataset,
option_keys=meta['options'],
answer_key=meta['output_column'],
path=meta['path'],
reader_cfg=reader_cfg,
infer_cfg=infer_cfg,
eval_cfg=eval_cfg,
)
return dataset
def parse_example_dataset(config):
# config -> .meta.jsonl -> parsed_results
path = config['path']
# load sample and get parsed_meta
parsed_meta = {}
if path.endswith('.jsonl'):
with open(path, 'r', encoding='utf-8') as f:
data_item = json.loads(f.readline())
@ -200,11 +443,11 @@ def parse_example_dataset(config):
else:
raise ValueError(f'Unsupported ext: {path}, .jsonl or .csv required')
meta['path'] = path
parsed_meta['path'] = path
input_columns = [i for i in data_item.keys() if i != 'answer']
meta.setdefault('input_columns', input_columns)
parsed_meta['input_columns'] = input_columns
output_column = 'answer' if 'answer' in data_item else None
meta.setdefault('output_column', output_column)
parsed_meta['output_column'] = output_column
options = []
for i in range(26):
i = chr(ord('A') + i)
@ -212,19 +455,28 @@ def parse_example_dataset(config):
options.append(i)
else:
break
meta.setdefault('options', options)
parsed_meta['options'] = options
abbr = os.path.basename(path).split('.')[0]
meta.setdefault('abbr', abbr)
parsed_meta['abbr'] = abbr
parsed_meta['data_type'] = 'mcq' if len(options) > 1 else 'qa'
parsed_meta['infer_method'] = 'gen'
if 'data_type' in config:
meta.setdefault('data_type', config['data_type'])
# try to read meta json
meta_path = config.get('meta_path', path + '.meta.json')
if os.path.exists(meta_path):
with open(meta_path, 'r', encoding='utf-8') as f:
read_from_file_meta = json.load(f)
else:
data_type = 'mcq' if len(options) > 1 else 'qa'
meta.setdefault('data_type', data_type)
if 'infer_method' in config:
meta.setdefault('infer_method', config['infer_method'])
else:
meta.setdefault('infer_method', 'gen')
read_from_file_meta = {}
# get config meta
config_meta = copy.deepcopy(config)
# merge meta
meta = {}
meta.update(parsed_meta)
meta.update(read_from_file_meta)
meta.update(config_meta)
return meta
@ -236,6 +488,8 @@ def make_custom_dataset_config(config):
('mcq', 'gen'): make_mcq_gen_config,
('mcq', 'ppl'): make_mcq_ppl_config,
('qa', 'gen'): make_qa_gen_config,
('circular-mcq', 'gen'): make_circular_mcq_gen_config,
('circular-mcq', 'ppl'): make_circular_mcq_ppl_config,
}.get((meta['data_type'], meta['infer_method']), None)
if make_config_func is None:
raise ValueError(f'Unsupported dataset data_type: {meta["data_type"]}'

View File

@ -365,7 +365,7 @@ class DS1000ServiceEvaluator(BaseEvaluator):
lib: str,
ip_address='localhost',
port=5000,
timeout=180) -> None:
timeout=600) -> None:
assert lib in _LIBRARY_NAME_LIST, (
f' lib must be in {_LIBRARY_NAME_LIST}')
self.lib = lib

View File

@ -5,6 +5,7 @@ import os.path as osp
import re
import subprocess
import tempfile
import time
from shutil import copyfile
from typing import Dict, Iterable
@ -73,7 +74,8 @@ class HumanevalXEvaluator(BaseEvaluator):
language,
ip_address='localhost',
port=5000,
timeout=180) -> None:
retry=2,
timeout=600) -> None:
assert language in _LANGUAGE_NAME_DICT.keys(), (
f'language must be in {list(_LANGUAGE_NAME_DICT.keys())}')
if language == 'rust':
@ -81,6 +83,7 @@ class HumanevalXEvaluator(BaseEvaluator):
self.language = language
self.ip_address = ip_address
self.port = port
self.retry = retry
self.timeout = timeout
super().__init__()
@ -96,7 +99,17 @@ class HumanevalXEvaluator(BaseEvaluator):
for pred in predictions:
f.write(json.dumps(pred) + '\n')
succeed, output = self._code_eval_service(file_path=tmp_out_path)
num_retry = 0
while num_retry < self.retry:
succeed, output = self._code_eval_service(
file_path=tmp_out_path)
if not succeed and '(56) Recv failure' in output:
# only retry when connection failed
num_retry += 1
# wait a min in case the service load is too high
time.sleep(60)
else:
break
if succeed:
if isinstance(output, str):
@ -104,9 +117,15 @@ class HumanevalXEvaluator(BaseEvaluator):
elif isinstance(output, dict):
return output
ref_url = 'https://github.com/Ezra-Yu/code-evaluator'
result_file_path = os.path.join(
'outputs', f'humanevalx_{self.language}.json')
ref_url = 'https://opencompass.readthedocs.io/en/latest/advanced_guides/code_eval_service.html' # noqa
if hasattr(self, '_out_dir'):
result_file_path = re.sub('results', 'mid_results',
self._out_dir) + '.json' # noqa
if not osp.exists(osp.dirname(result_file_path)):
os.makedirs(osp.dirname(result_file_path))
else:
result_file_path = os.path.join(
'outputs', f'humanevalx_{self.language}.json')
copyfile(tmp_out_path, result_file_path)
raise Exception(
f'Call CodeEvalService Error in `HumanevalXEvaluator`, The '

View File

@ -0,0 +1,20 @@
import pandas as pd
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class HungarianExamMathDataset(BaseDataset):
@staticmethod
def load(path):
df = pd.read_csv(path)
df.columns = ['question']
outputs = [{
'question': question
} for question in df['question'].tolist()]
dataset = Dataset.from_list(outputs)
return dataset

View File

@ -0,0 +1,20 @@
import json
from datasets import Dataset
from opencompass.registry import LOAD_DATASET
from .base import BaseDataset
@LOAD_DATASET.register_module()
class JsonlDataset(BaseDataset):
@staticmethod
def load(path):
data = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line))
return Dataset.from_list(data)

View File

@ -1,4 +1,5 @@
import json
import re
from datasets import Dataset, DatasetDict
@ -9,65 +10,61 @@ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
from .base import BaseDataset
@LOAD_DATASET.register_module()
class MATHDataset(BaseDataset):
def last_boxed_only_string(string):
idx = string.rfind('\\boxed')
if idx < 0:
idx = string.rfind('\\fbox')
if idx < 0:
return None
@staticmethod
def load(path: str):
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == '{':
num_left_braces_open += 1
if string[i] == '}':
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
def remove_boxed(s):
left = '\\boxed{'
try:
assert s[:len(left)] == left
assert s[-1] == '}'
return s[len(left):-1]
except Exception:
return None
if right_brace_idx is None:
retval = None
else:
retval = string[idx:right_brace_idx + 1]
def last_boxed_only_string(string):
idx = string.rfind('\\boxed')
if idx < 0:
idx = string.rfind('\\fbox')
if idx < 0:
return None
i = idx
right_brace_idx = None
num_left_braces_open = 0
while i < len(string):
if string[i] == '{':
num_left_braces_open += 1
if string[i] == '}':
num_left_braces_open -= 1
if num_left_braces_open == 0:
right_brace_idx = i
break
i += 1
if right_brace_idx is None:
retval = None
else:
retval = string[idx:right_brace_idx + 1]
return retval
dataset = DatasetDict()
data = json.load(open(path))
raw_data = []
for i in data.keys():
raw_data.append({
'problem':
data[i]['problem'],
'solution':
remove_boxed(last_boxed_only_string(data[i]['solution']))
})
dataset['test'] = Dataset.from_list(raw_data)
dataset['train'] = Dataset.from_list(raw_data)
return dataset
return retval
@TEXT_POSTPROCESSORS.register_module('math_postprocess')
def math_postprocess(text: str) -> str:
def remove_boxed(s):
left = '\\boxed{'
try:
assert s[:len(left)] == left
assert s[-1] == '}'
return s[len(left):-1]
except Exception:
return None
def extract_boxed_answer(pred_str, strip_double_curly_brace=False):
boxed_str = last_boxed_only_string(pred_str)
if boxed_str is None:
return None
answer = remove_boxed(boxed_str)
if answer is None:
return None
if strip_double_curly_brace:
match = re.match('^\{(.*)\}$', answer) # noqa: W605
if match:
answer = match.group(1)
return answer
def normalize_final_answer(final_answer: str) -> str:
"""Normalize a final answer to a quantitative reasoning question."""
# final_answer = final_answer.split('=')[-1]
SUBSTITUTIONS = [('an ', ''), ('a ', ''), ('.$', '$'), ('\\$', ''),
(r'\ ', ''), (' ', ''), ('mbox', 'text'),
(',\\text{and}', ','), ('\\text{and}', ','),
@ -81,53 +78,74 @@ def math_postprocess(text: str) -> str:
'\\text{}', r'\mathrm{th}', r'^\circ', r'^{\circ}', r'\;', r',\!',
'{,}', '"', '\\dots', '\n', '\r', '\f'
]
import re
for before, after in SUBSTITUTIONS:
final_answer = final_answer.replace(before, after)
for expr in REMOVED_EXPRESSIONS:
final_answer = final_answer.replace(expr, '')
def normalize_final_answer(final_answer: str) -> str:
"""Normalize a final answer to a quantitative reasoning question."""
# final_answer = final_answer.split('=')[-1]
for before, after in SUBSTITUTIONS:
final_answer = final_answer.replace(before, after)
for expr in REMOVED_EXPRESSIONS:
final_answer = final_answer.replace(expr, '')
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\boxed\{)(.*)(\})', '\\2', final_answer)
assert '\n' not in final_answer
assert '\r' not in final_answer
assert '\f' not in final_answer
if len(re.findall(r'finalansweris(.*)', final_answer)) > 0:
final_answer = re.findall(r'finalansweris(.*)', final_answer)[-1]
# Extract answer that is in LaTeX math, is bold,
# is surrounded by a box, etc.
final_answer = re.sub(r'(\\text\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\textbf\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\overline\{)(.*?)(\})', '\\2', final_answer)
final_answer = re.sub(r'(\\boxed\{)(.*)(\})', '\\2', final_answer)
assert '\n' not in final_answer
assert '\r' not in final_answer
assert '\f' not in final_answer
if len(re.findall(r'finalansweris(.*)', final_answer)) > 0:
final_answer = re.findall(r'finalansweris(.*)', final_answer)[-1]
if len(re.findall(r'answer?is:?(.*)', final_answer)) > 0:
final_answer = re.findall(r'answer?is:?(.*)', final_answer)[-1]
if len(re.findall(r'oxed\{(.*?)\}', final_answer)) > 0:
final_answer = re.findall(r'oxed\{(.*?)\}', final_answer)[-1]
if len(re.findall(r'oxed\{(.*?)\}', final_answer)) > 0:
final_answer = re.findall(r'oxed\{(.*?)\}', final_answer)[-1]
if len(re.findall(r'\$(.*?)\$', final_answer)) > 0:
final_answer = re.findall(r'\$(.*?)\$', final_answer)[-1]
final_answer = final_answer.strip()
if 'rac' in final_answer and '\\frac' not in final_answer:
final_answer = final_answer.replace('rac', '\\frac')
if len(re.findall(r'\$(.*?)\$', final_answer)) > 0:
final_answer = re.findall(r'\$(.*?)\$', final_answer)[-1]
final_answer = final_answer.strip()
if 'rac' in final_answer and '\\frac' not in final_answer:
final_answer = final_answer.replace('rac', '\\frac')
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}',
final_answer)
final_answer = re.sub(r'(sqrt)([^{])', 'sqrt{\\2}', final_answer)
final_answer = final_answer.replace('$', '')
# Normalize shorthand TeX:
# \fracab -> \frac{a}{b}
# \frac{abc}{bef} -> \frac{abc}{bef}
# \fracabc -> \frac{a}{b}c
# \sqrta -> \sqrt{a}
# \sqrtab -> sqrt{a}b
final_answer = re.sub(r'(frac)([^{])(.)', 'frac{\\2}{\\3}', final_answer)
final_answer = re.sub(r'(sqrt)([^{])', 'sqrt{\\2}', final_answer)
final_answer = final_answer.replace('$', '')
# Normalize 100,000 -> 100000
if final_answer.replace(',', '').isdigit():
final_answer = final_answer.replace(',', '')
# Normalize 100,000 -> 100000
if final_answer.replace(',', '').isdigit():
final_answer = final_answer.replace(',', '')
return final_answer
return final_answer
@LOAD_DATASET.register_module()
class MATHDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
data = json.load(open(path))
raw_data = []
for i in data.keys():
raw_data.append({
'problem':
data[i]['problem'],
'solution':
extract_boxed_answer(data[i]['solution'])
})
dataset['test'] = Dataset.from_list(raw_data)
dataset['train'] = Dataset.from_list(raw_data)
return dataset
@TEXT_POSTPROCESSORS.register_module('math_postprocess')
def math_postprocess(text: str) -> str:
for maybe_ans in text.split('.'):
if 'final answer' in maybe_ans.lower():
@ -137,9 +155,27 @@ def math_postprocess(text: str) -> str:
# text.split('Final Answer: ', 1)[-1].split('\n\n')[0])
@TEXT_POSTPROCESSORS.register_module('math_postprocess_v2')
def math_postprocess_v2(text: str) -> str:
cand_ans = extract_boxed_answer(text, strip_double_curly_brace=True)
if cand_ans:
return cand_ans
for maybe_ans in text.split('.'):
# if 'final answer' in maybe_ans.lower():
if re.search('final answer|answer is', maybe_ans.lower()):
return normalize_final_answer(maybe_ans)
return normalize_final_answer(text.split('.')[0])
@ICL_EVALUATORS.register_module()
class MATHEvaluator(BaseEvaluator):
def __init__(self, version='v1'):
assert version in ['v1', 'v2']
self.version = version
def score(self, predictions, references):
if len(predictions) != len(references):
return {
@ -166,7 +202,7 @@ class MATHEvaluator(BaseEvaluator):
substrs = substrs[1:]
for substr in substrs:
new_str += '\\frac'
if substr[0] == '{':
if len(substr) > 0 and substr[0] == '{':
new_str += substr
else:
try:
@ -228,6 +264,10 @@ class MATHEvaluator(BaseEvaluator):
new_string += new_substr
return new_string
def _fix_sqrt_v2(self, string):
_string = re.sub(r'\\sqrt(\w+)', r'\\sqrt{\1}', string)
return _string
def _strip_string(self, string):
# linebreaks
string = string.replace('\n', '')
@ -295,6 +335,109 @@ class MATHEvaluator(BaseEvaluator):
return string
def _strip_string_v2(self, string):
string = str(string).strip()
# linebreaks
string = string.replace('\n', '')
# right "."
string = string.rstrip('.')
# remove inverse spaces
string = string.replace('\\!', '')
string = string.replace('\\ ', '')
# replace \\ with \
string = string.replace('\\\\', '\\')
string = string.replace('\\\\', '\\')
# replace tfrac and dfrac with frac
string = string.replace('tfrac', 'frac')
string = string.replace('dfrac', 'frac')
# remove \left and \right
string = string.replace('\\left', '')
string = string.replace('\\right', '')
# Remove unit: miles, dollars if after is not none
_string = re.sub(r'\\text{.*?}$', '', string).strip()
if _string != '' and _string != string:
string = _string
# Remove circ (degrees)
string = string.replace('^{\\circ}', '')
string = string.replace('^\\circ', '')
# remove dollar signs
string = string.replace('\\$', '')
string = string.replace('$', '')
string = string.replace('\\text', '')
string = string.replace('x\\in', '')
# remove percentage
string = string.replace('\\%', '')
string = string.replace('\%', '') # noqa: W605
string = string.replace('%', '')
# " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively,
# add "0" if "." is the start of the string
string = string.replace(' .', ' 0.')
string = string.replace('{.', '{0.')
# cdot
string = string.replace('\\cdot', '')
# inf
string = string.replace('infinity', '\\infty')
if '\\infty' not in string:
string = string.replace('inf', '\\infty')
string = string.replace('+\\inity', '\\infty')
# and
string = string.replace('and', '')
string = string.replace('\\mathbf', '')
# use regex to remove \mbox{...}
string = re.sub(r'\\mbox{.*?}', '', string)
# quote
string.replace("'", '')
string.replace('"', '')
# i, j
if 'j' in string and 'i' not in string:
string = string.replace('j', 'i')
# replace a.000b where b is not number or b is end, with ab, use regex
string = re.sub(r'(\d+)\.0+([^\d])', r'\1\2', string)
string = re.sub(r'(\d+)\.0+$', r'\1', string)
# if empty, return empty string
if len(string) == 0:
return string
if string[0] == '.':
string = '0' + string
# to consider: get rid of e.g. "k = " or "q = " at beginning
if len(string.split('=')) == 2:
if len(string.split('=')[0]) <= 2:
string = string.split('=')[1]
string = self._fix_sqrt_v2(string)
string = string.replace(' ', '')
# \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc.
# Even works with \frac1{72} (but not \frac{72}1).
# Also does a/b --> \\frac{a}{b}
string = self._fix_fracs(string)
# NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple
# cases fix in case the model output is X/Y
string = self._fix_a_slash_b(string)
return string
def is_equiv(self, str1, str2, verbose=False):
if str1 is None and str2 is None:
print('WARNING: Both None')
@ -302,16 +445,24 @@ class MATHEvaluator(BaseEvaluator):
if str1 is None or str2 is None:
return False
if self.version == 'v1':
strip_string_func = self._strip_string
elif self.version == 'v2':
strip_string_func = self._strip_string_v2
else:
raise NotImplementedError
try:
ss1 = self._strip_string(str1)
ss2 = self._strip_string(str2)
ss1 = strip_string_func(str1)
ss2 = strip_string_func(str2)
if verbose:
print(ss1, ss2)
return ss1 == ss2
except: # noqa
except Exception:
return str1 == str2
@ICL_EVALUATORS.register_module()
class MATHAgentEvaluator(MATHEvaluator):
"""math agent evaluator for soft condition.
@ -320,8 +471,9 @@ class MATHAgentEvaluator(MATHEvaluator):
Defaults to `PythonInterpreter`.
"""
def __init__(self, action: str = 'PythonInterpreter'):
def __init__(self, action: str = 'PythonInterpreter', version='v1'):
self.action = action
super().__init__(version=version)
def soft_equal(self, pred, refer, step):
try:

View File

@ -0,0 +1,30 @@
from opencompass.openicl import BaseEvaluator
def check(a, b):
return abs(float(a) - float(b)) < 1e-3
class Math401Evaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
correct = 0
count = 0
details = []
for i, j in zip(predictions, references):
detail = {'pred': i, 'answer': j, 'correct': False}
count += 1
try:
if check(i, j):
correct += 1
detail['correct'] = True
except Exception:
pass
details.append(detail)
result = {'accuracy': 100 * correct / count, 'details': details}
return result

View File

@ -1,4 +1,5 @@
import csv
import json
import os.path as osp
from datasets import Dataset, DatasetDict
@ -18,7 +19,7 @@ class NaturalQuestionDataset(BaseDataset):
dataset = DatasetDict()
for split in ['dev', 'test']:
filename = osp.join(path, f'nq-{split}.qa.csv')
with open(filename) as f:
with open(filename, 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
raw_data = []
for row in reader:
@ -33,6 +34,26 @@ class NaturalQuestionDataset(BaseDataset):
return dataset
@LOAD_DATASET.register_module()
class NQOpenDataset(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
for split in ['validation', 'train']:
filename = osp.join(path, f'nq-open-{split}.jsonl')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for doc in f:
doc = json.loads(doc)
if split == 'train':
doc['answer'] = doc['answer'][0]
raw_data.append(doc)
dataset[split] = Dataset.from_list(raw_data)
return dataset
@ICL_EVALUATORS.register_module()
class NQEvaluator(BaseEvaluator):

View File

@ -16,13 +16,13 @@ class ReasonBenchDataset(BaseDataset):
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
prompt = line['prompt']
prompt_ppl = line['prompt_ppl']
label = line['label']
label_ppl = line['label_ppl']
choices = line['choices']
tag = line['tag']
source = line['source']
prompt = line.get('prompt', '')
prompt_ppl = line.get('prompt_ppl', '')
label = line.get('label', '')
label_ppl = line.get('label_ppl', '')
choices = line.get('choices', '')
tag = line.get('tag', '')
source = line.get('source', '')
option_content = {choice: line[choice] for choice in choices}
data = {
'prompt': prompt,

View File

@ -1,4 +1,5 @@
import csv
import json
import os.path as osp
from datasets import Dataset, DatasetDict
@ -18,7 +19,7 @@ class TriviaQADataset(BaseDataset):
dataset = DatasetDict()
for split in ['dev', 'test']:
filename = osp.join(path, f'trivia-{split}.qa.csv')
with open(filename) as f:
with open(filename, 'r', encoding='utf-8') as f:
reader = csv.reader(f, delimiter='\t')
raw_data = []
for row in reader:
@ -32,20 +33,49 @@ class TriviaQADataset(BaseDataset):
return dataset
@LOAD_DATASET.register_module()
class TriviaQADataset_V2(BaseDataset):
@staticmethod
def load(path: str):
dataset = DatasetDict()
for split in ['validation', 'train']:
filename = osp.join(path, f'triviaqa-{split}.jsonl')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for doc in f:
doc = json.loads(doc)
raw_data.append(doc)
dataset[split] = Dataset.from_list(raw_data)
return dataset
@LOAD_DATASET.register_module()
class TriviaQADataset_V3(BaseDataset):
@staticmethod
def load(path: str):
data_list = []
with open(path, 'r', encoding='utf-8') as f:
for doc in f:
data_list.append(json.loads(doc))
return Dataset.from_list(data_list)
@ICL_EVALUATORS.register_module()
class TriviaQAEvaluator(BaseEvaluator):
def score(self, predictions, references):
if len(predictions) != len(references):
return {
'error': 'predictions and references have different '
'length'
}
return {'error': 'preds and refrs have different length'}
processed_predictions = []
for prediction in predictions:
prediction = prediction.strip().split('\n')[0].lower()
if 'answer is' in prediction:
prediction = prediction.split('answer is')[-1]
prediction = prediction.split('answer is')[-1]
prediction = prediction.split('a:')[-1]
prediction = prediction.split('answer:')[-1]
prediction = prediction.strip()
prediction = general_postprocess(prediction)
processed_predictions.append(prediction)
processed_answers = [[general_postprocess(j).lower() for j in i]

View File

@ -16,11 +16,14 @@ from jupyter_client import KernelManager
from lagent.actions.base_action import BaseAction
from lagent.schema import ActionReturn, ActionStatusCode
WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR', '/tmp/workspace')
WORK_DIR = os.getenv('CODE_INTERPRETER_WORK_DIR',
f"{os.path.abspath('./output_images')}")
DEFAULT_DESCRIPTION = """启动Jupter Kernel用于执行Python代码。"""
START_CODE = """
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
def input(*args, **kwargs):
raise NotImplementedError('Python input() function is disabled.')
@ -74,6 +77,10 @@ class IPythonInterpreter(BaseAction):
if user_data_dir:
# user_data_dir = os.path.dirname(user_data_dir)
# in case change of dirs
assert os.path.exists(user_data_dir), \
f'{user_data_dir} does not exist.'
user_data_dir = os.path.abspath(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
self.user_data_dir = user_data_dir
self._initialized = False

View File

@ -24,5 +24,6 @@ from .qwen_api import Qwen # noqa: F401
from .sensetime_api import SenseTime # noqa: F401
from .turbomind import TurboMindModel # noqa: F401
from .turbomind_tis import TurboMindTisModel # noqa: F401
from .vllm import VLLM # noqa: F401
from .xunfei_api import XunFei # noqa: F401
from .zhipuai_api import ZhiPuAI # noqa: F401

View File

@ -2,6 +2,9 @@ from abc import abstractmethod
from copy import deepcopy
from typing import Dict, List, Optional, Tuple, Union
import torch
from mmengine import dist
from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
@ -21,6 +24,9 @@ class BaseModel:
wrapping of any meta instructions.
generation_kwargs (Dict, optional): The generation kwargs for the
model. Defaults to dict().
sync_rank (bool): Whether to sync inputs between ranks. Do not use this
if you are not familiar with this behavior. Check `sync_inputs`
function for more details. Defaults to False.
"""
is_api: bool = False
@ -30,7 +36,8 @@ class BaseModel:
max_seq_len: int = 2048,
tokenizer_only: bool = False,
meta_template: Optional[Dict] = None,
generation_kwargs: Optional[Dict] = dict()):
generation_kwargs: Optional[Dict] = dict(),
sync_rank: bool = False):
self.path = path
self.max_seq_len = max_seq_len
self.tokenizer_only = tokenizer_only
@ -40,6 +47,7 @@ class BaseModel:
if meta_template and 'eos_token_id' in meta_template:
self.eos_token_id = meta_template['eos_token_id']
self.generation_kwargs = generation_kwargs
self.sync_rank = sync_rank
@abstractmethod
def generate(self, inputs: List[str], max_out_len: int) -> List[str]:
@ -77,6 +85,34 @@ class BaseModel:
' ppl-based evaluation yet, try gen-based '
'instead.')
@abstractmethod
def encode(self, prompt: str) -> torch.Tensor:
"""Encode prompt to tokens. Not necessary for most cases.
Args:
prompt (str): Input string.
Returns:
torch.Tensor: Encoded tokens.
"""
raise NotImplementedError(
f'{self.__class__.__name__} does not implement'
'`encode` method.')
@abstractmethod
def decode(self, tokens: torch.Tensor) -> str:
"""Decode tokens to text. Not necessary for most cases.
Args:
tokens (torch.Tensor): Input tokens.
Returns:
str: Decoded text.
"""
raise NotImplementedError(
f'{self.__class__.__name__} does not implement'
'`decode` method.')
@abstractmethod
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
@ -115,20 +151,6 @@ class BaseModel:
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length)
def get_loglikelihood_from_template(self,
templates: List[PromptType],
conts: List[str],
mask_length=None):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_loglikelihood(inputs, conts, mask_length)
def generate_from_template(self, templates: List[PromptType],
max_out_len: int, **kwargs):
"""Generate completion from a list of templates.
@ -138,6 +160,8 @@ class BaseModel:
max_out_len (int): The maximum length of the output.
"""
inputs = self.parse_template(templates, mode='gen')
if hasattr(self, 'sync_rank') and self.sync_rank:
inputs = self.sync_inputs(inputs)
return self.generate(inputs, max_out_len=max_out_len, **kwargs)
def get_token_len_from_template(
@ -165,6 +189,39 @@ class BaseModel:
token_lens = [self.get_token_len(prompt) for prompt in prompts]
return token_lens[0] if not is_batched else token_lens
def sync_inputs(self, inputs: str) -> str:
"""For some case, when it involves multiprocessing with multiple gpus,
there might be the chance that inputs are different among different
gpus. Therefore, we need to sync inputs for rank0.
Args:
inputs (str): Inputs for each rank.
"""
rank = dist.get_rank()
if rank == 0:
tokens = self.encode(inputs)
length = self.get_token_len(inputs)
if length > 2048:
from opencompass.utils import get_logger
get_logger().info(f'Large tokens nums: {length}')
size = torch.tensor([tokens.shape], dtype=torch.long)
else:
tokens = None
size = torch.empty(2, dtype=torch.long)
# broadcast data size
dist.broadcast(size, src=0)
if rank != 0:
tokens = torch.empty(size.tolist(), dtype=torch.long)
# broadcast tokens
dist.broadcast(tokens, src=0)
# the final input might be different from original input
# due to the max sequence limitation
return self.decode(tokens)
def to(self, device):
self.model.to(device)

Some files were not shown because too many files have changed in this diff Show More