mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] minor test (#683)
This commit is contained in:
parent
dd4318f6ab
commit
e78857ac36
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,6 +11,7 @@ configs/eval_debug*.py
|
||||
configs/viz_*.py
|
||||
data
|
||||
work_dirs
|
||||
models
|
||||
configs/internal/
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .CIBench_gen_eb42f9 import ci_datasets # noqa: F401, F403
|
||||
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403
|
||||
|
@ -16,28 +16,20 @@ cibench_infer_cfg = dict(
|
||||
template="""{questions}""",
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=AgentInferencer),
|
||||
inferencer=dict(type=AgentInferencer, infer_mode='every'),
|
||||
)
|
||||
|
||||
|
||||
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
||||
cibench_eval_cfg = {
|
||||
lib: dict(
|
||||
evaluator=dict(
|
||||
type=CIBenchEvaluator,
|
||||
output_dir=f'output_data/cibench/{lib}'),
|
||||
pred_role="BOT",
|
||||
)
|
||||
for lib in libs
|
||||
}
|
||||
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
|
||||
|
||||
cibench_datasets = [
|
||||
dict(
|
||||
abbr=f"cibench_{lib}",
|
||||
abbr=f"cibench_generation_{lib}",
|
||||
type=CIBenchDataset,
|
||||
path=f"./data/cibench/{lib}",
|
||||
reader_cfg=cibench_reader_cfg,
|
||||
infer_cfg=cibench_infer_cfg,
|
||||
eval_cfg=cibench_eval_cfg[lib],
|
||||
eval_cfg=cibench_eval_cfg,
|
||||
) for lib in libs
|
||||
]
|
@ -12,15 +12,15 @@ PROMPT_EN = {
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:210'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:140'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
|
||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
|
||||
dict(role='SYSTEM', prompt='Response:146'),
|
||||
dict(role='SYSTEM', prompt='Response:146'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
|
||||
|
||||
dict(role='HUMAN', prompt='{question}'),
|
||||
@ -35,14 +35,14 @@ PROMPT_EN = {
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
||||
dict(role='BOT', prompt='Thought: The functional expression of the power function is $y=x^{{-2}}$. Substituting $x=-3$ yields $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='If $3 x-y=12$, what is the value of $\\frac{8^{x}}{2^{y}} ?$\nA. The value cannot be determined from the information given.\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
||||
dict(role='BOT', prompt='Thought: The value of $\\frac{8^{x}}{2^{y}}$ is $2^{12}$\nFinalAnswer: B'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='{question}'),
|
||||
]
|
||||
]
|
||||
}
|
||||
|
||||
PROMPT_CN = {
|
||||
@ -51,17 +51,17 @@ PROMPT_CN = {
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:210'),
|
||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 210'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='Bella有两倍于飞盘的弹珠。她还比卡片多20个飞盘。如果她买每种物品多2/5,她会有多少总数的物品,如果她现在有60颗弹珠?'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:140'),
|
||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 140'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='一个有4个水果篮子,前三个篮子里有9个苹果、15个橙子和14个香蕉,第四个篮子里每种水果都少2个。总共有多少水果?'),
|
||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
|
||||
dict(role='SYSTEM', prompt='Response:146'),
|
||||
dict(role='BOT', prompt='Thought: 根据回答,我得到了答案\nFinalAnswer: 146'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='{question}'),
|
||||
],
|
||||
"FEWSHOT_INSTRUCTION_CHOICE" : [
|
||||
@ -74,12 +74,12 @@ PROMPT_CN = {
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import *\ndef solution():\n x, y, k = symbols(\'x y k\')\n eq1 = Eq(2**k, Rational(1, 4))\n k_value = solve(eq1, k)[0]\n y = x**k_value\n result = y.subs(x, -3)\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:1/9'),
|
||||
dict(role='BOT', prompt='Thought: 求出幂函数的函数表达式为 $y=x^{{-2}}$,代入 $x=-3$ 得到 $y=$\\frac{{1}}{{9}}$\nFinalAnswer: A'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='如果$3 x-y=12$,则$\\frac{8^{x}}{2^{y}}$的值是多少?\nA. 无法从给定的信息中确定值。\nB. $2^{12}$\nC. 4\nD. $8^{2}$'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:from sympy import symbols, Eq, solve\n\ndef sloution():\n x, y = symbols(\'x y\')\n equation = Eq(3*x - y, 12)\n\n y_in_terms_of_x = solve(equation, y)[0]\n expression = 8**x / 2**y_in_terms_of_x\n result = expression.simplify()\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:2**12'),
|
||||
dict(role='BOT', prompt='Thought: $\\frac{8^{x}}{2^{y}}$的值是$2^{12}$\nFinalAnswer: B'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='{question}'),
|
||||
]
|
||||
}
|
||||
@ -95,13 +95,13 @@ mathbench_sets = {
|
||||
# Use circular evaluation or not
|
||||
with_circular_eval = True
|
||||
|
||||
mathbench_code_datasets = []
|
||||
mathbench_agent_datasets = []
|
||||
|
||||
for _split in list(mathbench_sets.keys()):
|
||||
for _name in mathbench_sets[_split]:
|
||||
prompt_example = PROMPT_CN if '_cn' in _name else PROMPT_EN
|
||||
mathbench_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate,
|
||||
prompt_template=dict(type=PromptTemplate,
|
||||
template=dict(
|
||||
round = prompt_example["FEWSHOT_INSTRUCTION_CLOZE"] if 'cloze' in _name else prompt_example["FEWSHOT_INSTRUCTION_CHOICE"])),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
@ -112,13 +112,13 @@ for _split in list(mathbench_sets.keys()):
|
||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
||||
|
||||
mathbench_code_datasets.append(
|
||||
mathbench_agent_datasets.append(
|
||||
dict(
|
||||
abbr="mathbench-" + _split + '-' + _name + '-agent',
|
||||
type=MathBenchDataset,
|
||||
path=f"./data/mathbench/{_split}",
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
abbr="mathbench-interpreter-" + _split + '-' + _name,
|
||||
reader_cfg=dict(
|
||||
input_columns=["question"],
|
||||
output_column="answer"
|
@ -6,17 +6,17 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
||||
|
||||
cloze_prompts ={
|
||||
"cloze_arith_en": [
|
||||
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
|
||||
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
|
||||
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
|
||||
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
|
||||
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
|
||||
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
|
||||
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
|
||||
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}\n'),
|
||||
]
|
||||
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
|
||||
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
|
||||
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
|
||||
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
|
||||
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
|
||||
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
|
||||
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
|
||||
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
|
||||
dict(role='HUMAN', prompt='Q: {question}'),
|
||||
dict(role='BOT', prompt='A: {answer}\n'),
|
||||
]
|
||||
}
|
||||
|
||||
mathbench_sets = {
|
@ -94,11 +94,11 @@ for _split in list(mathbench_sets.keys()):
|
||||
|
||||
mathbench_datasets.append(
|
||||
dict(
|
||||
abbr="mathbench-" + _split + '-' + _name,
|
||||
type=MathBenchDataset,
|
||||
path=f"./data/mathbench/{_split}",
|
||||
name=_name,
|
||||
with_circular=with_circular_eval,
|
||||
abbr="mathbench-" + _split + '-' + _name,
|
||||
reader_cfg=dict(
|
||||
input_columns=["question"],
|
||||
output_column="answer"
|
||||
|
69
configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
Normal file
69
configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
Normal file
@ -0,0 +1,69 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
|
||||
ds1000_matplotlib_postprocess,
|
||||
DS1000Evaluator)
|
||||
|
||||
ds1000_reader_cfg = dict(
|
||||
input_columns=["prompt"],
|
||||
output_column="test_column",
|
||||
train_split='test',
|
||||
test_split='test')
|
||||
|
||||
ds1000_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt="{prompt}",
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
ds1000_eval_cfg = dict(
|
||||
evaluator=dict(type=DS1000Evaluator),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=ds1000_completion_postprocess),
|
||||
)
|
||||
|
||||
# The DS-1000 dataset can be downloaded from
|
||||
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
||||
ds1000_datasets = [
|
||||
dict(
|
||||
abbr=f"ds1000_{lib}",
|
||||
type=DS1000Dataset,
|
||||
path="./data/ds1000_data/",
|
||||
libs=f"{lib}",
|
||||
mode="Completion",
|
||||
reader_cfg=ds1000_reader_cfg,
|
||||
infer_cfg=ds1000_infer_cfg,
|
||||
eval_cfg=ds1000_eval_cfg,
|
||||
) for lib in [
|
||||
'Pandas',
|
||||
'Numpy',
|
||||
'Tensorflow',
|
||||
'Scipy',
|
||||
'Sklearn',
|
||||
'Pytorch',
|
||||
]
|
||||
]
|
||||
ds1000_datasets.append(
|
||||
dict(
|
||||
abbr="ds1000_Matplotlib",
|
||||
type=DS1000Dataset,
|
||||
path="./data/ds1000_data/",
|
||||
libs="Matplotlib",
|
||||
mode="Completion",
|
||||
reader_cfg=ds1000_reader_cfg,
|
||||
infer_cfg=ds1000_infer_cfg,
|
||||
eval_cfg=dict(
|
||||
evaluator=dict(type=DS1000Evaluator),
|
||||
pred_role="BOT",
|
||||
pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
|
||||
),
|
||||
))
|
@ -0,0 +1,68 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
|
||||
|
||||
ds1000_reader_cfg = dict(
|
||||
input_columns=["prompt"],
|
||||
output_column="test_column",
|
||||
train_split='test',
|
||||
test_split='test')
|
||||
|
||||
ds1000_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(round=[
|
||||
dict(
|
||||
role="HUMAN",
|
||||
prompt="{prompt}",
|
||||
),
|
||||
]),
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer),
|
||||
)
|
||||
|
||||
ds1000_eval_cfg_dict = {
|
||||
lib: dict(
|
||||
evaluator=dict(
|
||||
type=DS1000ServiceEvaluator,
|
||||
lib=lib,
|
||||
ip_address=
|
||||
"localhost", # replace to your code_eval_server ip_address, port
|
||||
port=5000
|
||||
),
|
||||
pred_role="BOT")
|
||||
for lib in [
|
||||
'Pandas',
|
||||
'Numpy',
|
||||
'Tensorflow',
|
||||
'Scipy',
|
||||
'Sklearn',
|
||||
'Pytorch',
|
||||
'Matplotlib',
|
||||
]
|
||||
}
|
||||
|
||||
# The DS-1000 dataset can be downloaded from
|
||||
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
||||
ds1000_datasets = [
|
||||
dict(
|
||||
abbr=f"ds1000_{lib}",
|
||||
type=DS1000Dataset,
|
||||
path="./data/ds1000_data/",
|
||||
libs=f"{lib}",
|
||||
mode="Completion",
|
||||
reader_cfg=ds1000_reader_cfg,
|
||||
infer_cfg=ds1000_infer_cfg,
|
||||
eval_cfg=ds1000_eval_cfg_dict[lib],
|
||||
) for lib in [
|
||||
'Pandas',
|
||||
'Numpy',
|
||||
'Tensorflow',
|
||||
'Scipy',
|
||||
'Sklearn',
|
||||
'Pytorch',
|
||||
'Matplotlib',
|
||||
]
|
||||
]
|
@ -20,15 +20,15 @@ gsm8k_infer_cfg = dict(
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n mark_pointers_2 = 25 * 2\n mark_pointers_3 = 8 * 3\n mark_free_throws = 10 * 1\n mark_points_scored = mark_pointers_2 + mark_pointers_3 + mark_free_throws\n opponents_pointers_2 = mark_pointers_2 * 2\n opponents_pointers_3 = mark_pointers_3 / 2\n opponents_free_throws = mark_free_throws / 2\n opponents_points_scored = opponents_pointers_2 + opponents_pointers_3 + opponents_free_throws\n total_points_scored = mark_points_scored + opponents_points_scored\n result = total_points_scored\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:210'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 210'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?'),
|
||||
dict(role='BOT', prompt='Tool:PythonInterpreter\nTool Input:def solution():\n marbles = 60\n num_increased_marbles = marbles * 2 / 5\n num_total_marbles = marbles + num_increased_marbles\n frisbees = marbles / 2\n num_increased_frisbees = frisbees * 2 / 5\n num_total_frisbees = frisbees + num_increased_frisbees\n deck_cards = frisbees - 20\n num_increased_deck_cards = deck_cards * 2 / 5\n num_total_deck_cards = deck_cards + num_increased_deck_cards\n num_total = num_total_marbles + num_total_frisbees + num_total_deck_cards\n result = num_total\n return result'),
|
||||
dict(role='SYSTEM', prompt='Response:140'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 140'),
|
||||
|
||||
|
||||
dict(role='HUMAN', prompt='A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?'),
|
||||
dict(role='BOT', prompt="""Tool:PythonInterpreter\nTool Input:def solution():\n num_fruits_per_first_three_basket = 9 + 15 + 14\n num_fruits_first_three_basket = num_fruits_per_first_three_basket * 3\n num_apple_fourth_basket = 9 - 2\n num_orange_fourth_basket = 15 - 2\n num_banana_fourth_basket = 14 - 2\n num_fruits_fourth_basket = num_apple_fourth_basket + num_orange_fourth_basket + num_banana_fourth_basket\n num_fruits_total = num_fruits_first_three_basket + num_fruits_fourth_basket\n result = num_fruits_total\n return result"""),
|
||||
dict(role='SYSTEM', prompt='Response:146'),
|
||||
dict(role='SYSTEM', prompt='Response:146'),
|
||||
dict(role='BOT', prompt='Thought: According to the response, I got the answer\nFinalAnswer: 146'),
|
||||
|
||||
dict(role='HUMAN', prompt='{question}'),
|
||||
@ -45,7 +45,7 @@ gsm8k_eval_cfg = dict(
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
abbr='gsm8k-agent',
|
||||
type=GSM8KDataset,
|
||||
path='./data/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
|
39
configs/datasets/gsm8k/gsm8k_gen_3309bd.py
Normal file
39
configs/datasets/gsm8k/gsm8k_gen_3309bd.py
Normal file
@ -0,0 +1,39 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||
|
||||
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template=dict(
|
||||
round=[
|
||||
dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
|
||||
dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
|
||||
dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
|
||||
dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
|
||||
dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
|
||||
dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
|
||||
dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
|
||||
dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
|
||||
dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
|
||||
],
|
||||
)),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=[":", "Question:", "Question"]))
|
||||
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
|
||||
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
|
||||
|
||||
gsm8k_datasets = [
|
||||
dict(
|
||||
abbr='gsm8k',
|
||||
type=GSM8KDataset,
|
||||
path='./data/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg)
|
||||
]
|
@ -0,0 +1,57 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
|
||||
from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
|
||||
from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
|
||||
|
||||
gsm8k_datasets = []
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template="{question} {answer}"),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLOnlyInferencer),
|
||||
)
|
||||
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
||||
|
||||
for split in ['train', 'test']:
|
||||
gsm8k_reader_cfg = dict(
|
||||
input_columns=['question', 'answer'],
|
||||
output_column=None,
|
||||
train_split=split,
|
||||
test_split=split,
|
||||
)
|
||||
gsm8k_datasets.append(
|
||||
dict(
|
||||
abbr=f'gsm8k-{split}-ppl',
|
||||
type=GSM8KDataset,
|
||||
path='./data/gsm8k',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg)
|
||||
)
|
||||
|
||||
|
||||
gsm8k_infer_cfg = dict(
|
||||
prompt_template=dict(type=PromptTemplate, template="{text}"),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=PPLOnlyInferencer),
|
||||
)
|
||||
|
||||
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
||||
|
||||
gsm8k_reader_cfg = dict(
|
||||
input_columns=['text'],
|
||||
output_column=None,
|
||||
)
|
||||
|
||||
gsm8k_datasets.append(
|
||||
dict(
|
||||
abbr=f'gsm8k-ref-ppl',
|
||||
type=GSM8KReferenceSkywork,
|
||||
path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
|
||||
reader_cfg=gsm8k_reader_cfg,
|
||||
infer_cfg=gsm8k_infer_cfg,
|
||||
eval_cfg=gsm8k_eval_cfg
|
||||
)
|
||||
)
|
@ -79,7 +79,7 @@ math_eval_cfg = dict(
|
||||
|
||||
math_datasets = [
|
||||
dict(
|
||||
abbr='math',
|
||||
abbr='math-agent',
|
||||
type=MATHDataset,
|
||||
path='./data/math/math.json',
|
||||
reader_cfg=math_reader_cfg,
|
||||
|
@ -1,4 +1,4 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .winogrande_ppl_55a66e import winogrande_datasets # noqa: F401, F403
|
||||
from .winogrande_ppl_8be6c3 import winogrande_datasets # noqa: F401, F403
|
||||
|
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import winograndeDataset
|
||||
|
||||
# WARNING: This config cannot reproduce results in the paper.
|
||||
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
|
||||
# Please try winogrande_ppl_8be6c3
|
||||
|
||||
winogrande_reader_cfg = dict(
|
||||
input_columns=['opt1', 'opt2'],
|
||||
output_column='answer',
|
||||
|
33
configs/datasets/winogrande/winogrande_ppl_8be6c3.py
Normal file
33
configs/datasets/winogrande/winogrande_ppl_8be6c3.py
Normal file
@ -0,0 +1,33 @@
|
||||
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||
from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import winograndeDataset
|
||||
|
||||
winogrande_reader_cfg = dict(
|
||||
input_columns=['opt1', 'opt2'],
|
||||
output_column='answer',
|
||||
)
|
||||
|
||||
winogrande_infer_cfg = dict(
|
||||
prompt_template=dict(
|
||||
type=PromptTemplate,
|
||||
template={
|
||||
1: "{opt1}",
|
||||
2: "{opt2}",
|
||||
}
|
||||
),
|
||||
retriever=dict(type=ZeroRetriever),
|
||||
inferencer=dict(type=LoglikelihoodInferencer))
|
||||
|
||||
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
||||
|
||||
winogrande_datasets = [
|
||||
dict(
|
||||
abbr='winogrande',
|
||||
type=winograndeDataset,
|
||||
path='./data/winogrande',
|
||||
reader_cfg=winogrande_reader_cfg,
|
||||
infer_cfg=winogrande_infer_cfg,
|
||||
eval_cfg=winogrande_eval_cfg)
|
||||
]
|
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
|
||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||
from opencompass.datasets import winograndeDataset
|
||||
|
||||
# WARNING: This config cannot reproduce results in the paper.
|
||||
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
|
||||
# Please try winogrande_ppl_8be6c3
|
||||
|
||||
winogrande_reader_cfg = dict(
|
||||
input_columns=['opt1', 'opt2'],
|
||||
output_column='answer',
|
||||
|
@ -4,11 +4,20 @@ from opencompass.partitioners import SizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.models.lagent import LagentAgent
|
||||
from lagent import PythonInterpreter, ReAct
|
||||
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
|
||||
from lagent import ReAct
|
||||
from lagent.agents.react import ReActProtocol
|
||||
|
||||
with read_base():
|
||||
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets as datasets
|
||||
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets
|
||||
from .datasets.math.math_agent_gen_861b4f import math_datasets
|
||||
from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
|
||||
from .summarizers.math_agent import summarizer
|
||||
|
||||
datasets = []
|
||||
datasets += gsm8k_datasets
|
||||
datasets += math_datasets
|
||||
datasets += mathbench_agent_datasets
|
||||
|
||||
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
|
||||
```
|
||||
@ -52,4 +61,4 @@ infer = dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
)
|
@ -10,7 +10,7 @@ from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
|
||||
with read_base():
|
||||
from .datasets.CIBench.CIBench_gen_eb42f9 import \
|
||||
from .datasets.CIBench.CIBench_gen_8ab0dc import \
|
||||
cibench_datasets as datasets
|
||||
|
||||
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
|
||||
@ -36,7 +36,21 @@ Also please follow the guidelines:
|
||||
3. The generated codes will be executed in an ipython manner and the results will be cached.
|
||||
4. Your responded code should always be simple and only solves the problem in current step.
|
||||
|
||||
Begin!
|
||||
For example:
|
||||
|
||||
File url: `xxxx`
|
||||
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
|
||||
|
||||
{thought} We should use `pandas` to solve this step.
|
||||
{action} IPythonInterpreter
|
||||
{action_input} ```python
|
||||
import pandas as pd
|
||||
url = "xxxx"
|
||||
data = pd.read_csv(url)
|
||||
```
|
||||
{response} The code is succeed without any outputs.
|
||||
|
||||
Let us begin from here!
|
||||
"""
|
||||
|
||||
IPYTHON_INTERPRETER_DESCRIPTION = '''\
|
||||
@ -69,9 +83,6 @@ models = [
|
||||
),
|
||||
]
|
||||
|
||||
for dataset in datasets:
|
||||
# Evaluate on every assistant response
|
||||
dataset['infer_cfg']['inferencer']['infer_mode'] = 'every'
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||
@ -79,4 +90,4 @@ infer = dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
||||
)
|
@ -1,56 +0,0 @@
|
||||
from mmengine.config import read_base
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import SizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
from opencompass.models.lagent import LagentAgent
|
||||
from lagent import PythonInterpreter, ReAct
|
||||
from lagent.agents.react import ReActProtocol
|
||||
|
||||
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
|
||||
```
|
||||
def solution():
|
||||
variable_names_with_real_meaning = func(variable)
|
||||
return variable_names_with_real_meaning
|
||||
```"""
|
||||
|
||||
protocol = dict(
|
||||
type=ReActProtocol,
|
||||
action=dict(role="ACTION", begin="Tool:", end="\n"),
|
||||
action_input=dict(role="ARGS", begin="Tool Input:", end="\n"),
|
||||
finish=dict(role="FINISH", begin="FinalAnswer:", end="\n"),
|
||||
call_protocol=system_prompt,
|
||||
)
|
||||
|
||||
with read_base():
|
||||
from .datasets.MathBench.mathbench_code_gen_568903 import mathbench_code_datasets as datasets
|
||||
from .summarizers.mathbench import summarizer
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='gpt-3.5-react',
|
||||
type=LagentAgent,
|
||||
agent_type=ReAct,
|
||||
max_turn=3,
|
||||
llm=dict(
|
||||
type=OpenAI,
|
||||
path='gpt-3.5-turbo',
|
||||
key='ENV',
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
),
|
||||
actions=[
|
||||
dict(type=PythonInterpreter),
|
||||
],
|
||||
protocol=protocol,
|
||||
batch_size=1,
|
||||
),
|
||||
]
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
43
configs/eval_with_model_dataset_combinations.py
Normal file
43
configs/eval_with_model_dataset_combinations.py
Normal file
@ -0,0 +1,43 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
with read_base():
|
||||
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_base_models
|
||||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_models
|
||||
|
||||
from .datasets.ceval.ceval_ppl_578f8d import ceval_datasets as base_ceval_datasets
|
||||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets as chat_ceval_datasets
|
||||
|
||||
from .internal.clusters.slurm import infer, eval
|
||||
# from .clusters.slurm import infer_split as infer, eval
|
||||
# from .clusters.slurm import infer_size as infer, eval
|
||||
# from .clusters.slurm import infer_size_split as infer, eval
|
||||
|
||||
base_ceval_datasets = base_ceval_datasets[:1]
|
||||
chat_ceval_datasets = chat_ceval_datasets[-1:]
|
||||
|
||||
# If you do not want to run all the combinations of models and datasets, you
|
||||
# can specify the combinations you want to run here. This is useful when you
|
||||
# deleberately want to skip some subset of the combinations.
|
||||
# Models and datasets in different combinations are recommended to be disjoint
|
||||
# (different `abbr` in model & dataset configs), as we haven't tested this case
|
||||
# throughly.
|
||||
model_dataset_combinations = [
|
||||
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
|
||||
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
|
||||
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
|
||||
]
|
||||
|
||||
# This union of models and datasets in model_dataset_combinations should be
|
||||
# stored in the `models` and `datasets` variables below. Otherwise, modules
|
||||
# like summarizer will miss out some information.
|
||||
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
|
||||
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
|
||||
|
||||
work_dir = './outputs/default/mdcomb/'
|
||||
|
||||
"""
|
||||
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
|
||||
---------------------- --------- -------- ------ ------------ -----------------
|
||||
ceval-computer_network 9b9417 accuracy ppl 52.63 -
|
||||
ceval-physician 6e277d accuracy gen - 59.18
|
||||
"""
|
@ -29,5 +29,6 @@ models = [
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||
end_str='<eoa>',
|
||||
)
|
||||
]
|
||||
|
@ -29,5 +29,6 @@ models = [
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>',
|
||||
)
|
||||
]
|
||||
|
@ -29,5 +29,6 @@ models = [
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<eoa>',
|
||||
)
|
||||
]
|
||||
|
@ -22,12 +22,14 @@ models = [
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,),
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
@ -22,12 +22,14 @@ models = [
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
trust_remote_code=True,
|
||||
use_fast=False,),
|
||||
use_fast=False,
|
||||
),
|
||||
pad_token_id=151643,
|
||||
max_out_len=100,
|
||||
max_seq_len=2048,
|
||||
batch_size=8,
|
||||
meta_template=_meta_template,
|
||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||
end_str='<|im_end|>',
|
||||
)
|
||||
]
|
||||
|
4
configs/summarizers/groups/cibench.py
Normal file
4
configs/summarizers/groups/cibench.py
Normal file
@ -0,0 +1,4 @@
|
||||
|
||||
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
||||
_cibench = ['cibench_' + i for i in _cibench]
|
||||
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
|
75
configs/summarizers/groups/mathbench.py
Normal file
75
configs/summarizers/groups/mathbench.py
Normal file
@ -0,0 +1,75 @@
|
||||
|
||||
mathbench_summary_groups = [
|
||||
{
|
||||
'name': 'mathbench-college',
|
||||
'subsets': [
|
||||
['mathbench-college-single_choice_cn', 'acc_1'],
|
||||
['mathbench-college-cloze_en', 'accuracy'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-high',
|
||||
'subsets': [
|
||||
['mathbench-high-single_choice_cn', 'acc_1'],
|
||||
['mathbench-high-single_choice_en', 'acc_1'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-middle',
|
||||
'subsets': [
|
||||
['mathbench-middle-single_choice_cn', 'acc_1'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-primary',
|
||||
'subsets': [
|
||||
['mathbench-primary-cloze_cn', 'accuracy'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench',
|
||||
'subsets': [
|
||||
'mathbench-college',
|
||||
'mathbench-high',
|
||||
'mathbench-middle',
|
||||
'mathbench-primary',
|
||||
],
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-college-circular',
|
||||
'subsets': [
|
||||
['mathbench-college-single_choice_cn', 'perf_4'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-high-circular',
|
||||
'subsets': [
|
||||
['mathbench-high-single_choice_cn', 'perf_4'],
|
||||
['mathbench-high-single_choice_en', 'perf_4'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-middle-circular',
|
||||
'subsets': [
|
||||
['mathbench-middle-single_choice_cn', 'perf_4'],
|
||||
]
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-circular',
|
||||
'subsets': [
|
||||
'mathbench-college-circular',
|
||||
'mathbench-high-circular',
|
||||
'mathbench-middle-circular',
|
||||
],
|
||||
},
|
||||
{
|
||||
'name': 'mathbench-circular-and-cloze',
|
||||
'subsets': [
|
||||
'mathbench-high-circular',
|
||||
'mathbench-middle-circular',
|
||||
'mathbench-circular',
|
||||
'mathbench-college-cloze_en',
|
||||
'mathbench-primary-cloze_cn',
|
||||
],
|
||||
}
|
||||
]
|
28
configs/summarizers/math_agent.py
Normal file
28
configs/summarizers/math_agent.py
Normal file
@ -0,0 +1,28 @@
|
||||
summarizer = dict(
|
||||
dataset_abbrs=[
|
||||
'######## GSM8K-Agent Accuracy ########', # category
|
||||
['gsm8k-agent', 'follow_acc'],
|
||||
['gsm8k-agent', 'reasoning_acc'],
|
||||
['gsm8k-agent', 'code_acc'],
|
||||
['gsm8k-agent', 'action_pct'],
|
||||
'######## MATH-Agent Accuracy ########', # category
|
||||
['math-agent', 'follow_acc'],
|
||||
['math-agent', 'reasoning_acc'],
|
||||
['math-agent', 'code_acc'],
|
||||
['math-agent', 'action_pct'],
|
||||
'######## MathBench-Agent Accuracy ########', # category
|
||||
['mathbench-college-single_choice_cn-agent', 'acc_1'],
|
||||
['mathbench-college-cloze_en-agent', 'accuracy'],
|
||||
['mathbench-high-single_choice_cn-agent', 'acc_1'],
|
||||
['mathbench-high-single_choice_en-agent', 'acc_1'],
|
||||
['mathbench-middle-single_choice_cn-agent', 'acc_1'],
|
||||
['mathbench-primary-cloze_cn-agent', 'accuracy'],
|
||||
'######## MathBench-Agent CircularEval ########', # category
|
||||
['mathbench-college-single_choice_cn-agent', 'perf_4'],
|
||||
['mathbench-high-single_choice_cn-agent', 'perf_4'],
|
||||
['mathbench-high-single_choice_en-agent', 'perf_4'],
|
||||
['mathbench-middle-single_choice_cn-agent', 'perf_4'],
|
||||
],
|
||||
summary_groups=sum(
|
||||
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
|
||||
)
|
@ -2,13 +2,15 @@ import json
|
||||
import os
|
||||
import os.path as osp
|
||||
import re
|
||||
import subprocess
|
||||
from collections import defaultdict
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
from datasets import Dataset
|
||||
|
||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||
from opencompass.registry import LOAD_DATASET
|
||||
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||
|
||||
from .base import BaseDataset
|
||||
|
||||
@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
|
||||
with open(file, 'r') as f:
|
||||
notebook = json.load(f)
|
||||
example = notebook['cells']
|
||||
|
||||
metadata = notebook['metadata']
|
||||
modules = metadata.get('modules', [])
|
||||
if modules:
|
||||
# these two annotations should be the same
|
||||
assert len(modules) == len(metadata.get('step_types'))
|
||||
# reformat annotations
|
||||
modules = [[_m.strip() for _m in _modules.split('&')]
|
||||
for _modules in modules]
|
||||
questions = []
|
||||
source_codes = []
|
||||
outputs = []
|
||||
tags = []
|
||||
for cell in example:
|
||||
if cell['cell_type'] == 'markdown':
|
||||
text = ''.join(cell['source'])
|
||||
text = ''.join(cell['source']).strip()
|
||||
if modules:
|
||||
_modules = modules.pop(0)
|
||||
text += f"Please use {' and '.join(_modules)} modules."
|
||||
text = text.strip() + '\n'
|
||||
# append the formatted text
|
||||
questions.append(text)
|
||||
elif cell['cell_type'] == 'code':
|
||||
source_codes.append(''.join(cell['source']))
|
||||
if cell['outputs'] and 'data' in cell['outputs'][-1]:
|
||||
if 'image/png' in cell['outputs'][-1]['data']:
|
||||
# skip vis temporarily due to lack of evaluation
|
||||
@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
|
||||
outputs.append(''.join(
|
||||
cell['outputs'][-1]['data']['text/plain']))
|
||||
else:
|
||||
tags.append('executable')
|
||||
tags.append('exec')
|
||||
outputs.append(None)
|
||||
return dict(
|
||||
experiment=file,
|
||||
questions=sum(([
|
||||
dict(role='user', content=question),
|
||||
dict(role='assistant', content=output)
|
||||
] for question, output in zip(questions, outputs)), []),
|
||||
references=dict(outputs=outputs, tags=tags, experiment=file),
|
||||
dict(role='assistant', content=source_code)
|
||||
] for question, source_code in zip(questions, source_codes)), []),
|
||||
references=dict(outputs=outputs,
|
||||
tags=tags,
|
||||
metadata=metadata,
|
||||
experiment=file),
|
||||
)
|
||||
|
||||
|
||||
@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
|
||||
@staticmethod
|
||||
def load(path: str):
|
||||
"""Load whole dataset."""
|
||||
assert os.path.exists(path), f'Path {path} does not exist.'
|
||||
data_list = []
|
||||
for cwd, dirs, files in os.walk(path):
|
||||
dirs.sort()
|
||||
@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
"""Evaluator for CI dataset.
|
||||
|
||||
Args:
|
||||
text_evaluator (optional, dict): The text evaluator for text result
|
||||
comparison[]. Defaults to None, which use Rouge as defaults.
|
||||
Please notice that a extra key for `metric_name` should be set
|
||||
to get the exact metric result, such as `rouge1`.
|
||||
output_dir (optional, str): The directory to save experiment
|
||||
files in a markdown or notebook format.
|
||||
with_ipynb (bool): Generate ipynb correspondingly.
|
||||
Defaults to False.
|
||||
user_data_dir (str): The directory to load local files.
|
||||
Defaults to 'ENV', which means use environment variable
|
||||
`USER_DATA_DIR` to get the data dir.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
text_evaluator: Optional[dict] = None,
|
||||
output_dir: Optional[str] = None,
|
||||
with_ipynb: bool = False,
|
||||
user_data_dir: str = 'ENV') -> None:
|
||||
if text_evaluator is None:
|
||||
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
||||
self.text_evaluator = ICL_EVALUATORS.build(
|
||||
dict(type=RougeEvaluator))
|
||||
self.text_eval_metric = 'rouge1'
|
||||
else:
|
||||
self.text_eval_metric = text_evaluator.pop('metric_name')
|
||||
self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
|
||||
# TODO: should use work dir for this task.
|
||||
self.output_dir = output_dir
|
||||
self.user_data_dir = self.check_user_data_dir(user_data_dir)
|
||||
self.with_ipynb = with_ipynb
|
||||
self.TAG_MAPPING = {
|
||||
'exec': ('executable', self.valid_step),
|
||||
'general': ('general_correct', self.correct_step),
|
||||
'num': ('numeric_correct', self.correct_step),
|
||||
'text': ('text_score', self.text_step),
|
||||
'vis': ('vis_sim', self.vis_similarity_step),
|
||||
}
|
||||
|
||||
def check_user_data_dir(self, user_data_dir):
|
||||
if user_data_dir == 'ENV':
|
||||
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
||||
self.user_data_dir = user_data_dir
|
||||
user_data_dir = user_data_dir.rstrip('/')
|
||||
basename = osp.basename(user_data_dir)
|
||||
if basename and basename != 'data':
|
||||
user_data_dir = osp.join(user_data_dir, 'data')
|
||||
assert osp.exists(user_data_dir), \
|
||||
f'a subfolder named `data` should exist under {user_data_dir}.'
|
||||
elif basename:
|
||||
assert osp.exists(user_data_dir), \
|
||||
f'{user_data_dir} does not exist.'
|
||||
return user_data_dir
|
||||
|
||||
@staticmethod
|
||||
def valid_step(step):
|
||||
@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
# Fall back to False
|
||||
return False
|
||||
|
||||
def text_step(self, step, target):
|
||||
"""Whether the step output is correct."""
|
||||
# Found the latest code interpreter to determine correct
|
||||
for action in step[::-1]:
|
||||
if action['type'] == 'IPythonInterpreter':
|
||||
if action['result']:
|
||||
try:
|
||||
pred = action['result']['text']
|
||||
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
|
||||
if match:
|
||||
out = match.group(1)
|
||||
score = self.text_evaluator.score([out], [target])
|
||||
return score[self.text_eval_metric] / 100
|
||||
except Exception:
|
||||
return False
|
||||
# Fall back to False
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def vis_similarity_step(step, target):
|
||||
"""Whether the step output image has the same structure similarity with
|
||||
@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
'the conversion processes.')
|
||||
|
||||
check_jupytext()
|
||||
p_list = []
|
||||
from opencompass.lagent.actions.ipython_interpreter import extract_code
|
||||
for idx, (example_origin_prompt,
|
||||
example_steps) in enumerate(zip(origin_prompt, steps)):
|
||||
@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
f.writelines(markdown_lines)
|
||||
|
||||
# TODO: be careful for this
|
||||
# The result might be different with infer process
|
||||
# please check carefully
|
||||
# convert markdown to ipynb and exectue with error tolerance
|
||||
# subprocess.Popen(
|
||||
# "jupytext --to ipynb --pipe-fmt ipynb "
|
||||
# "--pipe 'jupyter nbconvert --to ipynb --execute "
|
||||
# f"--allow-errors --stdin --stdout' {md_file}",
|
||||
# shell=True)
|
||||
if self.with_ipynb:
|
||||
p = subprocess.Popen(
|
||||
'jupytext --to ipynb --pipe-fmt ipynb '
|
||||
"--pipe 'jupyter nbconvert --to ipynb --execute "
|
||||
f"--allow-errors --stdin --stdout' {md_file}",
|
||||
shell=True)
|
||||
p_list.append(p)
|
||||
# TODO: async wait
|
||||
for p in p_list:
|
||||
p.wait()
|
||||
|
||||
def set_data_dir(self, work_dir):
|
||||
"""Set work directory and link data files for save notebook results."""
|
||||
if self.user_data_dir:
|
||||
if self.user_data_dir.endswith('/'):
|
||||
basename = osp.basename(osp.split(self.user_data_dir)[0])
|
||||
else:
|
||||
basename = osp.basename(self.user_data_dir)
|
||||
basename = osp.basename(self.user_data_dir)
|
||||
|
||||
if not osp.exists(osp.join(self.output_dir, basename)):
|
||||
os.symlink(self.user_data_dir,
|
||||
osp.join(self.output_dir, basename))
|
||||
@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
"""Change work directory and keep the symlink."""
|
||||
os.chdir(work_dir)
|
||||
|
||||
def single_exp(self, gold, steps):
|
||||
tags = gold['tags']
|
||||
outputs = gold['outputs']
|
||||
metadata = gold['metadata']
|
||||
hard_tags = metadata.get('step_types', [])
|
||||
if hard_tags:
|
||||
tags = hard_tags
|
||||
|
||||
# executable: exec succeed
|
||||
# general_correct: general correct
|
||||
# numeric_correct: numerical correct
|
||||
# text_score: text score
|
||||
# vis_sim: visual similarity
|
||||
result = defaultdict(list)
|
||||
for tag, step, output in zip(tags, steps, outputs):
|
||||
# check whether this step is valid
|
||||
result['executable'].append(self.valid_step(step))
|
||||
if tag != 'exec':
|
||||
key, func = self.TAG_MAPPING[tag]
|
||||
result[key].append(func(step, output))
|
||||
|
||||
# add missing metric for better analyse if not exists
|
||||
if hard_tags:
|
||||
check_tags = ['exec', 'num', 'text', 'vis']
|
||||
else:
|
||||
check_tags = ['exec', 'general', 'vis']
|
||||
for tag in check_tags:
|
||||
key = self.TAG_MAPPING[tag][0]
|
||||
if key not in result:
|
||||
result[key] = []
|
||||
|
||||
return result
|
||||
|
||||
def get_output_dir(self):
|
||||
"""Get output dir from eval task.
|
||||
|
||||
Notice: output dir should be in format xxx/data.
|
||||
All the needed files should be
|
||||
"""
|
||||
# hard hack for get output dir from eval task
|
||||
if hasattr(self, '_out_dir') and self.output_dir is None:
|
||||
self.output_dir = self._out_dir
|
||||
|
||||
def score(self, predictions: List, references: List, steps: List,
|
||||
origin_prompt: List):
|
||||
"""Calculate accuracy."""
|
||||
cwd = os.getcwd()
|
||||
self.get_output_dir()
|
||||
if self.output_dir:
|
||||
if not osp.exists(self.output_dir):
|
||||
os.makedirs(self.output_dir)
|
||||
@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
self.save_results(origin_prompt, steps)
|
||||
self.unset_data_dir(cwd)
|
||||
|
||||
num_cells_list = []
|
||||
num_general_list = []
|
||||
passed_list = []
|
||||
correct_list = []
|
||||
vis_list = []
|
||||
total_results = defaultdict(float)
|
||||
total_scores = defaultdict(float)
|
||||
total_nums = defaultdict(int)
|
||||
for gold, single_steps in zip(references, steps):
|
||||
tags = gold['tags']
|
||||
outputs = gold['outputs']
|
||||
num_cells = len(tags)
|
||||
num_general = sum([tag == 'general' for tag in tags])
|
||||
result = self.single_exp(gold, single_steps)
|
||||
|
||||
passed = sum([self.valid_step(step) for step in single_steps])
|
||||
correct = 0
|
||||
vis_sim = []
|
||||
for tag, step, output in zip(tags, single_steps, outputs):
|
||||
if tag == 'general':
|
||||
correct += self.correct_step(step, output)
|
||||
elif tag == 'vis':
|
||||
vis_sim.append(self.vis_similarity_step(step, output))
|
||||
for k, v in result.items():
|
||||
total_scores[k] += sum(v)
|
||||
total_nums[k] += len(v)
|
||||
|
||||
num_cells_list.append(num_cells)
|
||||
num_general_list.append(num_general)
|
||||
passed_list.append(passed)
|
||||
correct_list.append(correct)
|
||||
if vis_sim:
|
||||
vis_list.append(sum(vis_sim) / len(vis_sim))
|
||||
for k, v in total_scores.items():
|
||||
if total_nums[k] > 0:
|
||||
total_results[k] = total_scores[k] / total_nums[k] * 100
|
||||
else:
|
||||
vis_list.append(-1)
|
||||
total_results[k] = -1
|
||||
|
||||
if len([v for v in vis_list if v >= 0]) > 0:
|
||||
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
|
||||
[v for v in vis_list if v >= 0])
|
||||
else:
|
||||
# not valid
|
||||
visualize_similarity = -1
|
||||
|
||||
if sum(num_general_list) > 0:
|
||||
general_accuracy = sum(correct_list) / sum(num_general_list)
|
||||
else:
|
||||
# not valid
|
||||
general_accuracy = -1
|
||||
|
||||
result = dict(
|
||||
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
|
||||
general_accuracy=general_accuracy * 100,
|
||||
visualize_similarity=visualize_similarity * 100,
|
||||
num_cells_list=num_cells_list,
|
||||
num_general_list=num_general_list,
|
||||
passed_list=passed_list,
|
||||
correct_list=correct_list,
|
||||
vis_list=vis_list,
|
||||
)
|
||||
return result
|
||||
return total_results
|
||||
|
@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
if line['label'] == '-':
|
||||
continue
|
||||
data.append(line)
|
||||
return Dataset.from_list(data)
|
||||
|
||||
|
@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
|
||||
return text
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
|
||||
def ds1000_completion_postprocess(text: str) -> str:
|
||||
text += '</code>'
|
||||
|
||||
match = re.search('(.*?)</code>', text, re.DOTALL)
|
||||
if match:
|
||||
text = match.group(1)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
|
||||
def ds1000_matplotlib_postprocess(text: str) -> str:
|
||||
text = ds1000_postprocess(text)
|
||||
|
@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
|
||||
reasoning_acc=100 *
|
||||
(reasoning_scope + final_scope + row_reasoning_scope) / total,
|
||||
code_acc=100 * (code_scope + final_scope) / total,
|
||||
action_acc=100 * (action_scope + final_scope) / total,
|
||||
action_pct=100 * (action_scope + final_scope) / total,
|
||||
)
|
||||
return result
|
||||
|
@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
|
||||
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
|
||||
|
||||
data = []
|
||||
with open(path, 'r') as infile:
|
||||
with open(path, 'r', encoding='utf-8') as infile:
|
||||
for id, line in enumerate(infile):
|
||||
entry = json.loads(line)
|
||||
if 'cloze' in name:
|
||||
|
@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
|
||||
for line in f:
|
||||
line = json.loads(line)
|
||||
prompt = line['sentence']
|
||||
dataset_list.append({
|
||||
'opt1':
|
||||
prompt.replace('_', line['option1']),
|
||||
'opt2':
|
||||
prompt.replace('_', line['option2']),
|
||||
'answer':
|
||||
line['answer']
|
||||
})
|
||||
continue_prompt = prompt.split('_')
|
||||
data_item = {
|
||||
'opt1': prompt.replace('_', line['option1']),
|
||||
'opt2': prompt.replace('_', line['option2']),
|
||||
'answer': line['answer'],
|
||||
'cont': continue_prompt[1]
|
||||
}
|
||||
dataset_list.append(data_item)
|
||||
dataset_list = Dataset.from_list(dataset_list)
|
||||
return dataset_list
|
||||
|
||||
@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
|
||||
prompt = line['sentence']
|
||||
answer = line['answer']
|
||||
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
|
||||
dataset_list.append({
|
||||
'opt1':
|
||||
prompt.replace('_', line['option1']),
|
||||
'opt2':
|
||||
prompt.replace('_', line['option2']),
|
||||
'answer':
|
||||
answer
|
||||
})
|
||||
data_item = {
|
||||
'opt1': prompt.replace('_', line['option1']),
|
||||
'opt2': prompt.replace('_', line['option2']),
|
||||
'answer': answer,
|
||||
}
|
||||
dataset_list.append(data_item)
|
||||
dataset_list = Dataset.from_list(dataset_list)
|
||||
return dataset_list
|
||||
|
@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
|
||||
it is disabled. Defaults to None.
|
||||
timeout (int): Upper bound of waiting time for Python script execution.
|
||||
Defaults to 20.
|
||||
trim_output (int, optional): Max characters restriction of ipython
|
||||
outputs. If None, do not perform any trim.
|
||||
TODO: Notice that, this is not token len. Anf trim strategies
|
||||
might be added later. Defaults to 1024.
|
||||
user_data_dir (str): Specified the user data directory for files
|
||||
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
|
||||
Defaults to `ENV`.
|
||||
@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
|
||||
enable: bool = True,
|
||||
disable_description: Optional[str] = None,
|
||||
timeout: int = 20,
|
||||
trim_output: Optional[int] = 1024,
|
||||
user_data_dir: str = 'ENV') -> None:
|
||||
super().__init__(description, name, enable, disable_description)
|
||||
|
||||
@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
|
||||
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
||||
|
||||
if user_data_dir:
|
||||
user_data_dir = os.path.dirname(user_data_dir)
|
||||
# user_data_dir = os.path.dirname(user_data_dir)
|
||||
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
|
||||
self.user_data_dir = user_data_dir
|
||||
self._initialized = False
|
||||
self.trim_output = trim_output
|
||||
if not os.path.exists(WORK_DIR):
|
||||
os.mkdir(WORK_DIR)
|
||||
|
||||
@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
|
||||
if image:
|
||||
result += f'\n\n{image}'
|
||||
if finished:
|
||||
# in case output text too long
|
||||
# might need better design later
|
||||
if self.trim_output and len(result) > self.trim_output:
|
||||
ellip = '......'
|
||||
half_len = int((self.trim_output - len(ellip)) / 2)
|
||||
result = result[:half_len] + ellip + result[-half_len:]
|
||||
return succeed, result
|
||||
|
||||
try:
|
||||
@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
|
||||
command: str,
|
||||
timeout: Optional[int] = None) -> ActionReturn:
|
||||
tool_return = ActionReturn(url=None, args=None, type=self.name)
|
||||
tool_return.args = dict(text=command)
|
||||
succeed, result = self._call(command, timeout)
|
||||
if succeed:
|
||||
tool_return.result = dict(text=result)
|
||||
tool_return.state = ActionStatusCode.SUCCESS
|
||||
extracted_command = extract_code(command)
|
||||
tool_return.args = dict(text=command, extract_code=extracted_command)
|
||||
if extracted_command:
|
||||
succeed, result = self._call(extracted_command, timeout)
|
||||
if succeed:
|
||||
if not result:
|
||||
result = 'The code is succeed without any outputs.'
|
||||
tool_return.result = dict(text=result)
|
||||
tool_return.state = ActionStatusCode.SUCCESS
|
||||
else:
|
||||
tool_return.errmsg = repr(result)
|
||||
tool_return.state = ActionStatusCode.API_ERROR
|
||||
else:
|
||||
tool_return.errmsg = repr(result)
|
||||
tool_return.errmsg = 'The input code is empty. Please follow the format.' # noqa
|
||||
tool_return.state = ActionStatusCode.API_ERROR
|
||||
return tool_return
|
||||
|
||||
|
@ -115,6 +115,20 @@ class BaseModel:
|
||||
inputs = self.parse_template(templates, mode='ppl')
|
||||
return self.get_ppl(inputs, mask_length)
|
||||
|
||||
def get_loglikelihood_from_template(self,
|
||||
templates: List[PromptType],
|
||||
conts: List[str],
|
||||
mask_length=None):
|
||||
"""Get perplexity given a list of templates.
|
||||
|
||||
Args:
|
||||
templates (List[PromptType]): A list of templates.
|
||||
mask_length (List[int]): A list of mask lengths. If provided, the
|
||||
perplexity will be calculated only on the unmasked tokens.
|
||||
"""
|
||||
inputs = self.parse_template(templates, mode='ppl')
|
||||
return self.get_loglikelihood(inputs, conts, mask_length)
|
||||
|
||||
def generate_from_template(self, templates: List[PromptType],
|
||||
max_out_len: int, **kwargs):
|
||||
"""Generate completion from a list of templates.
|
||||
|
@ -1,9 +1,11 @@
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import warnings
|
||||
from abc import abstractmethod
|
||||
from copy import deepcopy
|
||||
from queue import Queue
|
||||
from time import sleep
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
|
||||
def __init__(self,
|
||||
path: str,
|
||||
query_per_second: int = 1,
|
||||
rpm_verbose: bool = False,
|
||||
retry: int = 2,
|
||||
max_seq_len: int = 2048,
|
||||
meta_template: Optional[Dict] = None,
|
||||
@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
|
||||
self.meta_template = meta_template
|
||||
self.retry = retry
|
||||
self.query_per_second = query_per_second
|
||||
self.token_bucket = TokenBucket(query_per_second)
|
||||
self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
|
||||
self.template_parser = APITemplateParser(meta_template)
|
||||
self.logger = get_logger()
|
||||
self.generation_kwargs = generation_kwargs
|
||||
@ -422,10 +425,13 @@ class TokenBucket:
|
||||
query_per_second (float): The rate of the token bucket.
|
||||
"""
|
||||
|
||||
def __init__(self, rate):
|
||||
def __init__(self, rate, verbose=False):
|
||||
self._rate = rate
|
||||
self._tokens = threading.Semaphore(0)
|
||||
self.started = False
|
||||
self._request_queue = Queue()
|
||||
self.logger = get_logger()
|
||||
self.verbose = verbose
|
||||
|
||||
def _add_tokens(self):
|
||||
"""Add tokens to the bucket."""
|
||||
@ -440,3 +446,12 @@ class TokenBucket:
|
||||
self.started = True
|
||||
threading.Thread(target=self._add_tokens, daemon=True).start()
|
||||
self._tokens.acquire()
|
||||
if self.verbose:
|
||||
cur_time = time.time()
|
||||
while not self._request_queue.empty():
|
||||
if cur_time - self._request_queue.queue[0] > 60:
|
||||
self._request_queue.get()
|
||||
else:
|
||||
break
|
||||
self._request_queue.put(cur_time)
|
||||
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
|
||||
|
@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from opencompass.models.base import BaseModel
|
||||
from opencompass.models.base_api import APITemplateParser
|
||||
@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
|
||||
PromptType = Union[PromptList, str]
|
||||
|
||||
|
||||
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
|
||||
"""Criteria to stop on the specified multi-token sequence."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sequence: str,
|
||||
tokenizer: transformers.PreTrainedTokenizer,
|
||||
batch_size: int,
|
||||
):
|
||||
self.done_tracker = [False] * batch_size
|
||||
self.sequence = sequence
|
||||
self.sequence_ids = tokenizer.encode(sequence,
|
||||
add_special_tokens=False)
|
||||
self.sequence_id_len = len(self.sequence_ids)
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def __call__(self, input_ids, scores, **kwargs) -> bool:
|
||||
# compare the last len(stop) tokens
|
||||
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
|
||||
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
|
||||
for i, done in enumerate(self.done_tracker):
|
||||
if done:
|
||||
continue
|
||||
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
|
||||
return False not in self.done_tracker
|
||||
|
||||
|
||||
@MODELS.register_module()
|
||||
class HuggingFace(BaseModel):
|
||||
"""Model wrapper around HuggingFace models.
|
||||
@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
|
||||
self.model.config.eos_token_id = 2
|
||||
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
||||
|
||||
def generate(self, inputs: List[str], max_out_len: int,
|
||||
def generate(self,
|
||||
inputs: List[str],
|
||||
max_out_len: int,
|
||||
stopping_criteria: List[str] = [],
|
||||
**kwargs) -> List[str]:
|
||||
"""Generate results given a list of inputs.
|
||||
|
||||
@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
|
||||
max_out_len=max_out_len,
|
||||
**generation_kwargs)
|
||||
else:
|
||||
return sum((self._single_generate(
|
||||
inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
|
||||
for input_ in inputs), [])
|
||||
return sum(
|
||||
(self._single_generate(inputs=[input_],
|
||||
max_out_len=max_out_len,
|
||||
stopping_criteria=stopping_criteria,
|
||||
**generation_kwargs)
|
||||
for input_ in inputs), [])
|
||||
|
||||
def _batch_generate(self, inputs: List[str], max_out_len: int,
|
||||
**kwargs) -> List[str]:
|
||||
@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
|
||||
decodeds = [token.split(self.end_str)[0] for token in decodeds]
|
||||
return decodeds
|
||||
|
||||
def _single_generate(self, inputs: List[str], max_out_len: int,
|
||||
def _single_generate(self,
|
||||
inputs: List[str],
|
||||
max_out_len: int,
|
||||
stopping_criteria: List[str] = [],
|
||||
**kwargs) -> List[str]:
|
||||
"""Support for single prompt inference.
|
||||
|
||||
@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
|
||||
max_length=self.max_seq_len -
|
||||
max_out_len)['input_ids']
|
||||
input_ids = torch.tensor(input_ids, device=self.model.device)
|
||||
|
||||
if stopping_criteria:
|
||||
# Construct huggingface stopping criteria
|
||||
stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
|
||||
stopping_criteria = transformers.StoppingCriteriaList([
|
||||
*[
|
||||
MultiTokenEOSCriteria(sequence, self.tokenizer,
|
||||
input_ids.shape[0])
|
||||
for sequence in stopping_criteria
|
||||
],
|
||||
])
|
||||
kwargs['stopping_criteria'] = stopping_criteria
|
||||
|
||||
# To accommodate the PeftModel, parameters should be passed in
|
||||
# key-value format for generate.
|
||||
outputs = self.model.generate(input_ids=input_ids,
|
||||
@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
|
||||
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
|
||||
return ce_loss
|
||||
|
||||
def get_loglikelihood(
|
||||
self,
|
||||
inputs: List[str],
|
||||
conts: List[str],
|
||||
mask_length: Optional[List[int]] = None) -> List[float]:
|
||||
"""Get loglikelihood scores given a list of inputs.
|
||||
|
||||
Args:
|
||||
inputs (List[str]): A list of strings.
|
||||
conts (List[str]): A list of strings: slices after the space.
|
||||
NOT SUPPORT mask_length YET!
|
||||
mask_length (Optional[List[int]]): A list of mask lengths. If
|
||||
provided, the perplexity scores will be calculated with the
|
||||
first mask_length[i] tokens masked out. It's okay to skip
|
||||
its implementation if advanced features in PPLInfernecer is
|
||||
not needed.
|
||||
|
||||
Returns:
|
||||
List[float]: A list of loglikelihood scores.
|
||||
"""
|
||||
assert mask_length is None, 'Not support mask_length yet.'
|
||||
if self.batch_padding and len(inputs) > 1:
|
||||
raise NotImplementedError('Batch padding is not supported yet.')
|
||||
# assert self.tokenizer.pad_token
|
||||
# return self._get_loglikelihood(inputs, mask_length=mask_length)
|
||||
return np.array([
|
||||
self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
|
||||
for idx in range(len(inputs))
|
||||
])
|
||||
|
||||
def _get_loglikelihood(self, inputs: str, conts: str) -> float:
|
||||
"""Get loglikelihood scores given input string and continuation string.
|
||||
|
||||
Args:
|
||||
inputs (str): string.
|
||||
conts (str): strings: slices after the space.
|
||||
Returns:
|
||||
float: loglikelihood scores.
|
||||
"""
|
||||
|
||||
input_ids = self.tokenizer(inputs,
|
||||
padding=False,
|
||||
truncation=True,
|
||||
max_length=self.max_seq_len)['input_ids']
|
||||
input_ids = torch.tensor(input_ids, device=self.model.device)
|
||||
context_ids = self.tokenizer(inputs.replace(conts, ''),
|
||||
padding=False,
|
||||
truncation=True,
|
||||
max_length=self.max_seq_len)['input_ids']
|
||||
cont_ids = input_ids[len(context_ids):]
|
||||
|
||||
output = self.model(input_ids.unsqueeze(0))
|
||||
logits = output['logits'][:, :-1]
|
||||
logits = torch.nn.functional.log_softmax(logits, dim=-1)
|
||||
contlen = cont_ids.shape[0]
|
||||
logits = logits[:, -contlen:, :]
|
||||
# Reducing the dimension will lead to a wrong outcome
|
||||
logits_gather = torch.gather(
|
||||
logits, 2,
|
||||
cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq]
|
||||
|
||||
# Answer: sum the likelihood of each token in continuation
|
||||
answer = float(logits_gather.detach().cpu().sum())
|
||||
return answer
|
||||
|
||||
def get_token_len(self, prompt: str) -> int:
|
||||
"""Get lengths of the tokenized strings.
|
||||
|
||||
@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
|
||||
'role': {
|
||||
'HUMAN': 'user',
|
||||
'BOT': 'assistant',
|
||||
'SYSTEM': 'system'
|
||||
}[item['role']]
|
||||
'SYSTEM': 'system',
|
||||
}[item['role'].upper()]
|
||||
}
|
||||
history.append(msg)
|
||||
user_content = history[-1]['content']
|
||||
@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
|
||||
response, history = self.model.chat(self.tokenizer,
|
||||
user_content,
|
||||
history=history)
|
||||
# response will be dict sometime
|
||||
if isinstance(response, dict):
|
||||
response = response.get('content', '')
|
||||
responses.append(response)
|
||||
except Exception:
|
||||
responses.append('')
|
||||
|
@ -52,7 +52,7 @@ class LagentAgent:
|
||||
|
||||
def chat(self,
|
||||
user_input: str,
|
||||
history: List[dict] = None) -> Tuple[str, List[dict]]:
|
||||
history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
|
||||
"""Chat with agent."""
|
||||
if history:
|
||||
self.agent._session_history = history
|
||||
@ -60,6 +60,7 @@ class LagentAgent:
|
||||
from lagent.schema import ActionReturn, AgentReturn
|
||||
generation: AgentReturn = self.agent.chat(user_input)
|
||||
|
||||
inner_steps = generation.inner_steps
|
||||
answer = generation.response
|
||||
steps = []
|
||||
|
||||
@ -76,7 +77,7 @@ class LagentAgent:
|
||||
valid=int(step.valid),
|
||||
))
|
||||
|
||||
return answer, steps
|
||||
return answer, steps, inner_steps
|
||||
|
||||
|
||||
FORCE_STOP_PROMPT_EN = (
|
||||
|
@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
|
||||
dialog = []
|
||||
for item in input:
|
||||
msg = {'content': item['prompt']}
|
||||
if item['role'] == 'HUMAN':
|
||||
if item['role'].upper() == 'HUMAN':
|
||||
msg['role'] = 'user'
|
||||
elif item['role'] == 'BOT':
|
||||
elif item['role'].upper() == 'BOT':
|
||||
msg['role'] = 'assistant'
|
||||
elif item['role'] == 'SYSTEM':
|
||||
elif item['role'].upper() == 'SYSTEM':
|
||||
msg['role'] = 'system'
|
||||
else:
|
||||
raise ValueError(f'Unknown role: {item["role"]}')
|
||||
dialog.append(msg)
|
||||
dialogs.append(dialog)
|
||||
|
||||
|
@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
|
||||
path: str = 'gpt-3.5-turbo',
|
||||
max_seq_len: int = 4096,
|
||||
query_per_second: int = 1,
|
||||
rpm_verbose: bool = False,
|
||||
retry: int = 2,
|
||||
key: Union[str, List[str]] = 'ENV',
|
||||
org: Optional[Union[str, List[str]]] = None,
|
||||
@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
|
||||
max_seq_len=max_seq_len,
|
||||
meta_template=meta_template,
|
||||
query_per_second=query_per_second,
|
||||
rpm_verbose=rpm_verbose,
|
||||
retry=retry)
|
||||
import tiktoken
|
||||
self.tiktoken = tiktoken
|
||||
|
@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator # noqa
|
||||
from .icl_em_evaluator import EMEvaluator # noqa
|
||||
from .icl_hf_evaluator import * # noqa
|
||||
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
|
||||
from .icl_misc_evaluator import AveragePPLEvaluator # noqa
|
||||
from .icl_toxic_evaluator import ToxicEvaluator # noqa
|
||||
from .lm_evaluator import LMEvaluator # noqa
|
||||
|
11
opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
Normal file
11
opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
Normal file
@ -0,0 +1,11 @@
|
||||
from opencompass.registry import ICL_EVALUATORS
|
||||
|
||||
from .icl_base_evaluator import BaseEvaluator
|
||||
|
||||
|
||||
@ICL_EVALUATORS.register_module()
|
||||
class AveragePPLEvaluator(BaseEvaluator):
|
||||
|
||||
def score(self, ppl):
|
||||
average_ppl = sum(ppl) / len(ppl)
|
||||
return {'average_ppl': average_ppl}
|
@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer # noqa
|
||||
from .icl_chat_inferencer import ChatInferencer # noqa
|
||||
from .icl_clp_inferencer import CLPInferencer # noqa
|
||||
from .icl_gen_inferencer import GenInferencer # noqa
|
||||
from .icl_loglikelihood_inferencer import LoglikelihoodInferencer # noqa
|
||||
from .icl_ppl_inferencer import PPLInferencer # noqa
|
||||
from .icl_ppl_only_inferencer import PPLOnlyInferencer # noqa
|
||||
from .icl_sc_inferencer import SCInferencer # noqa
|
||||
from .icl_tot_inferencer import ToTInferencer # noqa
|
||||
|
@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):
|
||||
|
||||
user_idx = assistant_indices[-1] - 1
|
||||
self.model.set_history(chat[:user_idx])
|
||||
answer, steps = self.model.chat(chat[user_idx]['content'])
|
||||
answer, steps, _ = self.model.chat(chat[user_idx]['content'])
|
||||
output_handler.save_results(
|
||||
origin_prompt=chat[user_idx]['content'],
|
||||
prediction=answer,
|
||||
@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
|
||||
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
||||
]
|
||||
|
||||
self.model.set_history(chat[:assistant_indices[0] - 1])
|
||||
|
||||
history = chat[:assistant_indices[0] - 1]
|
||||
for i in assistant_indices:
|
||||
answer, steps = self.model.chat(chat[i - 1]['content'])
|
||||
answer, steps, inner_steps = self.model.chat(
|
||||
chat[i - 1]['content'], history)
|
||||
history += inner_steps
|
||||
output_handler.save_multiround_results(
|
||||
origin_prompt=chat[i - 1]['content'],
|
||||
prediction=answer,
|
||||
@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):
|
||||
|
||||
for i in assistant_indices:
|
||||
self.model.set_history(chat[:i - 1])
|
||||
answer, steps = self.model.chat(chat[i - 1]['content'])
|
||||
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
|
||||
output_handler.save_multiround_results(
|
||||
origin_prompt=chat[i - 1]['content'],
|
||||
prediction=answer,
|
||||
|
@ -68,11 +68,11 @@ class LMTemplateParser:
|
||||
prompt = ''
|
||||
if self.roles:
|
||||
for dialog in chat:
|
||||
role_cfg = self.roles.get(dialog['role'])
|
||||
prompt += role_cfg['begin']
|
||||
role_cfg = self.roles.get(dialog['role'], {})
|
||||
prompt += (role_cfg.get('begin') or '')
|
||||
prompt += (dialog.get('content') or '')
|
||||
prompt += role_cfg['end']
|
||||
prompt += self.roles['assistant']['begin']
|
||||
prompt += (role_cfg.get('end') or '')
|
||||
prompt += (self.roles['assistant'].get('begin') or '')
|
||||
else:
|
||||
# in case the model does not have any meta template
|
||||
last_sep = ''
|
||||
@ -227,9 +227,13 @@ class ChatInferencer(BaseInferencer):
|
||||
'tmp_' + output_json_filename)
|
||||
if osp.exists(tmp_json_filepath):
|
||||
# TODO: move resume to output handler
|
||||
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
||||
output_handler.results_dict = tmp_result_dict
|
||||
index = len(tmp_result_dict)
|
||||
try:
|
||||
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
output_handler.results_dict = tmp_result_dict
|
||||
index = len(tmp_result_dict)
|
||||
|
||||
# 4. Wrap prompts with Dataloader
|
||||
dataloader = self.get_dataloader(chat_list[index:], batch_size=1)
|
||||
|
@ -1,5 +1,6 @@
|
||||
"""Direct Generation Inferencer."""
|
||||
|
||||
import inspect
|
||||
import os
|
||||
import os.path as osp
|
||||
from typing import List, Optional
|
||||
@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
|
||||
self,
|
||||
model: BaseModel,
|
||||
max_out_len: int,
|
||||
stopping_criteria: List[str] = [],
|
||||
max_seq_len: Optional[int] = None,
|
||||
batch_size: Optional[int] = 1,
|
||||
gen_field_replace_token: Optional[str] = '',
|
||||
@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):
|
||||
|
||||
self.gen_field_replace_token = gen_field_replace_token
|
||||
self.max_out_len = max_out_len
|
||||
self.stopping_criteria = stopping_criteria
|
||||
|
||||
if self.model.is_api and save_every is None:
|
||||
save_every = 1
|
||||
@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
|
||||
entry = datum
|
||||
golds = [None for _ in range(len(entry))]
|
||||
# 5-1. Inference with local model
|
||||
extra_gen_kwargs = {}
|
||||
sig = inspect.signature(self.model.generate)
|
||||
if 'stopping_criteria' in sig.parameters:
|
||||
extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
|
||||
with torch.no_grad():
|
||||
parsed_entries = self.model.parse_template(entry, mode='gen')
|
||||
results = self.model.generate_from_template(
|
||||
entry, max_out_len=self.max_out_len)
|
||||
entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
|
||||
generated = results
|
||||
|
||||
num_return_sequences = getattr(self.model, 'generation_kwargs',
|
||||
|
@ -0,0 +1,215 @@
|
||||
"""PPL Inferencer."""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import torch
|
||||
from tqdm import trange
|
||||
|
||||
from opencompass.models.base import BaseModel
|
||||
from opencompass.registry import ICL_INFERENCERS
|
||||
|
||||
from ..icl_prompt_template import PromptTemplate
|
||||
from ..icl_retriever import BaseRetriever
|
||||
from ..utils import get_logger
|
||||
from .icl_base_inferencer import BaseInferencer, dump_results_dict
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ICL_INFERENCERS.register_module()
|
||||
class LoglikelihoodInferencer(BaseInferencer):
|
||||
"""Loglikelihood Inferencer class to evaluate by loglikelihood.
|
||||
|
||||
Attributes:
|
||||
model (:obj:`BaseModel`, optional): The module to inference.
|
||||
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
|
||||
the LM.
|
||||
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
|
||||
output_json_filepath (:obj:`str`, optional): File path for output
|
||||
`JSON` file.
|
||||
output_json_filename (:obj:`str`, optional): File name for output
|
||||
`JSON` file.
|
||||
labels (:obj:`List`, optional): A list of labels for all classes.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: BaseModel,
|
||||
max_seq_len: Optional[int] = None,
|
||||
batch_size: Optional[int] = 1,
|
||||
output_json_filepath: Optional[str] = './icl_inference_output',
|
||||
output_json_filename: Optional[str] = 'predictions',
|
||||
labels: Optional[List] = None,
|
||||
**kwargs) -> None:
|
||||
super().__init__(
|
||||
model=model,
|
||||
max_seq_len=max_seq_len,
|
||||
batch_size=batch_size,
|
||||
output_json_filename=output_json_filename,
|
||||
output_json_filepath=output_json_filepath,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.labels = labels
|
||||
|
||||
def inference(self,
|
||||
retriever: BaseRetriever,
|
||||
ice_template: Optional[PromptTemplate] = None,
|
||||
prompt_template: Optional[PromptTemplate] = None,
|
||||
output_json_filepath: Optional[str] = None,
|
||||
output_json_filename: Optional[str] = None) -> List:
|
||||
# 1. Preparation for output logs
|
||||
output_handler = LoglikelihoodInferencerOutputHandler()
|
||||
|
||||
sub_predictions = []
|
||||
ppl = []
|
||||
ice = []
|
||||
|
||||
if output_json_filepath is None:
|
||||
output_json_filepath = self.output_json_filepath
|
||||
if output_json_filename is None:
|
||||
output_json_filename = self.output_json_filename
|
||||
|
||||
# 2. Get results of retrieval process
|
||||
ice_idx_list = retriever.retrieve()
|
||||
|
||||
# 3. Get labels of all the classes
|
||||
if self.labels is None:
|
||||
labels = retriever.get_labels(ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
else:
|
||||
labels = self.labels
|
||||
|
||||
# 4. Generate in-context examples for testing inputs
|
||||
for idx in range(len(ice_idx_list)):
|
||||
ice.append(
|
||||
retriever.generate_ice(ice_idx_list[idx],
|
||||
ice_template=ice_template))
|
||||
output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
|
||||
|
||||
# 5. Calculating loglikelihood for prompts in each label's class
|
||||
for label in labels:
|
||||
index = 0
|
||||
prompt_list = []
|
||||
sub_ppl_list = []
|
||||
token_num_list = []
|
||||
cont_list = []
|
||||
|
||||
# 5.1 Generate prompts of current label and truncate
|
||||
# TODO: Refactor
|
||||
for idx in range(len(ice_idx_list)):
|
||||
prompt = retriever.generate_label_prompt(
|
||||
idx,
|
||||
ice[idx],
|
||||
label,
|
||||
ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
if self.max_seq_len is not None:
|
||||
prompt_token_num = self.model.get_token_len_from_template(
|
||||
prompt, mode='ppl')
|
||||
while len(ice_idx_list[idx]
|
||||
) > 0 and prompt_token_num > self.max_seq_len:
|
||||
ice_idx_list[idx] = ice_idx_list[idx][:-1]
|
||||
ice[idx] = retriever.generate_ice(
|
||||
ice_idx_list[idx], ice_template=ice_template)
|
||||
prompt = retriever.generate_label_prompt(
|
||||
idx,
|
||||
ice[idx],
|
||||
label,
|
||||
ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
prompt_token_num = self.model.get_token_len_from_template( # noqa
|
||||
prompt, mode='ppl') # noqa
|
||||
|
||||
prompt_list.append(prompt)
|
||||
token_num_list.append(prompt_token_num)
|
||||
cont_list.append(retriever.test_ds[idx]['cont'])
|
||||
|
||||
# 5.2 Get PPL
|
||||
logger.info(f"Calculating PPL for prompts labeled '{label}'")
|
||||
for idx in trange(0,
|
||||
len(prompt_list),
|
||||
self.batch_size,
|
||||
disable=not self.is_main_process):
|
||||
sub_prompt_list = prompt_list[idx:idx + self.batch_size]
|
||||
sub_cont_list = cont_list[idx:idx + self.batch_size]
|
||||
|
||||
with torch.no_grad():
|
||||
# mainly modify compared to PPLInferencer
|
||||
sub_res = self.model.get_loglikelihood_from_template(
|
||||
sub_prompt_list, sub_cont_list).tolist()
|
||||
for res, prompt in zip(
|
||||
sub_res,
|
||||
self.model.parse_template(sub_prompt_list,
|
||||
mode='ppl')):
|
||||
sub_ppl_list.append(res)
|
||||
ice_str = self.model.parse_template(ice[idx], mode='ppl')
|
||||
output_handler.save_prompt_and_loglikelihood(
|
||||
label, prompt.replace(ice_str, ''), prompt, res, index)
|
||||
index = index + 1
|
||||
ppl.append(sub_ppl_list)
|
||||
|
||||
# 6. Get lowest PPL class as predictions
|
||||
ppl = list(zip(*ppl))
|
||||
for single_ppl in ppl:
|
||||
sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
|
||||
output_handler.save_predictions(sub_predictions)
|
||||
|
||||
# 7. Fetch gold answers if exist
|
||||
ds_reader = retriever.dataset_reader
|
||||
if ds_reader.output_column:
|
||||
golds = ds_reader.dataset['test'][ds_reader.output_column]
|
||||
output_handler.save_golds(golds)
|
||||
|
||||
# 8. Output
|
||||
if self.is_main_process:
|
||||
os.makedirs(output_json_filepath, exist_ok=True)
|
||||
output_handler.write_to_json(output_json_filepath,
|
||||
output_json_filename)
|
||||
|
||||
return [
|
||||
sample['prediction']
|
||||
for sample in output_handler.results_dict.values()
|
||||
]
|
||||
|
||||
|
||||
class LoglikelihoodInferencerOutputHandler:
|
||||
results_dict = {}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.results_dict = {}
|
||||
|
||||
def write_to_json(self, save_dir: str, filename: str):
|
||||
"""Dump the result to a json file."""
|
||||
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
|
||||
|
||||
def save_ice(self, ice):
|
||||
for idx, example in enumerate(ice):
|
||||
if str(idx) not in self.results_dict.keys():
|
||||
self.results_dict[str(idx)] = {}
|
||||
self.results_dict[str(idx)]['in-context examples'] = example
|
||||
|
||||
def save_predictions(self, predictions):
|
||||
for idx, prediction in enumerate(predictions):
|
||||
if str(idx) not in self.results_dict.keys():
|
||||
self.results_dict[str(idx)] = {}
|
||||
self.results_dict[str(idx)]['prediction'] = prediction
|
||||
|
||||
def save_prompt_and_loglikelihood(self, label, input, prompt,
|
||||
loglikelihood, idx):
|
||||
if str(idx) not in self.results_dict.keys():
|
||||
self.results_dict[str(idx)] = {}
|
||||
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
|
||||
self.results_dict[str(idx)]['label: ' + str(label)] = {}
|
||||
self.results_dict[str(idx)]['label: ' +
|
||||
str(label)]['testing input'] = input
|
||||
self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
|
||||
self.results_dict[str(idx)][
|
||||
'label: ' + str(label)]['Loglikelihood'] = loglikelihood
|
||||
|
||||
def save_golds(self, golds):
|
||||
for idx, gold in enumerate(golds):
|
||||
if str(idx) not in self.results_dict.keys():
|
||||
self.results_dict[str(idx)] = {}
|
||||
self.results_dict[str(idx)]['gold'] = gold
|
188
opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
Normal file
188
opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
Normal file
@ -0,0 +1,188 @@
|
||||
"""PPL Inferencer."""
|
||||
|
||||
import os
|
||||
from typing import List, Optional
|
||||
|
||||
import mmengine
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from opencompass.models.base import BaseModel
|
||||
from opencompass.registry import ICL_INFERENCERS
|
||||
|
||||
from ..icl_prompt_template import PromptTemplate
|
||||
from ..icl_retriever import BaseRetriever
|
||||
from ..utils import get_logger
|
||||
from .icl_base_inferencer import BaseInferencer, dump_results_dict
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@ICL_INFERENCERS.register_module()
|
||||
class PPLOnlyInferencer(BaseInferencer):
|
||||
"""PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
|
||||
made. This Inferencer is usually used along with AveragePPLEvaluator.
|
||||
|
||||
Attributes:
|
||||
model (:obj:`BaseModel`, optional): The module to inference.
|
||||
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
|
||||
the LM.
|
||||
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
|
||||
output_json_filepath (:obj:`str`, optional): File path for output
|
||||
`JSON` file.
|
||||
output_json_filename (:obj:`str`, optional): File name for output
|
||||
`JSON` file.
|
||||
save_every (:obj:`int`, optional): Save intermediate results every
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: BaseModel,
|
||||
max_seq_len: Optional[int] = None,
|
||||
batch_size: Optional[int] = 1,
|
||||
output_json_filepath: Optional[str] = './icl_inference_output',
|
||||
output_json_filename: Optional[str] = 'predictions',
|
||||
save_every: Optional[int] = 1,
|
||||
**kwargs) -> None:
|
||||
super().__init__(
|
||||
model=model,
|
||||
max_seq_len=max_seq_len,
|
||||
batch_size=batch_size,
|
||||
output_json_filename=output_json_filename,
|
||||
output_json_filepath=output_json_filepath,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
self.save_every = save_every
|
||||
|
||||
def inference(self,
|
||||
retriever: BaseRetriever,
|
||||
ice_template: Optional[PromptTemplate] = None,
|
||||
prompt_template: Optional[PromptTemplate] = None,
|
||||
output_json_filepath: Optional[str] = None,
|
||||
output_json_filename: Optional[str] = None) -> List:
|
||||
# 1. Preparation for output logs
|
||||
output_handler = PPLOnlyInferencerOutputHandler()
|
||||
|
||||
if output_json_filepath is None:
|
||||
output_json_filepath = self.output_json_filepath
|
||||
if output_json_filename is None:
|
||||
output_json_filename = self.output_json_filename
|
||||
|
||||
# 2. Get results of retrieval process
|
||||
ice_idx_list = retriever.retrieve()
|
||||
|
||||
# 3. Generate prompts for testing input
|
||||
prompt_list = self.get_generation_prompt_list_from_retriever_indices(
|
||||
ice_idx_list,
|
||||
retriever,
|
||||
max_seq_len=self.max_seq_len,
|
||||
ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
|
||||
# 3.1 Fetch and zip prompt & gold answer if output column exists
|
||||
ds_reader = retriever.dataset_reader
|
||||
|
||||
assert ds_reader.output_column is None, (
|
||||
'PPLOnlyInferencer supports `output_column=None` only.')
|
||||
|
||||
# Create tmp json file for saving intermediate results and future
|
||||
# resuming
|
||||
index = 0
|
||||
tmp_json_filepath = os.path.join(output_json_filepath,
|
||||
'tmp_' + output_json_filename)
|
||||
if os.path.exists(tmp_json_filepath):
|
||||
# TODO: move resume to output handler
|
||||
try:
|
||||
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
output_handler.results_dict = tmp_result_dict
|
||||
index = len(tmp_result_dict)
|
||||
|
||||
# 4. Wrap prompts with Dataloader
|
||||
dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
|
||||
|
||||
# 5. Inference for prompts in each batch
|
||||
logger.info('Starting inference process...')
|
||||
for datum in tqdm(dataloader, disable=not self.is_main_process):
|
||||
entry = datum
|
||||
# 5-1. Inference with local model
|
||||
with torch.no_grad():
|
||||
ppls = self.model.get_ppl_from_template(entry).tolist()
|
||||
|
||||
parsed_entries = self.model.parse_template(entry, mode='gen')
|
||||
# 5-3. Save current output
|
||||
for prompt, ppl, in zip(parsed_entries, ppls):
|
||||
output_handler.save_results(prompt, ppl, index)
|
||||
index = index + 1
|
||||
|
||||
# 5-4. Save intermediate results
|
||||
if (self.save_every is not None and index % self.save_every == 0
|
||||
and self.is_main_process):
|
||||
output_handler.write_to_json(output_json_filepath,
|
||||
'tmp_' + output_json_filename)
|
||||
|
||||
# 6. Output
|
||||
if self.is_main_process:
|
||||
os.makedirs(output_json_filepath, exist_ok=True)
|
||||
output_handler.write_to_json(output_json_filepath,
|
||||
output_json_filename)
|
||||
if os.path.exists(tmp_json_filepath):
|
||||
os.remove(tmp_json_filepath)
|
||||
|
||||
return [
|
||||
sample['ppl'] for sample in output_handler.results_dict.values()
|
||||
]
|
||||
|
||||
def get_generation_prompt_list_from_retriever_indices(
|
||||
self,
|
||||
ice_idx_list: List[List[int]],
|
||||
retriever: BaseRetriever,
|
||||
max_seq_len: Optional[int] = None,
|
||||
ice_template: Optional[PromptTemplate] = None,
|
||||
prompt_template: Optional[PromptTemplate] = None):
|
||||
prompt_list = []
|
||||
for idx, ice_idx in enumerate(ice_idx_list):
|
||||
ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
|
||||
prompt = retriever.generate_prompt_for_generate_task(
|
||||
idx,
|
||||
ice,
|
||||
ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
if max_seq_len is not None:
|
||||
prompt_token_num = self.model.get_token_len_from_template(
|
||||
prompt, mode='gen')
|
||||
while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
|
||||
ice_idx = ice_idx[:-1]
|
||||
ice = retriever.generate_ice(ice_idx,
|
||||
ice_template=ice_template)
|
||||
prompt = retriever.generate_prompt_for_generate_task(
|
||||
idx,
|
||||
ice,
|
||||
ice_template=ice_template,
|
||||
prompt_template=prompt_template)
|
||||
prompt_token_num = self.model.get_token_len_from_template(
|
||||
prompt, mode='gen')
|
||||
prompt_list.append(prompt)
|
||||
return prompt_list
|
||||
|
||||
|
||||
class PPLOnlyInferencerOutputHandler:
|
||||
origin_prompt_dict = {}
|
||||
output_dict = {}
|
||||
results_dict = {}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.results_dict = {}
|
||||
|
||||
def write_to_json(self, save_dir: str, filename: str):
|
||||
"""Dump the result to a json file."""
|
||||
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
|
||||
|
||||
def save_results(self, origin_prompt, ppl, idx):
|
||||
self.results_dict[str(idx)] = {
|
||||
'origin_prompt': origin_prompt,
|
||||
'ppl': ppl,
|
||||
}
|
@ -1,10 +1,12 @@
|
||||
import inspect
|
||||
from abc import abstractmethod
|
||||
from copy import deepcopy
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from mmengine.config import ConfigDict
|
||||
|
||||
from opencompass.utils import get_logger, task_abbr_from_cfg
|
||||
from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
|
||||
model_abbr_from_cfg, task_abbr_from_cfg)
|
||||
|
||||
|
||||
class BasePartitioner:
|
||||
@ -54,8 +56,7 @@ class BasePartitioner:
|
||||
List[Dict]: A list of tasks.
|
||||
"""
|
||||
cfg = deepcopy(cfg)
|
||||
models = cfg['models']
|
||||
datasets = cfg['datasets']
|
||||
|
||||
work_dir = cfg['work_dir']
|
||||
|
||||
add_cfg = {}
|
||||
@ -74,10 +75,11 @@ class BasePartitioner:
|
||||
self.logger.debug(f'Key {k} not found in config, ignored.')
|
||||
self.logger.debug(f'Additional config: {add_cfg}')
|
||||
|
||||
tasks = self.partition(models,
|
||||
datasets,
|
||||
work_dir,
|
||||
self.out_dir,
|
||||
model_and_dataset_args = self.parse_model_dataset_args(cfg)
|
||||
|
||||
tasks = self.partition(**model_and_dataset_args,
|
||||
work_dir=work_dir,
|
||||
out_dir=self.out_dir,
|
||||
add_cfg=add_cfg)
|
||||
|
||||
self.logger.info(f'Partitioned into {len(tasks)} tasks.')
|
||||
@ -86,6 +88,41 @@ class BasePartitioner:
|
||||
|
||||
return tasks
|
||||
|
||||
def parse_model_dataset_args(self, cfg: ConfigDict):
|
||||
models = cfg['models']
|
||||
datasets = cfg['datasets']
|
||||
|
||||
sig = inspect.signature(self.partition)
|
||||
if 'model_dataset_combinations' in sig.parameters:
|
||||
combs = cfg.get('model_dataset_combinations', None)
|
||||
if combs is None:
|
||||
combs = [{'models': models, 'datasets': datasets}]
|
||||
else:
|
||||
# sanity check
|
||||
model_abbrs = [model_abbr_from_cfg(model) for model in models]
|
||||
dataset_abbrs = [
|
||||
dataset_abbr_from_cfg(dataset) for dataset in datasets
|
||||
]
|
||||
for comb in combs:
|
||||
for model in comb['models']:
|
||||
if model_abbr_from_cfg(model) not in model_abbrs:
|
||||
raise ValueError(
|
||||
f'Model {model_abbr_from_cfg(model)} '
|
||||
'not found in config.')
|
||||
for dataset in comb['datasets']:
|
||||
if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
|
||||
raise ValueError(
|
||||
f'Dataset {dataset_abbr_from_cfg(dataset)} '
|
||||
'not found in config.')
|
||||
used_kwargs = {'model_dataset_combinations': combs}
|
||||
else:
|
||||
if cfg.get('model_dataset_combinations', None) is not None:
|
||||
self.logger.warning(
|
||||
'model_dataset_combinations is not supported by '
|
||||
f'{self.__class__.__name__}. Ignored.')
|
||||
used_kwargs = {'models': models, 'datasets': datasets}
|
||||
return used_kwargs
|
||||
|
||||
@abstractmethod
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
|
@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
|
||||
self.n = n
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
datasets: List[ConfigDict],
|
||||
model_dataset_combinations: List[Dict[str,
|
||||
List[ConfigDict]]],
|
||||
work_dir: str,
|
||||
out_dir: str,
|
||||
add_cfg: Dict = {}) -> List[Dict]:
|
||||
@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
|
||||
}
|
||||
|
||||
Args:
|
||||
models (List[ConfigDict]): A list of model configs.
|
||||
datasets (List[ConfigDict]): A list of dataset configs.
|
||||
model_dataset_combinations (List[Dict]): List of
|
||||
`{models: [...], datasets: [...]}` dicts. Each dict contains
|
||||
a list of model configs and a list of dataset configs.
|
||||
work_dir (str): The work dir for the task.
|
||||
out_dir (str): The full output path for the task, intended for
|
||||
Partitioners to check whether the task is finished via the
|
||||
@ -60,20 +61,21 @@ class NaivePartitioner(BasePartitioner):
|
||||
"""
|
||||
|
||||
tasks = []
|
||||
for model in models:
|
||||
chunks = []
|
||||
for dataset in datasets:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
chunks.append(dataset)
|
||||
for comb in model_dataset_combinations:
|
||||
for model in comb['models']:
|
||||
chunks = []
|
||||
for dataset in comb['datasets']:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
chunks.append(dataset)
|
||||
|
||||
for i in range(0, len(chunks), self.n):
|
||||
task = Config({
|
||||
'models': [model],
|
||||
'datasets': [chunks[i:i + self.n]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
})
|
||||
tasks.append(task)
|
||||
for i in range(0, len(chunks), self.n):
|
||||
task = Config({
|
||||
'models': [model],
|
||||
'datasets': [chunks[i:i + self.n]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
})
|
||||
tasks.append(task)
|
||||
return tasks
|
||||
|
@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
|
||||
self.strategy = strategy
|
||||
|
||||
def partition(self,
|
||||
models: List[ConfigDict],
|
||||
datasets: List[ConfigDict],
|
||||
model_dataset_combinations: List[Dict[str,
|
||||
List[ConfigDict]]],
|
||||
work_dir: str,
|
||||
out_dir: str,
|
||||
add_cfg: Dict = {}) -> List[ConfigDict]:
|
||||
@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
|
||||
}
|
||||
|
||||
Args:
|
||||
models (List[ConfigDict]): A list of model configs.
|
||||
datasets (List[ConfigDict]): A list of dataset configs.
|
||||
model_dataset_combinations (List[Dict]): List of
|
||||
`{models: [...], datasets: [...]}` dicts. Each dict contains
|
||||
a list of model configs and a list of dataset configs.
|
||||
work_dir (str): The work dir for the task.
|
||||
out_dir (str): The full output path for the task, intended for
|
||||
Partitioners to check whether the task is finished via the
|
||||
@ -84,52 +85,54 @@ class SizePartitioner(BasePartitioner):
|
||||
List[ConfigDict]: A list of tasks.
|
||||
"""
|
||||
|
||||
datasets = sorted(datasets,
|
||||
key=lambda x: self.get_cost(x),
|
||||
reverse=True)
|
||||
tasks = []
|
||||
for model in models:
|
||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||
for dataset in datasets:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
# skip the task if the task output exists
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
dataset_size = self.get_cost(dataset)
|
||||
if dataset_size > self.max_task_size:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
chunks.append((self.max_task_size, dataset_split))
|
||||
else:
|
||||
chunks.append((dataset_size, dataset))
|
||||
for comb in model_dataset_combinations:
|
||||
comb['datasets'] = sorted(comb['datasets'],
|
||||
key=lambda x: self.get_cost(x),
|
||||
reverse=True)
|
||||
for model in comb['models']:
|
||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||
for dataset in comb['datasets']:
|
||||
filename = get_infer_output_path(model, dataset, out_dir)
|
||||
# skip the task if the task output exists
|
||||
if osp.exists(filename):
|
||||
continue
|
||||
dataset_size = self.get_cost(dataset)
|
||||
if dataset_size > self.max_task_size:
|
||||
root, ext = osp.splitext(filename)
|
||||
dataset_splits = self.split_dataset(dataset)
|
||||
for i, dataset_split in enumerate(dataset_splits):
|
||||
if not osp.exists(f'{root}_{i}{ext}'):
|
||||
chunks.append(
|
||||
(self.max_task_size, dataset_split))
|
||||
else:
|
||||
chunks.append((dataset_size, dataset))
|
||||
|
||||
if self.strategy == 'heuristic':
|
||||
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
||||
current_size, current_chunks = 0, []
|
||||
for index in range(len(chunks)):
|
||||
current_size += chunks[index][0]
|
||||
current_chunks.append(chunks[index][1])
|
||||
if index == len(chunks) - 1 or current_size + chunks[
|
||||
index + 1][0] > self.max_task_size:
|
||||
if self.strategy == 'heuristic':
|
||||
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
||||
current_size, current_chunks = 0, []
|
||||
for index in range(len(chunks)):
|
||||
current_size += chunks[index][0]
|
||||
current_chunks.append(chunks[index][1])
|
||||
if index == len(chunks) - 1 or current_size + chunks[
|
||||
index + 1][0] > self.max_task_size:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [current_chunks],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
current_size, current_chunks = 0, []
|
||||
elif self.strategy == 'split':
|
||||
for _, dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [current_chunks],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
current_size, current_chunks = 0, []
|
||||
elif self.strategy == 'split':
|
||||
for _, dataset in chunks:
|
||||
tasks.append(
|
||||
Config({
|
||||
'models': [model],
|
||||
'datasets': [[dataset]],
|
||||
'work_dir': work_dir,
|
||||
**add_cfg
|
||||
}))
|
||||
return tasks
|
||||
|
||||
@property
|
||||
|
@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
|
||||
from tqdm import tqdm
|
||||
|
||||
from opencompass.registry import RUNNERS, TASKS
|
||||
from opencompass.utils import get_logger
|
||||
from opencompass.utils import batched, get_logger
|
||||
|
||||
from .base import BaseRunner
|
||||
|
||||
@ -131,15 +131,22 @@ class SlurmSequentialRunner(BaseRunner):
|
||||
break
|
||||
parent_conn.close()
|
||||
|
||||
for job_id in tqdm(job_ids, desc='clear sruns'):
|
||||
if job_id is None:
|
||||
continue
|
||||
cmd = f'scancel {job_id}'
|
||||
p = subprocess.Popen(cmd,
|
||||
shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
p.wait()
|
||||
tbar = tqdm(total=len(job_ids), desc='clear sruns')
|
||||
for batched_job_ids in batched(job_ids, 4):
|
||||
ps = []
|
||||
for job_id in batched_job_ids:
|
||||
tbar.update()
|
||||
if job_id is None:
|
||||
continue
|
||||
cmd = f'scancel {job_id}'
|
||||
p = subprocess.Popen(cmd,
|
||||
shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
ps.append(p)
|
||||
for p in ps:
|
||||
p.wait()
|
||||
tbar.close()
|
||||
|
||||
def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
|
||||
logger = get_logger()
|
||||
|
@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
|
||||
pred_dicts = copy.deepcopy(preds)
|
||||
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
|
||||
|
||||
pred_strs = preds.pop('prediction')
|
||||
pred_list_flag = isinstance(pred_strs[0], list)
|
||||
pred_strs = preds.pop('prediction', None)
|
||||
pred_list_flag = pred_strs is not None and isinstance(
|
||||
pred_strs[0], list)
|
||||
if ('pred_role' in self.eval_cfg
|
||||
and 'meta_template' in self.model_cfg
|
||||
and not MODELS.get(self.model_cfg['type']).is_api):
|
||||
@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
|
||||
]
|
||||
|
||||
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
|
||||
# need results dir to save other files
|
||||
out_path = get_infer_output_path(
|
||||
self.model_cfg, self.dataset_cfg,
|
||||
osp.join(self.work_dir, 'results'))
|
||||
icl_evaluator._out_dir = osp.splitext(out_path)[
|
||||
0] # strip extension
|
||||
|
||||
preds['predictions'] = pred_strs
|
||||
preds['references'] = (test_set[self.output_column]
|
||||
|
@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
|
||||
return ''
|
||||
|
||||
|
||||
@TEXT_POSTPROCESSORS.register_module('last-capital')
|
||||
def last_capital_postprocess(text: str) -> str:
|
||||
for t in text[::-1]:
|
||||
if t.isupper():
|
||||
return t
|
||||
return ''
|
||||
|
||||
|
||||
def first_option_postprocess(text: str, options: str) -> str:
|
||||
"""Find first valid option for text."""
|
||||
|
||||
|
7
requirements/agent.txt
Normal file
7
requirements/agent.txt
Normal file
@ -0,0 +1,7 @@
|
||||
json5
|
||||
jupyter
|
||||
jupyter_client
|
||||
jupytext
|
||||
lagent
|
||||
scikit-image
|
||||
sympy
|
@ -1,4 +1 @@
|
||||
faiss_gpu==1.7.2
|
||||
jupyter
|
||||
lagent
|
||||
scikit-image
|
||||
|
Loading…
Reference in New Issue
Block a user