mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Sync] minor test (#683)
This commit is contained in:
parent
dd4318f6ab
commit
e78857ac36
1
.gitignore
vendored
1
.gitignore
vendored
@ -11,6 +11,7 @@ configs/eval_debug*.py
|
|||||||
configs/viz_*.py
|
configs/viz_*.py
|
||||||
data
|
data
|
||||||
work_dirs
|
work_dirs
|
||||||
|
models
|
||||||
configs/internal/
|
configs/internal/
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .CIBench_gen_eb42f9 import ci_datasets # noqa: F401, F403
|
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403
|
||||||
|
@ -16,28 +16,20 @@ cibench_infer_cfg = dict(
|
|||||||
template="""{questions}""",
|
template="""{questions}""",
|
||||||
),
|
),
|
||||||
retriever=dict(type=ZeroRetriever),
|
retriever=dict(type=ZeroRetriever),
|
||||||
inferencer=dict(type=AgentInferencer),
|
inferencer=dict(type=AgentInferencer, infer_mode='every'),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
||||||
cibench_eval_cfg = {
|
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
|
||||||
lib: dict(
|
|
||||||
evaluator=dict(
|
|
||||||
type=CIBenchEvaluator,
|
|
||||||
output_dir=f'output_data/cibench/{lib}'),
|
|
||||||
pred_role="BOT",
|
|
||||||
)
|
|
||||||
for lib in libs
|
|
||||||
}
|
|
||||||
|
|
||||||
cibench_datasets = [
|
cibench_datasets = [
|
||||||
dict(
|
dict(
|
||||||
abbr=f"cibench_{lib}",
|
abbr=f"cibench_generation_{lib}",
|
||||||
type=CIBenchDataset,
|
type=CIBenchDataset,
|
||||||
path=f"./data/cibench/{lib}",
|
path=f"./data/cibench/{lib}",
|
||||||
reader_cfg=cibench_reader_cfg,
|
reader_cfg=cibench_reader_cfg,
|
||||||
infer_cfg=cibench_infer_cfg,
|
infer_cfg=cibench_infer_cfg,
|
||||||
eval_cfg=cibench_eval_cfg[lib],
|
eval_cfg=cibench_eval_cfg,
|
||||||
) for lib in libs
|
) for lib in libs
|
||||||
]
|
]
|
@ -95,7 +95,7 @@ mathbench_sets = {
|
|||||||
# Use circular evaluation or not
|
# Use circular evaluation or not
|
||||||
with_circular_eval = True
|
with_circular_eval = True
|
||||||
|
|
||||||
mathbench_code_datasets = []
|
mathbench_agent_datasets = []
|
||||||
|
|
||||||
for _split in list(mathbench_sets.keys()):
|
for _split in list(mathbench_sets.keys()):
|
||||||
for _name in mathbench_sets[_split]:
|
for _name in mathbench_sets[_split]:
|
||||||
@ -112,13 +112,13 @@ for _split in list(mathbench_sets.keys()):
|
|||||||
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
|
||||||
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
|
||||||
|
|
||||||
mathbench_code_datasets.append(
|
mathbench_agent_datasets.append(
|
||||||
dict(
|
dict(
|
||||||
|
abbr="mathbench-" + _split + '-' + _name + '-agent',
|
||||||
type=MathBenchDataset,
|
type=MathBenchDataset,
|
||||||
path=f"./data/mathbench/{_split}",
|
path=f"./data/mathbench/{_split}",
|
||||||
name=_name,
|
name=_name,
|
||||||
with_circular=with_circular_eval,
|
with_circular=with_circular_eval,
|
||||||
abbr="mathbench-interpreter-" + _split + '-' + _name,
|
|
||||||
reader_cfg=dict(
|
reader_cfg=dict(
|
||||||
input_columns=["question"],
|
input_columns=["question"],
|
||||||
output_column="answer"
|
output_column="answer"
|
@ -6,17 +6,17 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
|
|||||||
|
|
||||||
cloze_prompts ={
|
cloze_prompts ={
|
||||||
"cloze_arith_en": [
|
"cloze_arith_en": [
|
||||||
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
|
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
|
||||||
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
|
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
|
||||||
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
|
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
|
||||||
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
|
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
|
||||||
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
|
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
|
||||||
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
|
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
|
||||||
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
|
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
|
||||||
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
|
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
|
||||||
dict(role='HUMAN', prompt='Q: {question}'),
|
dict(role='HUMAN', prompt='Q: {question}'),
|
||||||
dict(role='BOT', prompt='A: {answer}\n'),
|
dict(role='BOT', prompt='A: {answer}\n'),
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
mathbench_sets = {
|
mathbench_sets = {
|
@ -94,11 +94,11 @@ for _split in list(mathbench_sets.keys()):
|
|||||||
|
|
||||||
mathbench_datasets.append(
|
mathbench_datasets.append(
|
||||||
dict(
|
dict(
|
||||||
|
abbr="mathbench-" + _split + '-' + _name,
|
||||||
type=MathBenchDataset,
|
type=MathBenchDataset,
|
||||||
path=f"./data/mathbench/{_split}",
|
path=f"./data/mathbench/{_split}",
|
||||||
name=_name,
|
name=_name,
|
||||||
with_circular=with_circular_eval,
|
with_circular=with_circular_eval,
|
||||||
abbr="mathbench-" + _split + '-' + _name,
|
|
||||||
reader_cfg=dict(
|
reader_cfg=dict(
|
||||||
input_columns=["question"],
|
input_columns=["question"],
|
||||||
output_column="answer"
|
output_column="answer"
|
||||||
|
69
configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
Normal file
69
configs/datasets/ds1000/ds1000_compl_gen_cbc84f.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
|
||||||
|
ds1000_matplotlib_postprocess,
|
||||||
|
DS1000Evaluator)
|
||||||
|
|
||||||
|
ds1000_reader_cfg = dict(
|
||||||
|
input_columns=["prompt"],
|
||||||
|
output_column="test_column",
|
||||||
|
train_split='test',
|
||||||
|
test_split='test')
|
||||||
|
|
||||||
|
ds1000_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt="{prompt}",
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
ds1000_eval_cfg = dict(
|
||||||
|
evaluator=dict(type=DS1000Evaluator),
|
||||||
|
pred_role="BOT",
|
||||||
|
pred_postprocessor=dict(type=ds1000_completion_postprocess),
|
||||||
|
)
|
||||||
|
|
||||||
|
# The DS-1000 dataset can be downloaded from
|
||||||
|
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
||||||
|
ds1000_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr=f"ds1000_{lib}",
|
||||||
|
type=DS1000Dataset,
|
||||||
|
path="./data/ds1000_data/",
|
||||||
|
libs=f"{lib}",
|
||||||
|
mode="Completion",
|
||||||
|
reader_cfg=ds1000_reader_cfg,
|
||||||
|
infer_cfg=ds1000_infer_cfg,
|
||||||
|
eval_cfg=ds1000_eval_cfg,
|
||||||
|
) for lib in [
|
||||||
|
'Pandas',
|
||||||
|
'Numpy',
|
||||||
|
'Tensorflow',
|
||||||
|
'Scipy',
|
||||||
|
'Sklearn',
|
||||||
|
'Pytorch',
|
||||||
|
]
|
||||||
|
]
|
||||||
|
ds1000_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr="ds1000_Matplotlib",
|
||||||
|
type=DS1000Dataset,
|
||||||
|
path="./data/ds1000_data/",
|
||||||
|
libs="Matplotlib",
|
||||||
|
mode="Completion",
|
||||||
|
reader_cfg=ds1000_reader_cfg,
|
||||||
|
infer_cfg=ds1000_infer_cfg,
|
||||||
|
eval_cfg=dict(
|
||||||
|
evaluator=dict(type=DS1000Evaluator),
|
||||||
|
pred_role="BOT",
|
||||||
|
pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
|
||||||
|
),
|
||||||
|
))
|
@ -0,0 +1,68 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
|
||||||
|
|
||||||
|
ds1000_reader_cfg = dict(
|
||||||
|
input_columns=["prompt"],
|
||||||
|
output_column="test_column",
|
||||||
|
train_split='test',
|
||||||
|
test_split='test')
|
||||||
|
|
||||||
|
ds1000_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(round=[
|
||||||
|
dict(
|
||||||
|
role="HUMAN",
|
||||||
|
prompt="{prompt}",
|
||||||
|
),
|
||||||
|
]),
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
ds1000_eval_cfg_dict = {
|
||||||
|
lib: dict(
|
||||||
|
evaluator=dict(
|
||||||
|
type=DS1000ServiceEvaluator,
|
||||||
|
lib=lib,
|
||||||
|
ip_address=
|
||||||
|
"localhost", # replace to your code_eval_server ip_address, port
|
||||||
|
port=5000
|
||||||
|
),
|
||||||
|
pred_role="BOT")
|
||||||
|
for lib in [
|
||||||
|
'Pandas',
|
||||||
|
'Numpy',
|
||||||
|
'Tensorflow',
|
||||||
|
'Scipy',
|
||||||
|
'Sklearn',
|
||||||
|
'Pytorch',
|
||||||
|
'Matplotlib',
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# The DS-1000 dataset can be downloaded from
|
||||||
|
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
|
||||||
|
ds1000_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr=f"ds1000_{lib}",
|
||||||
|
type=DS1000Dataset,
|
||||||
|
path="./data/ds1000_data/",
|
||||||
|
libs=f"{lib}",
|
||||||
|
mode="Completion",
|
||||||
|
reader_cfg=ds1000_reader_cfg,
|
||||||
|
infer_cfg=ds1000_infer_cfg,
|
||||||
|
eval_cfg=ds1000_eval_cfg_dict[lib],
|
||||||
|
) for lib in [
|
||||||
|
'Pandas',
|
||||||
|
'Numpy',
|
||||||
|
'Tensorflow',
|
||||||
|
'Scipy',
|
||||||
|
'Sklearn',
|
||||||
|
'Pytorch',
|
||||||
|
'Matplotlib',
|
||||||
|
]
|
||||||
|
]
|
@ -45,7 +45,7 @@ gsm8k_eval_cfg = dict(
|
|||||||
|
|
||||||
gsm8k_datasets = [
|
gsm8k_datasets = [
|
||||||
dict(
|
dict(
|
||||||
abbr='gsm8k',
|
abbr='gsm8k-agent',
|
||||||
type=GSM8KDataset,
|
type=GSM8KDataset,
|
||||||
path='./data/gsm8k',
|
path='./data/gsm8k',
|
||||||
reader_cfg=gsm8k_reader_cfg,
|
reader_cfg=gsm8k_reader_cfg,
|
||||||
|
39
configs/datasets/gsm8k/gsm8k_gen_3309bd.py
Normal file
39
configs/datasets/gsm8k/gsm8k_gen_3309bd.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import GenInferencer
|
||||||
|
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
|
||||||
|
|
||||||
|
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
|
||||||
|
|
||||||
|
gsm8k_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template=dict(
|
||||||
|
round=[
|
||||||
|
dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
|
||||||
|
dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
|
||||||
|
dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
|
||||||
|
dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
|
||||||
|
dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
|
||||||
|
dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
|
||||||
|
dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
|
||||||
|
dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
|
||||||
|
dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
|
||||||
|
],
|
||||||
|
)),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=[":", "Question:", "Question"]))
|
||||||
|
|
||||||
|
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
|
||||||
|
pred_postprocessor=dict(type=gsm8k_postprocess),
|
||||||
|
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
|
||||||
|
|
||||||
|
gsm8k_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='gsm8k',
|
||||||
|
type=GSM8KDataset,
|
||||||
|
path='./data/gsm8k',
|
||||||
|
reader_cfg=gsm8k_reader_cfg,
|
||||||
|
infer_cfg=gsm8k_infer_cfg,
|
||||||
|
eval_cfg=gsm8k_eval_cfg)
|
||||||
|
]
|
@ -0,0 +1,57 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
|
||||||
|
from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
|
||||||
|
|
||||||
|
gsm8k_datasets = []
|
||||||
|
|
||||||
|
gsm8k_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template="{question} {answer}"),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=PPLOnlyInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
||||||
|
|
||||||
|
for split in ['train', 'test']:
|
||||||
|
gsm8k_reader_cfg = dict(
|
||||||
|
input_columns=['question', 'answer'],
|
||||||
|
output_column=None,
|
||||||
|
train_split=split,
|
||||||
|
test_split=split,
|
||||||
|
)
|
||||||
|
gsm8k_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'gsm8k-{split}-ppl',
|
||||||
|
type=GSM8KDataset,
|
||||||
|
path='./data/gsm8k',
|
||||||
|
reader_cfg=gsm8k_reader_cfg,
|
||||||
|
infer_cfg=gsm8k_infer_cfg,
|
||||||
|
eval_cfg=gsm8k_eval_cfg)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
gsm8k_infer_cfg = dict(
|
||||||
|
prompt_template=dict(type=PromptTemplate, template="{text}"),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=PPLOnlyInferencer),
|
||||||
|
)
|
||||||
|
|
||||||
|
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
|
||||||
|
|
||||||
|
gsm8k_reader_cfg = dict(
|
||||||
|
input_columns=['text'],
|
||||||
|
output_column=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
gsm8k_datasets.append(
|
||||||
|
dict(
|
||||||
|
abbr=f'gsm8k-ref-ppl',
|
||||||
|
type=GSM8KReferenceSkywork,
|
||||||
|
path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
|
||||||
|
reader_cfg=gsm8k_reader_cfg,
|
||||||
|
infer_cfg=gsm8k_infer_cfg,
|
||||||
|
eval_cfg=gsm8k_eval_cfg
|
||||||
|
)
|
||||||
|
)
|
@ -79,7 +79,7 @@ math_eval_cfg = dict(
|
|||||||
|
|
||||||
math_datasets = [
|
math_datasets = [
|
||||||
dict(
|
dict(
|
||||||
abbr='math',
|
abbr='math-agent',
|
||||||
type=MATHDataset,
|
type=MATHDataset,
|
||||||
path='./data/math/math.json',
|
path='./data/math/math.json',
|
||||||
reader_cfg=math_reader_cfg,
|
reader_cfg=math_reader_cfg,
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from mmengine.config import read_base
|
from mmengine.config import read_base
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .winogrande_ppl_55a66e import winogrande_datasets # noqa: F401, F403
|
from .winogrande_ppl_8be6c3 import winogrande_datasets # noqa: F401, F403
|
||||||
|
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
|
|||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
from opencompass.datasets import winograndeDataset
|
from opencompass.datasets import winograndeDataset
|
||||||
|
|
||||||
|
# WARNING: This config cannot reproduce results in the paper.
|
||||||
|
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
|
||||||
|
# Please try winogrande_ppl_8be6c3
|
||||||
|
|
||||||
winogrande_reader_cfg = dict(
|
winogrande_reader_cfg = dict(
|
||||||
input_columns=['opt1', 'opt2'],
|
input_columns=['opt1', 'opt2'],
|
||||||
output_column='answer',
|
output_column='answer',
|
||||||
|
33
configs/datasets/winogrande/winogrande_ppl_8be6c3.py
Normal file
33
configs/datasets/winogrande/winogrande_ppl_8be6c3.py
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
from opencompass.openicl.icl_prompt_template import PromptTemplate
|
||||||
|
from opencompass.openicl.icl_retriever import ZeroRetriever
|
||||||
|
from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
|
||||||
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
|
from opencompass.datasets import winograndeDataset
|
||||||
|
|
||||||
|
winogrande_reader_cfg = dict(
|
||||||
|
input_columns=['opt1', 'opt2'],
|
||||||
|
output_column='answer',
|
||||||
|
)
|
||||||
|
|
||||||
|
winogrande_infer_cfg = dict(
|
||||||
|
prompt_template=dict(
|
||||||
|
type=PromptTemplate,
|
||||||
|
template={
|
||||||
|
1: "{opt1}",
|
||||||
|
2: "{opt2}",
|
||||||
|
}
|
||||||
|
),
|
||||||
|
retriever=dict(type=ZeroRetriever),
|
||||||
|
inferencer=dict(type=LoglikelihoodInferencer))
|
||||||
|
|
||||||
|
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
|
||||||
|
|
||||||
|
winogrande_datasets = [
|
||||||
|
dict(
|
||||||
|
abbr='winogrande',
|
||||||
|
type=winograndeDataset,
|
||||||
|
path='./data/winogrande',
|
||||||
|
reader_cfg=winogrande_reader_cfg,
|
||||||
|
infer_cfg=winogrande_infer_cfg,
|
||||||
|
eval_cfg=winogrande_eval_cfg)
|
||||||
|
]
|
@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
|
|||||||
from opencompass.openicl.icl_evaluator import AccEvaluator
|
from opencompass.openicl.icl_evaluator import AccEvaluator
|
||||||
from opencompass.datasets import winograndeDataset
|
from opencompass.datasets import winograndeDataset
|
||||||
|
|
||||||
|
# WARNING: This config cannot reproduce results in the paper.
|
||||||
|
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
|
||||||
|
# Please try winogrande_ppl_8be6c3
|
||||||
|
|
||||||
winogrande_reader_cfg = dict(
|
winogrande_reader_cfg = dict(
|
||||||
input_columns=['opt1', 'opt2'],
|
input_columns=['opt1', 'opt2'],
|
||||||
output_column='answer',
|
output_column='answer',
|
||||||
|
@ -4,11 +4,20 @@ from opencompass.partitioners import SizePartitioner
|
|||||||
from opencompass.runners import LocalRunner
|
from opencompass.runners import LocalRunner
|
||||||
from opencompass.tasks import OpenICLInferTask
|
from opencompass.tasks import OpenICLInferTask
|
||||||
from opencompass.models.lagent import LagentAgent
|
from opencompass.models.lagent import LagentAgent
|
||||||
from lagent import PythonInterpreter, ReAct
|
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
|
||||||
|
from lagent import ReAct
|
||||||
from lagent.agents.react import ReActProtocol
|
from lagent.agents.react import ReActProtocol
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets as datasets
|
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets
|
||||||
|
from .datasets.math.math_agent_gen_861b4f import math_datasets
|
||||||
|
from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
|
||||||
|
from .summarizers.math_agent import summarizer
|
||||||
|
|
||||||
|
datasets = []
|
||||||
|
datasets += gsm8k_datasets
|
||||||
|
datasets += math_datasets
|
||||||
|
datasets += mathbench_agent_datasets
|
||||||
|
|
||||||
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
|
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
|
||||||
```
|
```
|
||||||
|
@ -10,7 +10,7 @@ from opencompass.runners import LocalRunner
|
|||||||
from opencompass.tasks import OpenICLInferTask
|
from opencompass.tasks import OpenICLInferTask
|
||||||
|
|
||||||
with read_base():
|
with read_base():
|
||||||
from .datasets.CIBench.CIBench_gen_eb42f9 import \
|
from .datasets.CIBench.CIBench_gen_8ab0dc import \
|
||||||
cibench_datasets as datasets
|
cibench_datasets as datasets
|
||||||
|
|
||||||
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
|
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
|
||||||
@ -36,7 +36,21 @@ Also please follow the guidelines:
|
|||||||
3. The generated codes will be executed in an ipython manner and the results will be cached.
|
3. The generated codes will be executed in an ipython manner and the results will be cached.
|
||||||
4. Your responded code should always be simple and only solves the problem in current step.
|
4. Your responded code should always be simple and only solves the problem in current step.
|
||||||
|
|
||||||
Begin!
|
For example:
|
||||||
|
|
||||||
|
File url: `xxxx`
|
||||||
|
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
|
||||||
|
|
||||||
|
{thought} We should use `pandas` to solve this step.
|
||||||
|
{action} IPythonInterpreter
|
||||||
|
{action_input} ```python
|
||||||
|
import pandas as pd
|
||||||
|
url = "xxxx"
|
||||||
|
data = pd.read_csv(url)
|
||||||
|
```
|
||||||
|
{response} The code is succeed without any outputs.
|
||||||
|
|
||||||
|
Let us begin from here!
|
||||||
"""
|
"""
|
||||||
|
|
||||||
IPYTHON_INTERPRETER_DESCRIPTION = '''\
|
IPYTHON_INTERPRETER_DESCRIPTION = '''\
|
||||||
@ -69,9 +83,6 @@ models = [
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
for dataset in datasets:
|
|
||||||
# Evaluate on every assistant response
|
|
||||||
dataset['infer_cfg']['inferencer']['infer_mode'] = 'every'
|
|
||||||
|
|
||||||
infer = dict(
|
infer = dict(
|
||||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||||
|
@ -1,56 +0,0 @@
|
|||||||
from mmengine.config import read_base
|
|
||||||
from opencompass.models.openai_api import OpenAI
|
|
||||||
from opencompass.partitioners import SizePartitioner
|
|
||||||
from opencompass.runners import LocalRunner
|
|
||||||
from opencompass.tasks import OpenICLInferTask
|
|
||||||
from opencompass.models.lagent import LagentAgent
|
|
||||||
from lagent import PythonInterpreter, ReAct
|
|
||||||
from lagent.agents.react import ReActProtocol
|
|
||||||
|
|
||||||
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
|
|
||||||
```
|
|
||||||
def solution():
|
|
||||||
variable_names_with_real_meaning = func(variable)
|
|
||||||
return variable_names_with_real_meaning
|
|
||||||
```"""
|
|
||||||
|
|
||||||
protocol = dict(
|
|
||||||
type=ReActProtocol,
|
|
||||||
action=dict(role="ACTION", begin="Tool:", end="\n"),
|
|
||||||
action_input=dict(role="ARGS", begin="Tool Input:", end="\n"),
|
|
||||||
finish=dict(role="FINISH", begin="FinalAnswer:", end="\n"),
|
|
||||||
call_protocol=system_prompt,
|
|
||||||
)
|
|
||||||
|
|
||||||
with read_base():
|
|
||||||
from .datasets.MathBench.mathbench_code_gen_568903 import mathbench_code_datasets as datasets
|
|
||||||
from .summarizers.mathbench import summarizer
|
|
||||||
|
|
||||||
models = [
|
|
||||||
dict(
|
|
||||||
abbr='gpt-3.5-react',
|
|
||||||
type=LagentAgent,
|
|
||||||
agent_type=ReAct,
|
|
||||||
max_turn=3,
|
|
||||||
llm=dict(
|
|
||||||
type=OpenAI,
|
|
||||||
path='gpt-3.5-turbo',
|
|
||||||
key='ENV',
|
|
||||||
query_per_second=1,
|
|
||||||
max_seq_len=4096,
|
|
||||||
),
|
|
||||||
actions=[
|
|
||||||
dict(type=PythonInterpreter),
|
|
||||||
],
|
|
||||||
protocol=protocol,
|
|
||||||
batch_size=1,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
infer = dict(
|
|
||||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
|
||||||
runner=dict(
|
|
||||||
type=LocalRunner,
|
|
||||||
max_num_workers=16,
|
|
||||||
task=dict(type=OpenICLInferTask)),
|
|
||||||
)
|
|
43
configs/eval_with_model_dataset_combinations.py
Normal file
43
configs/eval_with_model_dataset_combinations.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_base_models
|
||||||
|
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_models
|
||||||
|
|
||||||
|
from .datasets.ceval.ceval_ppl_578f8d import ceval_datasets as base_ceval_datasets
|
||||||
|
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets as chat_ceval_datasets
|
||||||
|
|
||||||
|
from .internal.clusters.slurm import infer, eval
|
||||||
|
# from .clusters.slurm import infer_split as infer, eval
|
||||||
|
# from .clusters.slurm import infer_size as infer, eval
|
||||||
|
# from .clusters.slurm import infer_size_split as infer, eval
|
||||||
|
|
||||||
|
base_ceval_datasets = base_ceval_datasets[:1]
|
||||||
|
chat_ceval_datasets = chat_ceval_datasets[-1:]
|
||||||
|
|
||||||
|
# If you do not want to run all the combinations of models and datasets, you
|
||||||
|
# can specify the combinations you want to run here. This is useful when you
|
||||||
|
# deleberately want to skip some subset of the combinations.
|
||||||
|
# Models and datasets in different combinations are recommended to be disjoint
|
||||||
|
# (different `abbr` in model & dataset configs), as we haven't tested this case
|
||||||
|
# throughly.
|
||||||
|
model_dataset_combinations = [
|
||||||
|
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
|
||||||
|
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
|
||||||
|
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
|
||||||
|
]
|
||||||
|
|
||||||
|
# This union of models and datasets in model_dataset_combinations should be
|
||||||
|
# stored in the `models` and `datasets` variables below. Otherwise, modules
|
||||||
|
# like summarizer will miss out some information.
|
||||||
|
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
|
||||||
|
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
|
||||||
|
|
||||||
|
work_dir = './outputs/default/mdcomb/'
|
||||||
|
|
||||||
|
"""
|
||||||
|
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
|
||||||
|
---------------------- --------- -------- ------ ------------ -----------------
|
||||||
|
ceval-computer_network 9b9417 accuracy ppl 52.63 -
|
||||||
|
ceval-physician 6e277d accuracy gen - 59.18
|
||||||
|
"""
|
@ -29,5 +29,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=2, num_procs=1),
|
run_cfg=dict(num_gpus=2, num_procs=1),
|
||||||
|
end_str='<eoa>',
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -29,5 +29,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
end_str='<eoa>',
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -29,5 +29,6 @@ models = [
|
|||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
end_str='<eoa>',
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -22,12 +22,14 @@ models = [
|
|||||||
padding_side='left',
|
padding_side='left',
|
||||||
truncation_side='left',
|
truncation_side='left',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_fast=False,),
|
use_fast=False,
|
||||||
|
),
|
||||||
pad_token_id=151643,
|
pad_token_id=151643,
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
end_str='<|im_end|>',
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
@ -22,12 +22,14 @@ models = [
|
|||||||
padding_side='left',
|
padding_side='left',
|
||||||
truncation_side='left',
|
truncation_side='left',
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
use_fast=False,),
|
use_fast=False,
|
||||||
|
),
|
||||||
pad_token_id=151643,
|
pad_token_id=151643,
|
||||||
max_out_len=100,
|
max_out_len=100,
|
||||||
max_seq_len=2048,
|
max_seq_len=2048,
|
||||||
batch_size=8,
|
batch_size=8,
|
||||||
meta_template=_meta_template,
|
meta_template=_meta_template,
|
||||||
run_cfg=dict(num_gpus=1, num_procs=1),
|
run_cfg=dict(num_gpus=1, num_procs=1),
|
||||||
|
end_str='<|im_end|>',
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
4
configs/summarizers/groups/cibench.py
Normal file
4
configs/summarizers/groups/cibench.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
|
||||||
|
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
|
||||||
|
_cibench = ['cibench_' + i for i in _cibench]
|
||||||
|
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
|
75
configs/summarizers/groups/mathbench.py
Normal file
75
configs/summarizers/groups/mathbench.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
|
||||||
|
mathbench_summary_groups = [
|
||||||
|
{
|
||||||
|
'name': 'mathbench-college',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-college-single_choice_cn', 'acc_1'],
|
||||||
|
['mathbench-college-cloze_en', 'accuracy'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-high',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-high-single_choice_cn', 'acc_1'],
|
||||||
|
['mathbench-high-single_choice_en', 'acc_1'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-middle',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-middle-single_choice_cn', 'acc_1'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-primary',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-primary-cloze_cn', 'accuracy'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench',
|
||||||
|
'subsets': [
|
||||||
|
'mathbench-college',
|
||||||
|
'mathbench-high',
|
||||||
|
'mathbench-middle',
|
||||||
|
'mathbench-primary',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-college-circular',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-college-single_choice_cn', 'perf_4'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-high-circular',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-high-single_choice_cn', 'perf_4'],
|
||||||
|
['mathbench-high-single_choice_en', 'perf_4'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-middle-circular',
|
||||||
|
'subsets': [
|
||||||
|
['mathbench-middle-single_choice_cn', 'perf_4'],
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-circular',
|
||||||
|
'subsets': [
|
||||||
|
'mathbench-college-circular',
|
||||||
|
'mathbench-high-circular',
|
||||||
|
'mathbench-middle-circular',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'name': 'mathbench-circular-and-cloze',
|
||||||
|
'subsets': [
|
||||||
|
'mathbench-high-circular',
|
||||||
|
'mathbench-middle-circular',
|
||||||
|
'mathbench-circular',
|
||||||
|
'mathbench-college-cloze_en',
|
||||||
|
'mathbench-primary-cloze_cn',
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
28
configs/summarizers/math_agent.py
Normal file
28
configs/summarizers/math_agent.py
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
summarizer = dict(
|
||||||
|
dataset_abbrs=[
|
||||||
|
'######## GSM8K-Agent Accuracy ########', # category
|
||||||
|
['gsm8k-agent', 'follow_acc'],
|
||||||
|
['gsm8k-agent', 'reasoning_acc'],
|
||||||
|
['gsm8k-agent', 'code_acc'],
|
||||||
|
['gsm8k-agent', 'action_pct'],
|
||||||
|
'######## MATH-Agent Accuracy ########', # category
|
||||||
|
['math-agent', 'follow_acc'],
|
||||||
|
['math-agent', 'reasoning_acc'],
|
||||||
|
['math-agent', 'code_acc'],
|
||||||
|
['math-agent', 'action_pct'],
|
||||||
|
'######## MathBench-Agent Accuracy ########', # category
|
||||||
|
['mathbench-college-single_choice_cn-agent', 'acc_1'],
|
||||||
|
['mathbench-college-cloze_en-agent', 'accuracy'],
|
||||||
|
['mathbench-high-single_choice_cn-agent', 'acc_1'],
|
||||||
|
['mathbench-high-single_choice_en-agent', 'acc_1'],
|
||||||
|
['mathbench-middle-single_choice_cn-agent', 'acc_1'],
|
||||||
|
['mathbench-primary-cloze_cn-agent', 'accuracy'],
|
||||||
|
'######## MathBench-Agent CircularEval ########', # category
|
||||||
|
['mathbench-college-single_choice_cn-agent', 'perf_4'],
|
||||||
|
['mathbench-high-single_choice_cn-agent', 'perf_4'],
|
||||||
|
['mathbench-high-single_choice_en-agent', 'perf_4'],
|
||||||
|
['mathbench-middle-single_choice_cn-agent', 'perf_4'],
|
||||||
|
],
|
||||||
|
summary_groups=sum(
|
||||||
|
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
|
||||||
|
)
|
@ -2,13 +2,15 @@ import json
|
|||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
import re
|
import re
|
||||||
|
import subprocess
|
||||||
|
from collections import defaultdict
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
|
||||||
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
from opencompass.openicl.icl_evaluator import BaseEvaluator
|
||||||
from opencompass.registry import LOAD_DATASET
|
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
|
||||||
|
|
||||||
from .base import BaseDataset
|
from .base import BaseDataset
|
||||||
|
|
||||||
@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
|
|||||||
with open(file, 'r') as f:
|
with open(file, 'r') as f:
|
||||||
notebook = json.load(f)
|
notebook = json.load(f)
|
||||||
example = notebook['cells']
|
example = notebook['cells']
|
||||||
|
metadata = notebook['metadata']
|
||||||
|
modules = metadata.get('modules', [])
|
||||||
|
if modules:
|
||||||
|
# these two annotations should be the same
|
||||||
|
assert len(modules) == len(metadata.get('step_types'))
|
||||||
|
# reformat annotations
|
||||||
|
modules = [[_m.strip() for _m in _modules.split('&')]
|
||||||
|
for _modules in modules]
|
||||||
questions = []
|
questions = []
|
||||||
|
source_codes = []
|
||||||
outputs = []
|
outputs = []
|
||||||
tags = []
|
tags = []
|
||||||
for cell in example:
|
for cell in example:
|
||||||
if cell['cell_type'] == 'markdown':
|
if cell['cell_type'] == 'markdown':
|
||||||
text = ''.join(cell['source'])
|
text = ''.join(cell['source']).strip()
|
||||||
|
if modules:
|
||||||
|
_modules = modules.pop(0)
|
||||||
|
text += f"Please use {' and '.join(_modules)} modules."
|
||||||
|
text = text.strip() + '\n'
|
||||||
# append the formatted text
|
# append the formatted text
|
||||||
questions.append(text)
|
questions.append(text)
|
||||||
elif cell['cell_type'] == 'code':
|
elif cell['cell_type'] == 'code':
|
||||||
|
source_codes.append(''.join(cell['source']))
|
||||||
if cell['outputs'] and 'data' in cell['outputs'][-1]:
|
if cell['outputs'] and 'data' in cell['outputs'][-1]:
|
||||||
if 'image/png' in cell['outputs'][-1]['data']:
|
if 'image/png' in cell['outputs'][-1]['data']:
|
||||||
# skip vis temporarily due to lack of evaluation
|
# skip vis temporarily due to lack of evaluation
|
||||||
@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
|
|||||||
outputs.append(''.join(
|
outputs.append(''.join(
|
||||||
cell['outputs'][-1]['data']['text/plain']))
|
cell['outputs'][-1]['data']['text/plain']))
|
||||||
else:
|
else:
|
||||||
tags.append('executable')
|
tags.append('exec')
|
||||||
outputs.append(None)
|
outputs.append(None)
|
||||||
return dict(
|
return dict(
|
||||||
experiment=file,
|
experiment=file,
|
||||||
questions=sum(([
|
questions=sum(([
|
||||||
dict(role='user', content=question),
|
dict(role='user', content=question),
|
||||||
dict(role='assistant', content=output)
|
dict(role='assistant', content=source_code)
|
||||||
] for question, output in zip(questions, outputs)), []),
|
] for question, source_code in zip(questions, source_codes)), []),
|
||||||
references=dict(outputs=outputs, tags=tags, experiment=file),
|
references=dict(outputs=outputs,
|
||||||
|
tags=tags,
|
||||||
|
metadata=metadata,
|
||||||
|
experiment=file),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def load(path: str):
|
def load(path: str):
|
||||||
"""Load whole dataset."""
|
"""Load whole dataset."""
|
||||||
|
assert os.path.exists(path), f'Path {path} does not exist.'
|
||||||
data_list = []
|
data_list = []
|
||||||
for cwd, dirs, files in os.walk(path):
|
for cwd, dirs, files in os.walk(path):
|
||||||
dirs.sort()
|
dirs.sort()
|
||||||
@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
"""Evaluator for CI dataset.
|
"""Evaluator for CI dataset.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
text_evaluator (optional, dict): The text evaluator for text result
|
||||||
|
comparison[]. Defaults to None, which use Rouge as defaults.
|
||||||
|
Please notice that a extra key for `metric_name` should be set
|
||||||
|
to get the exact metric result, such as `rouge1`.
|
||||||
output_dir (optional, str): The directory to save experiment
|
output_dir (optional, str): The directory to save experiment
|
||||||
files in a markdown or notebook format.
|
files in a markdown or notebook format.
|
||||||
|
with_ipynb (bool): Generate ipynb correspondingly.
|
||||||
|
Defaults to False.
|
||||||
user_data_dir (str): The directory to load local files.
|
user_data_dir (str): The directory to load local files.
|
||||||
Defaults to 'ENV', which means use environment variable
|
Defaults to 'ENV', which means use environment variable
|
||||||
`USER_DATA_DIR` to get the data dir.
|
`USER_DATA_DIR` to get the data dir.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
|
text_evaluator: Optional[dict] = None,
|
||||||
output_dir: Optional[str] = None,
|
output_dir: Optional[str] = None,
|
||||||
|
with_ipynb: bool = False,
|
||||||
user_data_dir: str = 'ENV') -> None:
|
user_data_dir: str = 'ENV') -> None:
|
||||||
|
if text_evaluator is None:
|
||||||
|
from opencompass.openicl.icl_evaluator import RougeEvaluator
|
||||||
|
self.text_evaluator = ICL_EVALUATORS.build(
|
||||||
|
dict(type=RougeEvaluator))
|
||||||
|
self.text_eval_metric = 'rouge1'
|
||||||
|
else:
|
||||||
|
self.text_eval_metric = text_evaluator.pop('metric_name')
|
||||||
|
self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
|
||||||
# TODO: should use work dir for this task.
|
# TODO: should use work dir for this task.
|
||||||
self.output_dir = output_dir
|
self.output_dir = output_dir
|
||||||
|
self.user_data_dir = self.check_user_data_dir(user_data_dir)
|
||||||
|
self.with_ipynb = with_ipynb
|
||||||
|
self.TAG_MAPPING = {
|
||||||
|
'exec': ('executable', self.valid_step),
|
||||||
|
'general': ('general_correct', self.correct_step),
|
||||||
|
'num': ('numeric_correct', self.correct_step),
|
||||||
|
'text': ('text_score', self.text_step),
|
||||||
|
'vis': ('vis_sim', self.vis_similarity_step),
|
||||||
|
}
|
||||||
|
|
||||||
|
def check_user_data_dir(self, user_data_dir):
|
||||||
if user_data_dir == 'ENV':
|
if user_data_dir == 'ENV':
|
||||||
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
||||||
self.user_data_dir = user_data_dir
|
user_data_dir = user_data_dir.rstrip('/')
|
||||||
|
basename = osp.basename(user_data_dir)
|
||||||
|
if basename and basename != 'data':
|
||||||
|
user_data_dir = osp.join(user_data_dir, 'data')
|
||||||
|
assert osp.exists(user_data_dir), \
|
||||||
|
f'a subfolder named `data` should exist under {user_data_dir}.'
|
||||||
|
elif basename:
|
||||||
|
assert osp.exists(user_data_dir), \
|
||||||
|
f'{user_data_dir} does not exist.'
|
||||||
|
return user_data_dir
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def valid_step(step):
|
def valid_step(step):
|
||||||
@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
# Fall back to False
|
# Fall back to False
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def text_step(self, step, target):
|
||||||
|
"""Whether the step output is correct."""
|
||||||
|
# Found the latest code interpreter to determine correct
|
||||||
|
for action in step[::-1]:
|
||||||
|
if action['type'] == 'IPythonInterpreter':
|
||||||
|
if action['result']:
|
||||||
|
try:
|
||||||
|
pred = action['result']['text']
|
||||||
|
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
out = match.group(1)
|
||||||
|
score = self.text_evaluator.score([out], [target])
|
||||||
|
return score[self.text_eval_metric] / 100
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
# Fall back to False
|
||||||
|
return False
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def vis_similarity_step(step, target):
|
def vis_similarity_step(step, target):
|
||||||
"""Whether the step output image has the same structure similarity with
|
"""Whether the step output image has the same structure similarity with
|
||||||
@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
'the conversion processes.')
|
'the conversion processes.')
|
||||||
|
|
||||||
check_jupytext()
|
check_jupytext()
|
||||||
|
p_list = []
|
||||||
from opencompass.lagent.actions.ipython_interpreter import extract_code
|
from opencompass.lagent.actions.ipython_interpreter import extract_code
|
||||||
for idx, (example_origin_prompt,
|
for idx, (example_origin_prompt,
|
||||||
example_steps) in enumerate(zip(origin_prompt, steps)):
|
example_steps) in enumerate(zip(origin_prompt, steps)):
|
||||||
@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
f.writelines(markdown_lines)
|
f.writelines(markdown_lines)
|
||||||
|
|
||||||
# TODO: be careful for this
|
# TODO: be careful for this
|
||||||
|
# The result might be different with infer process
|
||||||
|
# please check carefully
|
||||||
# convert markdown to ipynb and exectue with error tolerance
|
# convert markdown to ipynb and exectue with error tolerance
|
||||||
# subprocess.Popen(
|
if self.with_ipynb:
|
||||||
# "jupytext --to ipynb --pipe-fmt ipynb "
|
p = subprocess.Popen(
|
||||||
# "--pipe 'jupyter nbconvert --to ipynb --execute "
|
'jupytext --to ipynb --pipe-fmt ipynb '
|
||||||
# f"--allow-errors --stdin --stdout' {md_file}",
|
"--pipe 'jupyter nbconvert --to ipynb --execute "
|
||||||
# shell=True)
|
f"--allow-errors --stdin --stdout' {md_file}",
|
||||||
|
shell=True)
|
||||||
|
p_list.append(p)
|
||||||
|
# TODO: async wait
|
||||||
|
for p in p_list:
|
||||||
|
p.wait()
|
||||||
|
|
||||||
def set_data_dir(self, work_dir):
|
def set_data_dir(self, work_dir):
|
||||||
"""Set work directory and link data files for save notebook results."""
|
"""Set work directory and link data files for save notebook results."""
|
||||||
if self.user_data_dir:
|
if self.user_data_dir:
|
||||||
if self.user_data_dir.endswith('/'):
|
basename = osp.basename(self.user_data_dir)
|
||||||
basename = osp.basename(osp.split(self.user_data_dir)[0])
|
|
||||||
else:
|
|
||||||
basename = osp.basename(self.user_data_dir)
|
|
||||||
if not osp.exists(osp.join(self.output_dir, basename)):
|
if not osp.exists(osp.join(self.output_dir, basename)):
|
||||||
os.symlink(self.user_data_dir,
|
os.symlink(self.user_data_dir,
|
||||||
osp.join(self.output_dir, basename))
|
osp.join(self.output_dir, basename))
|
||||||
@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
"""Change work directory and keep the symlink."""
|
"""Change work directory and keep the symlink."""
|
||||||
os.chdir(work_dir)
|
os.chdir(work_dir)
|
||||||
|
|
||||||
|
def single_exp(self, gold, steps):
|
||||||
|
tags = gold['tags']
|
||||||
|
outputs = gold['outputs']
|
||||||
|
metadata = gold['metadata']
|
||||||
|
hard_tags = metadata.get('step_types', [])
|
||||||
|
if hard_tags:
|
||||||
|
tags = hard_tags
|
||||||
|
|
||||||
|
# executable: exec succeed
|
||||||
|
# general_correct: general correct
|
||||||
|
# numeric_correct: numerical correct
|
||||||
|
# text_score: text score
|
||||||
|
# vis_sim: visual similarity
|
||||||
|
result = defaultdict(list)
|
||||||
|
for tag, step, output in zip(tags, steps, outputs):
|
||||||
|
# check whether this step is valid
|
||||||
|
result['executable'].append(self.valid_step(step))
|
||||||
|
if tag != 'exec':
|
||||||
|
key, func = self.TAG_MAPPING[tag]
|
||||||
|
result[key].append(func(step, output))
|
||||||
|
|
||||||
|
# add missing metric for better analyse if not exists
|
||||||
|
if hard_tags:
|
||||||
|
check_tags = ['exec', 'num', 'text', 'vis']
|
||||||
|
else:
|
||||||
|
check_tags = ['exec', 'general', 'vis']
|
||||||
|
for tag in check_tags:
|
||||||
|
key = self.TAG_MAPPING[tag][0]
|
||||||
|
if key not in result:
|
||||||
|
result[key] = []
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_output_dir(self):
|
||||||
|
"""Get output dir from eval task.
|
||||||
|
|
||||||
|
Notice: output dir should be in format xxx/data.
|
||||||
|
All the needed files should be
|
||||||
|
"""
|
||||||
|
# hard hack for get output dir from eval task
|
||||||
|
if hasattr(self, '_out_dir') and self.output_dir is None:
|
||||||
|
self.output_dir = self._out_dir
|
||||||
|
|
||||||
def score(self, predictions: List, references: List, steps: List,
|
def score(self, predictions: List, references: List, steps: List,
|
||||||
origin_prompt: List):
|
origin_prompt: List):
|
||||||
"""Calculate accuracy."""
|
"""Calculate accuracy."""
|
||||||
cwd = os.getcwd()
|
cwd = os.getcwd()
|
||||||
|
self.get_output_dir()
|
||||||
if self.output_dir:
|
if self.output_dir:
|
||||||
if not osp.exists(self.output_dir):
|
if not osp.exists(self.output_dir):
|
||||||
os.makedirs(self.output_dir)
|
os.makedirs(self.output_dir)
|
||||||
@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
self.save_results(origin_prompt, steps)
|
self.save_results(origin_prompt, steps)
|
||||||
self.unset_data_dir(cwd)
|
self.unset_data_dir(cwd)
|
||||||
|
|
||||||
num_cells_list = []
|
total_results = defaultdict(float)
|
||||||
num_general_list = []
|
total_scores = defaultdict(float)
|
||||||
passed_list = []
|
total_nums = defaultdict(int)
|
||||||
correct_list = []
|
|
||||||
vis_list = []
|
|
||||||
for gold, single_steps in zip(references, steps):
|
for gold, single_steps in zip(references, steps):
|
||||||
tags = gold['tags']
|
result = self.single_exp(gold, single_steps)
|
||||||
outputs = gold['outputs']
|
|
||||||
num_cells = len(tags)
|
|
||||||
num_general = sum([tag == 'general' for tag in tags])
|
|
||||||
|
|
||||||
passed = sum([self.valid_step(step) for step in single_steps])
|
for k, v in result.items():
|
||||||
correct = 0
|
total_scores[k] += sum(v)
|
||||||
vis_sim = []
|
total_nums[k] += len(v)
|
||||||
for tag, step, output in zip(tags, single_steps, outputs):
|
|
||||||
if tag == 'general':
|
|
||||||
correct += self.correct_step(step, output)
|
|
||||||
elif tag == 'vis':
|
|
||||||
vis_sim.append(self.vis_similarity_step(step, output))
|
|
||||||
|
|
||||||
num_cells_list.append(num_cells)
|
for k, v in total_scores.items():
|
||||||
num_general_list.append(num_general)
|
if total_nums[k] > 0:
|
||||||
passed_list.append(passed)
|
total_results[k] = total_scores[k] / total_nums[k] * 100
|
||||||
correct_list.append(correct)
|
|
||||||
if vis_sim:
|
|
||||||
vis_list.append(sum(vis_sim) / len(vis_sim))
|
|
||||||
else:
|
else:
|
||||||
vis_list.append(-1)
|
total_results[k] = -1
|
||||||
|
|
||||||
if len([v for v in vis_list if v >= 0]) > 0:
|
return total_results
|
||||||
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
|
|
||||||
[v for v in vis_list if v >= 0])
|
|
||||||
else:
|
|
||||||
# not valid
|
|
||||||
visualize_similarity = -1
|
|
||||||
|
|
||||||
if sum(num_general_list) > 0:
|
|
||||||
general_accuracy = sum(correct_list) / sum(num_general_list)
|
|
||||||
else:
|
|
||||||
# not valid
|
|
||||||
general_accuracy = -1
|
|
||||||
|
|
||||||
result = dict(
|
|
||||||
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
|
|
||||||
general_accuracy=general_accuracy * 100,
|
|
||||||
visualize_similarity=visualize_similarity * 100,
|
|
||||||
num_cells_list=num_cells_list,
|
|
||||||
num_general_list=num_general_list,
|
|
||||||
passed_list=passed_list,
|
|
||||||
correct_list=correct_list,
|
|
||||||
vis_list=vis_list,
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
|
@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
|
|||||||
with open(path, 'r', encoding='utf-8') as f:
|
with open(path, 'r', encoding='utf-8') as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
line = json.loads(line)
|
line = json.loads(line)
|
||||||
|
if line['label'] == '-':
|
||||||
|
continue
|
||||||
data.append(line)
|
data.append(line)
|
||||||
return Dataset.from_list(data)
|
return Dataset.from_list(data)
|
||||||
|
|
||||||
|
@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
|
|||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
|
||||||
|
def ds1000_completion_postprocess(text: str) -> str:
|
||||||
|
text += '</code>'
|
||||||
|
|
||||||
|
match = re.search('(.*?)</code>', text, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
text = match.group(1)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
|
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
|
||||||
def ds1000_matplotlib_postprocess(text: str) -> str:
|
def ds1000_matplotlib_postprocess(text: str) -> str:
|
||||||
text = ds1000_postprocess(text)
|
text = ds1000_postprocess(text)
|
||||||
|
@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
|
|||||||
reasoning_acc=100 *
|
reasoning_acc=100 *
|
||||||
(reasoning_scope + final_scope + row_reasoning_scope) / total,
|
(reasoning_scope + final_scope + row_reasoning_scope) / total,
|
||||||
code_acc=100 * (code_scope + final_scope) / total,
|
code_acc=100 * (code_scope + final_scope) / total,
|
||||||
action_acc=100 * (action_scope + final_scope) / total,
|
action_pct=100 * (action_scope + final_scope) / total,
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
|
|||||||
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
|
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
|
||||||
|
|
||||||
data = []
|
data = []
|
||||||
with open(path, 'r') as infile:
|
with open(path, 'r', encoding='utf-8') as infile:
|
||||||
for id, line in enumerate(infile):
|
for id, line in enumerate(infile):
|
||||||
entry = json.loads(line)
|
entry = json.loads(line)
|
||||||
if 'cloze' in name:
|
if 'cloze' in name:
|
||||||
|
@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
|
|||||||
for line in f:
|
for line in f:
|
||||||
line = json.loads(line)
|
line = json.loads(line)
|
||||||
prompt = line['sentence']
|
prompt = line['sentence']
|
||||||
dataset_list.append({
|
continue_prompt = prompt.split('_')
|
||||||
'opt1':
|
data_item = {
|
||||||
prompt.replace('_', line['option1']),
|
'opt1': prompt.replace('_', line['option1']),
|
||||||
'opt2':
|
'opt2': prompt.replace('_', line['option2']),
|
||||||
prompt.replace('_', line['option2']),
|
'answer': line['answer'],
|
||||||
'answer':
|
'cont': continue_prompt[1]
|
||||||
line['answer']
|
}
|
||||||
})
|
dataset_list.append(data_item)
|
||||||
dataset_list = Dataset.from_list(dataset_list)
|
dataset_list = Dataset.from_list(dataset_list)
|
||||||
return dataset_list
|
return dataset_list
|
||||||
|
|
||||||
@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
|
|||||||
prompt = line['sentence']
|
prompt = line['sentence']
|
||||||
answer = line['answer']
|
answer = line['answer']
|
||||||
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
|
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
|
||||||
dataset_list.append({
|
data_item = {
|
||||||
'opt1':
|
'opt1': prompt.replace('_', line['option1']),
|
||||||
prompt.replace('_', line['option1']),
|
'opt2': prompt.replace('_', line['option2']),
|
||||||
'opt2':
|
'answer': answer,
|
||||||
prompt.replace('_', line['option2']),
|
}
|
||||||
'answer':
|
dataset_list.append(data_item)
|
||||||
answer
|
|
||||||
})
|
|
||||||
dataset_list = Dataset.from_list(dataset_list)
|
dataset_list = Dataset.from_list(dataset_list)
|
||||||
return dataset_list
|
return dataset_list
|
||||||
|
@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
|
|||||||
it is disabled. Defaults to None.
|
it is disabled. Defaults to None.
|
||||||
timeout (int): Upper bound of waiting time for Python script execution.
|
timeout (int): Upper bound of waiting time for Python script execution.
|
||||||
Defaults to 20.
|
Defaults to 20.
|
||||||
|
trim_output (int, optional): Max characters restriction of ipython
|
||||||
|
outputs. If None, do not perform any trim.
|
||||||
|
TODO: Notice that, this is not token len. Anf trim strategies
|
||||||
|
might be added later. Defaults to 1024.
|
||||||
user_data_dir (str): Specified the user data directory for files
|
user_data_dir (str): Specified the user data directory for files
|
||||||
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
|
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
|
||||||
Defaults to `ENV`.
|
Defaults to `ENV`.
|
||||||
@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
|
|||||||
enable: bool = True,
|
enable: bool = True,
|
||||||
disable_description: Optional[str] = None,
|
disable_description: Optional[str] = None,
|
||||||
timeout: int = 20,
|
timeout: int = 20,
|
||||||
|
trim_output: Optional[int] = 1024,
|
||||||
user_data_dir: str = 'ENV') -> None:
|
user_data_dir: str = 'ENV') -> None:
|
||||||
super().__init__(description, name, enable, disable_description)
|
super().__init__(description, name, enable, disable_description)
|
||||||
|
|
||||||
@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
|
|||||||
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
user_data_dir = os.environ.get('USER_DATA_DIR', '')
|
||||||
|
|
||||||
if user_data_dir:
|
if user_data_dir:
|
||||||
user_data_dir = os.path.dirname(user_data_dir)
|
# user_data_dir = os.path.dirname(user_data_dir)
|
||||||
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
|
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
|
||||||
self.user_data_dir = user_data_dir
|
self.user_data_dir = user_data_dir
|
||||||
self._initialized = False
|
self._initialized = False
|
||||||
|
self.trim_output = trim_output
|
||||||
if not os.path.exists(WORK_DIR):
|
if not os.path.exists(WORK_DIR):
|
||||||
os.mkdir(WORK_DIR)
|
os.mkdir(WORK_DIR)
|
||||||
|
|
||||||
@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
|
|||||||
if image:
|
if image:
|
||||||
result += f'\n\n{image}'
|
result += f'\n\n{image}'
|
||||||
if finished:
|
if finished:
|
||||||
|
# in case output text too long
|
||||||
|
# might need better design later
|
||||||
|
if self.trim_output and len(result) > self.trim_output:
|
||||||
|
ellip = '......'
|
||||||
|
half_len = int((self.trim_output - len(ellip)) / 2)
|
||||||
|
result = result[:half_len] + ellip + result[-half_len:]
|
||||||
return succeed, result
|
return succeed, result
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
|
|||||||
command: str,
|
command: str,
|
||||||
timeout: Optional[int] = None) -> ActionReturn:
|
timeout: Optional[int] = None) -> ActionReturn:
|
||||||
tool_return = ActionReturn(url=None, args=None, type=self.name)
|
tool_return = ActionReturn(url=None, args=None, type=self.name)
|
||||||
tool_return.args = dict(text=command)
|
extracted_command = extract_code(command)
|
||||||
succeed, result = self._call(command, timeout)
|
tool_return.args = dict(text=command, extract_code=extracted_command)
|
||||||
if succeed:
|
if extracted_command:
|
||||||
tool_return.result = dict(text=result)
|
succeed, result = self._call(extracted_command, timeout)
|
||||||
tool_return.state = ActionStatusCode.SUCCESS
|
if succeed:
|
||||||
|
if not result:
|
||||||
|
result = 'The code is succeed without any outputs.'
|
||||||
|
tool_return.result = dict(text=result)
|
||||||
|
tool_return.state = ActionStatusCode.SUCCESS
|
||||||
|
else:
|
||||||
|
tool_return.errmsg = repr(result)
|
||||||
|
tool_return.state = ActionStatusCode.API_ERROR
|
||||||
else:
|
else:
|
||||||
tool_return.errmsg = repr(result)
|
tool_return.errmsg = 'The input code is empty. Please follow the format.' # noqa
|
||||||
tool_return.state = ActionStatusCode.API_ERROR
|
tool_return.state = ActionStatusCode.API_ERROR
|
||||||
return tool_return
|
return tool_return
|
||||||
|
|
||||||
|
@ -115,6 +115,20 @@ class BaseModel:
|
|||||||
inputs = self.parse_template(templates, mode='ppl')
|
inputs = self.parse_template(templates, mode='ppl')
|
||||||
return self.get_ppl(inputs, mask_length)
|
return self.get_ppl(inputs, mask_length)
|
||||||
|
|
||||||
|
def get_loglikelihood_from_template(self,
|
||||||
|
templates: List[PromptType],
|
||||||
|
conts: List[str],
|
||||||
|
mask_length=None):
|
||||||
|
"""Get perplexity given a list of templates.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
templates (List[PromptType]): A list of templates.
|
||||||
|
mask_length (List[int]): A list of mask lengths. If provided, the
|
||||||
|
perplexity will be calculated only on the unmasked tokens.
|
||||||
|
"""
|
||||||
|
inputs = self.parse_template(templates, mode='ppl')
|
||||||
|
return self.get_loglikelihood(inputs, conts, mask_length)
|
||||||
|
|
||||||
def generate_from_template(self, templates: List[PromptType],
|
def generate_from_template(self, templates: List[PromptType],
|
||||||
max_out_len: int, **kwargs):
|
max_out_len: int, **kwargs):
|
||||||
"""Generate completion from a list of templates.
|
"""Generate completion from a list of templates.
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
from queue import Queue
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Dict, List, Optional, Tuple, Union
|
from typing import Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
|
|||||||
def __init__(self,
|
def __init__(self,
|
||||||
path: str,
|
path: str,
|
||||||
query_per_second: int = 1,
|
query_per_second: int = 1,
|
||||||
|
rpm_verbose: bool = False,
|
||||||
retry: int = 2,
|
retry: int = 2,
|
||||||
max_seq_len: int = 2048,
|
max_seq_len: int = 2048,
|
||||||
meta_template: Optional[Dict] = None,
|
meta_template: Optional[Dict] = None,
|
||||||
@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
|
|||||||
self.meta_template = meta_template
|
self.meta_template = meta_template
|
||||||
self.retry = retry
|
self.retry = retry
|
||||||
self.query_per_second = query_per_second
|
self.query_per_second = query_per_second
|
||||||
self.token_bucket = TokenBucket(query_per_second)
|
self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
|
||||||
self.template_parser = APITemplateParser(meta_template)
|
self.template_parser = APITemplateParser(meta_template)
|
||||||
self.logger = get_logger()
|
self.logger = get_logger()
|
||||||
self.generation_kwargs = generation_kwargs
|
self.generation_kwargs = generation_kwargs
|
||||||
@ -422,10 +425,13 @@ class TokenBucket:
|
|||||||
query_per_second (float): The rate of the token bucket.
|
query_per_second (float): The rate of the token bucket.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, rate):
|
def __init__(self, rate, verbose=False):
|
||||||
self._rate = rate
|
self._rate = rate
|
||||||
self._tokens = threading.Semaphore(0)
|
self._tokens = threading.Semaphore(0)
|
||||||
self.started = False
|
self.started = False
|
||||||
|
self._request_queue = Queue()
|
||||||
|
self.logger = get_logger()
|
||||||
|
self.verbose = verbose
|
||||||
|
|
||||||
def _add_tokens(self):
|
def _add_tokens(self):
|
||||||
"""Add tokens to the bucket."""
|
"""Add tokens to the bucket."""
|
||||||
@ -440,3 +446,12 @@ class TokenBucket:
|
|||||||
self.started = True
|
self.started = True
|
||||||
threading.Thread(target=self._add_tokens, daemon=True).start()
|
threading.Thread(target=self._add_tokens, daemon=True).start()
|
||||||
self._tokens.acquire()
|
self._tokens.acquire()
|
||||||
|
if self.verbose:
|
||||||
|
cur_time = time.time()
|
||||||
|
while not self._request_queue.empty():
|
||||||
|
if cur_time - self._request_queue.queue[0] > 60:
|
||||||
|
self._request_queue.get()
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
self._request_queue.put(cur_time)
|
||||||
|
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
|
||||||
|
@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import transformers
|
||||||
|
|
||||||
from opencompass.models.base import BaseModel
|
from opencompass.models.base import BaseModel
|
||||||
from opencompass.models.base_api import APITemplateParser
|
from opencompass.models.base_api import APITemplateParser
|
||||||
@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
|
|||||||
PromptType = Union[PromptList, str]
|
PromptType = Union[PromptList, str]
|
||||||
|
|
||||||
|
|
||||||
|
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
|
||||||
|
"""Criteria to stop on the specified multi-token sequence."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sequence: str,
|
||||||
|
tokenizer: transformers.PreTrainedTokenizer,
|
||||||
|
batch_size: int,
|
||||||
|
):
|
||||||
|
self.done_tracker = [False] * batch_size
|
||||||
|
self.sequence = sequence
|
||||||
|
self.sequence_ids = tokenizer.encode(sequence,
|
||||||
|
add_special_tokens=False)
|
||||||
|
self.sequence_id_len = len(self.sequence_ids)
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
|
def __call__(self, input_ids, scores, **kwargs) -> bool:
|
||||||
|
# compare the last len(stop) tokens
|
||||||
|
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
|
||||||
|
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
|
||||||
|
for i, done in enumerate(self.done_tracker):
|
||||||
|
if done:
|
||||||
|
continue
|
||||||
|
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
|
||||||
|
return False not in self.done_tracker
|
||||||
|
|
||||||
|
|
||||||
@MODELS.register_module()
|
@MODELS.register_module()
|
||||||
class HuggingFace(BaseModel):
|
class HuggingFace(BaseModel):
|
||||||
"""Model wrapper around HuggingFace models.
|
"""Model wrapper around HuggingFace models.
|
||||||
@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
|
|||||||
self.model.config.eos_token_id = 2
|
self.model.config.eos_token_id = 2
|
||||||
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
self.model.config.pad_token_id = self.tokenizer.pad_token_id
|
||||||
|
|
||||||
def generate(self, inputs: List[str], max_out_len: int,
|
def generate(self,
|
||||||
|
inputs: List[str],
|
||||||
|
max_out_len: int,
|
||||||
|
stopping_criteria: List[str] = [],
|
||||||
**kwargs) -> List[str]:
|
**kwargs) -> List[str]:
|
||||||
"""Generate results given a list of inputs.
|
"""Generate results given a list of inputs.
|
||||||
|
|
||||||
@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
|
|||||||
max_out_len=max_out_len,
|
max_out_len=max_out_len,
|
||||||
**generation_kwargs)
|
**generation_kwargs)
|
||||||
else:
|
else:
|
||||||
return sum((self._single_generate(
|
return sum(
|
||||||
inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
|
(self._single_generate(inputs=[input_],
|
||||||
for input_ in inputs), [])
|
max_out_len=max_out_len,
|
||||||
|
stopping_criteria=stopping_criteria,
|
||||||
|
**generation_kwargs)
|
||||||
|
for input_ in inputs), [])
|
||||||
|
|
||||||
def _batch_generate(self, inputs: List[str], max_out_len: int,
|
def _batch_generate(self, inputs: List[str], max_out_len: int,
|
||||||
**kwargs) -> List[str]:
|
**kwargs) -> List[str]:
|
||||||
@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
|
|||||||
decodeds = [token.split(self.end_str)[0] for token in decodeds]
|
decodeds = [token.split(self.end_str)[0] for token in decodeds]
|
||||||
return decodeds
|
return decodeds
|
||||||
|
|
||||||
def _single_generate(self, inputs: List[str], max_out_len: int,
|
def _single_generate(self,
|
||||||
|
inputs: List[str],
|
||||||
|
max_out_len: int,
|
||||||
|
stopping_criteria: List[str] = [],
|
||||||
**kwargs) -> List[str]:
|
**kwargs) -> List[str]:
|
||||||
"""Support for single prompt inference.
|
"""Support for single prompt inference.
|
||||||
|
|
||||||
@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
|
|||||||
max_length=self.max_seq_len -
|
max_length=self.max_seq_len -
|
||||||
max_out_len)['input_ids']
|
max_out_len)['input_ids']
|
||||||
input_ids = torch.tensor(input_ids, device=self.model.device)
|
input_ids = torch.tensor(input_ids, device=self.model.device)
|
||||||
|
|
||||||
|
if stopping_criteria:
|
||||||
|
# Construct huggingface stopping criteria
|
||||||
|
stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
|
||||||
|
stopping_criteria = transformers.StoppingCriteriaList([
|
||||||
|
*[
|
||||||
|
MultiTokenEOSCriteria(sequence, self.tokenizer,
|
||||||
|
input_ids.shape[0])
|
||||||
|
for sequence in stopping_criteria
|
||||||
|
],
|
||||||
|
])
|
||||||
|
kwargs['stopping_criteria'] = stopping_criteria
|
||||||
|
|
||||||
# To accommodate the PeftModel, parameters should be passed in
|
# To accommodate the PeftModel, parameters should be passed in
|
||||||
# key-value format for generate.
|
# key-value format for generate.
|
||||||
outputs = self.model.generate(input_ids=input_ids,
|
outputs = self.model.generate(input_ids=input_ids,
|
||||||
@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
|
|||||||
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
|
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
|
||||||
return ce_loss
|
return ce_loss
|
||||||
|
|
||||||
|
def get_loglikelihood(
|
||||||
|
self,
|
||||||
|
inputs: List[str],
|
||||||
|
conts: List[str],
|
||||||
|
mask_length: Optional[List[int]] = None) -> List[float]:
|
||||||
|
"""Get loglikelihood scores given a list of inputs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs (List[str]): A list of strings.
|
||||||
|
conts (List[str]): A list of strings: slices after the space.
|
||||||
|
NOT SUPPORT mask_length YET!
|
||||||
|
mask_length (Optional[List[int]]): A list of mask lengths. If
|
||||||
|
provided, the perplexity scores will be calculated with the
|
||||||
|
first mask_length[i] tokens masked out. It's okay to skip
|
||||||
|
its implementation if advanced features in PPLInfernecer is
|
||||||
|
not needed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[float]: A list of loglikelihood scores.
|
||||||
|
"""
|
||||||
|
assert mask_length is None, 'Not support mask_length yet.'
|
||||||
|
if self.batch_padding and len(inputs) > 1:
|
||||||
|
raise NotImplementedError('Batch padding is not supported yet.')
|
||||||
|
# assert self.tokenizer.pad_token
|
||||||
|
# return self._get_loglikelihood(inputs, mask_length=mask_length)
|
||||||
|
return np.array([
|
||||||
|
self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
|
||||||
|
for idx in range(len(inputs))
|
||||||
|
])
|
||||||
|
|
||||||
|
def _get_loglikelihood(self, inputs: str, conts: str) -> float:
|
||||||
|
"""Get loglikelihood scores given input string and continuation string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
inputs (str): string.
|
||||||
|
conts (str): strings: slices after the space.
|
||||||
|
Returns:
|
||||||
|
float: loglikelihood scores.
|
||||||
|
"""
|
||||||
|
|
||||||
|
input_ids = self.tokenizer(inputs,
|
||||||
|
padding=False,
|
||||||
|
truncation=True,
|
||||||
|
max_length=self.max_seq_len)['input_ids']
|
||||||
|
input_ids = torch.tensor(input_ids, device=self.model.device)
|
||||||
|
context_ids = self.tokenizer(inputs.replace(conts, ''),
|
||||||
|
padding=False,
|
||||||
|
truncation=True,
|
||||||
|
max_length=self.max_seq_len)['input_ids']
|
||||||
|
cont_ids = input_ids[len(context_ids):]
|
||||||
|
|
||||||
|
output = self.model(input_ids.unsqueeze(0))
|
||||||
|
logits = output['logits'][:, :-1]
|
||||||
|
logits = torch.nn.functional.log_softmax(logits, dim=-1)
|
||||||
|
contlen = cont_ids.shape[0]
|
||||||
|
logits = logits[:, -contlen:, :]
|
||||||
|
# Reducing the dimension will lead to a wrong outcome
|
||||||
|
logits_gather = torch.gather(
|
||||||
|
logits, 2,
|
||||||
|
cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq]
|
||||||
|
|
||||||
|
# Answer: sum the likelihood of each token in continuation
|
||||||
|
answer = float(logits_gather.detach().cpu().sum())
|
||||||
|
return answer
|
||||||
|
|
||||||
def get_token_len(self, prompt: str) -> int:
|
def get_token_len(self, prompt: str) -> int:
|
||||||
"""Get lengths of the tokenized strings.
|
"""Get lengths of the tokenized strings.
|
||||||
|
|
||||||
@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
|
|||||||
'role': {
|
'role': {
|
||||||
'HUMAN': 'user',
|
'HUMAN': 'user',
|
||||||
'BOT': 'assistant',
|
'BOT': 'assistant',
|
||||||
'SYSTEM': 'system'
|
'SYSTEM': 'system',
|
||||||
}[item['role']]
|
}[item['role'].upper()]
|
||||||
}
|
}
|
||||||
history.append(msg)
|
history.append(msg)
|
||||||
user_content = history[-1]['content']
|
user_content = history[-1]['content']
|
||||||
@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
|
|||||||
response, history = self.model.chat(self.tokenizer,
|
response, history = self.model.chat(self.tokenizer,
|
||||||
user_content,
|
user_content,
|
||||||
history=history)
|
history=history)
|
||||||
|
# response will be dict sometime
|
||||||
|
if isinstance(response, dict):
|
||||||
|
response = response.get('content', '')
|
||||||
responses.append(response)
|
responses.append(response)
|
||||||
except Exception:
|
except Exception:
|
||||||
responses.append('')
|
responses.append('')
|
||||||
|
@ -52,7 +52,7 @@ class LagentAgent:
|
|||||||
|
|
||||||
def chat(self,
|
def chat(self,
|
||||||
user_input: str,
|
user_input: str,
|
||||||
history: List[dict] = None) -> Tuple[str, List[dict]]:
|
history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
|
||||||
"""Chat with agent."""
|
"""Chat with agent."""
|
||||||
if history:
|
if history:
|
||||||
self.agent._session_history = history
|
self.agent._session_history = history
|
||||||
@ -60,6 +60,7 @@ class LagentAgent:
|
|||||||
from lagent.schema import ActionReturn, AgentReturn
|
from lagent.schema import ActionReturn, AgentReturn
|
||||||
generation: AgentReturn = self.agent.chat(user_input)
|
generation: AgentReturn = self.agent.chat(user_input)
|
||||||
|
|
||||||
|
inner_steps = generation.inner_steps
|
||||||
answer = generation.response
|
answer = generation.response
|
||||||
steps = []
|
steps = []
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ class LagentAgent:
|
|||||||
valid=int(step.valid),
|
valid=int(step.valid),
|
||||||
))
|
))
|
||||||
|
|
||||||
return answer, steps
|
return answer, steps, inner_steps
|
||||||
|
|
||||||
|
|
||||||
FORCE_STOP_PROMPT_EN = (
|
FORCE_STOP_PROMPT_EN = (
|
||||||
|
@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
|
|||||||
dialog = []
|
dialog = []
|
||||||
for item in input:
|
for item in input:
|
||||||
msg = {'content': item['prompt']}
|
msg = {'content': item['prompt']}
|
||||||
if item['role'] == 'HUMAN':
|
if item['role'].upper() == 'HUMAN':
|
||||||
msg['role'] = 'user'
|
msg['role'] = 'user'
|
||||||
elif item['role'] == 'BOT':
|
elif item['role'].upper() == 'BOT':
|
||||||
msg['role'] = 'assistant'
|
msg['role'] = 'assistant'
|
||||||
elif item['role'] == 'SYSTEM':
|
elif item['role'].upper() == 'SYSTEM':
|
||||||
msg['role'] = 'system'
|
msg['role'] = 'system'
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Unknown role: {item["role"]}')
|
||||||
dialog.append(msg)
|
dialog.append(msg)
|
||||||
dialogs.append(dialog)
|
dialogs.append(dialog)
|
||||||
|
|
||||||
|
@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
|
|||||||
path: str = 'gpt-3.5-turbo',
|
path: str = 'gpt-3.5-turbo',
|
||||||
max_seq_len: int = 4096,
|
max_seq_len: int = 4096,
|
||||||
query_per_second: int = 1,
|
query_per_second: int = 1,
|
||||||
|
rpm_verbose: bool = False,
|
||||||
retry: int = 2,
|
retry: int = 2,
|
||||||
key: Union[str, List[str]] = 'ENV',
|
key: Union[str, List[str]] = 'ENV',
|
||||||
org: Optional[Union[str, List[str]]] = None,
|
org: Optional[Union[str, List[str]]] = None,
|
||||||
@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
|
|||||||
max_seq_len=max_seq_len,
|
max_seq_len=max_seq_len,
|
||||||
meta_template=meta_template,
|
meta_template=meta_template,
|
||||||
query_per_second=query_per_second,
|
query_per_second=query_per_second,
|
||||||
|
rpm_verbose=rpm_verbose,
|
||||||
retry=retry)
|
retry=retry)
|
||||||
import tiktoken
|
import tiktoken
|
||||||
self.tiktoken = tiktoken
|
self.tiktoken = tiktoken
|
||||||
|
@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator # noqa
|
|||||||
from .icl_em_evaluator import EMEvaluator # noqa
|
from .icl_em_evaluator import EMEvaluator # noqa
|
||||||
from .icl_hf_evaluator import * # noqa
|
from .icl_hf_evaluator import * # noqa
|
||||||
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
|
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
|
||||||
|
from .icl_misc_evaluator import AveragePPLEvaluator # noqa
|
||||||
from .icl_toxic_evaluator import ToxicEvaluator # noqa
|
from .icl_toxic_evaluator import ToxicEvaluator # noqa
|
||||||
from .lm_evaluator import LMEvaluator # noqa
|
from .lm_evaluator import LMEvaluator # noqa
|
||||||
|
11
opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
Normal file
11
opencompass/openicl/icl_evaluator/icl_misc_evaluator.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from opencompass.registry import ICL_EVALUATORS
|
||||||
|
|
||||||
|
from .icl_base_evaluator import BaseEvaluator
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_EVALUATORS.register_module()
|
||||||
|
class AveragePPLEvaluator(BaseEvaluator):
|
||||||
|
|
||||||
|
def score(self, ppl):
|
||||||
|
average_ppl = sum(ppl) / len(ppl)
|
||||||
|
return {'average_ppl': average_ppl}
|
@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer # noqa
|
|||||||
from .icl_chat_inferencer import ChatInferencer # noqa
|
from .icl_chat_inferencer import ChatInferencer # noqa
|
||||||
from .icl_clp_inferencer import CLPInferencer # noqa
|
from .icl_clp_inferencer import CLPInferencer # noqa
|
||||||
from .icl_gen_inferencer import GenInferencer # noqa
|
from .icl_gen_inferencer import GenInferencer # noqa
|
||||||
|
from .icl_loglikelihood_inferencer import LoglikelihoodInferencer # noqa
|
||||||
from .icl_ppl_inferencer import PPLInferencer # noqa
|
from .icl_ppl_inferencer import PPLInferencer # noqa
|
||||||
|
from .icl_ppl_only_inferencer import PPLOnlyInferencer # noqa
|
||||||
from .icl_sc_inferencer import SCInferencer # noqa
|
from .icl_sc_inferencer import SCInferencer # noqa
|
||||||
from .icl_tot_inferencer import ToTInferencer # noqa
|
from .icl_tot_inferencer import ToTInferencer # noqa
|
||||||
|
@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):
|
|||||||
|
|
||||||
user_idx = assistant_indices[-1] - 1
|
user_idx = assistant_indices[-1] - 1
|
||||||
self.model.set_history(chat[:user_idx])
|
self.model.set_history(chat[:user_idx])
|
||||||
answer, steps = self.model.chat(chat[user_idx]['content'])
|
answer, steps, _ = self.model.chat(chat[user_idx]['content'])
|
||||||
output_handler.save_results(
|
output_handler.save_results(
|
||||||
origin_prompt=chat[user_idx]['content'],
|
origin_prompt=chat[user_idx]['content'],
|
||||||
prediction=answer,
|
prediction=answer,
|
||||||
@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
|
|||||||
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
||||||
]
|
]
|
||||||
|
|
||||||
self.model.set_history(chat[:assistant_indices[0] - 1])
|
history = chat[:assistant_indices[0] - 1]
|
||||||
|
|
||||||
for i in assistant_indices:
|
for i in assistant_indices:
|
||||||
answer, steps = self.model.chat(chat[i - 1]['content'])
|
answer, steps, inner_steps = self.model.chat(
|
||||||
|
chat[i - 1]['content'], history)
|
||||||
|
history += inner_steps
|
||||||
output_handler.save_multiround_results(
|
output_handler.save_multiround_results(
|
||||||
origin_prompt=chat[i - 1]['content'],
|
origin_prompt=chat[i - 1]['content'],
|
||||||
prediction=answer,
|
prediction=answer,
|
||||||
@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):
|
|||||||
|
|
||||||
for i in assistant_indices:
|
for i in assistant_indices:
|
||||||
self.model.set_history(chat[:i - 1])
|
self.model.set_history(chat[:i - 1])
|
||||||
answer, steps = self.model.chat(chat[i - 1]['content'])
|
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
|
||||||
output_handler.save_multiround_results(
|
output_handler.save_multiround_results(
|
||||||
origin_prompt=chat[i - 1]['content'],
|
origin_prompt=chat[i - 1]['content'],
|
||||||
prediction=answer,
|
prediction=answer,
|
||||||
|
@ -68,11 +68,11 @@ class LMTemplateParser:
|
|||||||
prompt = ''
|
prompt = ''
|
||||||
if self.roles:
|
if self.roles:
|
||||||
for dialog in chat:
|
for dialog in chat:
|
||||||
role_cfg = self.roles.get(dialog['role'])
|
role_cfg = self.roles.get(dialog['role'], {})
|
||||||
prompt += role_cfg['begin']
|
prompt += (role_cfg.get('begin') or '')
|
||||||
prompt += (dialog.get('content') or '')
|
prompt += (dialog.get('content') or '')
|
||||||
prompt += role_cfg['end']
|
prompt += (role_cfg.get('end') or '')
|
||||||
prompt += self.roles['assistant']['begin']
|
prompt += (self.roles['assistant'].get('begin') or '')
|
||||||
else:
|
else:
|
||||||
# in case the model does not have any meta template
|
# in case the model does not have any meta template
|
||||||
last_sep = ''
|
last_sep = ''
|
||||||
@ -227,9 +227,13 @@ class ChatInferencer(BaseInferencer):
|
|||||||
'tmp_' + output_json_filename)
|
'tmp_' + output_json_filename)
|
||||||
if osp.exists(tmp_json_filepath):
|
if osp.exists(tmp_json_filepath):
|
||||||
# TODO: move resume to output handler
|
# TODO: move resume to output handler
|
||||||
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
try:
|
||||||
output_handler.results_dict = tmp_result_dict
|
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
||||||
index = len(tmp_result_dict)
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
output_handler.results_dict = tmp_result_dict
|
||||||
|
index = len(tmp_result_dict)
|
||||||
|
|
||||||
# 4. Wrap prompts with Dataloader
|
# 4. Wrap prompts with Dataloader
|
||||||
dataloader = self.get_dataloader(chat_list[index:], batch_size=1)
|
dataloader = self.get_dataloader(chat_list[index:], batch_size=1)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
"""Direct Generation Inferencer."""
|
"""Direct Generation Inferencer."""
|
||||||
|
|
||||||
|
import inspect
|
||||||
import os
|
import os
|
||||||
import os.path as osp
|
import os.path as osp
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
|
|||||||
self,
|
self,
|
||||||
model: BaseModel,
|
model: BaseModel,
|
||||||
max_out_len: int,
|
max_out_len: int,
|
||||||
|
stopping_criteria: List[str] = [],
|
||||||
max_seq_len: Optional[int] = None,
|
max_seq_len: Optional[int] = None,
|
||||||
batch_size: Optional[int] = 1,
|
batch_size: Optional[int] = 1,
|
||||||
gen_field_replace_token: Optional[str] = '',
|
gen_field_replace_token: Optional[str] = '',
|
||||||
@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):
|
|||||||
|
|
||||||
self.gen_field_replace_token = gen_field_replace_token
|
self.gen_field_replace_token = gen_field_replace_token
|
||||||
self.max_out_len = max_out_len
|
self.max_out_len = max_out_len
|
||||||
|
self.stopping_criteria = stopping_criteria
|
||||||
|
|
||||||
if self.model.is_api and save_every is None:
|
if self.model.is_api and save_every is None:
|
||||||
save_every = 1
|
save_every = 1
|
||||||
@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
|
|||||||
entry = datum
|
entry = datum
|
||||||
golds = [None for _ in range(len(entry))]
|
golds = [None for _ in range(len(entry))]
|
||||||
# 5-1. Inference with local model
|
# 5-1. Inference with local model
|
||||||
|
extra_gen_kwargs = {}
|
||||||
|
sig = inspect.signature(self.model.generate)
|
||||||
|
if 'stopping_criteria' in sig.parameters:
|
||||||
|
extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
parsed_entries = self.model.parse_template(entry, mode='gen')
|
parsed_entries = self.model.parse_template(entry, mode='gen')
|
||||||
results = self.model.generate_from_template(
|
results = self.model.generate_from_template(
|
||||||
entry, max_out_len=self.max_out_len)
|
entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
|
||||||
generated = results
|
generated = results
|
||||||
|
|
||||||
num_return_sequences = getattr(self.model, 'generation_kwargs',
|
num_return_sequences = getattr(self.model, 'generation_kwargs',
|
||||||
|
@ -0,0 +1,215 @@
|
|||||||
|
"""PPL Inferencer."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from tqdm import trange
|
||||||
|
|
||||||
|
from opencompass.models.base import BaseModel
|
||||||
|
from opencompass.registry import ICL_INFERENCERS
|
||||||
|
|
||||||
|
from ..icl_prompt_template import PromptTemplate
|
||||||
|
from ..icl_retriever import BaseRetriever
|
||||||
|
from ..utils import get_logger
|
||||||
|
from .icl_base_inferencer import BaseInferencer, dump_results_dict
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_INFERENCERS.register_module()
|
||||||
|
class LoglikelihoodInferencer(BaseInferencer):
|
||||||
|
"""Loglikelihood Inferencer class to evaluate by loglikelihood.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
model (:obj:`BaseModel`, optional): The module to inference.
|
||||||
|
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
|
||||||
|
the LM.
|
||||||
|
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
|
||||||
|
output_json_filepath (:obj:`str`, optional): File path for output
|
||||||
|
`JSON` file.
|
||||||
|
output_json_filename (:obj:`str`, optional): File name for output
|
||||||
|
`JSON` file.
|
||||||
|
labels (:obj:`List`, optional): A list of labels for all classes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: BaseModel,
|
||||||
|
max_seq_len: Optional[int] = None,
|
||||||
|
batch_size: Optional[int] = 1,
|
||||||
|
output_json_filepath: Optional[str] = './icl_inference_output',
|
||||||
|
output_json_filename: Optional[str] = 'predictions',
|
||||||
|
labels: Optional[List] = None,
|
||||||
|
**kwargs) -> None:
|
||||||
|
super().__init__(
|
||||||
|
model=model,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
batch_size=batch_size,
|
||||||
|
output_json_filename=output_json_filename,
|
||||||
|
output_json_filepath=output_json_filepath,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.labels = labels
|
||||||
|
|
||||||
|
def inference(self,
|
||||||
|
retriever: BaseRetriever,
|
||||||
|
ice_template: Optional[PromptTemplate] = None,
|
||||||
|
prompt_template: Optional[PromptTemplate] = None,
|
||||||
|
output_json_filepath: Optional[str] = None,
|
||||||
|
output_json_filename: Optional[str] = None) -> List:
|
||||||
|
# 1. Preparation for output logs
|
||||||
|
output_handler = LoglikelihoodInferencerOutputHandler()
|
||||||
|
|
||||||
|
sub_predictions = []
|
||||||
|
ppl = []
|
||||||
|
ice = []
|
||||||
|
|
||||||
|
if output_json_filepath is None:
|
||||||
|
output_json_filepath = self.output_json_filepath
|
||||||
|
if output_json_filename is None:
|
||||||
|
output_json_filename = self.output_json_filename
|
||||||
|
|
||||||
|
# 2. Get results of retrieval process
|
||||||
|
ice_idx_list = retriever.retrieve()
|
||||||
|
|
||||||
|
# 3. Get labels of all the classes
|
||||||
|
if self.labels is None:
|
||||||
|
labels = retriever.get_labels(ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
else:
|
||||||
|
labels = self.labels
|
||||||
|
|
||||||
|
# 4. Generate in-context examples for testing inputs
|
||||||
|
for idx in range(len(ice_idx_list)):
|
||||||
|
ice.append(
|
||||||
|
retriever.generate_ice(ice_idx_list[idx],
|
||||||
|
ice_template=ice_template))
|
||||||
|
output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
|
||||||
|
|
||||||
|
# 5. Calculating loglikelihood for prompts in each label's class
|
||||||
|
for label in labels:
|
||||||
|
index = 0
|
||||||
|
prompt_list = []
|
||||||
|
sub_ppl_list = []
|
||||||
|
token_num_list = []
|
||||||
|
cont_list = []
|
||||||
|
|
||||||
|
# 5.1 Generate prompts of current label and truncate
|
||||||
|
# TODO: Refactor
|
||||||
|
for idx in range(len(ice_idx_list)):
|
||||||
|
prompt = retriever.generate_label_prompt(
|
||||||
|
idx,
|
||||||
|
ice[idx],
|
||||||
|
label,
|
||||||
|
ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
if self.max_seq_len is not None:
|
||||||
|
prompt_token_num = self.model.get_token_len_from_template(
|
||||||
|
prompt, mode='ppl')
|
||||||
|
while len(ice_idx_list[idx]
|
||||||
|
) > 0 and prompt_token_num > self.max_seq_len:
|
||||||
|
ice_idx_list[idx] = ice_idx_list[idx][:-1]
|
||||||
|
ice[idx] = retriever.generate_ice(
|
||||||
|
ice_idx_list[idx], ice_template=ice_template)
|
||||||
|
prompt = retriever.generate_label_prompt(
|
||||||
|
idx,
|
||||||
|
ice[idx],
|
||||||
|
label,
|
||||||
|
ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
prompt_token_num = self.model.get_token_len_from_template( # noqa
|
||||||
|
prompt, mode='ppl') # noqa
|
||||||
|
|
||||||
|
prompt_list.append(prompt)
|
||||||
|
token_num_list.append(prompt_token_num)
|
||||||
|
cont_list.append(retriever.test_ds[idx]['cont'])
|
||||||
|
|
||||||
|
# 5.2 Get PPL
|
||||||
|
logger.info(f"Calculating PPL for prompts labeled '{label}'")
|
||||||
|
for idx in trange(0,
|
||||||
|
len(prompt_list),
|
||||||
|
self.batch_size,
|
||||||
|
disable=not self.is_main_process):
|
||||||
|
sub_prompt_list = prompt_list[idx:idx + self.batch_size]
|
||||||
|
sub_cont_list = cont_list[idx:idx + self.batch_size]
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
# mainly modify compared to PPLInferencer
|
||||||
|
sub_res = self.model.get_loglikelihood_from_template(
|
||||||
|
sub_prompt_list, sub_cont_list).tolist()
|
||||||
|
for res, prompt in zip(
|
||||||
|
sub_res,
|
||||||
|
self.model.parse_template(sub_prompt_list,
|
||||||
|
mode='ppl')):
|
||||||
|
sub_ppl_list.append(res)
|
||||||
|
ice_str = self.model.parse_template(ice[idx], mode='ppl')
|
||||||
|
output_handler.save_prompt_and_loglikelihood(
|
||||||
|
label, prompt.replace(ice_str, ''), prompt, res, index)
|
||||||
|
index = index + 1
|
||||||
|
ppl.append(sub_ppl_list)
|
||||||
|
|
||||||
|
# 6. Get lowest PPL class as predictions
|
||||||
|
ppl = list(zip(*ppl))
|
||||||
|
for single_ppl in ppl:
|
||||||
|
sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
|
||||||
|
output_handler.save_predictions(sub_predictions)
|
||||||
|
|
||||||
|
# 7. Fetch gold answers if exist
|
||||||
|
ds_reader = retriever.dataset_reader
|
||||||
|
if ds_reader.output_column:
|
||||||
|
golds = ds_reader.dataset['test'][ds_reader.output_column]
|
||||||
|
output_handler.save_golds(golds)
|
||||||
|
|
||||||
|
# 8. Output
|
||||||
|
if self.is_main_process:
|
||||||
|
os.makedirs(output_json_filepath, exist_ok=True)
|
||||||
|
output_handler.write_to_json(output_json_filepath,
|
||||||
|
output_json_filename)
|
||||||
|
|
||||||
|
return [
|
||||||
|
sample['prediction']
|
||||||
|
for sample in output_handler.results_dict.values()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class LoglikelihoodInferencerOutputHandler:
|
||||||
|
results_dict = {}
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.results_dict = {}
|
||||||
|
|
||||||
|
def write_to_json(self, save_dir: str, filename: str):
|
||||||
|
"""Dump the result to a json file."""
|
||||||
|
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
|
||||||
|
|
||||||
|
def save_ice(self, ice):
|
||||||
|
for idx, example in enumerate(ice):
|
||||||
|
if str(idx) not in self.results_dict.keys():
|
||||||
|
self.results_dict[str(idx)] = {}
|
||||||
|
self.results_dict[str(idx)]['in-context examples'] = example
|
||||||
|
|
||||||
|
def save_predictions(self, predictions):
|
||||||
|
for idx, prediction in enumerate(predictions):
|
||||||
|
if str(idx) not in self.results_dict.keys():
|
||||||
|
self.results_dict[str(idx)] = {}
|
||||||
|
self.results_dict[str(idx)]['prediction'] = prediction
|
||||||
|
|
||||||
|
def save_prompt_and_loglikelihood(self, label, input, prompt,
|
||||||
|
loglikelihood, idx):
|
||||||
|
if str(idx) not in self.results_dict.keys():
|
||||||
|
self.results_dict[str(idx)] = {}
|
||||||
|
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
|
||||||
|
self.results_dict[str(idx)]['label: ' + str(label)] = {}
|
||||||
|
self.results_dict[str(idx)]['label: ' +
|
||||||
|
str(label)]['testing input'] = input
|
||||||
|
self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
|
||||||
|
self.results_dict[str(idx)][
|
||||||
|
'label: ' + str(label)]['Loglikelihood'] = loglikelihood
|
||||||
|
|
||||||
|
def save_golds(self, golds):
|
||||||
|
for idx, gold in enumerate(golds):
|
||||||
|
if str(idx) not in self.results_dict.keys():
|
||||||
|
self.results_dict[str(idx)] = {}
|
||||||
|
self.results_dict[str(idx)]['gold'] = gold
|
188
opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
Normal file
188
opencompass/openicl/icl_inferencer/icl_ppl_only_inferencer.py
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
"""PPL Inferencer."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import mmengine
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from opencompass.models.base import BaseModel
|
||||||
|
from opencompass.registry import ICL_INFERENCERS
|
||||||
|
|
||||||
|
from ..icl_prompt_template import PromptTemplate
|
||||||
|
from ..icl_retriever import BaseRetriever
|
||||||
|
from ..utils import get_logger
|
||||||
|
from .icl_base_inferencer import BaseInferencer, dump_results_dict
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ICL_INFERENCERS.register_module()
|
||||||
|
class PPLOnlyInferencer(BaseInferencer):
|
||||||
|
"""PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
|
||||||
|
made. This Inferencer is usually used along with AveragePPLEvaluator.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
model (:obj:`BaseModel`, optional): The module to inference.
|
||||||
|
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
|
||||||
|
the LM.
|
||||||
|
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
|
||||||
|
output_json_filepath (:obj:`str`, optional): File path for output
|
||||||
|
`JSON` file.
|
||||||
|
output_json_filename (:obj:`str`, optional): File name for output
|
||||||
|
`JSON` file.
|
||||||
|
save_every (:obj:`int`, optional): Save intermediate results every
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model: BaseModel,
|
||||||
|
max_seq_len: Optional[int] = None,
|
||||||
|
batch_size: Optional[int] = 1,
|
||||||
|
output_json_filepath: Optional[str] = './icl_inference_output',
|
||||||
|
output_json_filename: Optional[str] = 'predictions',
|
||||||
|
save_every: Optional[int] = 1,
|
||||||
|
**kwargs) -> None:
|
||||||
|
super().__init__(
|
||||||
|
model=model,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
batch_size=batch_size,
|
||||||
|
output_json_filename=output_json_filename,
|
||||||
|
output_json_filepath=output_json_filepath,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.save_every = save_every
|
||||||
|
|
||||||
|
def inference(self,
|
||||||
|
retriever: BaseRetriever,
|
||||||
|
ice_template: Optional[PromptTemplate] = None,
|
||||||
|
prompt_template: Optional[PromptTemplate] = None,
|
||||||
|
output_json_filepath: Optional[str] = None,
|
||||||
|
output_json_filename: Optional[str] = None) -> List:
|
||||||
|
# 1. Preparation for output logs
|
||||||
|
output_handler = PPLOnlyInferencerOutputHandler()
|
||||||
|
|
||||||
|
if output_json_filepath is None:
|
||||||
|
output_json_filepath = self.output_json_filepath
|
||||||
|
if output_json_filename is None:
|
||||||
|
output_json_filename = self.output_json_filename
|
||||||
|
|
||||||
|
# 2. Get results of retrieval process
|
||||||
|
ice_idx_list = retriever.retrieve()
|
||||||
|
|
||||||
|
# 3. Generate prompts for testing input
|
||||||
|
prompt_list = self.get_generation_prompt_list_from_retriever_indices(
|
||||||
|
ice_idx_list,
|
||||||
|
retriever,
|
||||||
|
max_seq_len=self.max_seq_len,
|
||||||
|
ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
|
||||||
|
# 3.1 Fetch and zip prompt & gold answer if output column exists
|
||||||
|
ds_reader = retriever.dataset_reader
|
||||||
|
|
||||||
|
assert ds_reader.output_column is None, (
|
||||||
|
'PPLOnlyInferencer supports `output_column=None` only.')
|
||||||
|
|
||||||
|
# Create tmp json file for saving intermediate results and future
|
||||||
|
# resuming
|
||||||
|
index = 0
|
||||||
|
tmp_json_filepath = os.path.join(output_json_filepath,
|
||||||
|
'tmp_' + output_json_filename)
|
||||||
|
if os.path.exists(tmp_json_filepath):
|
||||||
|
# TODO: move resume to output handler
|
||||||
|
try:
|
||||||
|
tmp_result_dict = mmengine.load(tmp_json_filepath)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
output_handler.results_dict = tmp_result_dict
|
||||||
|
index = len(tmp_result_dict)
|
||||||
|
|
||||||
|
# 4. Wrap prompts with Dataloader
|
||||||
|
dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
|
||||||
|
|
||||||
|
# 5. Inference for prompts in each batch
|
||||||
|
logger.info('Starting inference process...')
|
||||||
|
for datum in tqdm(dataloader, disable=not self.is_main_process):
|
||||||
|
entry = datum
|
||||||
|
# 5-1. Inference with local model
|
||||||
|
with torch.no_grad():
|
||||||
|
ppls = self.model.get_ppl_from_template(entry).tolist()
|
||||||
|
|
||||||
|
parsed_entries = self.model.parse_template(entry, mode='gen')
|
||||||
|
# 5-3. Save current output
|
||||||
|
for prompt, ppl, in zip(parsed_entries, ppls):
|
||||||
|
output_handler.save_results(prompt, ppl, index)
|
||||||
|
index = index + 1
|
||||||
|
|
||||||
|
# 5-4. Save intermediate results
|
||||||
|
if (self.save_every is not None and index % self.save_every == 0
|
||||||
|
and self.is_main_process):
|
||||||
|
output_handler.write_to_json(output_json_filepath,
|
||||||
|
'tmp_' + output_json_filename)
|
||||||
|
|
||||||
|
# 6. Output
|
||||||
|
if self.is_main_process:
|
||||||
|
os.makedirs(output_json_filepath, exist_ok=True)
|
||||||
|
output_handler.write_to_json(output_json_filepath,
|
||||||
|
output_json_filename)
|
||||||
|
if os.path.exists(tmp_json_filepath):
|
||||||
|
os.remove(tmp_json_filepath)
|
||||||
|
|
||||||
|
return [
|
||||||
|
sample['ppl'] for sample in output_handler.results_dict.values()
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_generation_prompt_list_from_retriever_indices(
|
||||||
|
self,
|
||||||
|
ice_idx_list: List[List[int]],
|
||||||
|
retriever: BaseRetriever,
|
||||||
|
max_seq_len: Optional[int] = None,
|
||||||
|
ice_template: Optional[PromptTemplate] = None,
|
||||||
|
prompt_template: Optional[PromptTemplate] = None):
|
||||||
|
prompt_list = []
|
||||||
|
for idx, ice_idx in enumerate(ice_idx_list):
|
||||||
|
ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
|
||||||
|
prompt = retriever.generate_prompt_for_generate_task(
|
||||||
|
idx,
|
||||||
|
ice,
|
||||||
|
ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
if max_seq_len is not None:
|
||||||
|
prompt_token_num = self.model.get_token_len_from_template(
|
||||||
|
prompt, mode='gen')
|
||||||
|
while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
|
||||||
|
ice_idx = ice_idx[:-1]
|
||||||
|
ice = retriever.generate_ice(ice_idx,
|
||||||
|
ice_template=ice_template)
|
||||||
|
prompt = retriever.generate_prompt_for_generate_task(
|
||||||
|
idx,
|
||||||
|
ice,
|
||||||
|
ice_template=ice_template,
|
||||||
|
prompt_template=prompt_template)
|
||||||
|
prompt_token_num = self.model.get_token_len_from_template(
|
||||||
|
prompt, mode='gen')
|
||||||
|
prompt_list.append(prompt)
|
||||||
|
return prompt_list
|
||||||
|
|
||||||
|
|
||||||
|
class PPLOnlyInferencerOutputHandler:
|
||||||
|
origin_prompt_dict = {}
|
||||||
|
output_dict = {}
|
||||||
|
results_dict = {}
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.results_dict = {}
|
||||||
|
|
||||||
|
def write_to_json(self, save_dir: str, filename: str):
|
||||||
|
"""Dump the result to a json file."""
|
||||||
|
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
|
||||||
|
|
||||||
|
def save_results(self, origin_prompt, ppl, idx):
|
||||||
|
self.results_dict[str(idx)] = {
|
||||||
|
'origin_prompt': origin_prompt,
|
||||||
|
'ppl': ppl,
|
||||||
|
}
|
@ -1,10 +1,12 @@
|
|||||||
|
import inspect
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from mmengine.config import ConfigDict
|
from mmengine.config import ConfigDict
|
||||||
|
|
||||||
from opencompass.utils import get_logger, task_abbr_from_cfg
|
from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
|
||||||
|
model_abbr_from_cfg, task_abbr_from_cfg)
|
||||||
|
|
||||||
|
|
||||||
class BasePartitioner:
|
class BasePartitioner:
|
||||||
@ -54,8 +56,7 @@ class BasePartitioner:
|
|||||||
List[Dict]: A list of tasks.
|
List[Dict]: A list of tasks.
|
||||||
"""
|
"""
|
||||||
cfg = deepcopy(cfg)
|
cfg = deepcopy(cfg)
|
||||||
models = cfg['models']
|
|
||||||
datasets = cfg['datasets']
|
|
||||||
work_dir = cfg['work_dir']
|
work_dir = cfg['work_dir']
|
||||||
|
|
||||||
add_cfg = {}
|
add_cfg = {}
|
||||||
@ -74,10 +75,11 @@ class BasePartitioner:
|
|||||||
self.logger.debug(f'Key {k} not found in config, ignored.')
|
self.logger.debug(f'Key {k} not found in config, ignored.')
|
||||||
self.logger.debug(f'Additional config: {add_cfg}')
|
self.logger.debug(f'Additional config: {add_cfg}')
|
||||||
|
|
||||||
tasks = self.partition(models,
|
model_and_dataset_args = self.parse_model_dataset_args(cfg)
|
||||||
datasets,
|
|
||||||
work_dir,
|
tasks = self.partition(**model_and_dataset_args,
|
||||||
self.out_dir,
|
work_dir=work_dir,
|
||||||
|
out_dir=self.out_dir,
|
||||||
add_cfg=add_cfg)
|
add_cfg=add_cfg)
|
||||||
|
|
||||||
self.logger.info(f'Partitioned into {len(tasks)} tasks.')
|
self.logger.info(f'Partitioned into {len(tasks)} tasks.')
|
||||||
@ -86,6 +88,41 @@ class BasePartitioner:
|
|||||||
|
|
||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
|
def parse_model_dataset_args(self, cfg: ConfigDict):
|
||||||
|
models = cfg['models']
|
||||||
|
datasets = cfg['datasets']
|
||||||
|
|
||||||
|
sig = inspect.signature(self.partition)
|
||||||
|
if 'model_dataset_combinations' in sig.parameters:
|
||||||
|
combs = cfg.get('model_dataset_combinations', None)
|
||||||
|
if combs is None:
|
||||||
|
combs = [{'models': models, 'datasets': datasets}]
|
||||||
|
else:
|
||||||
|
# sanity check
|
||||||
|
model_abbrs = [model_abbr_from_cfg(model) for model in models]
|
||||||
|
dataset_abbrs = [
|
||||||
|
dataset_abbr_from_cfg(dataset) for dataset in datasets
|
||||||
|
]
|
||||||
|
for comb in combs:
|
||||||
|
for model in comb['models']:
|
||||||
|
if model_abbr_from_cfg(model) not in model_abbrs:
|
||||||
|
raise ValueError(
|
||||||
|
f'Model {model_abbr_from_cfg(model)} '
|
||||||
|
'not found in config.')
|
||||||
|
for dataset in comb['datasets']:
|
||||||
|
if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
|
||||||
|
raise ValueError(
|
||||||
|
f'Dataset {dataset_abbr_from_cfg(dataset)} '
|
||||||
|
'not found in config.')
|
||||||
|
used_kwargs = {'model_dataset_combinations': combs}
|
||||||
|
else:
|
||||||
|
if cfg.get('model_dataset_combinations', None) is not None:
|
||||||
|
self.logger.warning(
|
||||||
|
'model_dataset_combinations is not supported by '
|
||||||
|
f'{self.__class__.__name__}. Ignored.')
|
||||||
|
used_kwargs = {'models': models, 'datasets': datasets}
|
||||||
|
return used_kwargs
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def partition(self,
|
def partition(self,
|
||||||
models: List[ConfigDict],
|
models: List[ConfigDict],
|
||||||
|
@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
|
|||||||
self.n = n
|
self.n = n
|
||||||
|
|
||||||
def partition(self,
|
def partition(self,
|
||||||
models: List[ConfigDict],
|
model_dataset_combinations: List[Dict[str,
|
||||||
datasets: List[ConfigDict],
|
List[ConfigDict]]],
|
||||||
work_dir: str,
|
work_dir: str,
|
||||||
out_dir: str,
|
out_dir: str,
|
||||||
add_cfg: Dict = {}) -> List[Dict]:
|
add_cfg: Dict = {}) -> List[Dict]:
|
||||||
@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
|
|||||||
}
|
}
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
models (List[ConfigDict]): A list of model configs.
|
model_dataset_combinations (List[Dict]): List of
|
||||||
datasets (List[ConfigDict]): A list of dataset configs.
|
`{models: [...], datasets: [...]}` dicts. Each dict contains
|
||||||
|
a list of model configs and a list of dataset configs.
|
||||||
work_dir (str): The work dir for the task.
|
work_dir (str): The work dir for the task.
|
||||||
out_dir (str): The full output path for the task, intended for
|
out_dir (str): The full output path for the task, intended for
|
||||||
Partitioners to check whether the task is finished via the
|
Partitioners to check whether the task is finished via the
|
||||||
@ -60,20 +61,21 @@ class NaivePartitioner(BasePartitioner):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
tasks = []
|
tasks = []
|
||||||
for model in models:
|
for comb in model_dataset_combinations:
|
||||||
chunks = []
|
for model in comb['models']:
|
||||||
for dataset in datasets:
|
chunks = []
|
||||||
filename = get_infer_output_path(model, dataset, out_dir)
|
for dataset in comb['datasets']:
|
||||||
if osp.exists(filename):
|
filename = get_infer_output_path(model, dataset, out_dir)
|
||||||
continue
|
if osp.exists(filename):
|
||||||
chunks.append(dataset)
|
continue
|
||||||
|
chunks.append(dataset)
|
||||||
|
|
||||||
for i in range(0, len(chunks), self.n):
|
for i in range(0, len(chunks), self.n):
|
||||||
task = Config({
|
task = Config({
|
||||||
'models': [model],
|
'models': [model],
|
||||||
'datasets': [chunks[i:i + self.n]],
|
'datasets': [chunks[i:i + self.n]],
|
||||||
'work_dir': work_dir,
|
'work_dir': work_dir,
|
||||||
**add_cfg
|
**add_cfg
|
||||||
})
|
})
|
||||||
tasks.append(task)
|
tasks.append(task)
|
||||||
return tasks
|
return tasks
|
||||||
|
@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
|
|||||||
self.strategy = strategy
|
self.strategy = strategy
|
||||||
|
|
||||||
def partition(self,
|
def partition(self,
|
||||||
models: List[ConfigDict],
|
model_dataset_combinations: List[Dict[str,
|
||||||
datasets: List[ConfigDict],
|
List[ConfigDict]]],
|
||||||
work_dir: str,
|
work_dir: str,
|
||||||
out_dir: str,
|
out_dir: str,
|
||||||
add_cfg: Dict = {}) -> List[ConfigDict]:
|
add_cfg: Dict = {}) -> List[ConfigDict]:
|
||||||
@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
|
|||||||
}
|
}
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
models (List[ConfigDict]): A list of model configs.
|
model_dataset_combinations (List[Dict]): List of
|
||||||
datasets (List[ConfigDict]): A list of dataset configs.
|
`{models: [...], datasets: [...]}` dicts. Each dict contains
|
||||||
|
a list of model configs and a list of dataset configs.
|
||||||
work_dir (str): The work dir for the task.
|
work_dir (str): The work dir for the task.
|
||||||
out_dir (str): The full output path for the task, intended for
|
out_dir (str): The full output path for the task, intended for
|
||||||
Partitioners to check whether the task is finished via the
|
Partitioners to check whether the task is finished via the
|
||||||
@ -84,52 +85,54 @@ class SizePartitioner(BasePartitioner):
|
|||||||
List[ConfigDict]: A list of tasks.
|
List[ConfigDict]: A list of tasks.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
datasets = sorted(datasets,
|
|
||||||
key=lambda x: self.get_cost(x),
|
|
||||||
reverse=True)
|
|
||||||
tasks = []
|
tasks = []
|
||||||
for model in models:
|
for comb in model_dataset_combinations:
|
||||||
chunks = [] # elements: tuple(size, dataset_chunk)
|
comb['datasets'] = sorted(comb['datasets'],
|
||||||
for dataset in datasets:
|
key=lambda x: self.get_cost(x),
|
||||||
filename = get_infer_output_path(model, dataset, out_dir)
|
reverse=True)
|
||||||
# skip the task if the task output exists
|
for model in comb['models']:
|
||||||
if osp.exists(filename):
|
chunks = [] # elements: tuple(size, dataset_chunk)
|
||||||
continue
|
for dataset in comb['datasets']:
|
||||||
dataset_size = self.get_cost(dataset)
|
filename = get_infer_output_path(model, dataset, out_dir)
|
||||||
if dataset_size > self.max_task_size:
|
# skip the task if the task output exists
|
||||||
root, ext = osp.splitext(filename)
|
if osp.exists(filename):
|
||||||
dataset_splits = self.split_dataset(dataset)
|
continue
|
||||||
for i, dataset_split in enumerate(dataset_splits):
|
dataset_size = self.get_cost(dataset)
|
||||||
if not osp.exists(f'{root}_{i}{ext}'):
|
if dataset_size > self.max_task_size:
|
||||||
chunks.append((self.max_task_size, dataset_split))
|
root, ext = osp.splitext(filename)
|
||||||
else:
|
dataset_splits = self.split_dataset(dataset)
|
||||||
chunks.append((dataset_size, dataset))
|
for i, dataset_split in enumerate(dataset_splits):
|
||||||
|
if not osp.exists(f'{root}_{i}{ext}'):
|
||||||
|
chunks.append(
|
||||||
|
(self.max_task_size, dataset_split))
|
||||||
|
else:
|
||||||
|
chunks.append((dataset_size, dataset))
|
||||||
|
|
||||||
if self.strategy == 'heuristic':
|
if self.strategy == 'heuristic':
|
||||||
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
|
||||||
current_size, current_chunks = 0, []
|
current_size, current_chunks = 0, []
|
||||||
for index in range(len(chunks)):
|
for index in range(len(chunks)):
|
||||||
current_size += chunks[index][0]
|
current_size += chunks[index][0]
|
||||||
current_chunks.append(chunks[index][1])
|
current_chunks.append(chunks[index][1])
|
||||||
if index == len(chunks) - 1 or current_size + chunks[
|
if index == len(chunks) - 1 or current_size + chunks[
|
||||||
index + 1][0] > self.max_task_size:
|
index + 1][0] > self.max_task_size:
|
||||||
|
tasks.append(
|
||||||
|
Config({
|
||||||
|
'models': [model],
|
||||||
|
'datasets': [current_chunks],
|
||||||
|
'work_dir': work_dir,
|
||||||
|
**add_cfg
|
||||||
|
}))
|
||||||
|
current_size, current_chunks = 0, []
|
||||||
|
elif self.strategy == 'split':
|
||||||
|
for _, dataset in chunks:
|
||||||
tasks.append(
|
tasks.append(
|
||||||
Config({
|
Config({
|
||||||
'models': [model],
|
'models': [model],
|
||||||
'datasets': [current_chunks],
|
'datasets': [[dataset]],
|
||||||
'work_dir': work_dir,
|
'work_dir': work_dir,
|
||||||
**add_cfg
|
**add_cfg
|
||||||
}))
|
}))
|
||||||
current_size, current_chunks = 0, []
|
|
||||||
elif self.strategy == 'split':
|
|
||||||
for _, dataset in chunks:
|
|
||||||
tasks.append(
|
|
||||||
Config({
|
|
||||||
'models': [model],
|
|
||||||
'datasets': [[dataset]],
|
|
||||||
'work_dir': work_dir,
|
|
||||||
**add_cfg
|
|
||||||
}))
|
|
||||||
return tasks
|
return tasks
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from opencompass.registry import RUNNERS, TASKS
|
from opencompass.registry import RUNNERS, TASKS
|
||||||
from opencompass.utils import get_logger
|
from opencompass.utils import batched, get_logger
|
||||||
|
|
||||||
from .base import BaseRunner
|
from .base import BaseRunner
|
||||||
|
|
||||||
@ -131,15 +131,22 @@ class SlurmSequentialRunner(BaseRunner):
|
|||||||
break
|
break
|
||||||
parent_conn.close()
|
parent_conn.close()
|
||||||
|
|
||||||
for job_id in tqdm(job_ids, desc='clear sruns'):
|
tbar = tqdm(total=len(job_ids), desc='clear sruns')
|
||||||
if job_id is None:
|
for batched_job_ids in batched(job_ids, 4):
|
||||||
continue
|
ps = []
|
||||||
cmd = f'scancel {job_id}'
|
for job_id in batched_job_ids:
|
||||||
p = subprocess.Popen(cmd,
|
tbar.update()
|
||||||
shell=True,
|
if job_id is None:
|
||||||
stdout=subprocess.PIPE,
|
continue
|
||||||
stderr=subprocess.STDOUT)
|
cmd = f'scancel {job_id}'
|
||||||
p.wait()
|
p = subprocess.Popen(cmd,
|
||||||
|
shell=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT)
|
||||||
|
ps.append(p)
|
||||||
|
for p in ps:
|
||||||
|
p.wait()
|
||||||
|
tbar.close()
|
||||||
|
|
||||||
def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
|
def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
|
||||||
logger = get_logger()
|
logger = get_logger()
|
||||||
|
@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
|
|||||||
pred_dicts = copy.deepcopy(preds)
|
pred_dicts = copy.deepcopy(preds)
|
||||||
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
|
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
|
||||||
|
|
||||||
pred_strs = preds.pop('prediction')
|
pred_strs = preds.pop('prediction', None)
|
||||||
pred_list_flag = isinstance(pred_strs[0], list)
|
pred_list_flag = pred_strs is not None and isinstance(
|
||||||
|
pred_strs[0], list)
|
||||||
if ('pred_role' in self.eval_cfg
|
if ('pred_role' in self.eval_cfg
|
||||||
and 'meta_template' in self.model_cfg
|
and 'meta_template' in self.model_cfg
|
||||||
and not MODELS.get(self.model_cfg['type']).is_api):
|
and not MODELS.get(self.model_cfg['type']).is_api):
|
||||||
@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
|
|||||||
]
|
]
|
||||||
|
|
||||||
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
|
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
|
||||||
|
# need results dir to save other files
|
||||||
|
out_path = get_infer_output_path(
|
||||||
|
self.model_cfg, self.dataset_cfg,
|
||||||
|
osp.join(self.work_dir, 'results'))
|
||||||
|
icl_evaluator._out_dir = osp.splitext(out_path)[
|
||||||
|
0] # strip extension
|
||||||
|
|
||||||
preds['predictions'] = pred_strs
|
preds['predictions'] = pred_strs
|
||||||
preds['references'] = (test_set[self.output_column]
|
preds['references'] = (test_set[self.output_column]
|
||||||
|
@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
@TEXT_POSTPROCESSORS.register_module('last-capital')
|
||||||
|
def last_capital_postprocess(text: str) -> str:
|
||||||
|
for t in text[::-1]:
|
||||||
|
if t.isupper():
|
||||||
|
return t
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def first_option_postprocess(text: str, options: str) -> str:
|
def first_option_postprocess(text: str, options: str) -> str:
|
||||||
"""Find first valid option for text."""
|
"""Find first valid option for text."""
|
||||||
|
|
||||||
|
7
requirements/agent.txt
Normal file
7
requirements/agent.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
json5
|
||||||
|
jupyter
|
||||||
|
jupyter_client
|
||||||
|
jupytext
|
||||||
|
lagent
|
||||||
|
scikit-image
|
||||||
|
sympy
|
@ -1,4 +1 @@
|
|||||||
faiss_gpu==1.7.2
|
faiss_gpu==1.7.2
|
||||||
jupyter
|
|
||||||
lagent
|
|
||||||
scikit-image
|
|
||||||
|
Loading…
Reference in New Issue
Block a user