[Sync] minor test (#683)

This commit is contained in:
Hubert 2023-12-11 17:42:53 +08:00 committed by GitHub
parent dd4318f6ab
commit e78857ac36
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
57 changed files with 1468 additions and 314 deletions

1
.gitignore vendored
View File

@ -11,6 +11,7 @@ configs/eval_debug*.py
configs/viz_*.py
data
work_dirs
models
configs/internal/
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .CIBench_gen_eb42f9 import ci_datasets # noqa: F401, F403
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403

View File

@ -16,28 +16,20 @@ cibench_infer_cfg = dict(
template="""{questions}""",
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
inferencer=dict(type=AgentInferencer, infer_mode='every'),
)
libs = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
cibench_eval_cfg = {
lib: dict(
evaluator=dict(
type=CIBenchEvaluator,
output_dir=f'output_data/cibench/{lib}'),
pred_role="BOT",
)
for lib in libs
}
cibench_eval_cfg = dict(evaluator=dict(type=CIBenchEvaluator), pred_role="BOT")
cibench_datasets = [
dict(
abbr=f"cibench_{lib}",
abbr=f"cibench_generation_{lib}",
type=CIBenchDataset,
path=f"./data/cibench/{lib}",
reader_cfg=cibench_reader_cfg,
infer_cfg=cibench_infer_cfg,
eval_cfg=cibench_eval_cfg[lib],
eval_cfg=cibench_eval_cfg,
) for lib in libs
]

View File

@ -95,7 +95,7 @@ mathbench_sets = {
# Use circular evaluation or not
with_circular_eval = True
mathbench_code_datasets = []
mathbench_agent_datasets = []
for _split in list(mathbench_sets.keys()):
for _name in mathbench_sets[_split]:
@ -112,13 +112,13 @@ for _split in list(mathbench_sets.keys()):
evaluator=dict(type=CircularEvaluator if 'choice' in _name and with_circular_eval else AccEvaluator),
pred_postprocessor=dict(type=first_option_postprocess, options='ABCD') if 'single_choice' in _name else dict(type=mathbench_postprocess, name=_name))
mathbench_code_datasets.append(
mathbench_agent_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name + '-agent',
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
abbr="mathbench-interpreter-" + _split + '-' + _name,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"

View File

@ -6,17 +6,17 @@ from opencompass.datasets import MathBenchDataset, mathbench_postprocess
cloze_prompts ={
"cloze_arith_en": [
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]
dict(role='HUMAN', prompt='Q: Calculate (341/11)/(9/(-6)*(-2)/3).'),
dict(role='BOT', prompt='A: First, (9/(-6)*(-2)/3) can be simplified by : 9/(-6) = -1.5, -1.5 * (-2) = 3, 3 / 3 = 1. So, (9/(-6)*(-2)/3) is equal to 1. Now, we have `(341/11)/1` equals `341/11`. Finally, calculate `341/11 = 31`. The answer is 31.\n'),
dict(role='HUMAN', prompt='Q: In base 14, what is 5 - 638d8d?'),
dict(role='BOT', prompt='A: 5 - 638d8d = -638d88. The answer is -638d88.\n'),
dict(role='HUMAN', prompt='Q: What is -491354 times -0.34?'),
dict(role='BOT', prompt='A: The product of -491354 and -0.34 is 167060.36. The answer is 167060.36.\n'),
dict(role='HUMAN', prompt='Q: What is the value of (-55)/(6930/(-382)) + (0 - 3)?.'),
dict(role='BOT', prompt='A: First, (-55)/(6930/(-382)) = (-55)/(-(6930/382)) = 55*382/6930 = 21010/6930 = 2101/693. Then, 2101/693 + (0 - 3) = 2101/693 - 3 = 2101/693 - 3*693/693 = (2101-2079)/693 = 22/693 = 2/63. The answer is 2/63.\n'),
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}\n'),
]
}
mathbench_sets = {

View File

@ -94,11 +94,11 @@ for _split in list(mathbench_sets.keys()):
mathbench_datasets.append(
dict(
abbr="mathbench-" + _split + '-' + _name,
type=MathBenchDataset,
path=f"./data/mathbench/{_split}",
name=_name,
with_circular=with_circular_eval,
abbr="mathbench-" + _split + '-' + _name,
reader_cfg=dict(
input_columns=["question"],
output_column="answer"

View File

@ -0,0 +1,69 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import (DS1000Dataset, ds1000_completion_postprocess,
ds1000_matplotlib_postprocess,
DS1000Evaluator)
ds1000_reader_cfg = dict(
input_columns=["prompt"],
output_column="test_column",
train_split='test',
test_split='test')
ds1000_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt="{prompt}",
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ds1000_eval_cfg = dict(
evaluator=dict(type=DS1000Evaluator),
pred_role="BOT",
pred_postprocessor=dict(type=ds1000_completion_postprocess),
)
# The DS-1000 dataset can be downloaded from
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
ds1000_datasets = [
dict(
abbr=f"ds1000_{lib}",
type=DS1000Dataset,
path="./data/ds1000_data/",
libs=f"{lib}",
mode="Completion",
reader_cfg=ds1000_reader_cfg,
infer_cfg=ds1000_infer_cfg,
eval_cfg=ds1000_eval_cfg,
) for lib in [
'Pandas',
'Numpy',
'Tensorflow',
'Scipy',
'Sklearn',
'Pytorch',
]
]
ds1000_datasets.append(
dict(
abbr="ds1000_Matplotlib",
type=DS1000Dataset,
path="./data/ds1000_data/",
libs="Matplotlib",
mode="Completion",
reader_cfg=ds1000_reader_cfg,
infer_cfg=ds1000_infer_cfg,
eval_cfg=dict(
evaluator=dict(type=DS1000Evaluator),
pred_role="BOT",
pred_postprocessor=dict(type=ds1000_matplotlib_postprocess),
),
))

View File

@ -0,0 +1,68 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import DS1000Dataset, DS1000ServiceEvaluator
ds1000_reader_cfg = dict(
input_columns=["prompt"],
output_column="test_column",
train_split='test',
test_split='test')
ds1000_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt="{prompt}",
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)
ds1000_eval_cfg_dict = {
lib: dict(
evaluator=dict(
type=DS1000ServiceEvaluator,
lib=lib,
ip_address=
"localhost", # replace to your code_eval_server ip_address, port
port=5000
),
pred_role="BOT")
for lib in [
'Pandas',
'Numpy',
'Tensorflow',
'Scipy',
'Sklearn',
'Pytorch',
'Matplotlib',
]
}
# The DS-1000 dataset can be downloaded from
# https://github.com/HKUNLP/DS-1000/blob/main/ds1000_data.zip
ds1000_datasets = [
dict(
abbr=f"ds1000_{lib}",
type=DS1000Dataset,
path="./data/ds1000_data/",
libs=f"{lib}",
mode="Completion",
reader_cfg=ds1000_reader_cfg,
infer_cfg=ds1000_infer_cfg,
eval_cfg=ds1000_eval_cfg_dict[lib],
) for lib in [
'Pandas',
'Numpy',
'Tensorflow',
'Scipy',
'Sklearn',
'Pytorch',
'Matplotlib',
]
]

View File

@ -45,7 +45,7 @@ gsm8k_eval_cfg = dict(
gsm8k_datasets = [
dict(
abbr='gsm8k',
abbr='gsm8k-agent',
type=GSM8KDataset,
path='./data/gsm8k',
reader_cfg=gsm8k_reader_cfg,

View File

@ -0,0 +1,39 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import GSM8KDataset, gsm8k_postprocess, gsm8k_dataset_postprocess, Gsm8kEvaluator
gsm8k_reader_cfg = dict(input_columns=['question'], output_column='answer')
gsm8k_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt="Question: Angelo and Melanie want to plan how many hours over the next week they should study together for their test next week. They have 2 chapters of their textbook to study and 4 worksheets to memorize. They figure out that they should dedicate 3 hours to each chapter of their textbook and 1.5 hours for each worksheet. If they plan to study no more than 4 hours each day, how many days should they plan to study total over the next week if they take a 10-minute break every hour, include 3 10-minute snack breaks each day, and 30 minutes for lunch each day?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="Angelo and Melanie think they should dedicate 3 hours to each of the 2 chapters, 3 hours x 2 chapters = 6 hours total.\nFor the worksheets they plan to dedicate 1.5 hours for each worksheet, 1.5 hours x 4 worksheets = 6 hours total.\nAngelo and Melanie need to start with planning 12 hours to study, at 4 hours a day, 12 / 4 = 3 days.\nHowever, they need to include time for breaks and lunch. Every hour they want to include a 10-minute break, so 12 total hours x 10 minutes = 120 extra minutes for breaks.\nThey also want to include 3 10-minute snack breaks, 3 x 10 minutes = 30 minutes.\nAnd they want to include 30 minutes for lunch each day, so 120 minutes for breaks + 30 minutes for snack breaks + 30 minutes for lunch = 180 minutes, or 180 / 60 minutes per hour = 3 extra hours.\nSo Angelo and Melanie want to plan 12 hours to study + 3 hours of breaks = 15 hours total.\nThey want to study no more than 4 hours each day, 15 hours / 4 hours each day = 3.75\nThey will need to plan to study 4 days to allow for all the time they need.\nThe answer is 4\n"),
dict(role='HUMAN', prompt="Question: Mark's basketball team scores 25 2 pointers, 8 3 pointers and 10 free throws. Their opponents score double the 2 pointers but half the 3 pointers and free throws. What's the total number of points scored by both teams added together?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="Mark's team scores 25 2 pointers, meaning they scored 25*2= 50 points in 2 pointers.\nHis team also scores 6 3 pointers, meaning they scored 8*3= 24 points in 3 pointers\nThey scored 10 free throws, and free throws count as one point so they scored 10*1=10 points in free throws.\nAll together his team scored 50+24+10= 84 points\nMark's opponents scored double his team's number of 2 pointers, meaning they scored 50*2=100 points in 2 pointers.\nHis opponents scored half his team's number of 3 pointers, meaning they scored 24/2= 12 points in 3 pointers.\nThey also scored half Mark's team's points in free throws, meaning they scored 10/2=5 points in free throws.\nAll together Mark's opponents scored 100+12+5=117 points\nThe total score for the game is both team's scores added together, so it is 84+117=201 points\nThe answer is 201\n"),
dict(role='HUMAN', prompt="Question: Bella has two times as many marbles as frisbees. She also has 20 more frisbees than deck cards. If she buys 2/5 times more of each item, what would be the total number of the items she will have if she currently has 60 marbles?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="When Bella buys 2/5 times more marbles, she'll have increased the number of marbles by 2/5*60 = 24\nThe total number of marbles she'll have is 60+24 = 84\nIf Bella currently has 60 marbles, and she has two times as many marbles as frisbees, she has 60/2 = 30 frisbees.\nIf Bella buys 2/5 times more frisbees, she'll have 2/5*30 = 12 more frisbees.\nThe total number of frisbees she'll have will increase to 30+12 = 42\nBella also has 20 more frisbees than deck cards, meaning she has 30-20 = 10 deck cards\nIf she buys 2/5 times more deck cards, she'll have 2/5*10 = 4 more deck cards.\nThe total number of deck cards she'll have is 10+4 = 14\nTogether, Bella will have a total of 14+42+84 = 140 items\nThe answer is 140\n"),
dict(role='HUMAN', prompt="Question: A group of 4 fruit baskets contains 9 apples, 15 oranges, and 14 bananas in the first three baskets and 2 less of each fruit in the fourth basket. How many fruits are there?\nLet's think step by step\nAnswer:"),
dict(role='BOT', prompt="For the first three baskets, the number of apples and oranges in one basket is 9+15=24\nIn total, together with bananas, the number of fruits in one basket is 24+14=38 for the first three baskets.\nSince there are three baskets each having 38 fruits, there are 3*38=114 fruits in the first three baskets.\nThe number of apples in the fourth basket is 9-2=7\nThere are also 15-2=13 oranges in the fourth basket\nThe combined number of oranges and apples in the fourth basket is 13+7=20\nThe fourth basket also contains 14-2=12 bananas.\nIn total, the fourth basket has 20+12=32 fruits.\nThe four baskets together have 32+114=146 fruits.\nThe answer is 146\n"),
dict(role='HUMAN', prompt="Question: {question}\nLet's think step by step\nAnswer:"),
],
)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512, stopping_criteria=[":", "Question:", "Question"]))
gsm8k_eval_cfg = dict(evaluator=dict(type=Gsm8kEvaluator),
pred_postprocessor=dict(type=gsm8k_postprocess),
dataset_postprocessor=dict(type=gsm8k_dataset_postprocess))
gsm8k_datasets = [
dict(
abbr='gsm8k',
type=GSM8KDataset,
path='./data/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg)
]

View File

@ -0,0 +1,57 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import PPLOnlyInferencer
from opencompass.openicl.icl_evaluator import AveragePPLEvaluator
from opencompass.datasets import GSM8KDataset, GSM8KReferenceSkywork
gsm8k_datasets = []
gsm8k_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template="{question} {answer}"),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLOnlyInferencer),
)
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
for split in ['train', 'test']:
gsm8k_reader_cfg = dict(
input_columns=['question', 'answer'],
output_column=None,
train_split=split,
test_split=split,
)
gsm8k_datasets.append(
dict(
abbr=f'gsm8k-{split}-ppl',
type=GSM8KDataset,
path='./data/gsm8k',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg)
)
gsm8k_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template="{text}"),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=PPLOnlyInferencer),
)
gsm8k_eval_cfg = dict(evaluator=dict(type=AveragePPLEvaluator))
gsm8k_reader_cfg = dict(
input_columns=['text'],
output_column=None,
)
gsm8k_datasets.append(
dict(
abbr=f'gsm8k-ref-ppl',
type=GSM8KReferenceSkywork,
path='./data/gsm8k-extra/mock_gsm8k_test.jsonl',
reader_cfg=gsm8k_reader_cfg,
infer_cfg=gsm8k_infer_cfg,
eval_cfg=gsm8k_eval_cfg
)
)

View File

@ -79,7 +79,7 @@ math_eval_cfg = dict(
math_datasets = [
dict(
abbr='math',
abbr='math-agent',
type=MATHDataset,
path='./data/math/math.json',
reader_cfg=math_reader_cfg,

View File

@ -1,4 +1,4 @@
from mmengine.config import read_base
with read_base():
from .winogrande_ppl_55a66e import winogrande_datasets # noqa: F401, F403
from .winogrande_ppl_8be6c3 import winogrande_datasets # noqa: F401, F403

View File

@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],
output_column='answer',

View File

@ -0,0 +1,33 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],
output_column='answer',
)
winogrande_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template={
1: "{opt1}",
2: "{opt2}",
}
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=LoglikelihoodInferencer))
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
winogrande_datasets = [
dict(
abbr='winogrande',
type=winograndeDataset,
path='./data/winogrande',
reader_cfg=winogrande_reader_cfg,
infer_cfg=winogrande_infer_cfg,
eval_cfg=winogrande_eval_cfg)
]

View File

@ -4,6 +4,10 @@ from opencompass.openicl.icl_inferencer import PPLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3
winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'],
output_column='answer',

View File

@ -4,11 +4,20 @@ from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models.lagent import LagentAgent
from lagent import PythonInterpreter, ReAct
from opencompass.lagent.actions.python_interpreter import PythonInterpreter
from lagent import ReAct
from lagent.agents.react import ReActProtocol
with read_base():
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets as datasets
from .datasets.gsm8k.gsm8k_agent_gen_3ac57d import gsm8k_datasets
from .datasets.math.math_agent_gen_861b4f import math_datasets
from .datasets.MathBench.mathbench_agent_gen_568903 import mathbench_agent_datasets
from .summarizers.math_agent import summarizer
datasets = []
datasets += gsm8k_datasets
datasets += math_datasets
datasets += mathbench_agent_datasets
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
```

View File

@ -10,7 +10,7 @@ from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
with read_base():
from .datasets.CIBench.CIBench_gen_eb42f9 import \
from .datasets.CIBench.CIBench_gen_8ab0dc import \
cibench_datasets as datasets
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
@ -36,7 +36,21 @@ Also please follow the guidelines:
3. The generated codes will be executed in an ipython manner and the results will be cached.
4. Your responded code should always be simple and only solves the problem in current step.
Begin!
For example:
File url: `xxxx`
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
{thought} We should use `pandas` to solve this step.
{action} IPythonInterpreter
{action_input} ```python
import pandas as pd
url = "xxxx"
data = pd.read_csv(url)
```
{response} The code is succeed without any outputs.
Let us begin from here!
"""
IPYTHON_INTERPRETER_DESCRIPTION = '''\
@ -69,9 +83,6 @@ models = [
),
]
for dataset in datasets:
# Evaluate on every assistant response
dataset['infer_cfg']['inferencer']['infer_mode'] = 'every'
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),

View File

@ -1,56 +0,0 @@
from mmengine.config import read_base
from opencompass.models.openai_api import OpenAI
from opencompass.partitioners import SizePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.models.lagent import LagentAgent
from lagent import PythonInterpreter, ReAct
from lagent.agents.react import ReActProtocol
system_prompt = """You are a helpful assistant which use tools to solve mathematical reasoning questions. The code must be a function, and the function name must be 'solution'. For mathematics, please use code tool to calculate. The example format is as follows:
```
def solution():
variable_names_with_real_meaning = func(variable)
return variable_names_with_real_meaning
```"""
protocol = dict(
type=ReActProtocol,
action=dict(role="ACTION", begin="Tool:", end="\n"),
action_input=dict(role="ARGS", begin="Tool Input:", end="\n"),
finish=dict(role="FINISH", begin="FinalAnswer:", end="\n"),
call_protocol=system_prompt,
)
with read_base():
from .datasets.MathBench.mathbench_code_gen_568903 import mathbench_code_datasets as datasets
from .summarizers.mathbench import summarizer
models = [
dict(
abbr='gpt-3.5-react',
type=LagentAgent,
agent_type=ReAct,
max_turn=3,
llm=dict(
type=OpenAI,
path='gpt-3.5-turbo',
key='ENV',
query_per_second=1,
max_seq_len=4096,
),
actions=[
dict(type=PythonInterpreter),
],
protocol=protocol,
batch_size=1,
),
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=1000),
runner=dict(
type=LocalRunner,
max_num_workers=16,
task=dict(type=OpenICLInferTask)),
)

View File

@ -0,0 +1,43 @@
from mmengine.config import read_base
with read_base():
from .models.qwen.hf_qwen_7b import models as hf_qwen_7b_base_models
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_models
from .datasets.ceval.ceval_ppl_578f8d import ceval_datasets as base_ceval_datasets
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets as chat_ceval_datasets
from .internal.clusters.slurm import infer, eval
# from .clusters.slurm import infer_split as infer, eval
# from .clusters.slurm import infer_size as infer, eval
# from .clusters.slurm import infer_size_split as infer, eval
base_ceval_datasets = base_ceval_datasets[:1]
chat_ceval_datasets = chat_ceval_datasets[-1:]
# If you do not want to run all the combinations of models and datasets, you
# can specify the combinations you want to run here. This is useful when you
# deleberately want to skip some subset of the combinations.
# Models and datasets in different combinations are recommended to be disjoint
# (different `abbr` in model & dataset configs), as we haven't tested this case
# throughly.
model_dataset_combinations = [
dict(models=hf_qwen_7b_base_models, datasets=base_ceval_datasets),
dict(models=hf_qwen_7b_chat_models, datasets=chat_ceval_datasets),
# dict(models=[model_cfg1, ...], datasets=[dataset_cfg1, ...]),
]
# This union of models and datasets in model_dataset_combinations should be
# stored in the `models` and `datasets` variables below. Otherwise, modules
# like summarizer will miss out some information.
models = [*hf_qwen_7b_base_models, *hf_qwen_7b_chat_models]
datasets = [*base_ceval_datasets, *chat_ceval_datasets]
work_dir = './outputs/default/mdcomb/'
"""
dataset version metric mode qwen-7b-hf qwen-7b-chat-hf
---------------------- --------- -------- ------ ------------ -----------------
ceval-computer_network 9b9417 accuracy ppl 52.63 -
ceval-physician 6e277d accuracy gen - 59.18
"""

View File

@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='<eoa>',
)
]

View File

@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]

View File

@ -29,5 +29,6 @@ models = [
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<eoa>',
)
]

View File

@ -22,12 +22,14 @@ models = [
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
use_fast=False,
),
pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -22,12 +22,14 @@ models = [
padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
use_fast=False,
),
pad_token_id=151643,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
meta_template=_meta_template,
run_cfg=dict(num_gpus=1, num_procs=1),
end_str='<|im_end|>',
)
]

View File

@ -0,0 +1,4 @@
_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
_cibench = ['cibench_' + i for i in _cibench]
cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]

View File

@ -0,0 +1,75 @@
mathbench_summary_groups = [
{
'name': 'mathbench-college',
'subsets': [
['mathbench-college-single_choice_cn', 'acc_1'],
['mathbench-college-cloze_en', 'accuracy'],
]
},
{
'name': 'mathbench-high',
'subsets': [
['mathbench-high-single_choice_cn', 'acc_1'],
['mathbench-high-single_choice_en', 'acc_1'],
]
},
{
'name': 'mathbench-middle',
'subsets': [
['mathbench-middle-single_choice_cn', 'acc_1'],
]
},
{
'name': 'mathbench-primary',
'subsets': [
['mathbench-primary-cloze_cn', 'accuracy'],
]
},
{
'name': 'mathbench',
'subsets': [
'mathbench-college',
'mathbench-high',
'mathbench-middle',
'mathbench-primary',
],
},
{
'name': 'mathbench-college-circular',
'subsets': [
['mathbench-college-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-high-circular',
'subsets': [
['mathbench-high-single_choice_cn', 'perf_4'],
['mathbench-high-single_choice_en', 'perf_4'],
]
},
{
'name': 'mathbench-middle-circular',
'subsets': [
['mathbench-middle-single_choice_cn', 'perf_4'],
]
},
{
'name': 'mathbench-circular',
'subsets': [
'mathbench-college-circular',
'mathbench-high-circular',
'mathbench-middle-circular',
],
},
{
'name': 'mathbench-circular-and-cloze',
'subsets': [
'mathbench-high-circular',
'mathbench-middle-circular',
'mathbench-circular',
'mathbench-college-cloze_en',
'mathbench-primary-cloze_cn',
],
}
]

View File

@ -0,0 +1,28 @@
summarizer = dict(
dataset_abbrs=[
'######## GSM8K-Agent Accuracy ########', # category
['gsm8k-agent', 'follow_acc'],
['gsm8k-agent', 'reasoning_acc'],
['gsm8k-agent', 'code_acc'],
['gsm8k-agent', 'action_pct'],
'######## MATH-Agent Accuracy ########', # category
['math-agent', 'follow_acc'],
['math-agent', 'reasoning_acc'],
['math-agent', 'code_acc'],
['math-agent', 'action_pct'],
'######## MathBench-Agent Accuracy ########', # category
['mathbench-college-single_choice_cn-agent', 'acc_1'],
['mathbench-college-cloze_en-agent', 'accuracy'],
['mathbench-high-single_choice_cn-agent', 'acc_1'],
['mathbench-high-single_choice_en-agent', 'acc_1'],
['mathbench-middle-single_choice_cn-agent', 'acc_1'],
['mathbench-primary-cloze_cn-agent', 'accuracy'],
'######## MathBench-Agent CircularEval ########', # category
['mathbench-college-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_cn-agent', 'perf_4'],
['mathbench-high-single_choice_en-agent', 'perf_4'],
['mathbench-middle-single_choice_cn-agent', 'perf_4'],
],
summary_groups=sum(
[v for k, v in locals().items() if k.endswith("_summary_groups")], [])
)

View File

@ -2,13 +2,15 @@ import json
import os
import os.path as osp
import re
import subprocess
from collections import defaultdict
from typing import List, Optional
import numpy as np
from datasets import Dataset
from opencompass.openicl.icl_evaluator import BaseEvaluator
from opencompass.registry import LOAD_DATASET
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
from .base import BaseDataset
@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
with open(file, 'r') as f:
notebook = json.load(f)
example = notebook['cells']
metadata = notebook['metadata']
modules = metadata.get('modules', [])
if modules:
# these two annotations should be the same
assert len(modules) == len(metadata.get('step_types'))
# reformat annotations
modules = [[_m.strip() for _m in _modules.split('&')]
for _modules in modules]
questions = []
source_codes = []
outputs = []
tags = []
for cell in example:
if cell['cell_type'] == 'markdown':
text = ''.join(cell['source'])
text = ''.join(cell['source']).strip()
if modules:
_modules = modules.pop(0)
text += f"Please use {' and '.join(_modules)} modules."
text = text.strip() + '\n'
# append the formatted text
questions.append(text)
elif cell['cell_type'] == 'code':
source_codes.append(''.join(cell['source']))
if cell['outputs'] and 'data' in cell['outputs'][-1]:
if 'image/png' in cell['outputs'][-1]['data']:
# skip vis temporarily due to lack of evaluation
@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
outputs.append(''.join(
cell['outputs'][-1]['data']['text/plain']))
else:
tags.append('executable')
tags.append('exec')
outputs.append(None)
return dict(
experiment=file,
questions=sum(([
dict(role='user', content=question),
dict(role='assistant', content=output)
] for question, output in zip(questions, outputs)), []),
references=dict(outputs=outputs, tags=tags, experiment=file),
dict(role='assistant', content=source_code)
] for question, source_code in zip(questions, source_codes)), []),
references=dict(outputs=outputs,
tags=tags,
metadata=metadata,
experiment=file),
)
@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
@staticmethod
def load(path: str):
"""Load whole dataset."""
assert os.path.exists(path), f'Path {path} does not exist.'
data_list = []
for cwd, dirs, files in os.walk(path):
dirs.sort()
@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
"""Evaluator for CI dataset.
Args:
text_evaluator (optional, dict): The text evaluator for text result
comparison[]. Defaults to None, which use Rouge as defaults.
Please notice that a extra key for `metric_name` should be set
to get the exact metric result, such as `rouge1`.
output_dir (optional, str): The directory to save experiment
files in a markdown or notebook format.
with_ipynb (bool): Generate ipynb correspondingly.
Defaults to False.
user_data_dir (str): The directory to load local files.
Defaults to 'ENV', which means use environment variable
`USER_DATA_DIR` to get the data dir.
"""
def __init__(self,
text_evaluator: Optional[dict] = None,
output_dir: Optional[str] = None,
with_ipynb: bool = False,
user_data_dir: str = 'ENV') -> None:
if text_evaluator is None:
from opencompass.openicl.icl_evaluator import RougeEvaluator
self.text_evaluator = ICL_EVALUATORS.build(
dict(type=RougeEvaluator))
self.text_eval_metric = 'rouge1'
else:
self.text_eval_metric = text_evaluator.pop('metric_name')
self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
# TODO: should use work dir for this task.
self.output_dir = output_dir
self.user_data_dir = self.check_user_data_dir(user_data_dir)
self.with_ipynb = with_ipynb
self.TAG_MAPPING = {
'exec': ('executable', self.valid_step),
'general': ('general_correct', self.correct_step),
'num': ('numeric_correct', self.correct_step),
'text': ('text_score', self.text_step),
'vis': ('vis_sim', self.vis_similarity_step),
}
def check_user_data_dir(self, user_data_dir):
if user_data_dir == 'ENV':
user_data_dir = os.environ.get('USER_DATA_DIR', '')
self.user_data_dir = user_data_dir
user_data_dir = user_data_dir.rstrip('/')
basename = osp.basename(user_data_dir)
if basename and basename != 'data':
user_data_dir = osp.join(user_data_dir, 'data')
assert osp.exists(user_data_dir), \
f'a subfolder named `data` should exist under {user_data_dir}.'
elif basename:
assert osp.exists(user_data_dir), \
f'{user_data_dir} does not exist.'
return user_data_dir
@staticmethod
def valid_step(step):
@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
# Fall back to False
return False
def text_step(self, step, target):
"""Whether the step output is correct."""
# Found the latest code interpreter to determine correct
for action in step[::-1]:
if action['type'] == 'IPythonInterpreter':
if action['result']:
try:
pred = action['result']['text']
match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
if match:
out = match.group(1)
score = self.text_evaluator.score([out], [target])
return score[self.text_eval_metric] / 100
except Exception:
return False
# Fall back to False
return False
@staticmethod
def vis_similarity_step(step, target):
"""Whether the step output image has the same structure similarity with
@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
'the conversion processes.')
check_jupytext()
p_list = []
from opencompass.lagent.actions.ipython_interpreter import extract_code
for idx, (example_origin_prompt,
example_steps) in enumerate(zip(origin_prompt, steps)):
@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
f.writelines(markdown_lines)
# TODO: be careful for this
# The result might be different with infer process
# please check carefully
# convert markdown to ipynb and exectue with error tolerance
# subprocess.Popen(
# "jupytext --to ipynb --pipe-fmt ipynb "
# "--pipe 'jupyter nbconvert --to ipynb --execute "
# f"--allow-errors --stdin --stdout' {md_file}",
# shell=True)
if self.with_ipynb:
p = subprocess.Popen(
'jupytext --to ipynb --pipe-fmt ipynb '
"--pipe 'jupyter nbconvert --to ipynb --execute "
f"--allow-errors --stdin --stdout' {md_file}",
shell=True)
p_list.append(p)
# TODO: async wait
for p in p_list:
p.wait()
def set_data_dir(self, work_dir):
"""Set work directory and link data files for save notebook results."""
if self.user_data_dir:
if self.user_data_dir.endswith('/'):
basename = osp.basename(osp.split(self.user_data_dir)[0])
else:
basename = osp.basename(self.user_data_dir)
basename = osp.basename(self.user_data_dir)
if not osp.exists(osp.join(self.output_dir, basename)):
os.symlink(self.user_data_dir,
osp.join(self.output_dir, basename))
@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
"""Change work directory and keep the symlink."""
os.chdir(work_dir)
def single_exp(self, gold, steps):
tags = gold['tags']
outputs = gold['outputs']
metadata = gold['metadata']
hard_tags = metadata.get('step_types', [])
if hard_tags:
tags = hard_tags
# executable: exec succeed
# general_correct: general correct
# numeric_correct: numerical correct
# text_score: text score
# vis_sim: visual similarity
result = defaultdict(list)
for tag, step, output in zip(tags, steps, outputs):
# check whether this step is valid
result['executable'].append(self.valid_step(step))
if tag != 'exec':
key, func = self.TAG_MAPPING[tag]
result[key].append(func(step, output))
# add missing metric for better analyse if not exists
if hard_tags:
check_tags = ['exec', 'num', 'text', 'vis']
else:
check_tags = ['exec', 'general', 'vis']
for tag in check_tags:
key = self.TAG_MAPPING[tag][0]
if key not in result:
result[key] = []
return result
def get_output_dir(self):
"""Get output dir from eval task.
Notice: output dir should be in format xxx/data.
All the needed files should be
"""
# hard hack for get output dir from eval task
if hasattr(self, '_out_dir') and self.output_dir is None:
self.output_dir = self._out_dir
def score(self, predictions: List, references: List, steps: List,
origin_prompt: List):
"""Calculate accuracy."""
cwd = os.getcwd()
self.get_output_dir()
if self.output_dir:
if not osp.exists(self.output_dir):
os.makedirs(self.output_dir)
@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
self.save_results(origin_prompt, steps)
self.unset_data_dir(cwd)
num_cells_list = []
num_general_list = []
passed_list = []
correct_list = []
vis_list = []
total_results = defaultdict(float)
total_scores = defaultdict(float)
total_nums = defaultdict(int)
for gold, single_steps in zip(references, steps):
tags = gold['tags']
outputs = gold['outputs']
num_cells = len(tags)
num_general = sum([tag == 'general' for tag in tags])
result = self.single_exp(gold, single_steps)
passed = sum([self.valid_step(step) for step in single_steps])
correct = 0
vis_sim = []
for tag, step, output in zip(tags, single_steps, outputs):
if tag == 'general':
correct += self.correct_step(step, output)
elif tag == 'vis':
vis_sim.append(self.vis_similarity_step(step, output))
for k, v in result.items():
total_scores[k] += sum(v)
total_nums[k] += len(v)
num_cells_list.append(num_cells)
num_general_list.append(num_general)
passed_list.append(passed)
correct_list.append(correct)
if vis_sim:
vis_list.append(sum(vis_sim) / len(vis_sim))
for k, v in total_scores.items():
if total_nums[k] > 0:
total_results[k] = total_scores[k] / total_nums[k] * 100
else:
vis_list.append(-1)
total_results[k] = -1
if len([v for v in vis_list if v >= 0]) > 0:
visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
[v for v in vis_list if v >= 0])
else:
# not valid
visualize_similarity = -1
if sum(num_general_list) > 0:
general_accuracy = sum(correct_list) / sum(num_general_list)
else:
# not valid
general_accuracy = -1
result = dict(
executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
general_accuracy=general_accuracy * 100,
visualize_similarity=visualize_similarity * 100,
num_cells_list=num_cells_list,
num_general_list=num_general_list,
passed_list=passed_list,
correct_list=correct_list,
vis_list=vis_list,
)
return result
return total_results

View File

@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
with open(path, 'r', encoding='utf-8') as f:
for line in f:
line = json.loads(line)
if line['label'] == '-':
continue
data.append(line)
return Dataset.from_list(data)

View File

@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
return text
@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
def ds1000_completion_postprocess(text: str) -> str:
text += '</code>'
match = re.search('(.*?)</code>', text, re.DOTALL)
if match:
text = match.group(1)
return text
@TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
def ds1000_matplotlib_postprocess(text: str) -> str:
text = ds1000_postprocess(text)

View File

@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
reasoning_acc=100 *
(reasoning_scope + final_scope + row_reasoning_scope) / total,
code_acc=100 * (code_scope + final_scope) / total,
action_acc=100 * (action_scope + final_scope) / total,
action_pct=100 * (action_scope + final_scope) / total,
)
return result

View File

@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
data = []
with open(path, 'r') as infile:
with open(path, 'r', encoding='utf-8') as infile:
for id, line in enumerate(infile):
entry = json.loads(line)
if 'cloze' in name:

View File

@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
for line in f:
line = json.loads(line)
prompt = line['sentence']
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
line['answer']
})
continue_prompt = prompt.split('_')
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': line['answer'],
'cont': continue_prompt[1]
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list
@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
prompt = line['sentence']
answer = line['answer']
answer = ' AB'[int(answer)] if answer != '' else 'NULL'
dataset_list.append({
'opt1':
prompt.replace('_', line['option1']),
'opt2':
prompt.replace('_', line['option2']),
'answer':
answer
})
data_item = {
'opt1': prompt.replace('_', line['option1']),
'opt2': prompt.replace('_', line['option2']),
'answer': answer,
}
dataset_list.append(data_item)
dataset_list = Dataset.from_list(dataset_list)
return dataset_list

View File

@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
it is disabled. Defaults to None.
timeout (int): Upper bound of waiting time for Python script execution.
Defaults to 20.
trim_output (int, optional): Max characters restriction of ipython
outputs. If None, do not perform any trim.
TODO: Notice that, this is not token len. Anf trim strategies
might be added later. Defaults to 1024.
user_data_dir (str): Specified the user data directory for files
loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
Defaults to `ENV`.
@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
enable: bool = True,
disable_description: Optional[str] = None,
timeout: int = 20,
trim_output: Optional[int] = 1024,
user_data_dir: str = 'ENV') -> None:
super().__init__(description, name, enable, disable_description)
@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
user_data_dir = os.environ.get('USER_DATA_DIR', '')
if user_data_dir:
user_data_dir = os.path.dirname(user_data_dir)
# user_data_dir = os.path.dirname(user_data_dir)
user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
self.user_data_dir = user_data_dir
self._initialized = False
self.trim_output = trim_output
if not os.path.exists(WORK_DIR):
os.mkdir(WORK_DIR)
@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
if image:
result += f'\n\n{image}'
if finished:
# in case output text too long
# might need better design later
if self.trim_output and len(result) > self.trim_output:
ellip = '......'
half_len = int((self.trim_output - len(ellip)) / 2)
result = result[:half_len] + ellip + result[-half_len:]
return succeed, result
try:
@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
command: str,
timeout: Optional[int] = None) -> ActionReturn:
tool_return = ActionReturn(url=None, args=None, type=self.name)
tool_return.args = dict(text=command)
succeed, result = self._call(command, timeout)
if succeed:
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
extracted_command = extract_code(command)
tool_return.args = dict(text=command, extract_code=extracted_command)
if extracted_command:
succeed, result = self._call(extracted_command, timeout)
if succeed:
if not result:
result = 'The code is succeed without any outputs.'
tool_return.result = dict(text=result)
tool_return.state = ActionStatusCode.SUCCESS
else:
tool_return.errmsg = repr(result)
tool_return.state = ActionStatusCode.API_ERROR
else:
tool_return.errmsg = repr(result)
tool_return.errmsg = 'The input code is empty. Please follow the format.' # noqa
tool_return.state = ActionStatusCode.API_ERROR
return tool_return

View File

@ -115,6 +115,20 @@ class BaseModel:
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length)
def get_loglikelihood_from_template(self,
templates: List[PromptType],
conts: List[str],
mask_length=None):
"""Get perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_loglikelihood(inputs, conts, mask_length)
def generate_from_template(self, templates: List[PromptType],
max_out_len: int, **kwargs):
"""Generate completion from a list of templates.

View File

@ -1,9 +1,11 @@
import re
import sys
import threading
import time
import warnings
from abc import abstractmethod
from copy import deepcopy
from queue import Queue
from time import sleep
from typing import Dict, List, Optional, Tuple, Union
@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
def __init__(self,
path: str,
query_per_second: int = 1,
rpm_verbose: bool = False,
retry: int = 2,
max_seq_len: int = 2048,
meta_template: Optional[Dict] = None,
@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
self.meta_template = meta_template
self.retry = retry
self.query_per_second = query_per_second
self.token_bucket = TokenBucket(query_per_second)
self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
self.template_parser = APITemplateParser(meta_template)
self.logger = get_logger()
self.generation_kwargs = generation_kwargs
@ -422,10 +425,13 @@ class TokenBucket:
query_per_second (float): The rate of the token bucket.
"""
def __init__(self, rate):
def __init__(self, rate, verbose=False):
self._rate = rate
self._tokens = threading.Semaphore(0)
self.started = False
self._request_queue = Queue()
self.logger = get_logger()
self.verbose = verbose
def _add_tokens(self):
"""Add tokens to the bucket."""
@ -440,3 +446,12 @@ class TokenBucket:
self.started = True
threading.Thread(target=self._add_tokens, daemon=True).start()
self._tokens.acquire()
if self.verbose:
cur_time = time.time()
while not self._request_queue.empty():
if cur_time - self._request_queue.queue[0] > 60:
self._request_queue.get()
else:
break
self._request_queue.put(cur_time)
self.logger.info(f'Current RPM {self._request_queue.qsize()}.')

View File

@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
import numpy as np
import torch
import transformers
from opencompass.models.base import BaseModel
from opencompass.models.base_api import APITemplateParser
@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
PromptType = Union[PromptList, str]
class MultiTokenEOSCriteria(transformers.StoppingCriteria):
"""Criteria to stop on the specified multi-token sequence."""
def __init__(
self,
sequence: str,
tokenizer: transformers.PreTrainedTokenizer,
batch_size: int,
):
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence,
add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
# compare the last len(stop) tokens
lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if done:
continue
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
return False not in self.done_tracker
@MODELS.register_module()
class HuggingFace(BaseModel):
"""Model wrapper around HuggingFace models.
@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
self.model.config.eos_token_id = 2
self.model.config.pad_token_id = self.tokenizer.pad_token_id
def generate(self, inputs: List[str], max_out_len: int,
def generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Generate results given a list of inputs.
@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
max_out_len=max_out_len,
**generation_kwargs)
else:
return sum((self._single_generate(
inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
for input_ in inputs), [])
return sum(
(self._single_generate(inputs=[input_],
max_out_len=max_out_len,
stopping_criteria=stopping_criteria,
**generation_kwargs)
for input_ in inputs), [])
def _batch_generate(self, inputs: List[str], max_out_len: int,
**kwargs) -> List[str]:
@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
decodeds = [token.split(self.end_str)[0] for token in decodeds]
return decodeds
def _single_generate(self, inputs: List[str], max_out_len: int,
def _single_generate(self,
inputs: List[str],
max_out_len: int,
stopping_criteria: List[str] = [],
**kwargs) -> List[str]:
"""Support for single prompt inference.
@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
max_length=self.max_seq_len -
max_out_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
if stopping_criteria:
# Construct huggingface stopping criteria
stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
stopping_criteria = transformers.StoppingCriteriaList([
*[
MultiTokenEOSCriteria(sequence, self.tokenizer,
input_ids.shape[0])
for sequence in stopping_criteria
],
])
kwargs['stopping_criteria'] = stopping_criteria
# To accommodate the PeftModel, parameters should be passed in
# key-value format for generate.
outputs = self.model.generate(input_ids=input_ids,
@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
return ce_loss
def get_loglikelihood(
self,
inputs: List[str],
conts: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get loglikelihood scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
conts (List[str]): A list of strings: slices after the space.
NOT SUPPORT mask_length YET!
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of loglikelihood scores.
"""
assert mask_length is None, 'Not support mask_length yet.'
if self.batch_padding and len(inputs) > 1:
raise NotImplementedError('Batch padding is not supported yet.')
# assert self.tokenizer.pad_token
# return self._get_loglikelihood(inputs, mask_length=mask_length)
return np.array([
self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
for idx in range(len(inputs))
])
def _get_loglikelihood(self, inputs: str, conts: str) -> float:
"""Get loglikelihood scores given input string and continuation string.
Args:
inputs (str): string.
conts (str): strings: slices after the space.
Returns:
float: loglikelihood scores.
"""
input_ids = self.tokenizer(inputs,
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
input_ids = torch.tensor(input_ids, device=self.model.device)
context_ids = self.tokenizer(inputs.replace(conts, ''),
padding=False,
truncation=True,
max_length=self.max_seq_len)['input_ids']
cont_ids = input_ids[len(context_ids):]
output = self.model(input_ids.unsqueeze(0))
logits = output['logits'][:, :-1]
logits = torch.nn.functional.log_softmax(logits, dim=-1)
contlen = cont_ids.shape[0]
logits = logits[:, -contlen:, :]
# Reducing the dimension will lead to a wrong outcome
logits_gather = torch.gather(
logits, 2,
cont_ids.unsqueeze(0).unsqueeze(-1)) # [1, seq]
# Answer: sum the likelihood of each token in continuation
answer = float(logits_gather.detach().cpu().sum())
return answer
def get_token_len(self, prompt: str) -> int:
"""Get lengths of the tokenized strings.
@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
'role': {
'HUMAN': 'user',
'BOT': 'assistant',
'SYSTEM': 'system'
}[item['role']]
'SYSTEM': 'system',
}[item['role'].upper()]
}
history.append(msg)
user_content = history[-1]['content']
@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
response, history = self.model.chat(self.tokenizer,
user_content,
history=history)
# response will be dict sometime
if isinstance(response, dict):
response = response.get('content', '')
responses.append(response)
except Exception:
responses.append('')

View File

@ -52,7 +52,7 @@ class LagentAgent:
def chat(self,
user_input: str,
history: List[dict] = None) -> Tuple[str, List[dict]]:
history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
"""Chat with agent."""
if history:
self.agent._session_history = history
@ -60,6 +60,7 @@ class LagentAgent:
from lagent.schema import ActionReturn, AgentReturn
generation: AgentReturn = self.agent.chat(user_input)
inner_steps = generation.inner_steps
answer = generation.response
steps = []
@ -76,7 +77,7 @@ class LagentAgent:
valid=int(step.valid),
))
return answer, steps
return answer, steps, inner_steps
FORCE_STOP_PROMPT_EN = (

View File

@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
dialog = []
for item in input:
msg = {'content': item['prompt']}
if item['role'] == 'HUMAN':
if item['role'].upper() == 'HUMAN':
msg['role'] = 'user'
elif item['role'] == 'BOT':
elif item['role'].upper() == 'BOT':
msg['role'] = 'assistant'
elif item['role'] == 'SYSTEM':
elif item['role'].upper() == 'SYSTEM':
msg['role'] = 'system'
else:
raise ValueError(f'Unknown role: {item["role"]}')
dialog.append(msg)
dialogs.append(dialog)

View File

@ -58,6 +58,7 @@ class OpenAI(BaseAPIModel):
path: str = 'gpt-3.5-turbo',
max_seq_len: int = 4096,
query_per_second: int = 1,
rpm_verbose: bool = False,
retry: int = 2,
key: Union[str, List[str]] = 'ENV',
org: Optional[Union[str, List[str]]] = None,
@ -70,6 +71,7 @@ class OpenAI(BaseAPIModel):
max_seq_len=max_seq_len,
meta_template=meta_template,
query_per_second=query_per_second,
rpm_verbose=rpm_verbose,
retry=retry)
import tiktoken
self.tiktoken = tiktoken

View File

@ -5,5 +5,6 @@ from .icl_circular_evaluator import CircularEvaluator # noqa
from .icl_em_evaluator import EMEvaluator # noqa
from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_misc_evaluator import AveragePPLEvaluator # noqa
from .icl_toxic_evaluator import ToxicEvaluator # noqa
from .lm_evaluator import LMEvaluator # noqa

View File

@ -0,0 +1,11 @@
from opencompass.registry import ICL_EVALUATORS
from .icl_base_evaluator import BaseEvaluator
@ICL_EVALUATORS.register_module()
class AveragePPLEvaluator(BaseEvaluator):
def score(self, ppl):
average_ppl = sum(ppl) / len(ppl)
return {'average_ppl': average_ppl}

View File

@ -4,6 +4,8 @@ from .icl_base_inferencer import BaseInferencer # noqa
from .icl_chat_inferencer import ChatInferencer # noqa
from .icl_clp_inferencer import CLPInferencer # noqa
from .icl_gen_inferencer import GenInferencer # noqa
from .icl_loglikelihood_inferencer import LoglikelihoodInferencer # noqa
from .icl_ppl_inferencer import PPLInferencer # noqa
from .icl_ppl_only_inferencer import PPLOnlyInferencer # noqa
from .icl_sc_inferencer import SCInferencer # noqa
from .icl_tot_inferencer import ToTInferencer # noqa

View File

@ -89,7 +89,7 @@ class AgentInferencer(ChatInferencer):
user_idx = assistant_indices[-1] - 1
self.model.set_history(chat[:user_idx])
answer, steps = self.model.chat(chat[user_idx]['content'])
answer, steps, _ = self.model.chat(chat[user_idx]['content'])
output_handler.save_results(
origin_prompt=chat[user_idx]['content'],
prediction=answer,
@ -104,10 +104,11 @@ class AgentInferencer(ChatInferencer):
i for i, item in enumerate(chat) if item['role'] == 'assistant'
]
self.model.set_history(chat[:assistant_indices[0] - 1])
history = chat[:assistant_indices[0] - 1]
for i in assistant_indices:
answer, steps = self.model.chat(chat[i - 1]['content'])
answer, steps, inner_steps = self.model.chat(
chat[i - 1]['content'], history)
history += inner_steps
output_handler.save_multiround_results(
origin_prompt=chat[i - 1]['content'],
prediction=answer,
@ -125,7 +126,7 @@ class AgentInferencer(ChatInferencer):
for i in assistant_indices:
self.model.set_history(chat[:i - 1])
answer, steps = self.model.chat(chat[i - 1]['content'])
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
output_handler.save_multiround_results(
origin_prompt=chat[i - 1]['content'],
prediction=answer,

View File

@ -68,11 +68,11 @@ class LMTemplateParser:
prompt = ''
if self.roles:
for dialog in chat:
role_cfg = self.roles.get(dialog['role'])
prompt += role_cfg['begin']
role_cfg = self.roles.get(dialog['role'], {})
prompt += (role_cfg.get('begin') or '')
prompt += (dialog.get('content') or '')
prompt += role_cfg['end']
prompt += self.roles['assistant']['begin']
prompt += (role_cfg.get('end') or '')
prompt += (self.roles['assistant'].get('begin') or '')
else:
# in case the model does not have any meta template
last_sep = ''
@ -227,9 +227,13 @@ class ChatInferencer(BaseInferencer):
'tmp_' + output_json_filename)
if osp.exists(tmp_json_filepath):
# TODO: move resume to output handler
tmp_result_dict = mmengine.load(tmp_json_filepath)
output_handler.results_dict = tmp_result_dict
index = len(tmp_result_dict)
try:
tmp_result_dict = mmengine.load(tmp_json_filepath)
except Exception:
pass
else:
output_handler.results_dict = tmp_result_dict
index = len(tmp_result_dict)
# 4. Wrap prompts with Dataloader
dataloader = self.get_dataloader(chat_list[index:], batch_size=1)

View File

@ -1,5 +1,6 @@
"""Direct Generation Inferencer."""
import inspect
import os
import os.path as osp
from typing import List, Optional
@ -46,6 +47,7 @@ class GenInferencer(BaseInferencer):
self,
model: BaseModel,
max_out_len: int,
stopping_criteria: List[str] = [],
max_seq_len: Optional[int] = None,
batch_size: Optional[int] = 1,
gen_field_replace_token: Optional[str] = '',
@ -64,6 +66,7 @@ class GenInferencer(BaseInferencer):
self.gen_field_replace_token = gen_field_replace_token
self.max_out_len = max_out_len
self.stopping_criteria = stopping_criteria
if self.model.is_api and save_every is None:
save_every = 1
@ -128,10 +131,14 @@ class GenInferencer(BaseInferencer):
entry = datum
golds = [None for _ in range(len(entry))]
# 5-1. Inference with local model
extra_gen_kwargs = {}
sig = inspect.signature(self.model.generate)
if 'stopping_criteria' in sig.parameters:
extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
with torch.no_grad():
parsed_entries = self.model.parse_template(entry, mode='gen')
results = self.model.generate_from_template(
entry, max_out_len=self.max_out_len)
entry, max_out_len=self.max_out_len, **extra_gen_kwargs)
generated = results
num_return_sequences = getattr(self.model, 'generation_kwargs',

View File

@ -0,0 +1,215 @@
"""PPL Inferencer."""
import os
from typing import List, Optional
import torch
from tqdm import trange
from opencompass.models.base import BaseModel
from opencompass.registry import ICL_INFERENCERS
from ..icl_prompt_template import PromptTemplate
from ..icl_retriever import BaseRetriever
from ..utils import get_logger
from .icl_base_inferencer import BaseInferencer, dump_results_dict
logger = get_logger(__name__)
@ICL_INFERENCERS.register_module()
class LoglikelihoodInferencer(BaseInferencer):
"""Loglikelihood Inferencer class to evaluate by loglikelihood.
Attributes:
model (:obj:`BaseModel`, optional): The module to inference.
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
the LM.
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
output_json_filepath (:obj:`str`, optional): File path for output
`JSON` file.
output_json_filename (:obj:`str`, optional): File name for output
`JSON` file.
labels (:obj:`List`, optional): A list of labels for all classes.
"""
def __init__(
self,
model: BaseModel,
max_seq_len: Optional[int] = None,
batch_size: Optional[int] = 1,
output_json_filepath: Optional[str] = './icl_inference_output',
output_json_filename: Optional[str] = 'predictions',
labels: Optional[List] = None,
**kwargs) -> None:
super().__init__(
model=model,
max_seq_len=max_seq_len,
batch_size=batch_size,
output_json_filename=output_json_filename,
output_json_filepath=output_json_filepath,
**kwargs,
)
self.labels = labels
def inference(self,
retriever: BaseRetriever,
ice_template: Optional[PromptTemplate] = None,
prompt_template: Optional[PromptTemplate] = None,
output_json_filepath: Optional[str] = None,
output_json_filename: Optional[str] = None) -> List:
# 1. Preparation for output logs
output_handler = LoglikelihoodInferencerOutputHandler()
sub_predictions = []
ppl = []
ice = []
if output_json_filepath is None:
output_json_filepath = self.output_json_filepath
if output_json_filename is None:
output_json_filename = self.output_json_filename
# 2. Get results of retrieval process
ice_idx_list = retriever.retrieve()
# 3. Get labels of all the classes
if self.labels is None:
labels = retriever.get_labels(ice_template=ice_template,
prompt_template=prompt_template)
else:
labels = self.labels
# 4. Generate in-context examples for testing inputs
for idx in range(len(ice_idx_list)):
ice.append(
retriever.generate_ice(ice_idx_list[idx],
ice_template=ice_template))
output_handler.save_ice(self.model.parse_template(ice, mode='ppl'))
# 5. Calculating loglikelihood for prompts in each label's class
for label in labels:
index = 0
prompt_list = []
sub_ppl_list = []
token_num_list = []
cont_list = []
# 5.1 Generate prompts of current label and truncate
# TODO: Refactor
for idx in range(len(ice_idx_list)):
prompt = retriever.generate_label_prompt(
idx,
ice[idx],
label,
ice_template=ice_template,
prompt_template=prompt_template)
if self.max_seq_len is not None:
prompt_token_num = self.model.get_token_len_from_template(
prompt, mode='ppl')
while len(ice_idx_list[idx]
) > 0 and prompt_token_num > self.max_seq_len:
ice_idx_list[idx] = ice_idx_list[idx][:-1]
ice[idx] = retriever.generate_ice(
ice_idx_list[idx], ice_template=ice_template)
prompt = retriever.generate_label_prompt(
idx,
ice[idx],
label,
ice_template=ice_template,
prompt_template=prompt_template)
prompt_token_num = self.model.get_token_len_from_template( # noqa
prompt, mode='ppl') # noqa
prompt_list.append(prompt)
token_num_list.append(prompt_token_num)
cont_list.append(retriever.test_ds[idx]['cont'])
# 5.2 Get PPL
logger.info(f"Calculating PPL for prompts labeled '{label}'")
for idx in trange(0,
len(prompt_list),
self.batch_size,
disable=not self.is_main_process):
sub_prompt_list = prompt_list[idx:idx + self.batch_size]
sub_cont_list = cont_list[idx:idx + self.batch_size]
with torch.no_grad():
# mainly modify compared to PPLInferencer
sub_res = self.model.get_loglikelihood_from_template(
sub_prompt_list, sub_cont_list).tolist()
for res, prompt in zip(
sub_res,
self.model.parse_template(sub_prompt_list,
mode='ppl')):
sub_ppl_list.append(res)
ice_str = self.model.parse_template(ice[idx], mode='ppl')
output_handler.save_prompt_and_loglikelihood(
label, prompt.replace(ice_str, ''), prompt, res, index)
index = index + 1
ppl.append(sub_ppl_list)
# 6. Get lowest PPL class as predictions
ppl = list(zip(*ppl))
for single_ppl in ppl:
sub_predictions.append(labels[single_ppl.index(max(single_ppl))])
output_handler.save_predictions(sub_predictions)
# 7. Fetch gold answers if exist
ds_reader = retriever.dataset_reader
if ds_reader.output_column:
golds = ds_reader.dataset['test'][ds_reader.output_column]
output_handler.save_golds(golds)
# 8. Output
if self.is_main_process:
os.makedirs(output_json_filepath, exist_ok=True)
output_handler.write_to_json(output_json_filepath,
output_json_filename)
return [
sample['prediction']
for sample in output_handler.results_dict.values()
]
class LoglikelihoodInferencerOutputHandler:
results_dict = {}
def __init__(self) -> None:
self.results_dict = {}
def write_to_json(self, save_dir: str, filename: str):
"""Dump the result to a json file."""
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
def save_ice(self, ice):
for idx, example in enumerate(ice):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
self.results_dict[str(idx)]['in-context examples'] = example
def save_predictions(self, predictions):
for idx, prediction in enumerate(predictions):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
self.results_dict[str(idx)]['prediction'] = prediction
def save_prompt_and_loglikelihood(self, label, input, prompt,
loglikelihood, idx):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
if 'label: ' + str(label) not in self.results_dict[str(idx)].keys():
self.results_dict[str(idx)]['label: ' + str(label)] = {}
self.results_dict[str(idx)]['label: ' +
str(label)]['testing input'] = input
self.results_dict[str(idx)]['label: ' + str(label)]['prompt'] = prompt
self.results_dict[str(idx)][
'label: ' + str(label)]['Loglikelihood'] = loglikelihood
def save_golds(self, golds):
for idx, gold in enumerate(golds):
if str(idx) not in self.results_dict.keys():
self.results_dict[str(idx)] = {}
self.results_dict[str(idx)]['gold'] = gold

View File

@ -0,0 +1,188 @@
"""PPL Inferencer."""
import os
from typing import List, Optional
import mmengine
import torch
from tqdm import tqdm
from opencompass.models.base import BaseModel
from opencompass.registry import ICL_INFERENCERS
from ..icl_prompt_template import PromptTemplate
from ..icl_retriever import BaseRetriever
from ..utils import get_logger
from .icl_base_inferencer import BaseInferencer, dump_results_dict
logger = get_logger(__name__)
@ICL_INFERENCERS.register_module()
class PPLOnlyInferencer(BaseInferencer):
"""PPLOnlyInferencer class to calculate PPL and PPL only, no choice is
made. This Inferencer is usually used along with AveragePPLEvaluator.
Attributes:
model (:obj:`BaseModel`, optional): The module to inference.
max_seq_len (:obj:`int`): Maximum number of tokenized words allowed by
the LM.
batch_size (:obj:`int`, optional): Batch size for the :obj:`DataLoader`
output_json_filepath (:obj:`str`, optional): File path for output
`JSON` file.
output_json_filename (:obj:`str`, optional): File name for output
`JSON` file.
save_every (:obj:`int`, optional): Save intermediate results every
"""
def __init__(
self,
model: BaseModel,
max_seq_len: Optional[int] = None,
batch_size: Optional[int] = 1,
output_json_filepath: Optional[str] = './icl_inference_output',
output_json_filename: Optional[str] = 'predictions',
save_every: Optional[int] = 1,
**kwargs) -> None:
super().__init__(
model=model,
max_seq_len=max_seq_len,
batch_size=batch_size,
output_json_filename=output_json_filename,
output_json_filepath=output_json_filepath,
**kwargs,
)
self.save_every = save_every
def inference(self,
retriever: BaseRetriever,
ice_template: Optional[PromptTemplate] = None,
prompt_template: Optional[PromptTemplate] = None,
output_json_filepath: Optional[str] = None,
output_json_filename: Optional[str] = None) -> List:
# 1. Preparation for output logs
output_handler = PPLOnlyInferencerOutputHandler()
if output_json_filepath is None:
output_json_filepath = self.output_json_filepath
if output_json_filename is None:
output_json_filename = self.output_json_filename
# 2. Get results of retrieval process
ice_idx_list = retriever.retrieve()
# 3. Generate prompts for testing input
prompt_list = self.get_generation_prompt_list_from_retriever_indices(
ice_idx_list,
retriever,
max_seq_len=self.max_seq_len,
ice_template=ice_template,
prompt_template=prompt_template)
# 3.1 Fetch and zip prompt & gold answer if output column exists
ds_reader = retriever.dataset_reader
assert ds_reader.output_column is None, (
'PPLOnlyInferencer supports `output_column=None` only.')
# Create tmp json file for saving intermediate results and future
# resuming
index = 0
tmp_json_filepath = os.path.join(output_json_filepath,
'tmp_' + output_json_filename)
if os.path.exists(tmp_json_filepath):
# TODO: move resume to output handler
try:
tmp_result_dict = mmengine.load(tmp_json_filepath)
except Exception:
pass
else:
output_handler.results_dict = tmp_result_dict
index = len(tmp_result_dict)
# 4. Wrap prompts with Dataloader
dataloader = self.get_dataloader(prompt_list[index:], self.batch_size)
# 5. Inference for prompts in each batch
logger.info('Starting inference process...')
for datum in tqdm(dataloader, disable=not self.is_main_process):
entry = datum
# 5-1. Inference with local model
with torch.no_grad():
ppls = self.model.get_ppl_from_template(entry).tolist()
parsed_entries = self.model.parse_template(entry, mode='gen')
# 5-3. Save current output
for prompt, ppl, in zip(parsed_entries, ppls):
output_handler.save_results(prompt, ppl, index)
index = index + 1
# 5-4. Save intermediate results
if (self.save_every is not None and index % self.save_every == 0
and self.is_main_process):
output_handler.write_to_json(output_json_filepath,
'tmp_' + output_json_filename)
# 6. Output
if self.is_main_process:
os.makedirs(output_json_filepath, exist_ok=True)
output_handler.write_to_json(output_json_filepath,
output_json_filename)
if os.path.exists(tmp_json_filepath):
os.remove(tmp_json_filepath)
return [
sample['ppl'] for sample in output_handler.results_dict.values()
]
def get_generation_prompt_list_from_retriever_indices(
self,
ice_idx_list: List[List[int]],
retriever: BaseRetriever,
max_seq_len: Optional[int] = None,
ice_template: Optional[PromptTemplate] = None,
prompt_template: Optional[PromptTemplate] = None):
prompt_list = []
for idx, ice_idx in enumerate(ice_idx_list):
ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
prompt = retriever.generate_prompt_for_generate_task(
idx,
ice,
ice_template=ice_template,
prompt_template=prompt_template)
if max_seq_len is not None:
prompt_token_num = self.model.get_token_len_from_template(
prompt, mode='gen')
while len(ice_idx) > 0 and prompt_token_num > max_seq_len:
ice_idx = ice_idx[:-1]
ice = retriever.generate_ice(ice_idx,
ice_template=ice_template)
prompt = retriever.generate_prompt_for_generate_task(
idx,
ice,
ice_template=ice_template,
prompt_template=prompt_template)
prompt_token_num = self.model.get_token_len_from_template(
prompt, mode='gen')
prompt_list.append(prompt)
return prompt_list
class PPLOnlyInferencerOutputHandler:
origin_prompt_dict = {}
output_dict = {}
results_dict = {}
def __init__(self) -> None:
self.results_dict = {}
def write_to_json(self, save_dir: str, filename: str):
"""Dump the result to a json file."""
dump_results_dict(self.results_dict, os.path.join(save_dir, filename))
def save_results(self, origin_prompt, ppl, idx):
self.results_dict[str(idx)] = {
'origin_prompt': origin_prompt,
'ppl': ppl,
}

View File

@ -1,10 +1,12 @@
import inspect
from abc import abstractmethod
from copy import deepcopy
from typing import Dict, List, Optional
from mmengine.config import ConfigDict
from opencompass.utils import get_logger, task_abbr_from_cfg
from opencompass.utils import (dataset_abbr_from_cfg, get_logger,
model_abbr_from_cfg, task_abbr_from_cfg)
class BasePartitioner:
@ -54,8 +56,7 @@ class BasePartitioner:
List[Dict]: A list of tasks.
"""
cfg = deepcopy(cfg)
models = cfg['models']
datasets = cfg['datasets']
work_dir = cfg['work_dir']
add_cfg = {}
@ -74,10 +75,11 @@ class BasePartitioner:
self.logger.debug(f'Key {k} not found in config, ignored.')
self.logger.debug(f'Additional config: {add_cfg}')
tasks = self.partition(models,
datasets,
work_dir,
self.out_dir,
model_and_dataset_args = self.parse_model_dataset_args(cfg)
tasks = self.partition(**model_and_dataset_args,
work_dir=work_dir,
out_dir=self.out_dir,
add_cfg=add_cfg)
self.logger.info(f'Partitioned into {len(tasks)} tasks.')
@ -86,6 +88,41 @@ class BasePartitioner:
return tasks
def parse_model_dataset_args(self, cfg: ConfigDict):
models = cfg['models']
datasets = cfg['datasets']
sig = inspect.signature(self.partition)
if 'model_dataset_combinations' in sig.parameters:
combs = cfg.get('model_dataset_combinations', None)
if combs is None:
combs = [{'models': models, 'datasets': datasets}]
else:
# sanity check
model_abbrs = [model_abbr_from_cfg(model) for model in models]
dataset_abbrs = [
dataset_abbr_from_cfg(dataset) for dataset in datasets
]
for comb in combs:
for model in comb['models']:
if model_abbr_from_cfg(model) not in model_abbrs:
raise ValueError(
f'Model {model_abbr_from_cfg(model)} '
'not found in config.')
for dataset in comb['datasets']:
if dataset_abbr_from_cfg(dataset) not in dataset_abbrs:
raise ValueError(
f'Dataset {dataset_abbr_from_cfg(dataset)} '
'not found in config.')
used_kwargs = {'model_dataset_combinations': combs}
else:
if cfg.get('model_dataset_combinations', None) is not None:
self.logger.warning(
'model_dataset_combinations is not supported by '
f'{self.__class__.__name__}. Ignored.')
used_kwargs = {'models': models, 'datasets': datasets}
return used_kwargs
@abstractmethod
def partition(self,
models: List[ConfigDict],

View File

@ -29,8 +29,8 @@ class NaivePartitioner(BasePartitioner):
self.n = n
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
model_dataset_combinations: List[Dict[str,
List[ConfigDict]]],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[Dict]:
@ -48,8 +48,9 @@ class NaivePartitioner(BasePartitioner):
}
Args:
models (List[ConfigDict]): A list of model configs.
datasets (List[ConfigDict]): A list of dataset configs.
model_dataset_combinations (List[Dict]): List of
`{models: [...], datasets: [...]}` dicts. Each dict contains
a list of model configs and a list of dataset configs.
work_dir (str): The work dir for the task.
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
@ -60,20 +61,21 @@ class NaivePartitioner(BasePartitioner):
"""
tasks = []
for model in models:
chunks = []
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
if osp.exists(filename):
continue
chunks.append(dataset)
for comb in model_dataset_combinations:
for model in comb['models']:
chunks = []
for dataset in comb['datasets']:
filename = get_infer_output_path(model, dataset, out_dir)
if osp.exists(filename):
continue
chunks.append(dataset)
for i in range(0, len(chunks), self.n):
task = Config({
'models': [model],
'datasets': [chunks[i:i + self.n]],
'work_dir': work_dir,
**add_cfg
})
tasks.append(task)
for i in range(0, len(chunks), self.n):
task = Config({
'models': [model],
'datasets': [chunks[i:i + self.n]],
'work_dir': work_dir,
**add_cfg
})
tasks.append(task)
return tasks

View File

@ -51,8 +51,8 @@ class SizePartitioner(BasePartitioner):
self.strategy = strategy
def partition(self,
models: List[ConfigDict],
datasets: List[ConfigDict],
model_dataset_combinations: List[Dict[str,
List[ConfigDict]]],
work_dir: str,
out_dir: str,
add_cfg: Dict = {}) -> List[ConfigDict]:
@ -71,8 +71,9 @@ class SizePartitioner(BasePartitioner):
}
Args:
models (List[ConfigDict]): A list of model configs.
datasets (List[ConfigDict]): A list of dataset configs.
model_dataset_combinations (List[Dict]): List of
`{models: [...], datasets: [...]}` dicts. Each dict contains
a list of model configs and a list of dataset configs.
work_dir (str): The work dir for the task.
out_dir (str): The full output path for the task, intended for
Partitioners to check whether the task is finished via the
@ -84,52 +85,54 @@ class SizePartitioner(BasePartitioner):
List[ConfigDict]: A list of tasks.
"""
datasets = sorted(datasets,
key=lambda x: self.get_cost(x),
reverse=True)
tasks = []
for model in models:
chunks = [] # elements: tuple(size, dataset_chunk)
for dataset in datasets:
filename = get_infer_output_path(model, dataset, out_dir)
# skip the task if the task output exists
if osp.exists(filename):
continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'):
chunks.append((self.max_task_size, dataset_split))
else:
chunks.append((dataset_size, dataset))
for comb in model_dataset_combinations:
comb['datasets'] = sorted(comb['datasets'],
key=lambda x: self.get_cost(x),
reverse=True)
for model in comb['models']:
chunks = [] # elements: tuple(size, dataset_chunk)
for dataset in comb['datasets']:
filename = get_infer_output_path(model, dataset, out_dir)
# skip the task if the task output exists
if osp.exists(filename):
continue
dataset_size = self.get_cost(dataset)
if dataset_size > self.max_task_size:
root, ext = osp.splitext(filename)
dataset_splits = self.split_dataset(dataset)
for i, dataset_split in enumerate(dataset_splits):
if not osp.exists(f'{root}_{i}{ext}'):
chunks.append(
(self.max_task_size, dataset_split))
else:
chunks.append((dataset_size, dataset))
if self.strategy == 'heuristic':
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
if self.strategy == 'heuristic':
chunks = sorted(chunks, key=lambda x: x[0], reverse=True)
current_size, current_chunks = 0, []
for index in range(len(chunks)):
current_size += chunks[index][0]
current_chunks.append(chunks[index][1])
if index == len(chunks) - 1 or current_size + chunks[
index + 1][0] > self.max_task_size:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [current_chunks],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
current_size, current_chunks = 0, []
elif self.strategy == 'split':
for _, dataset in chunks:
tasks.append(
Config({
'models': [model],
'datasets': [[dataset]],
'work_dir': work_dir,
**add_cfg
}))
return tasks
@property

View File

@ -13,7 +13,7 @@ from mmengine.config import ConfigDict
from tqdm import tqdm
from opencompass.registry import RUNNERS, TASKS
from opencompass.utils import get_logger
from opencompass.utils import batched, get_logger
from .base import BaseRunner
@ -131,15 +131,22 @@ class SlurmSequentialRunner(BaseRunner):
break
parent_conn.close()
for job_id in tqdm(job_ids, desc='clear sruns'):
if job_id is None:
continue
cmd = f'scancel {job_id}'
p = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
p.wait()
tbar = tqdm(total=len(job_ids), desc='clear sruns')
for batched_job_ids in batched(job_ids, 4):
ps = []
for job_id in batched_job_ids:
tbar.update()
if job_id is None:
continue
cmd = f'scancel {job_id}'
p = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
ps.append(p)
for p in ps:
p.wait()
tbar.close()
def _launch(self, cfg: ConfigDict, child_conn: Pipe = None):
logger = get_logger()

View File

@ -121,8 +121,9 @@ class OpenICLEvalTask(BaseTask):
pred_dicts = copy.deepcopy(preds)
preds = {k: [pred.get(k) for pred in preds] for k in preds[0]}
pred_strs = preds.pop('prediction')
pred_list_flag = isinstance(pred_strs[0], list)
pred_strs = preds.pop('prediction', None)
pred_list_flag = pred_strs is not None and isinstance(
pred_strs[0], list)
if ('pred_role' in self.eval_cfg
and 'meta_template' in self.model_cfg
and not MODELS.get(self.model_cfg['type']).is_api):
@ -166,6 +167,12 @@ class OpenICLEvalTask(BaseTask):
]
icl_evaluator = ICL_EVALUATORS.build(self.eval_cfg['evaluator'])
# need results dir to save other files
out_path = get_infer_output_path(
self.model_cfg, self.dataset_cfg,
osp.join(self.work_dir, 'results'))
icl_evaluator._out_dir = osp.splitext(out_path)[
0] # strip extension
preds['predictions'] = pred_strs
preds['references'] = (test_set[self.output_column]

View File

@ -49,6 +49,14 @@ def first_capital_postprocess(text: str) -> str:
return ''
@TEXT_POSTPROCESSORS.register_module('last-capital')
def last_capital_postprocess(text: str) -> str:
for t in text[::-1]:
if t.isupper():
return t
return ''
def first_option_postprocess(text: str, options: str) -> str:
"""Find first valid option for text."""

7
requirements/agent.txt Normal file
View File

@ -0,0 +1,7 @@
json5
jupyter
jupyter_client
jupytext
lagent
scikit-image
sympy

View File

@ -1,4 +1 @@
faiss_gpu==1.7.2
jupyter
lagent
scikit-image