mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feat] minor update agent related (#839)
* [Feat] update cibench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench
This commit is contained in:
parent
77be07dbb5
commit
4aa74565e2
96
configs/eval_chat_cibench_api.py
Normal file
96
configs/eval_chat_cibench_api.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
from mmengine.config import read_base
|
||||||
|
|
||||||
|
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
|
||||||
|
from opencompass.lagent.agents.react import CIReAct, ReActProtocol
|
||||||
|
from opencompass.models.lagent import CodeAgent
|
||||||
|
from opencompass.models.openai_api import OpenAI
|
||||||
|
from opencompass.partitioners import SizePartitioner
|
||||||
|
from opencompass.runners import LocalRunner
|
||||||
|
from opencompass.tasks import OpenICLInferTask
|
||||||
|
|
||||||
|
with read_base():
|
||||||
|
from .datasets.CIBench.CIBench_template_gen_e6b12a import \
|
||||||
|
cibench_datasets as datasets
|
||||||
|
|
||||||
|
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
|
||||||
|
|
||||||
|
FEWSHOT_INSTRUCTION = """\
|
||||||
|
You are an assistant who can utilize external tools.
|
||||||
|
{tool_description}
|
||||||
|
To use a tool, please response with the following format:
|
||||||
|
```
|
||||||
|
{thought} Think what you need to solve, do you need to use tools?
|
||||||
|
{action} The tool name, should be one of [{action_names}].
|
||||||
|
{action_input} The input to the tool that you want to use.
|
||||||
|
```
|
||||||
|
The tool will give you response after your response using the following format:
|
||||||
|
```
|
||||||
|
{response} the results after call the tool.
|
||||||
|
```
|
||||||
|
Therefore DO NOT generate tool response by yourself.
|
||||||
|
|
||||||
|
Also please follow the guidelines:
|
||||||
|
1. Always use code interpreter to solve the problem.
|
||||||
|
2. The generated codes should always in a markdown code block format.
|
||||||
|
3. The generated codes will be executed in an ipython manner and the results will be cached.
|
||||||
|
4. Your responded code should always be simple and only solves the problem in current step.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
File url: `xxxx`
|
||||||
|
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
|
||||||
|
|
||||||
|
{thought} We should use `pandas` to solve this step.
|
||||||
|
{action} IPythonInterpreter
|
||||||
|
{action_input} ```python
|
||||||
|
import pandas as pd
|
||||||
|
url = "xxxx"
|
||||||
|
data = pd.read_csv(url)
|
||||||
|
```
|
||||||
|
{response} The code is succeed without any outputs.
|
||||||
|
|
||||||
|
Let us begin from here!
|
||||||
|
"""
|
||||||
|
|
||||||
|
IPYTHON_INTERPRETER_DESCRIPTION = '''\
|
||||||
|
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
|
||||||
|
|
||||||
|
models = [
|
||||||
|
dict(
|
||||||
|
abbr='gpt-3.5-code',
|
||||||
|
type=CodeAgent,
|
||||||
|
agent_type=CIReAct,
|
||||||
|
max_turn=3,
|
||||||
|
llm=dict(
|
||||||
|
type=OpenAI,
|
||||||
|
path='gpt-3.5-turbo',
|
||||||
|
key='ENV',
|
||||||
|
query_per_second=1,
|
||||||
|
max_seq_len=4096,
|
||||||
|
),
|
||||||
|
actions=[
|
||||||
|
dict(type=IPythonInterpreter,
|
||||||
|
description=IPYTHON_INTERPRETER_DESCRIPTION,
|
||||||
|
user_data_dir='./data/cibench_dataset/datasources')
|
||||||
|
],
|
||||||
|
protocol=dict(
|
||||||
|
type=ReActProtocol,
|
||||||
|
call_protocol=FEWSHOT_INSTRUCTION,
|
||||||
|
force_stop=FORCE_STOP_PROMPT_EN,
|
||||||
|
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
|
||||||
|
),
|
||||||
|
batch_size=1,
|
||||||
|
use_system_role=False, # use `user` role instead of system role
|
||||||
|
first_system_role=False, # use `user` role of the first instruction prompt
|
||||||
|
merge_adjacent_role=True, # merge adjacent same user content
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
infer = dict(
|
||||||
|
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||||
|
runner=dict(
|
||||||
|
type=LocalRunner,
|
||||||
|
max_num_workers=16,
|
||||||
|
task=dict(type=OpenICLInferTask)),
|
||||||
|
)
|
@ -105,10 +105,11 @@ def load_experiment_template(file: str) -> dict:
|
|||||||
for _output in cell['outputs']:
|
for _output in cell['outputs']:
|
||||||
if _output['output_type'] == 'display_data':
|
if _output['output_type'] == 'display_data':
|
||||||
assert not output_flag
|
assert not output_flag
|
||||||
output_flag = True
|
if 'image/png' in _output['data']:
|
||||||
tags.append('vis')
|
output_flag = True
|
||||||
outputs.append(_output['data']['image/png'])
|
tags.append('vis')
|
||||||
for _output in cell['outputs']:
|
outputs.append(_output['data']['image/png'])
|
||||||
|
for _output in cell['outputs'][::-1]:
|
||||||
if output_flag:
|
if output_flag:
|
||||||
break
|
break
|
||||||
if _output['output_type'] == 'stream' and _output[
|
if _output['output_type'] == 'stream' and _output[
|
||||||
@ -290,11 +291,26 @@ class CIBenchEvaluator(BaseEvaluator):
|
|||||||
if action['result']:
|
if action['result']:
|
||||||
try:
|
try:
|
||||||
pred = action['result']['text']
|
pred = action['result']['text']
|
||||||
match = re.search('execute_result:\n\n```\n(.*?)\n```',
|
match_exec = re.search(
|
||||||
pred, re.DOTALL)
|
'execute_result:\n\n```\n(.*?)\n```', pred,
|
||||||
|
re.DOTALL)
|
||||||
|
match_stdout = re.search('stdout:\n\n```\n(.*?)\n```',
|
||||||
|
pred, re.DOTALL)
|
||||||
|
# get pred result from execute_result by default
|
||||||
|
# else stdout
|
||||||
|
if match_exec and match_stdout:
|
||||||
|
match = match_exec
|
||||||
|
elif match_exec:
|
||||||
|
match = match_exec
|
||||||
|
elif match_stdout:
|
||||||
|
match = match_stdout
|
||||||
|
else:
|
||||||
|
match = None
|
||||||
if match:
|
if match:
|
||||||
out = match.group(1)
|
out = match.group(1)
|
||||||
return out.strip() == target.strip()
|
score = (out.strip() == target.strip()
|
||||||
|
or target.strip() in out.strip())
|
||||||
|
return score
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
# Fall back to False
|
# Fall back to False
|
||||||
|
@ -1,7 +1,136 @@
|
|||||||
from lagent.agents.react import ReAct
|
import copy
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from lagent.actions import ActionExecutor
|
||||||
|
from lagent.agents.react import ReAct as _ReAct
|
||||||
|
from lagent.agents.react import ReActProtocol as _ReActProtocol
|
||||||
from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
|
from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
|
||||||
|
|
||||||
|
|
||||||
|
class ReActProtocol(_ReActProtocol):
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
# defaults to system
|
||||||
|
self.system_role = 'system'
|
||||||
|
self.first_system_role = 'system'
|
||||||
|
self.merge_adjacent_role = False
|
||||||
|
|
||||||
|
def format(self,
|
||||||
|
chat_history: List[Dict],
|
||||||
|
inner_step: List[Dict],
|
||||||
|
action_executor: ActionExecutor,
|
||||||
|
force_stop: bool = False) -> list:
|
||||||
|
"""Generate the ReAct format prompt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chat_history (List[Dict]): The history log in previous runs.
|
||||||
|
inner_step (List[Dict]): The log in the current run.
|
||||||
|
action_executor (ActionExecutor): the action manager to
|
||||||
|
execute actions.
|
||||||
|
force_stop (boolean): whether force the agent to give responses
|
||||||
|
under pre-defined turns.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Dict]: ReAct format prompt.
|
||||||
|
"""
|
||||||
|
|
||||||
|
call_protocol = self.call_protocol.format(
|
||||||
|
tool_description=action_executor.get_actions_info(),
|
||||||
|
action_names=action_executor.action_names(),
|
||||||
|
thought=self.thought['begin'],
|
||||||
|
action=self.action['begin'],
|
||||||
|
action_input=self.action_input['begin'],
|
||||||
|
response=self.response['begin'],
|
||||||
|
finish=self.finish['begin'],
|
||||||
|
)
|
||||||
|
formatted = []
|
||||||
|
formatted.append(
|
||||||
|
dict(role=self.first_system_role, content=call_protocol))
|
||||||
|
formatted += chat_history
|
||||||
|
formatted += inner_step
|
||||||
|
if force_stop:
|
||||||
|
formatted.append(
|
||||||
|
dict(role=self.system_role, content=self.force_stop))
|
||||||
|
|
||||||
|
if self.merge_adjacent_role and formatted:
|
||||||
|
merged = [formatted[0]] # Add the first dict
|
||||||
|
|
||||||
|
for d in formatted[1:]:
|
||||||
|
# If the 'role' of current dict matches with the 'role' of the
|
||||||
|
# last dict in merged list,
|
||||||
|
# append its 'content' to the 'content' of the last dict.
|
||||||
|
if d['role'] == merged[-1]['role']:
|
||||||
|
merged[-1]['content'] += d['content']
|
||||||
|
else:
|
||||||
|
# If 'role' does not match, add it as a new dict in the
|
||||||
|
# merged list
|
||||||
|
merged.append(d)
|
||||||
|
|
||||||
|
return merged
|
||||||
|
|
||||||
|
return formatted
|
||||||
|
|
||||||
|
|
||||||
|
class ReAct(_ReAct):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
use_system_role: bool = True,
|
||||||
|
first_system_role: bool = True,
|
||||||
|
merge_adjacent_role: bool = False,
|
||||||
|
**kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
if use_system_role:
|
||||||
|
self.system_role = 'system'
|
||||||
|
else:
|
||||||
|
self.system_role = 'user'
|
||||||
|
if use_system_role or first_system_role:
|
||||||
|
first_system_role = 'system'
|
||||||
|
else:
|
||||||
|
first_system_role = 'user'
|
||||||
|
self._protocol.first_system_role = first_system_role
|
||||||
|
self._protocol.system_role = self.system_role
|
||||||
|
self._protocol.merge_adjacent_role = merge_adjacent_role
|
||||||
|
|
||||||
|
def chat(self, message: str) -> AgentReturn:
|
||||||
|
for hist in self._session_history:
|
||||||
|
if hist['role'] == 'system':
|
||||||
|
hist['role'] = self.system_role
|
||||||
|
self._inner_history = []
|
||||||
|
self._inner_history.append(dict(role='user', content=message))
|
||||||
|
agent_return = AgentReturn()
|
||||||
|
default_response = 'Sorry that I cannot answer your question.'
|
||||||
|
for turn in range(self.max_turn):
|
||||||
|
prompt = self._protocol.format(
|
||||||
|
chat_history=self.session_history,
|
||||||
|
inner_step=self._inner_history,
|
||||||
|
action_executor=self._action_executor,
|
||||||
|
force_stop=(turn == self.max_turn - 1))
|
||||||
|
response = self._llm.generate_from_template(prompt, 512)
|
||||||
|
self._inner_history.append(dict(role='assistant',
|
||||||
|
content=response))
|
||||||
|
thought, action, action_input = self._protocol.parse(
|
||||||
|
response, self._action_executor)
|
||||||
|
action_return: ActionReturn = self._action_executor(
|
||||||
|
action, action_input)
|
||||||
|
action_return.thought = thought
|
||||||
|
agent_return.actions.append(action_return)
|
||||||
|
if action_return.type == self._action_executor.finish_action.name:
|
||||||
|
agent_return.response = action_return.result['text']
|
||||||
|
break
|
||||||
|
self._inner_history.append(
|
||||||
|
dict(role=self.system_role,
|
||||||
|
content=self._protocol.format_response(action_return)))
|
||||||
|
else:
|
||||||
|
agent_return.response = default_response
|
||||||
|
agent_return.inner_steps = copy.deepcopy(self._inner_history)
|
||||||
|
# only append the user and final response
|
||||||
|
self._session_history.append(dict(role='user', content=message))
|
||||||
|
self._session_history.append(
|
||||||
|
dict(role='assistant', content=agent_return.response))
|
||||||
|
return agent_return
|
||||||
|
|
||||||
|
|
||||||
class CIReAct(ReAct):
|
class CIReAct(ReAct):
|
||||||
"""Code Interpreter version of ReAct. The success state is different from
|
"""Code Interpreter version of ReAct. The success state is different from
|
||||||
ReAct.
|
ReAct.
|
||||||
@ -27,6 +156,9 @@ class CIReAct(ReAct):
|
|||||||
b.reset()
|
b.reset()
|
||||||
|
|
||||||
def chat(self, message: str) -> AgentReturn:
|
def chat(self, message: str) -> AgentReturn:
|
||||||
|
for hist in self._session_history:
|
||||||
|
if hist['role'] == 'system':
|
||||||
|
hist['role'] = self.system_role
|
||||||
self._inner_history = []
|
self._inner_history = []
|
||||||
# append the user message for session history
|
# append the user message for session history
|
||||||
self._session_history.append(dict(role='user', content=message))
|
self._session_history.append(dict(role='user', content=message))
|
||||||
@ -54,14 +186,14 @@ class CIReAct(ReAct):
|
|||||||
dict(role='assistant', content=response))
|
dict(role='assistant', content=response))
|
||||||
self._session_history.append(
|
self._session_history.append(
|
||||||
dict(
|
dict(
|
||||||
role='system',
|
role=self.system_role,
|
||||||
content=self._protocol.format_response(action_return)))
|
content=self._protocol.format_response(action_return)))
|
||||||
agent_return.response = action_return.result['text']
|
agent_return.response = action_return.result['text']
|
||||||
return agent_return
|
return agent_return
|
||||||
elif action_return.type == self._action_executor.invalid_action.name: # noqa
|
elif action_return.type == self._action_executor.invalid_action.name: # noqa
|
||||||
action_return.errmsg = 'The action is invalid, please check the action name.' # noqa
|
action_return.errmsg = 'The action is invalid, please check the action name.' # noqa
|
||||||
self._inner_history.append(
|
self._inner_history.append(
|
||||||
dict(role='system',
|
dict(role=self.system_role,
|
||||||
content=self._protocol.format_response(action_return)))
|
content=self._protocol.format_response(action_return)))
|
||||||
if turn == self.max_turn - 1:
|
if turn == self.max_turn - 1:
|
||||||
force_stop = True
|
force_stop = True
|
||||||
|
@ -42,6 +42,26 @@ class LagentAgent:
|
|||||||
def set_history(self, history):
|
def set_history(self, history):
|
||||||
self.agent._session_history = deepcopy(history)
|
self.agent._session_history = deepcopy(history)
|
||||||
|
|
||||||
|
def gt_response(self, prompt):
|
||||||
|
if 'CIReAct' in str(self.agent.__class__):
|
||||||
|
gold = prompt
|
||||||
|
prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
|
||||||
|
{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n""" # noqa
|
||||||
|
action_input = dict(
|
||||||
|
command=f"""```python\n{gold}\n```\n""",
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
response = self.agent._action_executor('IPythonInterpreter',
|
||||||
|
action_input)
|
||||||
|
gt_response = dict(role='assistant', content=prompt)
|
||||||
|
system_response = dict(
|
||||||
|
role='system',
|
||||||
|
content=self.agent._protocol.format_response(response))
|
||||||
|
return [gt_response, system_response]
|
||||||
|
else:
|
||||||
|
gt_response = dict(role='assistant', content=prompt)
|
||||||
|
return [gt_response]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def template_parser(self):
|
def template_parser(self):
|
||||||
return self.agent._llm.template_parser
|
return self.agent._llm.template_parser
|
||||||
|
@ -124,8 +124,15 @@ class AgentInferencer(ChatInferencer):
|
|||||||
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
||||||
]
|
]
|
||||||
|
|
||||||
|
history = chat[:assistant_indices[0] - 1]
|
||||||
|
prev_idx = 0
|
||||||
for i in assistant_indices:
|
for i in assistant_indices:
|
||||||
self.model.set_history(chat[:i - 1])
|
for j in range(prev_idx, i - 1):
|
||||||
|
if chat[j]['role'] == 'assistant':
|
||||||
|
history += self.model.gt_response(chat[j]['content'])
|
||||||
|
elif chat[j]['role'] == 'user':
|
||||||
|
history += [chat[j]]
|
||||||
|
self.model.set_history(history)
|
||||||
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
|
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
|
||||||
output_handler.save_multiround_results(
|
output_handler.save_multiround_results(
|
||||||
origin_prompt=chat[i - 1]['content'],
|
origin_prompt=chat[i - 1]['content'],
|
||||||
@ -134,4 +141,6 @@ class AgentInferencer(ChatInferencer):
|
|||||||
idx=index,
|
idx=index,
|
||||||
gold=chat[i]['content'],
|
gold=chat[i]['content'],
|
||||||
)
|
)
|
||||||
|
history += [chat[i - 1]]
|
||||||
|
prev_idx = i
|
||||||
self.model.reset()
|
self.model.reset()
|
||||||
|
@ -6,6 +6,8 @@ jupyter
|
|||||||
jupyter_client
|
jupyter_client
|
||||||
jupytext
|
jupytext
|
||||||
lagent
|
lagent
|
||||||
|
lightgbm==4.1.0
|
||||||
networkx
|
networkx
|
||||||
scikit-image
|
scikit-image
|
||||||
sympy==1.12
|
sympy==1.12
|
||||||
|
tensorflow==2.14.0
|
||||||
|
Loading…
Reference in New Issue
Block a user