mirror of
https://github.com/open-compass/opencompass.git
synced 2025-05-30 16:03:24 +08:00
[Feat] minor update agent related (#839)
* [Feat] update cibench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench
This commit is contained in:
parent
77be07dbb5
commit
4aa74565e2
96
configs/eval_chat_cibench_api.py
Normal file
96
configs/eval_chat_cibench_api.py
Normal file
@ -0,0 +1,96 @@
|
||||
from mmengine.config import read_base
|
||||
|
||||
from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
|
||||
from opencompass.lagent.agents.react import CIReAct, ReActProtocol
|
||||
from opencompass.models.lagent import CodeAgent
|
||||
from opencompass.models.openai_api import OpenAI
|
||||
from opencompass.partitioners import SizePartitioner
|
||||
from opencompass.runners import LocalRunner
|
||||
from opencompass.tasks import OpenICLInferTask
|
||||
|
||||
with read_base():
|
||||
from .datasets.CIBench.CIBench_template_gen_e6b12a import \
|
||||
cibench_datasets as datasets
|
||||
|
||||
FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
|
||||
|
||||
FEWSHOT_INSTRUCTION = """\
|
||||
You are an assistant who can utilize external tools.
|
||||
{tool_description}
|
||||
To use a tool, please response with the following format:
|
||||
```
|
||||
{thought} Think what you need to solve, do you need to use tools?
|
||||
{action} The tool name, should be one of [{action_names}].
|
||||
{action_input} The input to the tool that you want to use.
|
||||
```
|
||||
The tool will give you response after your response using the following format:
|
||||
```
|
||||
{response} the results after call the tool.
|
||||
```
|
||||
Therefore DO NOT generate tool response by yourself.
|
||||
|
||||
Also please follow the guidelines:
|
||||
1. Always use code interpreter to solve the problem.
|
||||
2. The generated codes should always in a markdown code block format.
|
||||
3. The generated codes will be executed in an ipython manner and the results will be cached.
|
||||
4. Your responded code should always be simple and only solves the problem in current step.
|
||||
|
||||
For example:
|
||||
|
||||
File url: `xxxx`
|
||||
### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
|
||||
|
||||
{thought} We should use `pandas` to solve this step.
|
||||
{action} IPythonInterpreter
|
||||
{action_input} ```python
|
||||
import pandas as pd
|
||||
url = "xxxx"
|
||||
data = pd.read_csv(url)
|
||||
```
|
||||
{response} The code is succeed without any outputs.
|
||||
|
||||
Let us begin from here!
|
||||
"""
|
||||
|
||||
IPYTHON_INTERPRETER_DESCRIPTION = '''\
|
||||
It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
|
||||
|
||||
models = [
|
||||
dict(
|
||||
abbr='gpt-3.5-code',
|
||||
type=CodeAgent,
|
||||
agent_type=CIReAct,
|
||||
max_turn=3,
|
||||
llm=dict(
|
||||
type=OpenAI,
|
||||
path='gpt-3.5-turbo',
|
||||
key='ENV',
|
||||
query_per_second=1,
|
||||
max_seq_len=4096,
|
||||
),
|
||||
actions=[
|
||||
dict(type=IPythonInterpreter,
|
||||
description=IPYTHON_INTERPRETER_DESCRIPTION,
|
||||
user_data_dir='./data/cibench_dataset/datasources')
|
||||
],
|
||||
protocol=dict(
|
||||
type=ReActProtocol,
|
||||
call_protocol=FEWSHOT_INSTRUCTION,
|
||||
force_stop=FORCE_STOP_PROMPT_EN,
|
||||
finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
|
||||
),
|
||||
batch_size=1,
|
||||
use_system_role=False, # use `user` role instead of system role
|
||||
first_system_role=False, # use `user` role of the first instruction prompt
|
||||
merge_adjacent_role=True, # merge adjacent same user content
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
infer = dict(
|
||||
partitioner=dict(type=SizePartitioner, max_task_size=1000),
|
||||
runner=dict(
|
||||
type=LocalRunner,
|
||||
max_num_workers=16,
|
||||
task=dict(type=OpenICLInferTask)),
|
||||
)
|
@ -105,10 +105,11 @@ def load_experiment_template(file: str) -> dict:
|
||||
for _output in cell['outputs']:
|
||||
if _output['output_type'] == 'display_data':
|
||||
assert not output_flag
|
||||
output_flag = True
|
||||
tags.append('vis')
|
||||
outputs.append(_output['data']['image/png'])
|
||||
for _output in cell['outputs']:
|
||||
if 'image/png' in _output['data']:
|
||||
output_flag = True
|
||||
tags.append('vis')
|
||||
outputs.append(_output['data']['image/png'])
|
||||
for _output in cell['outputs'][::-1]:
|
||||
if output_flag:
|
||||
break
|
||||
if _output['output_type'] == 'stream' and _output[
|
||||
@ -290,11 +291,26 @@ class CIBenchEvaluator(BaseEvaluator):
|
||||
if action['result']:
|
||||
try:
|
||||
pred = action['result']['text']
|
||||
match = re.search('execute_result:\n\n```\n(.*?)\n```',
|
||||
pred, re.DOTALL)
|
||||
match_exec = re.search(
|
||||
'execute_result:\n\n```\n(.*?)\n```', pred,
|
||||
re.DOTALL)
|
||||
match_stdout = re.search('stdout:\n\n```\n(.*?)\n```',
|
||||
pred, re.DOTALL)
|
||||
# get pred result from execute_result by default
|
||||
# else stdout
|
||||
if match_exec and match_stdout:
|
||||
match = match_exec
|
||||
elif match_exec:
|
||||
match = match_exec
|
||||
elif match_stdout:
|
||||
match = match_stdout
|
||||
else:
|
||||
match = None
|
||||
if match:
|
||||
out = match.group(1)
|
||||
return out.strip() == target.strip()
|
||||
score = (out.strip() == target.strip()
|
||||
or target.strip() in out.strip())
|
||||
return score
|
||||
except Exception:
|
||||
return False
|
||||
# Fall back to False
|
||||
|
@ -1,7 +1,136 @@
|
||||
from lagent.agents.react import ReAct
|
||||
import copy
|
||||
from typing import Dict, List
|
||||
|
||||
from lagent.actions import ActionExecutor
|
||||
from lagent.agents.react import ReAct as _ReAct
|
||||
from lagent.agents.react import ReActProtocol as _ReActProtocol
|
||||
from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
|
||||
|
||||
|
||||
class ReActProtocol(_ReActProtocol):
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
# defaults to system
|
||||
self.system_role = 'system'
|
||||
self.first_system_role = 'system'
|
||||
self.merge_adjacent_role = False
|
||||
|
||||
def format(self,
|
||||
chat_history: List[Dict],
|
||||
inner_step: List[Dict],
|
||||
action_executor: ActionExecutor,
|
||||
force_stop: bool = False) -> list:
|
||||
"""Generate the ReAct format prompt.
|
||||
|
||||
Args:
|
||||
chat_history (List[Dict]): The history log in previous runs.
|
||||
inner_step (List[Dict]): The log in the current run.
|
||||
action_executor (ActionExecutor): the action manager to
|
||||
execute actions.
|
||||
force_stop (boolean): whether force the agent to give responses
|
||||
under pre-defined turns.
|
||||
|
||||
Returns:
|
||||
List[Dict]: ReAct format prompt.
|
||||
"""
|
||||
|
||||
call_protocol = self.call_protocol.format(
|
||||
tool_description=action_executor.get_actions_info(),
|
||||
action_names=action_executor.action_names(),
|
||||
thought=self.thought['begin'],
|
||||
action=self.action['begin'],
|
||||
action_input=self.action_input['begin'],
|
||||
response=self.response['begin'],
|
||||
finish=self.finish['begin'],
|
||||
)
|
||||
formatted = []
|
||||
formatted.append(
|
||||
dict(role=self.first_system_role, content=call_protocol))
|
||||
formatted += chat_history
|
||||
formatted += inner_step
|
||||
if force_stop:
|
||||
formatted.append(
|
||||
dict(role=self.system_role, content=self.force_stop))
|
||||
|
||||
if self.merge_adjacent_role and formatted:
|
||||
merged = [formatted[0]] # Add the first dict
|
||||
|
||||
for d in formatted[1:]:
|
||||
# If the 'role' of current dict matches with the 'role' of the
|
||||
# last dict in merged list,
|
||||
# append its 'content' to the 'content' of the last dict.
|
||||
if d['role'] == merged[-1]['role']:
|
||||
merged[-1]['content'] += d['content']
|
||||
else:
|
||||
# If 'role' does not match, add it as a new dict in the
|
||||
# merged list
|
||||
merged.append(d)
|
||||
|
||||
return merged
|
||||
|
||||
return formatted
|
||||
|
||||
|
||||
class ReAct(_ReAct):
|
||||
|
||||
def __init__(self,
|
||||
use_system_role: bool = True,
|
||||
first_system_role: bool = True,
|
||||
merge_adjacent_role: bool = False,
|
||||
**kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
if use_system_role:
|
||||
self.system_role = 'system'
|
||||
else:
|
||||
self.system_role = 'user'
|
||||
if use_system_role or first_system_role:
|
||||
first_system_role = 'system'
|
||||
else:
|
||||
first_system_role = 'user'
|
||||
self._protocol.first_system_role = first_system_role
|
||||
self._protocol.system_role = self.system_role
|
||||
self._protocol.merge_adjacent_role = merge_adjacent_role
|
||||
|
||||
def chat(self, message: str) -> AgentReturn:
|
||||
for hist in self._session_history:
|
||||
if hist['role'] == 'system':
|
||||
hist['role'] = self.system_role
|
||||
self._inner_history = []
|
||||
self._inner_history.append(dict(role='user', content=message))
|
||||
agent_return = AgentReturn()
|
||||
default_response = 'Sorry that I cannot answer your question.'
|
||||
for turn in range(self.max_turn):
|
||||
prompt = self._protocol.format(
|
||||
chat_history=self.session_history,
|
||||
inner_step=self._inner_history,
|
||||
action_executor=self._action_executor,
|
||||
force_stop=(turn == self.max_turn - 1))
|
||||
response = self._llm.generate_from_template(prompt, 512)
|
||||
self._inner_history.append(dict(role='assistant',
|
||||
content=response))
|
||||
thought, action, action_input = self._protocol.parse(
|
||||
response, self._action_executor)
|
||||
action_return: ActionReturn = self._action_executor(
|
||||
action, action_input)
|
||||
action_return.thought = thought
|
||||
agent_return.actions.append(action_return)
|
||||
if action_return.type == self._action_executor.finish_action.name:
|
||||
agent_return.response = action_return.result['text']
|
||||
break
|
||||
self._inner_history.append(
|
||||
dict(role=self.system_role,
|
||||
content=self._protocol.format_response(action_return)))
|
||||
else:
|
||||
agent_return.response = default_response
|
||||
agent_return.inner_steps = copy.deepcopy(self._inner_history)
|
||||
# only append the user and final response
|
||||
self._session_history.append(dict(role='user', content=message))
|
||||
self._session_history.append(
|
||||
dict(role='assistant', content=agent_return.response))
|
||||
return agent_return
|
||||
|
||||
|
||||
class CIReAct(ReAct):
|
||||
"""Code Interpreter version of ReAct. The success state is different from
|
||||
ReAct.
|
||||
@ -27,6 +156,9 @@ class CIReAct(ReAct):
|
||||
b.reset()
|
||||
|
||||
def chat(self, message: str) -> AgentReturn:
|
||||
for hist in self._session_history:
|
||||
if hist['role'] == 'system':
|
||||
hist['role'] = self.system_role
|
||||
self._inner_history = []
|
||||
# append the user message for session history
|
||||
self._session_history.append(dict(role='user', content=message))
|
||||
@ -54,14 +186,14 @@ class CIReAct(ReAct):
|
||||
dict(role='assistant', content=response))
|
||||
self._session_history.append(
|
||||
dict(
|
||||
role='system',
|
||||
role=self.system_role,
|
||||
content=self._protocol.format_response(action_return)))
|
||||
agent_return.response = action_return.result['text']
|
||||
return agent_return
|
||||
elif action_return.type == self._action_executor.invalid_action.name: # noqa
|
||||
action_return.errmsg = 'The action is invalid, please check the action name.' # noqa
|
||||
self._inner_history.append(
|
||||
dict(role='system',
|
||||
dict(role=self.system_role,
|
||||
content=self._protocol.format_response(action_return)))
|
||||
if turn == self.max_turn - 1:
|
||||
force_stop = True
|
||||
|
@ -42,6 +42,26 @@ class LagentAgent:
|
||||
def set_history(self, history):
|
||||
self.agent._session_history = deepcopy(history)
|
||||
|
||||
def gt_response(self, prompt):
|
||||
if 'CIReAct' in str(self.agent.__class__):
|
||||
gold = prompt
|
||||
prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
|
||||
{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n""" # noqa
|
||||
action_input = dict(
|
||||
command=f"""```python\n{gold}\n```\n""",
|
||||
timeout=120,
|
||||
)
|
||||
response = self.agent._action_executor('IPythonInterpreter',
|
||||
action_input)
|
||||
gt_response = dict(role='assistant', content=prompt)
|
||||
system_response = dict(
|
||||
role='system',
|
||||
content=self.agent._protocol.format_response(response))
|
||||
return [gt_response, system_response]
|
||||
else:
|
||||
gt_response = dict(role='assistant', content=prompt)
|
||||
return [gt_response]
|
||||
|
||||
@property
|
||||
def template_parser(self):
|
||||
return self.agent._llm.template_parser
|
||||
|
@ -124,8 +124,15 @@ class AgentInferencer(ChatInferencer):
|
||||
i for i, item in enumerate(chat) if item['role'] == 'assistant'
|
||||
]
|
||||
|
||||
history = chat[:assistant_indices[0] - 1]
|
||||
prev_idx = 0
|
||||
for i in assistant_indices:
|
||||
self.model.set_history(chat[:i - 1])
|
||||
for j in range(prev_idx, i - 1):
|
||||
if chat[j]['role'] == 'assistant':
|
||||
history += self.model.gt_response(chat[j]['content'])
|
||||
elif chat[j]['role'] == 'user':
|
||||
history += [chat[j]]
|
||||
self.model.set_history(history)
|
||||
answer, steps, _ = self.model.chat(chat[i - 1]['content'])
|
||||
output_handler.save_multiround_results(
|
||||
origin_prompt=chat[i - 1]['content'],
|
||||
@ -134,4 +141,6 @@ class AgentInferencer(ChatInferencer):
|
||||
idx=index,
|
||||
gold=chat[i]['content'],
|
||||
)
|
||||
history += [chat[i - 1]]
|
||||
prev_idx = i
|
||||
self.model.reset()
|
||||
|
@ -6,6 +6,8 @@ jupyter
|
||||
jupyter_client
|
||||
jupytext
|
||||
lagent
|
||||
lightgbm==4.1.0
|
||||
networkx
|
||||
scikit-image
|
||||
sympy==1.12
|
||||
tensorflow==2.14.0
|
||||
|
Loading…
Reference in New Issue
Block a user