diff --git a/configs/eval_chat_cibench_api.py b/configs/eval_chat_cibench_api.py new file mode 100644 index 00000000..53146633 --- /dev/null +++ b/configs/eval_chat_cibench_api.py @@ -0,0 +1,96 @@ +from mmengine.config import read_base + +from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter +from opencompass.lagent.agents.react import CIReAct, ReActProtocol +from opencompass.models.lagent import CodeAgent +from opencompass.models.openai_api import OpenAI +from opencompass.partitioners import SizePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks import OpenICLInferTask + +with read_base(): + from .datasets.CIBench.CIBench_template_gen_e6b12a import \ + cibench_datasets as datasets + +FORCE_STOP_PROMPT_EN = """You should directly give results based on history information.""" + +FEWSHOT_INSTRUCTION = """\ +You are an assistant who can utilize external tools. +{tool_description} +To use a tool, please response with the following format: +``` +{thought} Think what you need to solve, do you need to use tools? +{action} The tool name, should be one of [{action_names}]. +{action_input} The input to the tool that you want to use. +``` +The tool will give you response after your response using the following format: +``` +{response} the results after call the tool. +``` +Therefore DO NOT generate tool response by yourself. + +Also please follow the guidelines: +1. Always use code interpreter to solve the problem. +2. The generated codes should always in a markdown code block format. +3. The generated codes will be executed in an ipython manner and the results will be cached. +4. Your responded code should always be simple and only solves the problem in current step. + +For example: + +File url: `xxxx` +### Step 1. Load the dataset from the url into a pandas DataFrame named `df`. + +{thought} We should use `pandas` to solve this step. +{action} IPythonInterpreter +{action_input} ```python +import pandas as pd +url = "xxxx" +data = pd.read_csv(url) +``` +{response} The code is succeed without any outputs. + +Let us begin from here! +""" + +IPYTHON_INTERPRETER_DESCRIPTION = '''\ +It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.''' + +models = [ + dict( + abbr='gpt-3.5-code', + type=CodeAgent, + agent_type=CIReAct, + max_turn=3, + llm=dict( + type=OpenAI, + path='gpt-3.5-turbo', + key='ENV', + query_per_second=1, + max_seq_len=4096, + ), + actions=[ + dict(type=IPythonInterpreter, + description=IPYTHON_INTERPRETER_DESCRIPTION, + user_data_dir='./data/cibench_dataset/datasources') + ], + protocol=dict( + type=ReActProtocol, + call_protocol=FEWSHOT_INSTRUCTION, + force_stop=FORCE_STOP_PROMPT_EN, + finish=dict(role='FINISH', begin='Final Answer:', end='\n'), + ), + batch_size=1, + use_system_role=False, # use `user` role instead of system role + first_system_role=False, # use `user` role of the first instruction prompt + merge_adjacent_role=True, # merge adjacent same user content + ), +] + + +infer = dict( + partitioner=dict(type=SizePartitioner, max_task_size=1000), + runner=dict( + type=LocalRunner, + max_num_workers=16, + task=dict(type=OpenICLInferTask)), +) \ No newline at end of file diff --git a/opencompass/datasets/cibench.py b/opencompass/datasets/cibench.py index ae13ced8..be2ce1ce 100644 --- a/opencompass/datasets/cibench.py +++ b/opencompass/datasets/cibench.py @@ -105,10 +105,11 @@ def load_experiment_template(file: str) -> dict: for _output in cell['outputs']: if _output['output_type'] == 'display_data': assert not output_flag - output_flag = True - tags.append('vis') - outputs.append(_output['data']['image/png']) - for _output in cell['outputs']: + if 'image/png' in _output['data']: + output_flag = True + tags.append('vis') + outputs.append(_output['data']['image/png']) + for _output in cell['outputs'][::-1]: if output_flag: break if _output['output_type'] == 'stream' and _output[ @@ -290,11 +291,26 @@ class CIBenchEvaluator(BaseEvaluator): if action['result']: try: pred = action['result']['text'] - match = re.search('execute_result:\n\n```\n(.*?)\n```', - pred, re.DOTALL) + match_exec = re.search( + 'execute_result:\n\n```\n(.*?)\n```', pred, + re.DOTALL) + match_stdout = re.search('stdout:\n\n```\n(.*?)\n```', + pred, re.DOTALL) + # get pred result from execute_result by default + # else stdout + if match_exec and match_stdout: + match = match_exec + elif match_exec: + match = match_exec + elif match_stdout: + match = match_stdout + else: + match = None if match: out = match.group(1) - return out.strip() == target.strip() + score = (out.strip() == target.strip() + or target.strip() in out.strip()) + return score except Exception: return False # Fall back to False diff --git a/opencompass/lagent/agents/react.py b/opencompass/lagent/agents/react.py index cd55af8b..0232068f 100644 --- a/opencompass/lagent/agents/react.py +++ b/opencompass/lagent/agents/react.py @@ -1,7 +1,136 @@ -from lagent.agents.react import ReAct +import copy +from typing import Dict, List + +from lagent.actions import ActionExecutor +from lagent.agents.react import ReAct as _ReAct +from lagent.agents.react import ReActProtocol as _ReActProtocol from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn +class ReActProtocol(_ReActProtocol): + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + # defaults to system + self.system_role = 'system' + self.first_system_role = 'system' + self.merge_adjacent_role = False + + def format(self, + chat_history: List[Dict], + inner_step: List[Dict], + action_executor: ActionExecutor, + force_stop: bool = False) -> list: + """Generate the ReAct format prompt. + + Args: + chat_history (List[Dict]): The history log in previous runs. + inner_step (List[Dict]): The log in the current run. + action_executor (ActionExecutor): the action manager to + execute actions. + force_stop (boolean): whether force the agent to give responses + under pre-defined turns. + + Returns: + List[Dict]: ReAct format prompt. + """ + + call_protocol = self.call_protocol.format( + tool_description=action_executor.get_actions_info(), + action_names=action_executor.action_names(), + thought=self.thought['begin'], + action=self.action['begin'], + action_input=self.action_input['begin'], + response=self.response['begin'], + finish=self.finish['begin'], + ) + formatted = [] + formatted.append( + dict(role=self.first_system_role, content=call_protocol)) + formatted += chat_history + formatted += inner_step + if force_stop: + formatted.append( + dict(role=self.system_role, content=self.force_stop)) + + if self.merge_adjacent_role and formatted: + merged = [formatted[0]] # Add the first dict + + for d in formatted[1:]: + # If the 'role' of current dict matches with the 'role' of the + # last dict in merged list, + # append its 'content' to the 'content' of the last dict. + if d['role'] == merged[-1]['role']: + merged[-1]['content'] += d['content'] + else: + # If 'role' does not match, add it as a new dict in the + # merged list + merged.append(d) + + return merged + + return formatted + + +class ReAct(_ReAct): + + def __init__(self, + use_system_role: bool = True, + first_system_role: bool = True, + merge_adjacent_role: bool = False, + **kwargs) -> None: + super().__init__(**kwargs) + if use_system_role: + self.system_role = 'system' + else: + self.system_role = 'user' + if use_system_role or first_system_role: + first_system_role = 'system' + else: + first_system_role = 'user' + self._protocol.first_system_role = first_system_role + self._protocol.system_role = self.system_role + self._protocol.merge_adjacent_role = merge_adjacent_role + + def chat(self, message: str) -> AgentReturn: + for hist in self._session_history: + if hist['role'] == 'system': + hist['role'] = self.system_role + self._inner_history = [] + self._inner_history.append(dict(role='user', content=message)) + agent_return = AgentReturn() + default_response = 'Sorry that I cannot answer your question.' + for turn in range(self.max_turn): + prompt = self._protocol.format( + chat_history=self.session_history, + inner_step=self._inner_history, + action_executor=self._action_executor, + force_stop=(turn == self.max_turn - 1)) + response = self._llm.generate_from_template(prompt, 512) + self._inner_history.append(dict(role='assistant', + content=response)) + thought, action, action_input = self._protocol.parse( + response, self._action_executor) + action_return: ActionReturn = self._action_executor( + action, action_input) + action_return.thought = thought + agent_return.actions.append(action_return) + if action_return.type == self._action_executor.finish_action.name: + agent_return.response = action_return.result['text'] + break + self._inner_history.append( + dict(role=self.system_role, + content=self._protocol.format_response(action_return))) + else: + agent_return.response = default_response + agent_return.inner_steps = copy.deepcopy(self._inner_history) + # only append the user and final response + self._session_history.append(dict(role='user', content=message)) + self._session_history.append( + dict(role='assistant', content=agent_return.response)) + return agent_return + + class CIReAct(ReAct): """Code Interpreter version of ReAct. The success state is different from ReAct. @@ -27,6 +156,9 @@ class CIReAct(ReAct): b.reset() def chat(self, message: str) -> AgentReturn: + for hist in self._session_history: + if hist['role'] == 'system': + hist['role'] = self.system_role self._inner_history = [] # append the user message for session history self._session_history.append(dict(role='user', content=message)) @@ -54,14 +186,14 @@ class CIReAct(ReAct): dict(role='assistant', content=response)) self._session_history.append( dict( - role='system', + role=self.system_role, content=self._protocol.format_response(action_return))) agent_return.response = action_return.result['text'] return agent_return elif action_return.type == self._action_executor.invalid_action.name: # noqa action_return.errmsg = 'The action is invalid, please check the action name.' # noqa self._inner_history.append( - dict(role='system', + dict(role=self.system_role, content=self._protocol.format_response(action_return))) if turn == self.max_turn - 1: force_stop = True diff --git a/opencompass/models/lagent.py b/opencompass/models/lagent.py index d3991336..e3e971fd 100644 --- a/opencompass/models/lagent.py +++ b/opencompass/models/lagent.py @@ -42,6 +42,26 @@ class LagentAgent: def set_history(self, history): self.agent._session_history = deepcopy(history) + def gt_response(self, prompt): + if 'CIReAct' in str(self.agent.__class__): + gold = prompt + prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter +{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n""" # noqa + action_input = dict( + command=f"""```python\n{gold}\n```\n""", + timeout=120, + ) + response = self.agent._action_executor('IPythonInterpreter', + action_input) + gt_response = dict(role='assistant', content=prompt) + system_response = dict( + role='system', + content=self.agent._protocol.format_response(response)) + return [gt_response, system_response] + else: + gt_response = dict(role='assistant', content=prompt) + return [gt_response] + @property def template_parser(self): return self.agent._llm.template_parser diff --git a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py index 5cea69ad..56bbce01 100644 --- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py +++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py @@ -124,8 +124,15 @@ class AgentInferencer(ChatInferencer): i for i, item in enumerate(chat) if item['role'] == 'assistant' ] + history = chat[:assistant_indices[0] - 1] + prev_idx = 0 for i in assistant_indices: - self.model.set_history(chat[:i - 1]) + for j in range(prev_idx, i - 1): + if chat[j]['role'] == 'assistant': + history += self.model.gt_response(chat[j]['content']) + elif chat[j]['role'] == 'user': + history += [chat[j]] + self.model.set_history(history) answer, steps, _ = self.model.chat(chat[i - 1]['content']) output_handler.save_multiround_results( origin_prompt=chat[i - 1]['content'], @@ -134,4 +141,6 @@ class AgentInferencer(ChatInferencer): idx=index, gold=chat[i]['content'], ) + history += [chat[i - 1]] + prev_idx = i self.model.reset() diff --git a/requirements/agent.txt b/requirements/agent.txt index 23c8b6a4..3fc7630b 100644 --- a/requirements/agent.txt +++ b/requirements/agent.txt @@ -6,6 +6,8 @@ jupyter jupyter_client jupytext lagent +lightgbm==4.1.0 networkx scikit-image sympy==1.12 +tensorflow==2.14.0