[Feat] minor update agent related (#839)

* [Feat] update cibench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench * [Feat] Support CIBench
2025-05-30 16:03:24 +08:00 · 2024-01-26 14:15:51 +08:00 · 2024-01-26 14:15:51 +08:00 · 4aa74565e2
commit 4aa74565e2
parent 77be07dbb5
6 changed files with 286 additions and 11 deletions
--- a/configs/eval_chat_cibench_api.py
+++ b/configs/eval_chat_cibench_api.py
@ -0,0 +1,96 @@
 from mmengine.config import read_base
 from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
 from opencompass.lagent.agents.react import CIReAct, ReActProtocol
 from opencompass.models.lagent import CodeAgent
 from opencompass.models.openai_api import OpenAI
 from opencompass.partitioners import SizePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks import OpenICLInferTask
 with read_base():
    from .datasets.CIBench.CIBench_template_gen_e6b12a import \
        cibench_datasets as datasets
 FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
 FEWSHOT_INSTRUCTION = """\
 You are an assistant who can utilize external tools.
 {tool_description}
 To use a tool, please response with the following format:
 ```
 {thought} Think what you need to solve, do you need to use tools?
 {action} The tool name, should be one of [{action_names}].
 {action_input} The input to the tool that you want to use.
 ```
 The tool will give you response after your response using the following format:
 ```
 {response} the results after call the tool.
 ```
 Therefore DO NOT generate tool response by yourself.
 Also please follow the guidelines:
 1. Always use code interpreter to solve the problem.
 2. The generated codes should always in a markdown code block format.
 3. The generated codes will be executed in an ipython manner and the results will be cached.
 4. Your responded code should always be simple and only solves the problem in current step.
 For example:
 File url: `xxxx`
 ### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
 {thought} We should use `pandas` to solve this step.
 {action} IPythonInterpreter
 {action_input} ```python
 import pandas as pd
 url = "xxxx"
 data = pd.read_csv(url)
 ```
 {response} The code is succeed without any outputs.
 Let us begin from here!
 """
 IPYTHON_INTERPRETER_DESCRIPTION = '''\
 It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
 models = [
    dict(
        abbr='gpt-3.5-code',
        type=CodeAgent,
        agent_type=CIReAct,
        max_turn=3,
        llm=dict(
            type=OpenAI,
            path='gpt-3.5-turbo',
            key='ENV',
            query_per_second=1,
            max_seq_len=4096,
        ),
        actions=[
            dict(type=IPythonInterpreter,
                 description=IPYTHON_INTERPRETER_DESCRIPTION,
                 user_data_dir='./data/cibench_dataset/datasources')
        ],
        protocol=dict(
            type=ReActProtocol,
            call_protocol=FEWSHOT_INSTRUCTION,
            force_stop=FORCE_STOP_PROMPT_EN,
            finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
        ),
        batch_size=1,
        use_system_role=False, # use `user` role instead of system role
        first_system_role=False, # use `user` role of the first instruction prompt
        merge_adjacent_role=True, # merge adjacent same user content
    ),
 ]
 infer = dict(
    partitioner=dict(type=SizePartitioner, max_task_size=1000),
    runner=dict(
        type=LocalRunner,
        max_num_workers=16,
        task=dict(type=OpenICLInferTask)),
 )
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
@ -105,10 +105,11 @@ def load_experiment_template(file: str) -> dict:
                    for _output in cell['outputs']:
                        if _output['output_type'] == 'display_data':
                            assert not output_flag
-                            output_flag = True
+                            if 'image/png' in _output['data']:
-                            tags.append('vis')
+                                output_flag = True
-                            outputs.append(_output['data']['image/png'])
+                                tags.append('vis')
-                    for _output in cell['outputs']:
+                                outputs.append(_output['data']['image/png'])
                    for _output in cell['outputs'][::-1]:
                        if output_flag:
                            break
                        if _output['output_type'] == 'stream' and _output[
@ -290,11 +291,26 @@ class CIBenchEvaluator(BaseEvaluator):
                if action['result']:
                    try:
                        pred = action['result']['text']
-                        match = re.search('execute_result:\n\n```\n(.*?)\n```',
+                        match_exec = re.search(
-                                          pred, re.DOTALL)
+                            'execute_result:\n\n```\n(.*?)\n```', pred,
                            re.DOTALL)
                        match_stdout = re.search('stdout:\n\n```\n(.*?)\n```',
                                                 pred, re.DOTALL)
                        # get pred result from execute_result by default
                        # else stdout
                        if match_exec and match_stdout:
                            match = match_exec
                        elif match_exec:
                            match = match_exec
                        elif match_stdout:
                            match = match_stdout
                        else:
                            match = None
                        if match:
                            out = match.group(1)
-                            return out.strip() == target.strip()
+                            score = (out.strip() == target.strip()
                                     or target.strip() in out.strip())
                            return score
                    except Exception:
                        return False
        # Fall back to False
--- a/opencompass/lagent/agents/react.py
+++ b/opencompass/lagent/agents/react.py
@ -1,7 +1,136 @@
-from lagent.agents.react import ReAct
+import copy
 from typing import Dict, List
 from lagent.actions import ActionExecutor
 from lagent.agents.react import ReAct as _ReAct
 from lagent.agents.react import ReActProtocol as _ReActProtocol
 from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
 class ReActProtocol(_ReActProtocol):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        # defaults to system
        self.system_role = 'system'
        self.first_system_role = 'system'
        self.merge_adjacent_role = False
    def format(self,
               chat_history: List[Dict],
               inner_step: List[Dict],
               action_executor: ActionExecutor,
               force_stop: bool = False) -> list:
        """Generate the ReAct format prompt.
        Args:
            chat_history (List[Dict]): The history log in previous runs.
            inner_step (List[Dict]): The log in the current run.
            action_executor (ActionExecutor): the action manager to
                execute actions.
            force_stop (boolean): whether force the agent to give responses
                under pre-defined turns.
        Returns:
            List[Dict]: ReAct format prompt.
        """
        call_protocol = self.call_protocol.format(
            tool_description=action_executor.get_actions_info(),
            action_names=action_executor.action_names(),
            thought=self.thought['begin'],
            action=self.action['begin'],
            action_input=self.action_input['begin'],
            response=self.response['begin'],
            finish=self.finish['begin'],
        )
        formatted = []
        formatted.append(
            dict(role=self.first_system_role, content=call_protocol))
        formatted += chat_history
        formatted += inner_step
        if force_stop:
            formatted.append(
                dict(role=self.system_role, content=self.force_stop))
        if self.merge_adjacent_role and formatted:
            merged = [formatted[0]]  # Add the first dict
            for d in formatted[1:]:
                # If the 'role' of current dict matches with the 'role' of the
                # last dict in merged list,
                # append its 'content' to the 'content' of the last dict.
                if d['role'] == merged[-1]['role']:
                    merged[-1]['content'] += d['content']
                else:
                    # If 'role' does not match, add it as a new dict in the
                    # merged list
                    merged.append(d)
            return merged
        return formatted
 class ReAct(_ReAct):
    def __init__(self,
                 use_system_role: bool = True,
                 first_system_role: bool = True,
                 merge_adjacent_role: bool = False,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        if use_system_role:
            self.system_role = 'system'
        else:
            self.system_role = 'user'
        if use_system_role or first_system_role:
            first_system_role = 'system'
        else:
            first_system_role = 'user'
        self._protocol.first_system_role = first_system_role
        self._protocol.system_role = self.system_role
        self._protocol.merge_adjacent_role = merge_adjacent_role
    def chat(self, message: str) -> AgentReturn:
        for hist in self._session_history:
            if hist['role'] == 'system':
                hist['role'] = self.system_role
        self._inner_history = []
        self._inner_history.append(dict(role='user', content=message))
        agent_return = AgentReturn()
        default_response = 'Sorry that I cannot answer your question.'
        for turn in range(self.max_turn):
            prompt = self._protocol.format(
                chat_history=self.session_history,
                inner_step=self._inner_history,
                action_executor=self._action_executor,
                force_stop=(turn == self.max_turn - 1))
            response = self._llm.generate_from_template(prompt, 512)
            self._inner_history.append(dict(role='assistant',
                                            content=response))
            thought, action, action_input = self._protocol.parse(
                response, self._action_executor)
            action_return: ActionReturn = self._action_executor(
                action, action_input)
            action_return.thought = thought
            agent_return.actions.append(action_return)
            if action_return.type == self._action_executor.finish_action.name:
                agent_return.response = action_return.result['text']
                break
            self._inner_history.append(
                dict(role=self.system_role,
                     content=self._protocol.format_response(action_return)))
        else:
            agent_return.response = default_response
        agent_return.inner_steps = copy.deepcopy(self._inner_history)
        # only append the user and final response
        self._session_history.append(dict(role='user', content=message))
        self._session_history.append(
            dict(role='assistant', content=agent_return.response))
        return agent_return
 class CIReAct(ReAct):
    """Code Interpreter version of ReAct. The success state is different from
    ReAct.
@ -27,6 +156,9 @@ class CIReAct(ReAct):
        b.reset()
    def chat(self, message: str) -> AgentReturn:
        for hist in self._session_history:
            if hist['role'] == 'system':
                hist['role'] = self.system_role
        self._inner_history = []
        # append the user message for session history
        self._session_history.append(dict(role='user', content=message))
@ -54,14 +186,14 @@ class CIReAct(ReAct):
                    dict(role='assistant', content=response))
                self._session_history.append(
                    dict(
-                        role='system',
+                        role=self.system_role,
                        content=self._protocol.format_response(action_return)))
                agent_return.response = action_return.result['text']
                return agent_return
            elif action_return.type == self._action_executor.invalid_action.name:  # noqa
                action_return.errmsg = 'The action is invalid, please check the action name.'  # noqa
            self._inner_history.append(
-                dict(role='system',
+                dict(role=self.system_role,
                     content=self._protocol.format_response(action_return)))
            if turn == self.max_turn - 1:
                force_stop = True
--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@ -42,6 +42,26 @@ class LagentAgent:
    def set_history(self, history):
        self.agent._session_history = deepcopy(history)
    def gt_response(self, prompt):
        if 'CIReAct' in str(self.agent.__class__):
            gold = prompt
            prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
 {self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n"""  # noqa
            action_input = dict(
                command=f"""```python\n{gold}\n```\n""",
                timeout=120,
            )
            response = self.agent._action_executor('IPythonInterpreter',
                                                   action_input)
            gt_response = dict(role='assistant', content=prompt)
            system_response = dict(
                role='system',
                content=self.agent._protocol.format_response(response))
            return [gt_response, system_response]
        else:
            gt_response = dict(role='assistant', content=prompt)
            return [gt_response]
    @property
    def template_parser(self):
        return self.agent._llm.template_parser
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@ -124,8 +124,15 @@ class AgentInferencer(ChatInferencer):
            i for i, item in enumerate(chat) if item['role'] == 'assistant'
        ]
        history = chat[:assistant_indices[0] - 1]
        prev_idx = 0
        for i in assistant_indices:
-            self.model.set_history(chat[:i - 1])
+            for j in range(prev_idx, i - 1):
                if chat[j]['role'] == 'assistant':
                    history += self.model.gt_response(chat[j]['content'])
                elif chat[j]['role'] == 'user':
                    history += [chat[j]]
            self.model.set_history(history)
            answer, steps, _ = self.model.chat(chat[i - 1]['content'])
            output_handler.save_multiround_results(
                origin_prompt=chat[i - 1]['content'],
@ -134,4 +141,6 @@ class AgentInferencer(ChatInferencer):
                idx=index,
                gold=chat[i]['content'],
            )
            history += [chat[i - 1]]
            prev_idx = i
        self.model.reset()
--- a/requirements/agent.txt
+++ b/requirements/agent.txt
@ -6,6 +6,8 @@ jupyter
 jupyter_client
 jupytext
 lagent
 lightgbm==4.1.0
 networkx
 scikit-image
 sympy==1.12
 tensorflow==2.14.0