diff --git a/configs/eval_chat_cibench_api.py b/configs/eval_chat_cibench_api.py
new file mode 100644
index 00000000..53146633
--- /dev/null
+++ b/configs/eval_chat_cibench_api.py
@@ -0,0 +1,96 @@
+from mmengine.config import read_base
+
+from opencompass.lagent.actions.ipython_interpreter import IPythonInterpreter
+from opencompass.lagent.agents.react import CIReAct, ReActProtocol
+from opencompass.models.lagent import CodeAgent
+from opencompass.models.openai_api import OpenAI
+from opencompass.partitioners import SizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks import OpenICLInferTask
+
+with read_base():
+    from .datasets.CIBench.CIBench_template_gen_e6b12a import \
+        cibench_datasets as datasets
+
+FORCE_STOP_PROMPT_EN = """You should directly give results based on history information."""
+
+FEWSHOT_INSTRUCTION = """\
+You are an assistant who can utilize external tools.
+{tool_description}
+To use a tool, please response with the following format:
+```
+{thought} Think what you need to solve, do you need to use tools?
+{action} The tool name, should be one of [{action_names}].
+{action_input} The input to the tool that you want to use.
+```
+The tool will give you response after your response using the following format:
+```
+{response} the results after call the tool.
+```
+Therefore DO NOT generate tool response by yourself.
+
+Also please follow the guidelines:
+1. Always use code interpreter to solve the problem.
+2. The generated codes should always in a markdown code block format.
+3. The generated codes will be executed in an ipython manner and the results will be cached.
+4. Your responded code should always be simple and only solves the problem in current step.
+
+For example:
+
+File url: `xxxx`
+### Step 1. Load the dataset from the url into a pandas DataFrame named `df`.
+
+{thought} We should use `pandas` to solve this step.
+{action} IPythonInterpreter
+{action_input} ```python
+import pandas as pd
+url = "xxxx"
+data = pd.read_csv(url)
+```
+{response} The code is succeed without any outputs.
+
+Let us begin from here!
+"""
+
+IPYTHON_INTERPRETER_DESCRIPTION = '''\
+It can run Python code in a manner as jupyter notebook. The code must be a valid code that contains only python method.'''
+
+models = [
+    dict(
+        abbr='gpt-3.5-code',
+        type=CodeAgent,
+        agent_type=CIReAct,
+        max_turn=3,
+        llm=dict(
+            type=OpenAI,
+            path='gpt-3.5-turbo',
+            key='ENV',
+            query_per_second=1,
+            max_seq_len=4096,
+        ),
+        actions=[
+            dict(type=IPythonInterpreter,
+                 description=IPYTHON_INTERPRETER_DESCRIPTION,
+                 user_data_dir='./data/cibench_dataset/datasources')
+        ],
+        protocol=dict(
+            type=ReActProtocol,
+            call_protocol=FEWSHOT_INSTRUCTION,
+            force_stop=FORCE_STOP_PROMPT_EN,
+            finish=dict(role='FINISH', begin='Final Answer:', end='\n'),
+        ),
+        batch_size=1,
+        use_system_role=False, # use `user` role instead of system role
+        first_system_role=False, # use `user` role of the first instruction prompt
+        merge_adjacent_role=True, # merge adjacent same user content
+    ),
+]
+
+
+infer = dict(
+    partitioner=dict(type=SizePartitioner, max_task_size=1000),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=16,
+        task=dict(type=OpenICLInferTask)),
+)
\ No newline at end of file
diff --git a/opencompass/datasets/cibench.py b/opencompass/datasets/cibench.py
index ae13ced8..be2ce1ce 100644
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
@@ -105,10 +105,11 @@ def load_experiment_template(file: str) -> dict:
                     for _output in cell['outputs']:
                         if _output['output_type'] == 'display_data':
                             assert not output_flag
-                            output_flag = True
-                            tags.append('vis')
-                            outputs.append(_output['data']['image/png'])
-                    for _output in cell['outputs']:
+                            if 'image/png' in _output['data']:
+                                output_flag = True
+                                tags.append('vis')
+                                outputs.append(_output['data']['image/png'])
+                    for _output in cell['outputs'][::-1]:
                         if output_flag:
                             break
                         if _output['output_type'] == 'stream' and _output[
@@ -290,11 +291,26 @@ class CIBenchEvaluator(BaseEvaluator):
                 if action['result']:
                     try:
                         pred = action['result']['text']
-                        match = re.search('execute_result:\n\n```\n(.*?)\n```',
-                                          pred, re.DOTALL)
+                        match_exec = re.search(
+                            'execute_result:\n\n```\n(.*?)\n```', pred,
+                            re.DOTALL)
+                        match_stdout = re.search('stdout:\n\n```\n(.*?)\n```',
+                                                 pred, re.DOTALL)
+                        # get pred result from execute_result by default
+                        # else stdout
+                        if match_exec and match_stdout:
+                            match = match_exec
+                        elif match_exec:
+                            match = match_exec
+                        elif match_stdout:
+                            match = match_stdout
+                        else:
+                            match = None
                         if match:
                             out = match.group(1)
-                            return out.strip() == target.strip()
+                            score = (out.strip() == target.strip()
+                                     or target.strip() in out.strip())
+                            return score
                     except Exception:
                         return False
         # Fall back to False
diff --git a/opencompass/lagent/agents/react.py b/opencompass/lagent/agents/react.py
index cd55af8b..0232068f 100644
--- a/opencompass/lagent/agents/react.py
+++ b/opencompass/lagent/agents/react.py
@@ -1,7 +1,136 @@
-from lagent.agents.react import ReAct
+import copy
+from typing import Dict, List
+
+from lagent.actions import ActionExecutor
+from lagent.agents.react import ReAct as _ReAct
+from lagent.agents.react import ReActProtocol as _ReActProtocol
 from lagent.schema import ActionReturn, ActionStatusCode, AgentReturn
 
 
+class ReActProtocol(_ReActProtocol):
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        # defaults to system
+        self.system_role = 'system'
+        self.first_system_role = 'system'
+        self.merge_adjacent_role = False
+
+    def format(self,
+               chat_history: List[Dict],
+               inner_step: List[Dict],
+               action_executor: ActionExecutor,
+               force_stop: bool = False) -> list:
+        """Generate the ReAct format prompt.
+
+        Args:
+            chat_history (List[Dict]): The history log in previous runs.
+            inner_step (List[Dict]): The log in the current run.
+            action_executor (ActionExecutor): the action manager to
+                execute actions.
+            force_stop (boolean): whether force the agent to give responses
+                under pre-defined turns.
+
+        Returns:
+            List[Dict]: ReAct format prompt.
+        """
+
+        call_protocol = self.call_protocol.format(
+            tool_description=action_executor.get_actions_info(),
+            action_names=action_executor.action_names(),
+            thought=self.thought['begin'],
+            action=self.action['begin'],
+            action_input=self.action_input['begin'],
+            response=self.response['begin'],
+            finish=self.finish['begin'],
+        )
+        formatted = []
+        formatted.append(
+            dict(role=self.first_system_role, content=call_protocol))
+        formatted += chat_history
+        formatted += inner_step
+        if force_stop:
+            formatted.append(
+                dict(role=self.system_role, content=self.force_stop))
+
+        if self.merge_adjacent_role and formatted:
+            merged = [formatted[0]]  # Add the first dict
+
+            for d in formatted[1:]:
+                # If the 'role' of current dict matches with the 'role' of the
+                # last dict in merged list,
+                # append its 'content' to the 'content' of the last dict.
+                if d['role'] == merged[-1]['role']:
+                    merged[-1]['content'] += d['content']
+                else:
+                    # If 'role' does not match, add it as a new dict in the
+                    # merged list
+                    merged.append(d)
+
+            return merged
+
+        return formatted
+
+
+class ReAct(_ReAct):
+
+    def __init__(self,
+                 use_system_role: bool = True,
+                 first_system_role: bool = True,
+                 merge_adjacent_role: bool = False,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        if use_system_role:
+            self.system_role = 'system'
+        else:
+            self.system_role = 'user'
+        if use_system_role or first_system_role:
+            first_system_role = 'system'
+        else:
+            first_system_role = 'user'
+        self._protocol.first_system_role = first_system_role
+        self._protocol.system_role = self.system_role
+        self._protocol.merge_adjacent_role = merge_adjacent_role
+
+    def chat(self, message: str) -> AgentReturn:
+        for hist in self._session_history:
+            if hist['role'] == 'system':
+                hist['role'] = self.system_role
+        self._inner_history = []
+        self._inner_history.append(dict(role='user', content=message))
+        agent_return = AgentReturn()
+        default_response = 'Sorry that I cannot answer your question.'
+        for turn in range(self.max_turn):
+            prompt = self._protocol.format(
+                chat_history=self.session_history,
+                inner_step=self._inner_history,
+                action_executor=self._action_executor,
+                force_stop=(turn == self.max_turn - 1))
+            response = self._llm.generate_from_template(prompt, 512)
+            self._inner_history.append(dict(role='assistant',
+                                            content=response))
+            thought, action, action_input = self._protocol.parse(
+                response, self._action_executor)
+            action_return: ActionReturn = self._action_executor(
+                action, action_input)
+            action_return.thought = thought
+            agent_return.actions.append(action_return)
+            if action_return.type == self._action_executor.finish_action.name:
+                agent_return.response = action_return.result['text']
+                break
+            self._inner_history.append(
+                dict(role=self.system_role,
+                     content=self._protocol.format_response(action_return)))
+        else:
+            agent_return.response = default_response
+        agent_return.inner_steps = copy.deepcopy(self._inner_history)
+        # only append the user and final response
+        self._session_history.append(dict(role='user', content=message))
+        self._session_history.append(
+            dict(role='assistant', content=agent_return.response))
+        return agent_return
+
+
 class CIReAct(ReAct):
     """Code Interpreter version of ReAct. The success state is different from
     ReAct.
@@ -27,6 +156,9 @@ class CIReAct(ReAct):
         b.reset()
 
     def chat(self, message: str) -> AgentReturn:
+        for hist in self._session_history:
+            if hist['role'] == 'system':
+                hist['role'] = self.system_role
         self._inner_history = []
         # append the user message for session history
         self._session_history.append(dict(role='user', content=message))
@@ -54,14 +186,14 @@ class CIReAct(ReAct):
                     dict(role='assistant', content=response))
                 self._session_history.append(
                     dict(
-                        role='system',
+                        role=self.system_role,
                         content=self._protocol.format_response(action_return)))
                 agent_return.response = action_return.result['text']
                 return agent_return
             elif action_return.type == self._action_executor.invalid_action.name:  # noqa
                 action_return.errmsg = 'The action is invalid, please check the action name.'  # noqa
             self._inner_history.append(
-                dict(role='system',
+                dict(role=self.system_role,
                      content=self._protocol.format_response(action_return)))
             if turn == self.max_turn - 1:
                 force_stop = True
diff --git a/opencompass/models/lagent.py b/opencompass/models/lagent.py
index d3991336..e3e971fd 100644
--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@@ -42,6 +42,26 @@ class LagentAgent:
     def set_history(self, history):
         self.agent._session_history = deepcopy(history)
 
+    def gt_response(self, prompt):
+        if 'CIReAct' in str(self.agent.__class__):
+            gold = prompt
+            prompt = f"""{self.agent._protocol.action['begin']} IPythonInterpreter
+{self.agent._protocol.action_input['begin']} ```python\n{gold}\n```\n"""  # noqa
+            action_input = dict(
+                command=f"""```python\n{gold}\n```\n""",
+                timeout=120,
+            )
+            response = self.agent._action_executor('IPythonInterpreter',
+                                                   action_input)
+            gt_response = dict(role='assistant', content=prompt)
+            system_response = dict(
+                role='system',
+                content=self.agent._protocol.format_response(response))
+            return [gt_response, system_response]
+        else:
+            gt_response = dict(role='assistant', content=prompt)
+            return [gt_response]
+
     @property
     def template_parser(self):
         return self.agent._llm.template_parser
diff --git a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
index 5cea69ad..56bbce01 100644
--- a/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_agent_inferencer.py
@@ -124,8 +124,15 @@ class AgentInferencer(ChatInferencer):
             i for i, item in enumerate(chat) if item['role'] == 'assistant'
         ]
 
+        history = chat[:assistant_indices[0] - 1]
+        prev_idx = 0
         for i in assistant_indices:
-            self.model.set_history(chat[:i - 1])
+            for j in range(prev_idx, i - 1):
+                if chat[j]['role'] == 'assistant':
+                    history += self.model.gt_response(chat[j]['content'])
+                elif chat[j]['role'] == 'user':
+                    history += [chat[j]]
+            self.model.set_history(history)
             answer, steps, _ = self.model.chat(chat[i - 1]['content'])
             output_handler.save_multiround_results(
                 origin_prompt=chat[i - 1]['content'],
@@ -134,4 +141,6 @@ class AgentInferencer(ChatInferencer):
                 idx=index,
                 gold=chat[i]['content'],
             )
+            history += [chat[i - 1]]
+            prev_idx = i
         self.model.reset()
diff --git a/requirements/agent.txt b/requirements/agent.txt
index 23c8b6a4..3fc7630b 100644
--- a/requirements/agent.txt
+++ b/requirements/agent.txt
@@ -6,6 +6,8 @@ jupyter
 jupyter_client
 jupytext
 lagent
+lightgbm==4.1.0
 networkx
 scikit-image
 sympy==1.12
+tensorflow==2.14.0