From 13609c269ea274e65b369d1b47b58d83c951ca04 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 20:33:41 +0000
Subject: [PATCH 01/76] refactor: standardize Observation base class with
 error/output and status helpers

- Add ObservationStatus enum and result_status property
- Add standardized output and error fields with default to_llm_content
- Keep subclasses backward compatible by using bool(error) in has_error

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py | 47 +++++++++++++++++++---
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 1c041aa659..9206ec7ece 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,5 +1,6 @@
-from abc import ABC, abstractmethod
+from abc import ABC
 from collections.abc import Sequence
+from enum import Enum
 from typing import Any, ClassVar, TypeVar
 
 from pydantic import ConfigDict, Field, create_model
@@ -187,20 +188,54 @@ def visualize(self) -> Text:
         return content
 
 
+class ObservationStatus(str, Enum):
+    SUCCESS = "success"
+    ERROR = "error"
+
+
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
+    # Standardized primary output and error handling
+    output: str = Field(
+        default="", description="Primary text output from the tool operation"
+    )
+    error: str | None = Field(
+        default=None, description="Error message if operation failed"
+    )
+
+    @property
+    def has_error(self) -> bool:
+        # Support both string and boolean-style error flags across subclasses.
+        # Using bool() handles: None/""/False -> False; non-empty str/True -> True.
+        return bool(self.error)
+
+    @property
+    def result_status(self) -> ObservationStatus:
+        return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
+
+    def _format_error(self) -> TextContent:
+        return TextContent(text=f"Error: {self.error}")
+
     @property
-    @abstractmethod
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Get the observation string to show to the agent."""
+        """Default content formatting prioritizing error then output.
+
+        Subclasses can override to provide richer content (e.g., images, diffs),
+        but should preserve the error-first convention.
+        """
+        if self.error:
+            return [self._format_error()]
+        if self.output:
+            return [TextContent(text=self.output)]
+        return []
 
     @property
     def visualize(self) -> Text:
-        """Return Rich Text representation of this action.
+        """Return Rich Text representation of this observation.
 
-        This method can be overridden by subclasses to customize visualization.
-        The base implementation displays all action fields systematically.
+        Subclasses can override for custom visualization; by default we show the
+        same text that would be sent to the LLM.
         """
         content = Text()
         text_parts = content_to_str(self.to_llm_content)

From 65208088946c54d3c95446fc818d4e93950f5d7d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 21:11:53 +0000
Subject: [PATCH 02/76] refactor: standardize Observation subclasses and
 maintain backward-compat

- FinishObservation: rename message->output, add message alias; rely on base to_llm_content/visualize
- ThinkObservation: rename content->output, add content alias; rely on base defaults
- MCPToolObservation: adopt base error semantics while preserving is_error; unify has_error and formatting
- TaskTrackerObservation: rename content->output; use base to_llm_content; keep rich visualize
- FileEditorObservation: standardize error formatting prefix as 'Error: '

This aligns all observation classes with the unified Observation base (output: str, error: str|None, has_error, result_status) and removes duplicated logic where possible.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py     | 12 +++++++++---
 openhands-sdk/openhands/sdk/mcp/tool.py           |  2 ++
 .../openhands/sdk/tool/builtins/finish.py         | 11 +++++------
 .../openhands/sdk/tool/builtins/think.py          |  9 ++++-----
 .../openhands/tools/file_editor/definition.py     |  2 +-
 .../openhands/tools/task_tracker/definition.py    | 15 +++++----------
 6 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 8d4d544880..79c859c34a 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -61,6 +61,11 @@ class MCPToolObservation(Observation):
     )
     tool_name: str = Field(description="Name of the tool that was called")
 
+    @property
+    def has_error(self) -> bool:  # type: ignore[override]
+        # Consider both base error string and is_error boolean for compatibility
+        return bool(self.error) or bool(self.is_error)
+
     @classmethod
     def from_call_tool_result(
         cls, tool_name: str, result: mcp.types.CallToolResult
@@ -83,7 +88,8 @@ def from_call_tool_result(
                 )
         return cls(
             content=convrted_content,
-            is_error=result.isError,
+            error=("MCP tool error" if result.isError else None),
+            is_error=bool(result.isError),
             tool_name=tool_name,
         )
 
@@ -91,7 +97,7 @@ def from_call_tool_result(
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         """Format the observation for agent display."""
         initial_message = f"[Tool '{self.tool_name}' executed.]\n"
-        if self.is_error:
+        if self.has_error:
             initial_message += "[An error occurred during execution.]\n"
         return [TextContent(text=initial_message)] + self.content
 
@@ -100,7 +106,7 @@ def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
         content = Text()
         content.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
-        if self.is_error:
+        if self.has_error:
             content.append("[Error during execution]\n", style="bold red")
         for block in self.content:
             if isinstance(block, TextContent):
diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index 71222ad8e3..cf4494b886 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -69,6 +69,7 @@ async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
                 logger.error(error_msg, exc_info=True)
                 return MCPToolObservation(
                     content=[TextContent(text=error_msg)],
+                    error=error_msg,
                     is_error=True,
                     tool_name=self.tool_name,
                 )
@@ -149,6 +150,7 @@ def __call__(
             logger.error(error_msg, exc_info=True)
             return MCPToolObservation(
                 content=[TextContent(text=error_msg)],
+                error=error_msg,
                 is_error=True,
                 tool_name=self.name,
             )
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index 6d2ac10420..b64a9e41d0 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -1,10 +1,8 @@
-from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
 from rich.text import Text
 
-from openhands.sdk.llm.message import ImageContent, TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -31,11 +29,12 @@ def visualize(self) -> Text:
 
 
 class FinishObservation(Observation):
-    message: str = Field(description="Final message sent to the user.")
+    output: str = Field(default="", description="Final message sent to the user.")
 
+    # Backward compatibility: expose `message` alias for older tests/integrations
     @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.message)]
+    def message(self) -> str:  # pragma: no cover - alias for backward compatibility
+        return self.output
 
     @property
     def visualize(self) -> Text:
@@ -64,7 +63,7 @@ def __call__(
         action: FinishAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation(message=action.message)
+        return FinishObservation(output=action.message)
 
 
 FinishTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 01d84d6ece..8db101bd10 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -1,10 +1,8 @@
-from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
 from rich.text import Text
 
-from openhands.sdk.llm.message import ImageContent, TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -47,13 +45,14 @@ def visualize(self) -> Text:
 class ThinkObservation(Observation):
     """Observation returned after logging a thought."""
 
-    content: str = Field(
+    output: str = Field(
         default="Your thought has been logged.", description="Confirmation message."
     )
 
+    # Backward compatibility: expose `content` alias for older tests/integrations
     @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.content)]
+    def content(self) -> str:  # pragma: no cover - alias for backward compatibility
+        return self.output
 
     @property
     def visualize(self) -> Text:
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 571baa5660..f05eb4e765 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -91,7 +91,7 @@ class FileEditorObservation(Observation):
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
-            return [TextContent(text=self.error)]
+            return [TextContent(text=f"Error: {self.error}")]
         return [TextContent(text=self.output)]
 
     @property
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index d71ab8c36d..8b0113fa71 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -11,7 +11,6 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
-from openhands.sdk import ImageContent, TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import (
     Action,
@@ -70,7 +69,7 @@ def visualize(self) -> Text:
 class TaskTrackerObservation(Observation):
     """This data class represents the result of a task tracking operation."""
 
-    content: str = Field(
+    output: str = Field(
         default="", description="The formatted task list or status message"
     )
     command: str = Field(default="", description="The command that was executed")
@@ -78,10 +77,6 @@ class TaskTrackerObservation(Observation):
         default_factory=list, description="The current task list"
     )
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.content)]
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
@@ -175,7 +170,7 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                content="Task list has been updated with "
+                output="Task list has been updated with "
                 + f"{len(self._task_list)} item(s).",
                 command=action.command,
                 task_list=self._task_list,
@@ -184,17 +179,17 @@ def __call__(
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    content='No task list found. Use the "plan" command to create one.',
+                    output='No task list found. Use the "plan" command to create one.',
                     command=action.command,
                     task_list=[],
                 )
             content = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
-                content=content, command=action.command, task_list=self._task_list
+                output=content, command=action.command, task_list=self._task_list
             )
         else:
             return TaskTrackerObservation(
-                content=f"Unknown command: {action.command}. "
+                output=f"Unknown command: {action.command}. "
                 + 'Supported commands are "view" and "plan".',
                 command=action.command,
                 task_list=[],

From a349a7bbb2f51a9e55213e0546c56f56255428a5 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 22:58:08 +0000
Subject: [PATCH 03/76] test: align tests with standardized Observation fields

- Fix remaining assertion in test_registry to use .output
- Ensure subclass output fields provide defaults to satisfy pyright
- Adjust ExecuteBashObservation output default

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 21 +++++++------------
 openhands-sdk/openhands/sdk/mcp/tool.py       |  2 --
 .../openhands/sdk/tool/builtins/finish.py     |  5 -----
 .../openhands/sdk/tool/builtins/think.py      |  5 -----
 .../tools/execute_bash/definition.py          |  6 +-----
 .../execute_bash/terminal/terminal_session.py |  6 +++---
 .../local/test_confirmation_mode.py           | 10 ++++-----
 tests/sdk/mcp/test_mcp_security_risk.py       |  2 +-
 tests/sdk/mcp/test_mcp_tool.py                | 18 ++++++++--------
 tests/sdk/tool/test_registry.py               | 10 ++++-----
 .../execute_bash/test_bash_ps1_metadata.py    | 10 ++++++---
 .../test_observation_truncation.py            | 10 ++++-----
 12 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 79c859c34a..b0cff07653 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -53,19 +53,13 @@ class MCPToolObservation(Observation):
 
     content: list[TextContent | ImageContent] = Field(
         default_factory=list,
-        description="Content returned from the MCP tool converted "
-        "to LLM Ready TextContent or ImageContent",
-    )
-    is_error: bool = Field(
-        default=False, description="Whether the call resulted in an error"
+        description=(
+            "Content returned from the MCP tool converted to LLM Ready TextContent "
+            "or ImageContent"
+        ),
     )
     tool_name: str = Field(description="Name of the tool that was called")
 
-    @property
-    def has_error(self) -> bool:  # type: ignore[override]
-        # Consider both base error string and is_error boolean for compatibility
-        return bool(self.error) or bool(self.is_error)
-
     @classmethod
     def from_call_tool_result(
         cls, tool_name: str, result: mcp.types.CallToolResult
@@ -89,17 +83,16 @@ def from_call_tool_result(
         return cls(
             content=convrted_content,
             error=("MCP tool error" if result.isError else None),
-            is_error=bool(result.isError),
             tool_name=tool_name,
         )
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         """Format the observation for agent display."""
-        initial_message = f"[Tool '{self.tool_name}' executed.]\n"
+        initial = f"[Tool '{self.tool_name}' executed.]\n"
         if self.has_error:
-            initial_message += "[An error occurred during execution.]\n"
-        return [TextContent(text=initial_message)] + self.content
+            initial += "[An error occurred during execution.]\n"
+        return [TextContent(text=initial)] + self.content
 
     @property
     def visualize(self) -> Text:
diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index cf4494b886..e8de3566c0 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -70,7 +70,6 @@ async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
                 return MCPToolObservation(
                     content=[TextContent(text=error_msg)],
                     error=error_msg,
-                    is_error=True,
                     tool_name=self.tool_name,
                 )
 
@@ -151,7 +150,6 @@ def __call__(
             return MCPToolObservation(
                 content=[TextContent(text=error_msg)],
                 error=error_msg,
-                is_error=True,
                 tool_name=self.name,
             )
 
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index b64a9e41d0..ec17b2960c 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -31,11 +31,6 @@ def visualize(self) -> Text:
 class FinishObservation(Observation):
     output: str = Field(default="", description="Final message sent to the user.")
 
-    # Backward compatibility: expose `message` alias for older tests/integrations
-    @property
-    def message(self) -> str:  # pragma: no cover - alias for backward compatibility
-        return self.output
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation - empty since action shows the message."""
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 8db101bd10..43438e1625 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -49,11 +49,6 @@ class ThinkObservation(Observation):
         default="Your thought has been logged.", description="Confirmation message."
     )
 
-    # Backward compatibility: expose `content` alias for older tests/integrations
-    @property
-    def content(self) -> str:  # pragma: no cover - alias for backward compatibility
-        return self.output
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation - empty since action shows the thought."""
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index cb19895436..e3a111847c 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,7 +79,7 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    output: str = Field(description="The raw output from the tool.")
+    output: str = Field(default="", description="The raw output from the tool.")
     command: str | None = Field(
         default=None,
         description="The bash command that was executed. Can be empty string if the observation is from a previous command that hit soft timeout and is not yet finished.",  # noqa
@@ -88,10 +88,6 @@ class ExecuteBashObservation(Observation):
         default=None,
         description="The exit code of the command. -1 indicates the process hit the soft timeout and is not yet finished.",  # noqa
     )
-    error: bool = Field(
-        default=False,
-        description="Whether there was an error during command execution.",
-    )
     timeout: bool = Field(
         default=False, description="Whether the command execution timed out."
     )
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 0d46638650..dcc16010b6 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -315,12 +315,12 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             if command == "":
                 return ExecuteBashObservation(
                     output="ERROR: No previous running command to retrieve logs from.",
-                    error=True,
+                    error="No previous running command to retrieve logs from.",
                 )
             if is_input:
                 return ExecuteBashObservation(
                     output="ERROR: No previous running command to interact with.",
-                    error=True,
+                    error="No previous running command to interact with.",
                 )
 
         # Check if the command is a single command or multiple commands
@@ -333,7 +333,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                     f"command via && or ;\nProvided commands:\n"
                     f"{'\n'.join(f'({i + 1}) {cmd}' for i, cmd in enumerate(splited_commands))}"  # noqa: E501
                 ),
-                error=True,
+                error="Cannot execute multiple commands at once",
             )
 
         # Get initial state before sending command
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 1c863c61f2..c1daa8dc36 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -525,7 +525,7 @@ def test_single_finish_action_skips_confirmation_entirely(self):
             e for e in self.conversation.state.events if isinstance(e, ObservationEvent)
         ]
         assert len(obs_events) == 1
-        assert obs_events[0].observation.message == "Task completed successfully!"  # type: ignore[attr-defined]
+        assert obs_events[0].observation.output == "Task completed successfully!"
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -565,12 +565,12 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
         assert len(obs_events) == 2
 
         # 1) ThinkAction observation
-        assert hasattr(obs_events[0].observation, "content")
-        assert obs_events[0].observation.content == "Your thought has been logged."  # type: ignore[attr-defined]
+        assert hasattr(obs_events[0].observation, "output")
+        assert obs_events[0].observation.output == "Your thought has been logged."
 
         # 2) FinishAction observation
-        assert hasattr(obs_events[1].observation, "message")
-        assert obs_events[1].observation.message == "Analysis complete"  # type: ignore[attr-defined]
+        assert hasattr(obs_events[1].observation, "output")
+        assert obs_events[1].observation.output == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/sdk/mcp/test_mcp_security_risk.py b/tests/sdk/mcp/test_mcp_security_risk.py
index aa0649c411..e014ff9bf7 100644
--- a/tests/sdk/mcp/test_mcp_security_risk.py
+++ b/tests/sdk/mcp/test_mcp_security_risk.py
@@ -180,4 +180,4 @@ def test_mcp_tool_validates_correctly_after_security_risk_pop():
     # 4. Execute the action (this should also work)
     observation = tool(action)
     assert isinstance(observation, MCPToolObservation)
-    assert not observation.is_error
+    assert not observation.has_error
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 3ca4f9c8bf..7fa0fe59c0 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -40,7 +40,7 @@ def test_from_call_tool_result_success(self):
         assert len(observation.content) == 1
         assert isinstance(observation.content[0], TextContent)
         assert observation.content[0].text == "Operation completed successfully"
-        assert observation.is_error is False
+        assert observation.has_error is False
 
     def test_from_call_tool_result_error(self):
         """Test creating observation from error MCP result."""
@@ -57,7 +57,7 @@ def test_from_call_tool_result_error(self):
         assert len(observation.content) == 1
         assert isinstance(observation.content[0], TextContent)
         assert observation.content[0].text == "Operation failed"
-        assert observation.is_error is True
+        assert observation.has_error is True
 
     def test_from_call_tool_result_with_image(self):
         """Test creating observation from MCP result with image content."""
@@ -81,14 +81,14 @@ def test_from_call_tool_result_with_image(self):
         assert observation.content[0].text == "Here's the image:"
         # Second content should be ImageContent
         assert hasattr(observation.content[1], "image_urls")
-        assert observation.is_error is False
+        assert observation.has_error is False
 
     def test_to_llm_content_success(self):
         """Test agent observation formatting for success."""
         observation = MCPToolObservation(
             tool_name="test_tool",
             content=[TextContent(text="Success result")],
-            is_error=False,
+            error=None,
         )
 
         agent_obs = observation.to_llm_content
@@ -104,7 +104,7 @@ def test_to_llm_content_error(self):
         observation = MCPToolObservation(
             tool_name="test_tool",
             content=[TextContent(text="Error occurred")],
-            is_error=True,
+            error="execution failed",
         )
 
         agent_obs = observation.to_llm_content
@@ -151,7 +151,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
 
         assert isinstance(observation, MCPToolObservation)
         assert observation.tool_name == "test_tool"
-        assert observation.is_error is False
+        assert observation.has_error is False
 
     def test_call_tool_error(self):
         """Test tool execution with error."""
@@ -178,7 +178,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
 
         assert isinstance(observation, MCPToolObservation)
         assert observation.tool_name == "test_tool"
-        assert observation.is_error is True
+        assert observation.has_error is True
 
     def test_call_tool_exception(self):
         """Test tool execution with exception."""
@@ -194,7 +194,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
                         text="Error calling MCP tool test_tool: Connection failed"
                     )
                 ],
-                is_error=True,
+                error="execution failed",
                 tool_name="test_tool",
             )
 
@@ -205,7 +205,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
         assert isinstance(observation, MCPToolObservation)
         assert isinstance(observation.content[0], TextContent)
         assert observation.tool_name == "test_tool"
-        assert observation.is_error is True
+        assert observation.has_error is True
         assert "Connection failed" in observation.content[0].text
 
 
diff --git a/tests/sdk/tool/test_registry.py b/tests/sdk/tool/test_registry.py
index 5cdb176f73..3524a878fc 100644
--- a/tests/sdk/tool/test_registry.py
+++ b/tests/sdk/tool/test_registry.py
@@ -26,16 +26,16 @@ class _HelloAction(Action):
 
 
 class _HelloObservation(Observation):
-    message: str
+    output: str = ""
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.message)]
+        return [TextContent(text=self.output)]
 
 
 class _HelloExec(ToolExecutor[_HelloAction, _HelloObservation]):
     def __call__(self, action: _HelloAction, conversation=None) -> _HelloObservation:
-        return _HelloObservation(message=f"Hello, {action.name}!")
+        return _HelloObservation(output=f"Hello, {action.name}!")
 
 
 class _ConfigurableHelloTool(ToolDefinition):
@@ -55,7 +55,7 @@ def __call__(
                 self, action: _HelloAction, conversation=None
             ) -> _HelloObservation:
                 return _HelloObservation(
-                    message=f"{self._greeting}, {action.name}{self._punctuation}"
+                    output=f"{self._greeting}, {action.name}{self._punctuation}"
                 )
 
         return [
@@ -133,4 +133,4 @@ def test_register_tool_type_uses_create_params():
 
     observation = tool(_HelloAction(name="Alice"))
     assert isinstance(observation, _HelloObservation)
-    assert observation.message == "Howdy, Alice?"
+    assert observation.output == "Howdy, Alice?"
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index df5cdc6b74..c5477f605a 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -288,15 +288,19 @@ def test_cmd_output_observation_properties():
     # Test with failed command
     metadata = CmdOutputMetadata(exit_code=1, pid=456)
     obs = ExecuteBashObservation(
-        command="invalid", output="error", exit_code=1, error=True, metadata=metadata
+        command="invalid",
+        output="error",
+        exit_code=1,
+        error="Command failed",
+        metadata=metadata,
     )
     assert obs.command_id == 456
     assert obs.exit_code == 1
-    assert obs.error
+    assert obs.has_error
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
     assert "exit code 1" in obs.to_llm_content[0].text
-    assert obs.error
+    assert obs.has_error
 
 
 def test_ps1_metadata_empty_fields():
diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 1be00839fc..3f27759c76 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -20,7 +20,7 @@ def test_execute_bash_observation_truncation_under_limit():
     observation = ExecuteBashObservation(
         output="Short output",
         metadata=metadata,
-        error=False,
+        error=None,
     )
 
     result = observation.to_llm_content
@@ -54,7 +54,7 @@ def test_execute_bash_observation_truncation_over_limit():
     observation = ExecuteBashObservation(
         output=long_output,
         metadata=metadata,
-        error=False,
+        error=None,
     )
 
     result = observation.to_llm_content
@@ -91,7 +91,7 @@ def test_execute_bash_observation_truncation_with_error():
     observation = ExecuteBashObservation(
         output=long_output,
         metadata=metadata,
-        error=True,
+        error="Command failed",
     )
 
     result = observation.to_llm_content
@@ -134,7 +134,7 @@ def test_execute_bash_observation_truncation_exact_limit():
     observation = ExecuteBashObservation(
         output=exact_output,
         metadata=metadata,
-        error=False,
+        error=None,
     )
 
     result = observation.to_llm_content
@@ -164,7 +164,7 @@ def test_execute_bash_observation_truncation_with_prefix_suffix():
     observation = ExecuteBashObservation(
         output=long_output,
         metadata=metadata,
-        error=False,
+        error=None,
     )
 
     result = observation.to_llm_content

From 3ff4cb9420746c26f02e5f342ca98693e7fb53ad Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 22:58:41 +0000
Subject: [PATCH 04/76] test(execute_bash): update assertions to use has_error
 per standardized Observation

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/tools/execute_bash/test_bash_session.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index e9afd067af..9e96d32017 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -317,7 +317,7 @@ def test_empty_command_error(terminal_type):
     # Test empty command without previous command
     obs = session.execute(ExecuteBashAction(command=""))
 
-    assert obs.error is True
+    assert obs.has_error is True
     assert obs.output == "ERROR: No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
@@ -715,7 +715,7 @@ def test_multiple_multiline_commands(terminal_type):
 
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
-            assert obs.error is True
+            assert obs.has_error is True
             assert "Cannot execute multiple commands at once" in obs.output
 
             # Now run each command individually and verify they work

From 4c1d809f8b8c7a0a86aed37b4d74daad63eca5ed Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 27 Oct 2025 23:04:03 +0000
Subject: [PATCH 05/76] refactor(delegate): standardize DelegateObservation to
 use output and base to_llm_content; update tests

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/delegate/definition.py    |  5 ++--
 .../openhands/tools/delegate/impl.py          | 18 +++++++-------
 tests/tools/delegation/test_delegation.py     | 24 +++++++++----------
 3 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index 2f8be795a6..da7c157698 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -47,12 +47,13 @@ class DelegateObservation(Observation):
     command: CommandLiteral = Field(
         description="The command that was executed. Either `spawn` or `delegate`."
     )
-    message: str = Field(description="Result message from the operation")
+    output: str = Field(default="", description="Result message from the operation")
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         """Get the observation content to show to the agent."""
-        return [TextContent(text=self.message)]
+        # Use standardized base behavior by prioritizing error then output
+        return super().to_llm_content
 
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index 2b79797c16..59ddc363d7 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -60,7 +60,7 @@ def __call__(  # type: ignore[override]
         else:
             return DelegateObservation(
                 command=action.command,
-                message=f"Unsupported command: {action.command}",
+                output=f"Unsupported command: {action.command}",
             )
 
     def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
@@ -76,13 +76,13 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         if not action.ids:
             return DelegateObservation(
                 command="spawn",
-                message="Error: at least one ID is required for spawn action",
+                output="Error: at least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
                 command="spawn",
-                message=(
+                output=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
                     f"maximum is {self._max_children}"
@@ -117,14 +117,14 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
                 command="spawn",
-                message=message,
+                output=message,
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
                 command="spawn",
-                message=f"Error: failed to spawn agents: {str(e)}",
+                output=f"Error: failed to spawn agents: {str(e)}",
             )
 
     def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
@@ -141,7 +141,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         if not action.tasks:
             return DelegateObservation(
                 command="delegate",
-                message="Error: at least one task is required for delegate action",
+                output="Error: at least one task is required for delegate action",
             )
 
         # Check that all requested agent IDs exist
@@ -149,7 +149,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         if missing_agents:
             return DelegateObservation(
                 command="delegate",
-                message=(
+                output=(
                     f"Error: sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
                 ),
@@ -223,12 +223,12 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
 
             return DelegateObservation(
                 command="delegate",
-                message=message,
+                output=message,
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
                 command="delegate",
-                message=f"Error: failed to delegate tasks: {str(e)}",
+                output=f"Error: failed to delegate tasks: {str(e)}",
             )
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index d3f9d855b5..aad6754ada 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -68,21 +68,21 @@ def test_delegate_observation_creation():
     # Test spawn observation
     spawn_observation = DelegateObservation(
         command="spawn",
-        message="Sub-agents created successfully",
+        output="Sub-agents created successfully",
     )
     assert spawn_observation.command == "spawn"
-    assert spawn_observation.message == "Sub-agents created successfully"
+    assert spawn_observation.output == "Sub-agents created successfully"
     # spawn observation doesn't have results field anymore
 
     # Test delegate observation
     delegate_observation = DelegateObservation(
         command="delegate",
-        message="Tasks completed successfully\n\nResults:\n1. Result 1\n2. Result 2",
+        output="Tasks completed successfully\n\nResults:\n1. Result 1\n2. Result 2",
     )
     assert delegate_observation.command == "delegate"
-    assert "Tasks completed successfully" in delegate_observation.message
-    assert "Result 1" in delegate_observation.message
-    assert "Result 2" in delegate_observation.message
+    assert "Tasks completed successfully" in delegate_observation.output
+    assert "Result 1" in delegate_observation.output
+    assert "Result 2" in delegate_observation.output
 
 
 def test_delegate_executor_delegate():
@@ -92,7 +92,7 @@ def test_delegate_executor_delegate():
     # First spawn some agents
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
-    assert "Successfully spawned" in spawn_observation.message
+    assert "Successfully spawned" in spawn_observation.output
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -103,7 +103,7 @@ def test_delegate_executor_delegate():
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
             command="delegate",
-            message=(
+            output=(
                 "Tasks completed successfully\n\nResults:\n"
                 "1. Agent agent1: Code analysis complete\n"
                 "2. Agent agent2: Tests written"
@@ -115,8 +115,8 @@ def test_delegate_executor_delegate():
 
     assert isinstance(observation, DelegateObservation)
     assert observation.command == "delegate"
-    assert "Agent agent1: Code analysis complete" in observation.message
-    assert "Agent agent2: Tests written" in observation.message
+    assert "Agent agent1: Code analysis complete" in observation.output
+    assert "Agent agent2: Tests written" in observation.output
 
 
 def test_delegate_executor_missing_task():
@@ -131,8 +131,8 @@ def test_delegate_executor_missing_task():
     assert isinstance(observation, DelegateObservation)
     assert observation.command == "delegate"
     assert (
-        "task is required" in observation.message.lower()
-        or "at least one task" in observation.message.lower()
+        "task is required" in observation.output.lower()
+        or "at least one task" in observation.output.lower()
     )
 
 

From d0ca50ac3264b62c8f13905995c44e80d34f9057 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 11:07:22 +0000
Subject: [PATCH 06/76] refactor: improve observation consistency

- Update _format_error to use 'Tool Execution Error:' prefix
- Remove command field from DelegateObservation and TaskTrackerObservation
- Remove unnecessary to_llm_content override in DelegateObservation
- Update delegate/impl.py to use error field for errors (not output)
- Prefix success messages with command name (spawn:/delegate:/plan:/view:)
- Update TaskTrackerExecutor to handle errors via error field
- Update tests to match new observation structure
- Ensure output is not populated when error is present

These changes make observations more consistent by:
1. Eliminating the redundant command field
2. Ensuring errors and outputs are mutually exclusive
3. Relying on base class to_llm_content when possible
4. Keeping command context in the message text when needed

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py    |  2 +-
 .../openhands/tools/delegate/definition.py    | 10 ------
 .../openhands/tools/delegate/impl.py          | 34 ++++++++-----------
 .../tools/task_tracker/definition.py          | 33 ++++++++----------
 tests/tools/delegation/test_delegation.py     | 25 +++++++-------
 5 files changed, 42 insertions(+), 62 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 9206ec7ece..facf722ae3 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -215,7 +215,7 @@ def result_status(self) -> ObservationStatus:
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
     def _format_error(self) -> TextContent:
-        return TextContent(text=f"Error: {self.error}")
+        return TextContent(text=f"Tool Execution Error: {self.error}")
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index da7c157698..b5540f5bda 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -5,7 +5,6 @@
 
 from pydantic import Field
 
-from openhands.sdk.llm.message import ImageContent, TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -44,17 +43,8 @@ class DelegateAction(Action):
 class DelegateObservation(Observation):
     """Observation from delegation operations."""
 
-    command: CommandLiteral = Field(
-        description="The command that was executed. Either `spawn` or `delegate`."
-    )
     output: str = Field(default="", description="Result message from the operation")
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Get the observation content to show to the agent."""
-        # Use standardized base behavior by prioritizing error then output
-        return super().to_llm_content
-
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
 
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index 59ddc363d7..a87552e099 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -59,8 +59,7 @@ def __call__(  # type: ignore[override]
             return self._delegate_tasks(action)
         else:
             return DelegateObservation(
-                command=action.command,
-                output=f"Unsupported command: {action.command}",
+                error=f"Unsupported command: {action.command}",
             )
 
     def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
@@ -75,15 +74,13 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         """
         if not action.ids:
             return DelegateObservation(
-                command="spawn",
-                output="Error: at least one ID is required for spawn action",
+                error="spawn: at least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
-                command="spawn",
-                output=(
-                    f"Cannot spawn {len(action.ids)} agents. "
+                error=(
+                    f"spawn: Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
                     f"maximum is {self._max_children}"
                 ),
@@ -114,17 +111,18 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
                 logger.info(f"Spawned sub-agent with ID: {agent_id}")
 
             agent_list = ", ".join(action.ids)
-            message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
+            message = (
+                f"spawn: Successfully spawned {len(action.ids)} sub-agents: "
+                f"{agent_list}"
+            )
             return DelegateObservation(
-                command="spawn",
                 output=message,
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
-                command="spawn",
-                output=f"Error: failed to spawn agents: {str(e)}",
+                error=f"spawn: failed to spawn agents: {str(e)}",
             )
 
     def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
@@ -140,17 +138,15 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         """
         if not action.tasks:
             return DelegateObservation(
-                command="delegate",
-                output="Error: at least one task is required for delegate action",
+                error="delegate: at least one task is required for delegate action",
             )
 
         # Check that all requested agent IDs exist
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
             return DelegateObservation(
-                command="delegate",
-                output=(
-                    f"Error: sub-agents not found: {', '.join(missing_agents)}. "
+                error=(
+                    f"delegate: sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
                 ),
             )
@@ -211,7 +207,7 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                     all_results.append(f"Agent {agent_id}: No result")
 
             # Create comprehensive message with results
-            message = f"Completed delegation of {len(action.tasks)} tasks"
+            message = f"delegate: Completed delegation of {len(action.tasks)} tasks"
             if errors:
                 message += f" with {len(errors)} errors"
 
@@ -222,13 +218,11 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                 message += f"\n\nResults:\n{results_text}"
 
             return DelegateObservation(
-                command="delegate",
                 output=message,
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
-                command="delegate",
-                output=f"Error: failed to delegate tasks: {str(e)}",
+                error=f"delegate: failed to delegate tasks: {str(e)}",
             )
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 7ec4c83094..0a2cb23a0f 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -72,7 +72,6 @@ class TaskTrackerObservation(Observation):
     output: str = Field(
         default="", description="The formatted task list or status message"
     )
-    command: str = Field(default="", description="The command that was executed")
     task_list: list[TaskItem] = Field(
         default_factory=list, description="The current task list"
     )
@@ -91,12 +90,8 @@ def visualize(self) -> Text:
             done_count = sum(1 for task in self.task_list if task.status == "done")
 
             # Show status summary
-            if self.command == "plan":
-                content.append("✅ ", style="green")
-                content.append("Task list updated: ", style="green")
-            else:  # view command
-                content.append("📋 ", style="blue")
-                content.append("Current task list: ", style="blue")
+            content.append("📋 ", style="blue")
+            content.append("Task list: ", style="blue")
 
             # Status counts
             status_parts = []
@@ -170,28 +165,30 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                output="Task list has been updated with "
-                + f"{len(self._task_list)} item(s).",
-                command=action.command,
+                output=(
+                    f"plan: Task list has been updated with "
+                    f"{len(self._task_list)} item(s)."
+                ),
                 task_list=self._task_list,
             )
         elif action.command == "view":
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    output='No task list found. Use the "plan" command to create one.',
-                    command=action.command,
+                    output=(
+                        'view: No task list found. Use the "plan" command to '
+                        "create one."
+                    ),
                     task_list=[],
                 )
             content = self._format_task_list(self._task_list)
-            return TaskTrackerObservation(
-                output=content, command=action.command, task_list=self._task_list
-            )
+            return TaskTrackerObservation(output=content, task_list=self._task_list)
         else:
             return TaskTrackerObservation(
-                output=f"Unknown command: {action.command}. "
-                + 'Supported commands are "view" and "plan".',
-                command=action.command,
+                error=(
+                    f"Unknown command: {action.command}. "
+                    'Supported commands are "view" and "plan".'
+                ),
                 task_list=[],
             )
 
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index aad6754ada..3e27ddd0b2 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -67,19 +67,18 @@ def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
     # Test spawn observation
     spawn_observation = DelegateObservation(
-        command="spawn",
-        output="Sub-agents created successfully",
+        output="spawn: Sub-agents created successfully",
     )
-    assert spawn_observation.command == "spawn"
-    assert spawn_observation.output == "Sub-agents created successfully"
+    assert spawn_observation.output == "spawn: Sub-agents created successfully"
     # spawn observation doesn't have results field anymore
 
     # Test delegate observation
     delegate_observation = DelegateObservation(
-        command="delegate",
-        output="Tasks completed successfully\n\nResults:\n1. Result 1\n2. Result 2",
+        output=(
+            "delegate: Tasks completed successfully\n\nResults:\n"
+            "1. Result 1\n2. Result 2"
+        ),
     )
-    assert delegate_observation.command == "delegate"
     assert "Tasks completed successfully" in delegate_observation.output
     assert "Result 1" in delegate_observation.output
     assert "Result 2" in delegate_observation.output
@@ -102,9 +101,8 @@ def test_delegate_executor_delegate():
 
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
-            command="delegate",
             output=(
-                "Tasks completed successfully\n\nResults:\n"
+                "delegate: Tasks completed successfully\n\nResults:\n"
                 "1. Agent agent1: Code analysis complete\n"
                 "2. Agent agent2: Tests written"
             ),
@@ -114,7 +112,6 @@ def test_delegate_executor_delegate():
         observation = executor(delegate_action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    assert observation.command == "delegate"
     assert "Agent agent1: Code analysis complete" in observation.output
     assert "Agent agent2: Tests written" in observation.output
 
@@ -129,10 +126,12 @@ def test_delegate_executor_missing_task():
     observation = executor(action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    assert observation.command == "delegate"
+    # Error message should be in the error field
+    assert observation.has_error
+    assert observation.error is not None
     assert (
-        "task is required" in observation.output.lower()
-        or "at least one task" in observation.output.lower()
+        "task is required" in observation.error.lower()
+        or "at least one task" in observation.error.lower()
     )
 
 

From b7efb0d274689562213b6f4d2eb4d217402e0255 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 11:20:39 +0000
Subject: [PATCH 07/76] refactor: simplify MCPToolObservation to use base
 to_llm_content

- Remove custom to_llm_content override from MCPToolObservation
- Rename convrted_content to converted_content (typo fix)
- Populate error field with full content when MCP result has error
- Populate output field with formatted content for successful results
- Update tests to match new behavior (single TextContent vs list)
- Error messages now include: header + error notice + content
- Success messages now include: header + content

This makes MCPToolObservation consistent with other observations by
relying on the base class to_llm_content and properly separating
error vs output fields.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 44 ++++++++++++-------
 tests/sdk/mcp/test_mcp_tool.py                | 18 ++++----
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index b0cff07653..c96e4a2bab 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -1,7 +1,6 @@
 """MCPTool definition and implementation."""
 
 import json
-from collections.abc import Sequence
 from typing import Any
 
 import mcp.types
@@ -66,12 +65,12 @@ def from_call_tool_result(
     ) -> "MCPToolObservation":
         """Create an MCPToolObservation from a CallToolResult."""
         content: list[mcp.types.ContentBlock] = result.content
-        convrted_content = []
+        converted_content = []
         for block in content:
             if isinstance(block, mcp.types.TextContent):
-                convrted_content.append(TextContent(text=block.text))
+                converted_content.append(TextContent(text=block.text))
             elif isinstance(block, mcp.types.ImageContent):
-                convrted_content.append(
+                converted_content.append(
                     ImageContent(
                         image_urls=[f"data:{block.mimeType};base64,{block.data}"],
                     )
@@ -80,19 +79,32 @@ def from_call_tool_result(
                 logger.warning(
                     f"Unsupported MCP content block type: {type(block)}. Ignoring."
                 )
-        return cls(
-            content=convrted_content,
-            error=("MCP tool error" if result.isError else None),
-            tool_name=tool_name,
-        )
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Format the observation for agent display."""
-        initial = f"[Tool '{self.tool_name}' executed.]\n"
-        if self.has_error:
-            initial += "[An error occurred during execution.]\n"
-        return [TextContent(text=initial)] + self.content
+        # Convert content to string for output/error field
+        content_str = ""
+        for block in converted_content:
+            if isinstance(block, TextContent):
+                content_str += block.text + "\n"
+            elif isinstance(block, ImageContent):
+                content_str += f"[Image with {len(block.image_urls)} URLs]\n"
+
+        header = f"[Tool '{tool_name}' executed.]\n"
+
+        # Populate error or output field based on result status
+        if result.isError:
+            error_msg = header + "[An error occurred during execution.]\n" + content_str
+            return cls(
+                content=converted_content,
+                error=error_msg,
+                tool_name=tool_name,
+            )
+        else:
+            output_msg = header + content_str
+            return cls(
+                content=converted_content,
+                output=output_msg,
+                tool_name=tool_name,
+            )
 
     @property
     def visualize(self) -> Text:
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 7fa0fe59c0..5a6cf5cb9b 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -88,32 +88,34 @@ def test_to_llm_content_success(self):
         observation = MCPToolObservation(
             tool_name="test_tool",
             content=[TextContent(text="Success result")],
-            error=None,
+            output="[Tool 'test_tool' executed.]\nSuccess result\n",
         )
 
         agent_obs = observation.to_llm_content
-        assert len(agent_obs) == 2
+        assert len(agent_obs) == 1
         assert isinstance(agent_obs[0], TextContent)
         assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
+        assert "Success result" in agent_obs[0].text
         assert "[An error occurred during execution.]" not in agent_obs[0].text
-        assert isinstance(agent_obs[1], TextContent)
-        assert agent_obs[1].text == "Success result"
 
     def test_to_llm_content_error(self):
         """Test agent observation formatting for error."""
         observation = MCPToolObservation(
             tool_name="test_tool",
             content=[TextContent(text="Error occurred")],
-            error="execution failed",
+            error=(
+                "[Tool 'test_tool' executed.]\n"
+                "[An error occurred during execution.]\n"
+                "Error occurred\n"
+            ),
         )
 
         agent_obs = observation.to_llm_content
-        assert len(agent_obs) == 2
+        assert len(agent_obs) == 1
         assert isinstance(agent_obs[0], TextContent)
-        assert isinstance(agent_obs[1], TextContent)
         assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
         assert "[An error occurred during execution.]" in agent_obs[0].text
-        assert agent_obs[1].text == "Error occurred"
+        assert "Error occurred" in agent_obs[0].text
 
 
 class TestMCPToolExecutor:

From a93a9e2b9b5b77936832f7edf34a8943c15a3895 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 11:25:13 +0000
Subject: [PATCH 08/76] refactor: use error field for multiple command error in
 ExecuteBash

- Remove output field from multiple commands error case
- Move full error message to error field instead
- Update test to check error field instead of output field
- Fix line length by extracting commands_list variable

This makes ExecuteBashObservation consistent with the pattern that
errors should use the error field, not the output field.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../tools/execute_bash/terminal/terminal_session.py | 13 +++++++------
 tests/tools/execute_bash/test_bash_session.py       |  3 ++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index dcc16010b6..fad4f457b5 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -326,14 +326,15 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
         # Check if the command is a single command or multiple commands
         splited_commands = split_bash_commands(command)
         if len(splited_commands) > 1:
+            commands_list = "\n".join(
+                f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands)
+            )
             return ExecuteBashObservation(
-                output=(
-                    f"ERROR: Cannot execute multiple commands at once.\n"
-                    f"Please run each command separately OR chain them into a single "
-                    f"command via && or ;\nProvided commands:\n"
-                    f"{'\n'.join(f'({i + 1}) {cmd}' for i, cmd in enumerate(splited_commands))}"  # noqa: E501
+                error=(
+                    "ERROR: Cannot execute multiple commands at once.\n"
+                    "Please run each command separately OR chain them into a single "
+                    f"command via && or ;\nProvided commands:\n{commands_list}"
                 ),
-                error="Cannot execute multiple commands at once",
             )
 
         # Get initial state before sending command
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 9e96d32017..98d8af4861 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -716,7 +716,8 @@ def test_multiple_multiline_commands(terminal_type):
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
             assert obs.has_error is True
-            assert "Cannot execute multiple commands at once" in obs.output
+            assert obs.error is not None
+            assert "Cannot execute multiple commands at once" in obs.error
 
             # Now run each command individually and verify they work
             results = []

From bded49d925afbd899e37e0b704250976f26fe2f0 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 11:28:44 +0000
Subject: [PATCH 09/76] refactor: remove redundant to_llm_content from
 FileEditorObservation

The FileEditorObservation.to_llm_content was simply returning error or output,
which is exactly what the base Observation class now provides by default.
Removing this override allows the observation to use the standardized base
implementation.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/file_editor/definition.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index f05eb4e765..19c328a455 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -10,7 +10,6 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
-from openhands.sdk.llm import ImageContent, TextContent
 from openhands.sdk.tool import (
     Action,
     Observation,
@@ -88,12 +87,6 @@ class FileEditorObservation(Observation):
 
     _diff_cache: Text | None = PrivateAttr(default=None)
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        if self.error:
-            return [TextContent(text=f"Error: {self.error}")]
-        return [TextContent(text=self.output)]
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation.

From 262e2a57532af40ef79cd95ccc933e9bc8262ed4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 11:36:39 +0000
Subject: [PATCH 10/76] refactor: revert TaskTrackerObservation changes to
 preserve original interface

- Restored 'command' field to TaskTrackerObservation
- Changed 'output' back to 'content' field
- Restored original visualize method that uses command field to determine styling
- Added custom to_llm_content implementation
- Updated executor to populate command field in observations

This preserves the original interface while maintaining consistency with the
base observation standardization changes.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../tools/task_tracker/definition.py          | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 0a2cb23a0f..93387c63e4 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -11,6 +11,7 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
+from openhands.sdk.llm.message import ImageContent, TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import (
     Action,
@@ -69,13 +70,18 @@ def visualize(self) -> Text:
 class TaskTrackerObservation(Observation):
     """This data class represents the result of a task tracking operation."""
 
-    output: str = Field(
+    content: str = Field(
         default="", description="The formatted task list or status message"
     )
+    command: str = Field(default="", description="The command that was executed")
     task_list: list[TaskItem] = Field(
         default_factory=list, description="The current task list"
     )
 
+    @property
+    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        return [TextContent(text=self.content)]
+
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
@@ -90,8 +96,12 @@ def visualize(self) -> Text:
             done_count = sum(1 for task in self.task_list if task.status == "done")
 
             # Show status summary
-            content.append("📋 ", style="blue")
-            content.append("Task list: ", style="blue")
+            if self.command == "plan":
+                content.append("✅ ", style="green")
+                content.append("Task list updated: ", style="green")
+            else:  # view command
+                content.append("📋 ", style="blue")
+                content.append("Current task list: ", style="blue")
 
             # Status counts
             status_parts = []
@@ -165,30 +175,35 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                output=(
+                content=(
                     f"plan: Task list has been updated with "
                     f"{len(self._task_list)} item(s)."
                 ),
+                command=action.command,
                 task_list=self._task_list,
             )
         elif action.command == "view":
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    output=(
+                    content=(
                         'view: No task list found. Use the "plan" command to '
                         "create one."
                     ),
+                    command=action.command,
                     task_list=[],
                 )
             content = self._format_task_list(self._task_list)
-            return TaskTrackerObservation(output=content, task_list=self._task_list)
+            return TaskTrackerObservation(
+                content=content, command=action.command, task_list=self._task_list
+            )
         else:
             return TaskTrackerObservation(
                 error=(
                     f"Unknown command: {action.command}. "
                     'Supported commands are "view" and "plan".'
                 ),
+                command=action.command,
                 task_list=[],
             )
 

From f0aaea0095423f311db576bb1eee312190eea2d3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 12:03:00 +0000
Subject: [PATCH 11/76] refactor: add optional command field to base
 Observation and update to_llm_content

- Added command: str | None field to base Observation class
- Updated to_llm_content to prepend 'Command: {command}' when command is present
- Removed to_llm_content overrides from simple observations (Glob, Grep, TaskTracker)
- Updated executors to populate output field instead of defining custom to_llm_content
- Kept complex to_llm_content overrides (Browser, ExecuteBash) that handle images and metadata

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py    | 13 +++--
 .../openhands/tools/glob/definition.py        | 34 ++-----------
 openhands-tools/openhands/tools/glob/impl.py  | 19 +++++++
 .../openhands/tools/grep/definition.py        | 45 ++---------------
 openhands-tools/openhands/tools/grep/impl.py  | 49 +++++++++++++++++++
 .../tools/task_tracker/definition.py          | 23 ++-------
 6 files changed, 91 insertions(+), 92 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index facf722ae3..816936ab37 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -203,6 +203,9 @@ class Observation(Schema, ABC):
     error: str | None = Field(
         default=None, description="Error message if operation failed"
     )
+    command: str | None = Field(
+        default=None, description="The command that was executed, if applicable"
+    )
 
     @property
     def has_error(self) -> bool:
@@ -224,10 +227,14 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         Subclasses can override to provide richer content (e.g., images, diffs),
         but should preserve the error-first convention.
         """
+        # Prepend command if present
+        command_prefix = f"Command: {self.command}\n\n" if self.command else ""
+
         if self.error:
-            return [self._format_error()]
-        if self.output:
-            return [TextContent(text=self.output)]
+            error_text = self._format_error().text
+            return [TextContent(text=command_prefix + error_text)]
+        elif self.output:
+            return [TextContent(text=command_prefix + self.output)]
         return []
 
     @property
diff --git a/openhands-tools/openhands/tools/glob/definition.py b/openhands-tools/openhands/tools/glob/definition.py
index c18d86425b..218d08ee9a 100644
--- a/openhands-tools/openhands/tools/glob/definition.py
+++ b/openhands-tools/openhands/tools/glob/definition.py
@@ -1,17 +1,17 @@
 """Glob tool implementation for fast file pattern matching."""
 
 import os
-from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
 
+from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+
 
 if TYPE_CHECKING:
-    from openhands.sdk.conversation.state import ConversationState
+    from collections.abc import Sequence
 
-from openhands.sdk.llm import ImageContent, TextContent
-from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+    from openhands.sdk.conversation.state import ConversationState
 
 
 class GlobAction(Action):
@@ -40,32 +40,6 @@ class GlobObservation(Observation):
     truncated: bool = Field(
         default=False, description="Whether results were truncated to 100 files"
     )
-    error: str | None = Field(default=None, description="Error message if any")
-
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Convert observation to LLM content."""
-        if self.error:
-            return [TextContent(text=f"Error: {self.error}")]
-
-        if not self.files:
-            content = (
-                f"No files found matching pattern '{self.pattern}' "
-                f"in directory '{self.search_path}'"
-            )
-        else:
-            file_list = "\n".join(self.files)
-            content = (
-                f"Found {len(self.files)} file(s) matching pattern "
-                f"'{self.pattern}' in '{self.search_path}':\n{file_list}"
-            )
-            if self.truncated:
-                content += (
-                    "\n\n[Results truncated to first 100 files. "
-                    "Consider using a more specific pattern.]"
-                )
-
-        return [TextContent(text=content)]
 
 
 TOOL_DESCRIPTION = """Fast file pattern matching tool.
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 1b566d5d4c..29f6cbbd1d 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -79,11 +79,30 @@ def __call__(
             else:
                 files, truncated = self._execute_with_glob(pattern, search_path)
 
+            # Format output message
+            if not files:
+                output = (
+                    f"No files found matching pattern '{original_pattern}' "
+                    f"in directory '{search_path}'"
+                )
+            else:
+                file_list = "\n".join(files)
+                output = (
+                    f"Found {len(files)} file(s) matching pattern "
+                    f"'{original_pattern}' in '{search_path}':\n{file_list}"
+                )
+                if truncated:
+                    output += (
+                        "\n\n[Results truncated to first 100 files. "
+                        "Consider using a more specific pattern.]"
+                    )
+
             return GlobObservation(
                 files=files,
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
+                output=output,
             )
 
         except Exception as e:
diff --git a/openhands-tools/openhands/tools/grep/definition.py b/openhands-tools/openhands/tools/grep/definition.py
index 4913fde795..a9a59bf042 100644
--- a/openhands-tools/openhands/tools/grep/definition.py
+++ b/openhands-tools/openhands/tools/grep/definition.py
@@ -1,17 +1,17 @@
 """Grep tool implementation for fast content search."""
 
 import os
-from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
 
+from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+
 
 if TYPE_CHECKING:
-    from openhands.sdk.conversation.state import ConversationState
+    from collections.abc import Sequence
 
-from openhands.sdk.llm import ImageContent, TextContent
-from openhands.sdk.tool import Action, Observation, ToolAnnotations, ToolDefinition
+    from openhands.sdk.conversation.state import ConversationState
 
 
 class GrepAction(Action):
@@ -46,43 +46,6 @@ class GrepObservation(Observation):
     truncated: bool = Field(
         default=False, description="Whether results were truncated to 100 files"
     )
-    error: str | None = Field(default=None, description="Error message if any")
-
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Convert observation to LLM content."""
-        if self.error:
-            return [TextContent(text=f"Error: {self.error}")]
-
-        if not self.matches:
-            include_info = (
-                f" (filtered by '{self.include_pattern}')"
-                if self.include_pattern
-                else ""
-            )
-            content = (
-                f"No files found containing pattern '{self.pattern}' "
-                f"in directory '{self.search_path}'{include_info}"
-            )
-        else:
-            include_info = (
-                f" (filtered by '{self.include_pattern}')"
-                if self.include_pattern
-                else ""
-            )
-            file_list = "\n".join(self.matches)
-            content = (
-                f"Found {len(self.matches)} file(s) containing pattern "
-                f"'{self.pattern}' in '{self.search_path}'{include_info}:\n"
-                f"{file_list}"
-            )
-            if self.truncated:
-                content += (
-                    "\n\n[Results truncated to first 100 files. "
-                    "Consider using a more specific pattern.]"
-                )
-
-        return [TextContent(text=content)]
 
 
 TOOL_DESCRIPTION = """Fast content search tool.
diff --git a/openhands-tools/openhands/tools/grep/impl.py b/openhands-tools/openhands/tools/grep/impl.py
index ef99434221..4f22e5f3c2 100644
--- a/openhands-tools/openhands/tools/grep/impl.py
+++ b/openhands-tools/openhands/tools/grep/impl.py
@@ -100,6 +100,37 @@ def __call__(
                 error=str(e),
             )
 
+    def _format_output(
+        self,
+        matches: list[str],
+        pattern: str,
+        search_path: str,
+        include_pattern: str | None,
+        truncated: bool,
+    ) -> str:
+        """Format the grep observation output message."""
+        if not matches:
+            include_info = (
+                f" (filtered by '{include_pattern}')" if include_pattern else ""
+            )
+            return (
+                f"No files found containing pattern '{pattern}' "
+                f"in directory '{search_path}'{include_info}"
+            )
+
+        include_info = f" (filtered by '{include_pattern}')" if include_pattern else ""
+        file_list = "\n".join(matches)
+        output = (
+            f"Found {len(matches)} file(s) containing pattern "
+            f"'{pattern}' in '{search_path}'{include_info}:\n{file_list}"
+        )
+        if truncated:
+            output += (
+                "\n\n[Results truncated to first 100 files. "
+                "Consider using a more specific pattern.]"
+            )
+        return output
+
     def _execute_with_ripgrep(
         self, action: GrepAction, search_path: Path
     ) -> GrepObservation:
@@ -135,12 +166,21 @@ def _execute_with_ripgrep(
 
         truncated = len(matches) >= 100
 
+        output = self._format_output(
+            matches=matches,
+            pattern=action.pattern,
+            search_path=str(search_path),
+            include_pattern=action.include,
+            truncated=truncated,
+        )
+
         return GrepObservation(
             matches=matches,
             pattern=action.pattern,
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
+            output=output,
         )
 
     def _execute_with_grep(
@@ -189,10 +229,19 @@ def _execute_with_grep(
 
         truncated = len(matches) >= 100
 
+        output = self._format_output(
+            matches=matches,
+            pattern=action.pattern,
+            search_path=str(search_path),
+            include_pattern=action.include,
+            truncated=truncated,
+        )
+
         return GrepObservation(
             matches=matches,
             pattern=action.pattern,
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
+            output=output,
         )
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 93387c63e4..d408b3a092 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -11,7 +11,6 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
-from openhands.sdk.llm.message import ImageContent, TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import (
     Action,
@@ -70,18 +69,10 @@ def visualize(self) -> Text:
 class TaskTrackerObservation(Observation):
     """This data class represents the result of a task tracking operation."""
 
-    content: str = Field(
-        default="", description="The formatted task list or status message"
-    )
-    command: str = Field(default="", description="The command that was executed")
     task_list: list[TaskItem] = Field(
         default_factory=list, description="The current task list"
     )
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.content)]
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
@@ -175,9 +166,8 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                content=(
-                    f"plan: Task list has been updated with "
-                    f"{len(self._task_list)} item(s)."
+                output=(
+                    f"Task list has been updated with {len(self._task_list)} item(s)."
                 ),
                 command=action.command,
                 task_list=self._task_list,
@@ -186,16 +176,13 @@ def __call__(
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    content=(
-                        'view: No task list found. Use the "plan" command to '
-                        "create one."
-                    ),
+                    output='No task list found. Use the "plan" command to create one.',
                     command=action.command,
                     task_list=[],
                 )
-            content = self._format_task_list(self._task_list)
+            output = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
-                content=content, command=action.command, task_list=self._task_list
+                output=output, command=action.command, task_list=self._task_list
             )
         else:
             return TaskTrackerObservation(

From 11d54958c7cf522848d0b0bebcfd252d36ff5b96 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 12:58:25 +0000
Subject: [PATCH 12/76] refactor: clean up observation subclasses for
 consistency

- Remove 'output', 'error', 'command' field redeclarations from observation subclasses where possible
- Update MCPToolObservation to use output field instead of _content field
- Store text content in output, images in separate images field
- Update MCP tests to match new structure
- Keep FileEditorObservation.command field (required for internal logic)
- When error occurs, populate only error field, not output

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 82 +++++++++++--------
 .../openhands/tools/browser_use/definition.py |  2 -
 .../openhands/tools/delegate/definition.py    |  2 +-
 .../tools/execute_bash/definition.py          |  5 --
 .../openhands/tools/file_editor/definition.py | 12 ++-
 tests/sdk/mcp/test_mcp_tool.py                | 43 +++++-----
 6 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index c96e4a2bab..ff5586c0c9 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -1,6 +1,7 @@
 """MCPTool definition and implementation."""
 
 import json
+from collections.abc import Sequence
 from typing import Any
 
 import mcp.types
@@ -50,12 +51,9 @@ def to_mcp_arguments(self) -> dict:
 class MCPToolObservation(Observation):
     """Observation from MCP tool execution."""
 
-    content: list[TextContent | ImageContent] = Field(
+    images: list[ImageContent] = Field(
         default_factory=list,
-        description=(
-            "Content returned from the MCP tool converted to LLM Ready TextContent "
-            "or ImageContent"
-        ),
+        description="Image content returned from the MCP tool",
     )
     tool_name: str = Field(description="Name of the tool that was called")
 
@@ -65,12 +63,14 @@ def from_call_tool_result(
     ) -> "MCPToolObservation":
         """Create an MCPToolObservation from a CallToolResult."""
         content: list[mcp.types.ContentBlock] = result.content
-        converted_content = []
+        text_parts = []
+        images = []
+
         for block in content:
             if isinstance(block, mcp.types.TextContent):
-                converted_content.append(TextContent(text=block.text))
+                text_parts.append(block.text)
             elif isinstance(block, mcp.types.ImageContent):
-                converted_content.append(
+                images.append(
                     ImageContent(
                         image_urls=[f"data:{block.mimeType};base64,{block.data}"],
                     )
@@ -80,48 +80,66 @@ def from_call_tool_result(
                     f"Unsupported MCP content block type: {type(block)}. Ignoring."
                 )
 
-        # Convert content to string for output/error field
-        content_str = ""
-        for block in converted_content:
-            if isinstance(block, TextContent):
-                content_str += block.text + "\n"
-            elif isinstance(block, ImageContent):
-                content_str += f"[Image with {len(block.image_urls)} URLs]\n"
-
-        header = f"[Tool '{tool_name}' executed.]\n"
+        header = f"[Tool '{tool_name}' executed.]"
+        text_content = "\n".join(text_parts) if text_parts else ""
 
         # Populate error or output field based on result status
         if result.isError:
-            error_msg = header + "[An error occurred during execution.]\n" + content_str
+            error_msg = (
+                f"{header}\n[An error occurred during execution.]\n{text_content}"
+            )
+            # When there is an error, don't populate output
             return cls(
-                content=converted_content,
                 error=error_msg,
+                images=images,
                 tool_name=tool_name,
             )
         else:
-            output_msg = header + content_str
+            # When success, don't populate error
+            output_msg = f"{header}\n{text_content}" if text_content else header
             return cls(
-                content=converted_content,
                 output=output_msg,
+                images=images,
                 tool_name=tool_name,
             )
 
+    @property
+    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        """Return structured content with images for LLM consumption.
+
+        Overrides base to preserve image content alongside text.
+        """
+        result: list[TextContent | ImageContent] = []
+
+        if self.error:
+            result.append(self._format_error())
+        elif self.output:
+            result.append(TextContent(text=self.output))
+
+        # Append images
+        result.extend(self.images)
+        return result
+
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
         content = Text()
         content.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
+
         if self.has_error:
             content.append("[Error during execution]\n", style="bold red")
-        for block in self.content:
-            if isinstance(block, TextContent):
-                # try to see if block.text is a JSON
-                try:
-                    parsed = json.loads(block.text)
-                    content.append(display_dict(parsed))
-                    continue
-                except (json.JSONDecodeError, TypeError):
-                    content.append(block.text + "\n")
-            elif isinstance(block, ImageContent):
-                content.append(f"[Image with {len(block.image_urls)} URLs]\n")
+            if self.error:
+                content.append(self.error + "\n")
+        elif self.output:
+            # Try to parse as JSON for better display
+            try:
+                parsed = json.loads(self.output)
+                content.append(display_dict(parsed))
+            except (json.JSONDecodeError, TypeError):
+                content.append(self.output + "\n")
+
+        # Show images if present
+        for image in self.images:
+            content.append(f"[Image with {len(image.image_urls)} URLs]\n")
+
         return content
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 90cc8ca55e..60ebeed8d2 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -28,8 +28,6 @@
 class BrowserObservation(Observation):
     """Base observation for browser operations."""
 
-    output: str = Field(description="The output message from the browser operation")
-    error: str | None = Field(default=None, description="Error message if any")
     screenshot_data: str | None = Field(
         default=None, description="Base64 screenshot data if available"
     )
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index b5540f5bda..81eafa06fb 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -43,7 +43,7 @@ class DelegateAction(Action):
 class DelegateObservation(Observation):
     """Observation from delegation operations."""
 
-    output: str = Field(default="", description="Result message from the operation")
+    pass
 
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index e3a111847c..782613a3f1 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,11 +79,6 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    output: str = Field(default="", description="The raw output from the tool.")
-    command: str | None = Field(
-        default=None,
-        description="The bash command that was executed. Can be empty string if the observation is from a previous command that hit soft timeout and is not yet finished.",  # noqa
-    )
     exit_code: int | None = Field(
         default=None,
         description="The exit code of the command. -1 indicates the process hit the soft timeout and is not yet finished.",  # noqa
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 19c328a455..9331c2e317 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -65,12 +65,11 @@ class FileEditorAction(Action):
 class FileEditorObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    command: CommandLiteral = Field(
-        description="The commands to run. Allowed options are: `view`, `create`, "
-        "`str_replace`, `insert`, `undo_edit`."
-    )
-    output: str = Field(
-        default="", description="The output message from the tool for the LLM to see."
+    command: CommandLiteral = Field(  # type: ignore[assignment]
+        description=(
+            "The command that was run: `view`, `create`, `str_replace`, "
+            "`insert`, or `undo_edit`."
+        )
     )
     path: str | None = Field(default=None, description="The file path that was edited.")
     prev_exist: bool = Field(
@@ -83,7 +82,6 @@ class FileEditorObservation(Observation):
     new_content: str | None = Field(
         default=None, description="The content of the file after the edit."
     )
-    error: str | None = Field(default=None, description="Error message if any.")
 
     _diff_cache: Text | None = PrivateAttr(default=None)
 
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 5a6cf5cb9b..3e8bc28813 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -37,9 +37,10 @@ def test_from_call_tool_result_success(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert len(observation.content) == 1
-        assert isinstance(observation.content[0], TextContent)
-        assert observation.content[0].text == "Operation completed successfully"
+        assert observation.output is not None
+        assert "[Tool 'test_tool' executed.]" in observation.output
+        assert "Operation completed successfully" in observation.output
+        assert len(observation.images) == 0
         assert observation.has_error is False
 
     def test_from_call_tool_result_error(self):
@@ -54,9 +55,11 @@ def test_from_call_tool_result_error(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert len(observation.content) == 1
-        assert isinstance(observation.content[0], TextContent)
-        assert observation.content[0].text == "Operation failed"
+        assert observation.error is not None
+        assert "[Tool 'test_tool' executed.]" in observation.error
+        assert "[An error occurred during execution.]" in observation.error
+        assert "Operation failed" in observation.error
+        assert len(observation.images) == 0
         assert observation.has_error is True
 
     def test_from_call_tool_result_with_image(self):
@@ -76,19 +79,18 @@ def test_from_call_tool_result_with_image(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert len(observation.content) == 2
-        assert isinstance(observation.content[0], TextContent)
-        assert observation.content[0].text == "Here's the image:"
-        # Second content should be ImageContent
-        assert hasattr(observation.content[1], "image_urls")
+        assert observation.output is not None
+        assert "[Tool 'test_tool' executed.]" in observation.output
+        assert "Here's the image:" in observation.output
+        assert len(observation.images) == 1
+        assert hasattr(observation.images[0], "image_urls")
         assert observation.has_error is False
 
     def test_to_llm_content_success(self):
         """Test agent observation formatting for success."""
         observation = MCPToolObservation(
             tool_name="test_tool",
-            content=[TextContent(text="Success result")],
-            output="[Tool 'test_tool' executed.]\nSuccess result\n",
+            output="[Tool 'test_tool' executed.]\nSuccess result",
         )
 
         agent_obs = observation.to_llm_content
@@ -102,17 +104,17 @@ def test_to_llm_content_error(self):
         """Test agent observation formatting for error."""
         observation = MCPToolObservation(
             tool_name="test_tool",
-            content=[TextContent(text="Error occurred")],
             error=(
                 "[Tool 'test_tool' executed.]\n"
                 "[An error occurred during execution.]\n"
-                "Error occurred\n"
+                "Error occurred"
             ),
         )
 
         agent_obs = observation.to_llm_content
         assert len(agent_obs) == 1
         assert isinstance(agent_obs[0], TextContent)
+        assert "Tool Execution Error:" in agent_obs[0].text
         assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
         assert "[An error occurred during execution.]" in agent_obs[0].text
         assert "Error occurred" in agent_obs[0].text
@@ -191,12 +193,7 @@ def test_call_tool_exception(self):
         # Mock call_async_from_sync to return an error observation
         def mock_call_async_from_sync(coro_func, **kwargs):
             return MCPToolObservation(
-                content=[
-                    TextContent(
-                        text="Error calling MCP tool test_tool: Connection failed"
-                    )
-                ],
-                error="execution failed",
+                error="Error calling MCP tool test_tool: Connection failed",
                 tool_name="test_tool",
             )
 
@@ -205,10 +202,10 @@ def mock_call_async_from_sync(coro_func, **kwargs):
         observation = self.executor(mock_action)
 
         assert isinstance(observation, MCPToolObservation)
-        assert isinstance(observation.content[0], TextContent)
         assert observation.tool_name == "test_tool"
         assert observation.has_error is True
-        assert "Connection failed" in observation.content[0].text
+        assert observation.error is not None
+        assert "Connection failed" in observation.error
 
 
 class TestMCPTool:

From b52ce10ff07dcd2d2dcfcdd4b76b191ad518b943 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Tue, 28 Oct 2025 14:19:00 +0100
Subject: [PATCH 13/76] update

---
 .../openhands/sdk/tool/builtins/finish.py     | 16 ++++-----
 .../openhands/sdk/tool/builtins/think.py      | 16 +++------
 .../openhands/tools/delegate/definition.py    |  2 --
 .../openhands/tools/delegate/impl.py          | 36 +++++++++++--------
 4 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index ec17b2960c..dd0e44f623 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -29,13 +29,11 @@ def visualize(self) -> Text:
 
 
 class FinishObservation(Observation):
-    output: str = Field(default="", description="Final message sent to the user.")
-
-    @property
-    def visualize(self) -> Text:
-        """Return Rich Text representation - empty since action shows the message."""
-        # Don't duplicate the finish message display - action already shows it
-        return Text()
+    """
+    Observation returned after finishing a task.
+    The FinishAction itself contains the message sent to the user so no
+    extra fields are needed here.
+    """
 
 
 TOOL_DESCRIPTION = """Signals the completion of the current task or conversation.
@@ -55,10 +53,10 @@ def visualize(self) -> Text:
 class FinishExecutor(ToolExecutor):
     def __call__(
         self,
-        action: FinishAction,
+        action: FinishAction,  # noqa: ARG002
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation(output=action.message)
+        return FinishObservation()
 
 
 FinishTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 43438e1625..3d2608fde4 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -43,17 +43,11 @@ def visualize(self) -> Text:
 
 
 class ThinkObservation(Observation):
-    """Observation returned after logging a thought."""
-
-    output: str = Field(
-        default="Your thought has been logged.", description="Confirmation message."
-    )
-
-    @property
-    def visualize(self) -> Text:
-        """Return Rich Text representation - empty since action shows the thought."""
-        # Don't duplicate the thought display - action already shows it
-        return Text()
+    """
+    Observation returned after logging a thought.
+    The ThinkAction itself contains the thought logged so no extra
+    fields are needed here.
+    """
 
 
 THINK_DESCRIPTION = """Use the tool to think about something. It will not obtain new information or make any changes to the repository, but just log the thought. Use it when complex reasoning or brainstorming is needed.
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index 81eafa06fb..14eb2f0b14 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -43,8 +43,6 @@ class DelegateAction(Action):
 class DelegateObservation(Observation):
     """Observation from delegation operations."""
 
-    pass
-
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
 
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index a87552e099..f2014ee47f 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -59,7 +59,8 @@ def __call__(  # type: ignore[override]
             return self._delegate_tasks(action)
         else:
             return DelegateObservation(
-                error=f"Unsupported command: {action.command}",
+                command=action.command,
+                error="Unsupported command. Available commands: spawn, delegate",
             )
 
     def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
@@ -74,13 +75,15 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         """
         if not action.ids:
             return DelegateObservation(
-                error="spawn: at least one ID is required for spawn action",
+                command="spawn",
+                error="At least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
+                command="spawn",
                 error=(
-                    f"spawn: Cannot spawn {len(action.ids)} agents. "
+                    f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
                     f"maximum is {self._max_children}"
                 ),
@@ -111,18 +114,17 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
                 logger.info(f"Spawned sub-agent with ID: {agent_id}")
 
             agent_list = ", ".join(action.ids)
-            message = (
-                f"spawn: Successfully spawned {len(action.ids)} sub-agents: "
-                f"{agent_list}"
-            )
+            message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
+                command="spawn",
                 output=message,
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
-                error=f"spawn: failed to spawn agents: {str(e)}",
+                command="spawn",
+                error=f"failed to spawn agents: {str(e)}",
             )
 
     def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
@@ -138,15 +140,17 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         """
         if not action.tasks:
             return DelegateObservation(
-                error="delegate: at least one task is required for delegate action",
+                command="delegate",
+                error="at least one task is required for delegate action",
             )
 
         # Check that all requested agent IDs exist
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
             return DelegateObservation(
+                command="delegate",
                 error=(
-                    f"delegate: sub-agents not found: {', '.join(missing_agents)}. "
+                    f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
                 ),
             )
@@ -207,22 +211,24 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                     all_results.append(f"Agent {agent_id}: No result")
 
             # Create comprehensive message with results
-            message = f"delegate: Completed delegation of {len(action.tasks)} tasks"
+            output = f"Completed delegation of {len(action.tasks)} tasks"
             if errors:
-                message += f" with {len(errors)} errors"
+                output += f" with {len(errors)} errors"
 
             if all_results:
                 results_text = "\n".join(
                     f"{i}. {result}" for i, result in enumerate(all_results, 1)
                 )
-                message += f"\n\nResults:\n{results_text}"
+                output += f"\n\nResults:\n{results_text}"
 
             return DelegateObservation(
-                output=message,
+                command="delegate",
+                error=output,
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
-                error=f"delegate: failed to delegate tasks: {str(e)}",
+                command="delegate",
+                error=f"failed to delegate tasks: {str(e)}",
             )

From 2c218cf98a4ecf67e4443a1768b1ea25e6004faf Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 13:36:42 +0000
Subject: [PATCH 14/76] fix: populate output field in ThinkObservation and
 FinishObservation

- Update ThinkExecutor to set output='Your thought has been logged.'
- Update FinishExecutor to set output=action.message
- Fix test_view_batch_atomicity.py to use output instead of content
- Fix test_mcp_tool_kind_field.py import issues
- Add Sequence import to glob and grep definitions

These changes ensure observations consistently use the base output field
rather than custom fields, aligning with the standardization effort.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/builtins/finish.py |  4 ++--
 openhands-sdk/openhands/sdk/tool/builtins/think.py  |  2 +-
 openhands-tools/openhands/tools/glob/definition.py  |  3 +--
 openhands-tools/openhands/tools/grep/definition.py  |  3 +--
 tests/sdk/context/test_view_batch_atomicity.py      |  2 +-
 tests/sdk/mcp/test_mcp_tool_kind_field.py           | 10 +++++++---
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index dd0e44f623..7cb30f4ebf 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -53,10 +53,10 @@ class FinishObservation(Observation):
 class FinishExecutor(ToolExecutor):
     def __call__(
         self,
-        action: FinishAction,  # noqa: ARG002
+        action: FinishAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation()
+        return FinishObservation(output=action.message)
 
 
 FinishTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 3d2608fde4..a698eff418 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -68,7 +68,7 @@ def __call__(
         _: ThinkAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> ThinkObservation:
-        return ThinkObservation()
+        return ThinkObservation(output="Your thought has been logged.")
 
 
 ThinkTool = ToolDefinition(
diff --git a/openhands-tools/openhands/tools/glob/definition.py b/openhands-tools/openhands/tools/glob/definition.py
index 218d08ee9a..efb440eb78 100644
--- a/openhands-tools/openhands/tools/glob/definition.py
+++ b/openhands-tools/openhands/tools/glob/definition.py
@@ -1,6 +1,7 @@
 """Glob tool implementation for fast file pattern matching."""
 
 import os
+from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
@@ -9,8 +10,6 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
-
     from openhands.sdk.conversation.state import ConversationState
 
 
diff --git a/openhands-tools/openhands/tools/grep/definition.py b/openhands-tools/openhands/tools/grep/definition.py
index a9a59bf042..e1dfb97cd1 100644
--- a/openhands-tools/openhands/tools/grep/definition.py
+++ b/openhands-tools/openhands/tools/grep/definition.py
@@ -1,6 +1,7 @@
 """Grep tool implementation for fast content search."""
 
 import os
+from collections.abc import Sequence
 from typing import TYPE_CHECKING
 
 from pydantic import Field
@@ -9,8 +10,6 @@
 
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
-
     from openhands.sdk.conversation.state import ConversationState
 
 
diff --git a/tests/sdk/context/test_view_batch_atomicity.py b/tests/sdk/context/test_view_batch_atomicity.py
index dd27f82de9..d4e0b4c8bc 100644
--- a/tests/sdk/context/test_view_batch_atomicity.py
+++ b/tests/sdk/context/test_view_batch_atomicity.py
@@ -57,7 +57,7 @@ def create_observation_event(
 ) -> ObservationEvent:
     """Helper to create an ObservationEvent."""
     observation = MCPToolObservation(
-        content=[TextContent(text=content)],
+        output=content,
         tool_name=tool_name,
     )
     return ObservationEvent(
diff --git a/tests/sdk/mcp/test_mcp_tool_kind_field.py b/tests/sdk/mcp/test_mcp_tool_kind_field.py
index a62444615f..87f20139fd 100644
--- a/tests/sdk/mcp/test_mcp_tool_kind_field.py
+++ b/tests/sdk/mcp/test_mcp_tool_kind_field.py
@@ -85,11 +85,15 @@ def test_real_mcp_tool_execution_without_kind_field(fetch_tool):
     observation = fetch_tool(action)
 
     # Verify we got a valid response (not an error about 'kind')
-    assert observation.content is not None
-    assert len(observation.content) > 0
+    # Check output if no error, otherwise check error message
+    if observation.has_error:
+        assert observation.error is not None
+        content_str = observation.error
+    else:
+        assert observation.output is not None
+        content_str = observation.output
 
     # Check that the response doesn't contain validation error about 'kind'
-    content_str = str(observation.content)
     if "error" in content_str.lower():
         # If there's an error, make sure it's not about 'kind' field
         assert "kind" not in content_str.lower(), (

From b4a29fcc547237a252443c6a9db592e5dcc5587e Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 14:20:32 +0000
Subject: [PATCH 15/76] refactor: improve Observation consistency and error
 handling

- Changed base Observation._format_error to use 'Tool Execution Error:' prefix
- Ensured observations don't populate output when there is an error
- Removed command parameter from ExecuteBashObservation instantiations
- Updated ExecuteBashObservation to override output field as str
- Updated ExecuteBashObservation.to_llm_content to return error without output
- Fixed terminal_session.py to only populate error field in error cases
- Removed 'ERROR:' prefix from error messages (base class adds prefix)
- Updated test_bash_session.py to check error field instead of output
- Updated all observation tests to work with new error handling

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 59 +++++++------------
 .../openhands/sdk/tool/builtins/finish.py     |  3 +-
 .../openhands/sdk/tool/builtins/think.py      |  5 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 25 ++++----
 .../openhands/tools/delegate/impl.py          | 23 +++-----
 .../tools/execute_bash/definition.py          |  7 ++-
 .../execute_bash/terminal/terminal_session.py |  9 +--
 .../sdk/context/test_view_batch_atomicity.py  |  2 +-
 .../local/test_confirmation_mode.py           | 15 ++++-
 tests/sdk/mcp/test_mcp_tool.py                | 25 ++++----
 tests/sdk/mcp/test_mcp_tool_kind_field.py     |  8 ++-
 .../browser_use/test_browser_observation.py   | 36 +++++++----
 tests/tools/delegation/test_delegation.py     | 53 +++++++++++------
 tests/tools/execute_bash/test_bash_session.py |  7 ++-
 14 files changed, 148 insertions(+), 129 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index ff5586c0c9..3fd0761032 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -1,7 +1,6 @@
 """MCPTool definition and implementation."""
 
 import json
-from collections.abc import Sequence
 from typing import Any
 
 import mcp.types
@@ -51,10 +50,6 @@ def to_mcp_arguments(self) -> dict:
 class MCPToolObservation(Observation):
     """Observation from MCP tool execution."""
 
-    images: list[ImageContent] = Field(
-        default_factory=list,
-        description="Image content returned from the MCP tool",
-    )
     tool_name: str = Field(description="Name of the tool that was called")
 
     @classmethod
@@ -64,13 +59,13 @@ def from_call_tool_result(
         """Create an MCPToolObservation from a CallToolResult."""
         content: list[mcp.types.ContentBlock] = result.content
         text_parts = []
-        images = []
+        output_content: list[TextContent | ImageContent] = []
 
         for block in content:
             if isinstance(block, mcp.types.TextContent):
                 text_parts.append(block.text)
             elif isinstance(block, mcp.types.ImageContent):
-                images.append(
+                output_content.append(
                     ImageContent(
                         image_urls=[f"data:{block.mimeType};base64,{block.data}"],
                     )
@@ -91,35 +86,22 @@ def from_call_tool_result(
             # When there is an error, don't populate output
             return cls(
                 error=error_msg,
-                images=images,
                 tool_name=tool_name,
             )
         else:
             # When success, don't populate error
-            output_msg = f"{header}\n{text_content}" if text_content else header
+            # Combine text and images in output
+            if text_content:
+                output_msg = f"{header}\n{text_content}"
+                output_content.insert(0, TextContent(text=output_msg))
+            else:
+                output_content.insert(0, TextContent(text=header))
+
             return cls(
-                output=output_msg,
-                images=images,
+                output=output_content,
                 tool_name=tool_name,
             )
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Return structured content with images for LLM consumption.
-
-        Overrides base to preserve image content alongside text.
-        """
-        result: list[TextContent | ImageContent] = []
-
-        if self.error:
-            result.append(self._format_error())
-        elif self.output:
-            result.append(TextContent(text=self.output))
-
-        # Append images
-        result.extend(self.images)
-        return result
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
@@ -131,15 +113,16 @@ def visualize(self) -> Text:
             if self.error:
                 content.append(self.error + "\n")
         elif self.output:
-            # Try to parse as JSON for better display
-            try:
-                parsed = json.loads(self.output)
-                content.append(display_dict(parsed))
-            except (json.JSONDecodeError, TypeError):
-                content.append(self.output + "\n")
-
-        # Show images if present
-        for image in self.images:
-            content.append(f"[Image with {len(image.image_urls)} URLs]\n")
+            # Display all content blocks
+            for block in self.output:
+                if isinstance(block, TextContent):
+                    # Try to parse as JSON for better display
+                    try:
+                        parsed = json.loads(block.text)
+                        content.append(display_dict(parsed))
+                    except (json.JSONDecodeError, TypeError):
+                        content.append(block.text + "\n")
+                elif isinstance(block, ImageContent):
+                    content.append(f"[Image with {len(block.image_urls)} URLs]\n")
 
         return content
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index 7cb30f4ebf..369a9ad63a 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -3,6 +3,7 @@
 from pydantic import Field
 from rich.text import Text
 
+from openhands.sdk.llm.message import TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -56,7 +57,7 @@ def __call__(
         action: FinishAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation(output=action.message)
+        return FinishObservation(output=[TextContent(text=action.message)])
 
 
 FinishTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index a698eff418..4913fa37ba 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -3,6 +3,7 @@
 from pydantic import Field
 from rich.text import Text
 
+from openhands.sdk.llm.message import TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -68,7 +69,9 @@ def __call__(
         _: ThinkAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> ThinkObservation:
-        return ThinkObservation(output="Your thought has been logged.")
+        return ThinkObservation(
+            output=[TextContent(text="Your thought has been logged.")]
+        )
 
 
 ThinkTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 816936ab37..c56c93c272 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -197,15 +197,16 @@ class Observation(Schema, ABC):
     """Base schema for output observation."""
 
     # Standardized primary output and error handling
-    output: str = Field(
-        default="", description="Primary text output from the tool operation"
+    output: list[TextContent | ImageContent] = Field(
+        default_factory=list,
+        description=(
+            "Output returned from the tool converted to LLM Ready "
+            "TextContent or ImageContent"
+        ),
     )
     error: str | None = Field(
         default=None, description="Error message if operation failed"
     )
-    command: str | None = Field(
-        default=None, description="The command that was executed, if applicable"
-    )
 
     @property
     def has_error(self) -> bool:
@@ -217,8 +218,8 @@ def has_error(self) -> bool:
     def result_status(self) -> ObservationStatus:
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
-    def _format_error(self) -> TextContent:
-        return TextContent(text=f"Tool Execution Error: {self.error}")
+    def _format_error(self) -> list[TextContent | ImageContent]:
+        return [TextContent(text=f"Tool Execution Error: {self.error}")]
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
@@ -227,15 +228,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         Subclasses can override to provide richer content (e.g., images, diffs),
         but should preserve the error-first convention.
         """
-        # Prepend command if present
-        command_prefix = f"Command: {self.command}\n\n" if self.command else ""
-
         if self.error:
-            error_text = self._format_error().text
-            return [TextContent(text=command_prefix + error_text)]
-        elif self.output:
-            return [TextContent(text=command_prefix + self.output)]
-        return []
+            return self._format_error()
+        return self.output
 
     @property
     def visualize(self) -> Text:
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index f2014ee47f..a2f22c6947 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -5,6 +5,7 @@
 
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.conversation.response_utils import get_agent_final_response
+from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool.tool import ToolExecutor
 from openhands.tools.delegate.definition import DelegateObservation
@@ -59,8 +60,8 @@ def __call__(  # type: ignore[override]
             return self._delegate_tasks(action)
         else:
             return DelegateObservation(
-                command=action.command,
-                error="Unsupported command. Available commands: spawn, delegate",
+                error=f"Unsupported command: {action.command}. "
+                "Available commands: spawn, delegate",
             )
 
     def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
@@ -75,13 +76,11 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         """
         if not action.ids:
             return DelegateObservation(
-                command="spawn",
                 error="At least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
-                command="spawn",
                 error=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
@@ -116,14 +115,12 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             agent_list = ", ".join(action.ids)
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
-                command="spawn",
-                output=message,
+                output=[TextContent(text=message)],
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
-                command="spawn",
                 error=f"failed to spawn agents: {str(e)}",
             )
 
@@ -140,7 +137,6 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         """
         if not action.tasks:
             return DelegateObservation(
-                command="delegate",
                 error="at least one task is required for delegate action",
             )
 
@@ -148,7 +144,6 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
             return DelegateObservation(
-                command="delegate",
                 error=(
                     f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
@@ -211,24 +206,22 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                     all_results.append(f"Agent {agent_id}: No result")
 
             # Create comprehensive message with results
-            output = f"Completed delegation of {len(action.tasks)} tasks"
+            output_text = f"Completed delegation of {len(action.tasks)} tasks"
             if errors:
-                output += f" with {len(errors)} errors"
+                output_text += f" with {len(errors)} errors"
 
             if all_results:
                 results_text = "\n".join(
                     f"{i}. {result}" for i, result in enumerate(all_results, 1)
                 )
-                output += f"\n\nResults:\n{results_text}"
+                output_text += f"\n\nResults:\n{results_text}"
 
             return DelegateObservation(
-                command="delegate",
-                error=output,
+                output=[TextContent(text=output_text)],
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
-                command="delegate",
                 error=f"failed to delegate tasks: {str(e)}",
             )
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 782613a3f1..72b9ddaf8d 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,6 +79,7 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
+    output: str = ""  # type: ignore[assignment]
     exit_code: int | None = Field(
         default=None,
         description="The exit code of the command. -1 indicates the process hit the soft timeout and is not yet finished.",  # noqa
@@ -98,6 +99,10 @@ def command_id(self) -> int | None:
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        if self.error:
+            # When there's an error, format it appropriately
+            return [TextContent(text=f"Tool Execution Error: {self.error}")]
+
         ret = f"{self.metadata.prefix}{self.output}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -105,8 +110,6 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             ret += f"\n[Python interpreter: {self.metadata.py_interpreter_path}]"
         if self.metadata.exit_code != -1:
             ret += f"\n[Command finished with exit code {self.metadata.exit_code}]"
-        if self.error:
-            ret = f"[There was an error during command execution.]\n{ret}"
         return [TextContent(text=maybe_truncate(ret, MAX_CMD_OUTPUT_SIZE))]
 
     @property
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index fad4f457b5..6e8c9d7bef 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -188,7 +188,6 @@ def _handle_completed_command(
         self._ready_for_next_command()
         return ExecuteBashObservation(
             output=command_output,
-            command=command,
             metadata=metadata,
         )
 
@@ -222,7 +221,6 @@ def _handle_nochange_timeout_command(
         )
         return ExecuteBashObservation(
             output=command_output,
-            command=command,
             metadata=metadata,
         )
 
@@ -255,10 +253,8 @@ def _handle_hard_timeout_command(
             metadata,
             continue_prefix="[Below is the output of the previous command.]\n",
         )
-
         return ExecuteBashObservation(
             output=command_output,
-            command=command,
             metadata=metadata,
         )
 
@@ -314,12 +310,10 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
         }:
             if command == "":
                 return ExecuteBashObservation(
-                    output="ERROR: No previous running command to retrieve logs from.",
                     error="No previous running command to retrieve logs from.",
                 )
             if is_input:
                 return ExecuteBashObservation(
-                    output="ERROR: No previous running command to interact with.",
                     error="No previous running command to interact with.",
                 )
 
@@ -331,7 +325,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             return ExecuteBashObservation(
                 error=(
-                    "ERROR: Cannot execute multiple commands at once.\n"
+                    "Cannot execute multiple commands at once.\n"
                     "Please run each command separately OR chain them into a single "
                     f"command via && or ;\nProvided commands:\n{commands_list}"
                 ),
@@ -387,7 +381,6 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             obs = ExecuteBashObservation(
                 output=command_output,
-                command=command,
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/tests/sdk/context/test_view_batch_atomicity.py b/tests/sdk/context/test_view_batch_atomicity.py
index d4e0b4c8bc..7bc134da4b 100644
--- a/tests/sdk/context/test_view_batch_atomicity.py
+++ b/tests/sdk/context/test_view_batch_atomicity.py
@@ -57,7 +57,7 @@ def create_observation_event(
 ) -> ObservationEvent:
     """Helper to create an ObservationEvent."""
     observation = MCPToolObservation(
-        output=content,
+        output=[TextContent(text=content)],
         tool_name=tool_name,
     )
     return ObservationEvent(
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index c1daa8dc36..04913ced0c 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -525,7 +525,10 @@ def test_single_finish_action_skips_confirmation_entirely(self):
             e for e in self.conversation.state.events if isinstance(e, ObservationEvent)
         ]
         assert len(obs_events) == 1
-        assert obs_events[0].observation.output == "Task completed successfully!"
+        assert len(obs_events[0].observation.output) == 1
+        output_block = obs_events[0].observation.output[0]
+        assert isinstance(output_block, TextContent)
+        assert output_block.text == "Task completed successfully!"
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -566,11 +569,17 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
 
         # 1) ThinkAction observation
         assert hasattr(obs_events[0].observation, "output")
-        assert obs_events[0].observation.output == "Your thought has been logged."
+        assert len(obs_events[0].observation.output) == 1
+        think_output = obs_events[0].observation.output[0]
+        assert isinstance(think_output, TextContent)
+        assert think_output.text == "Your thought has been logged."
 
         # 2) FinishAction observation
         assert hasattr(obs_events[1].observation, "output")
-        assert obs_events[1].observation.output == "Analysis complete"
+        assert len(obs_events[1].observation.output) == 1
+        finish_output = obs_events[1].observation.output[0]
+        assert isinstance(finish_output, TextContent)
+        assert finish_output.text == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 3e8bc28813..31d313c7b9 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -5,7 +5,7 @@
 
 import mcp.types
 
-from openhands.sdk.llm import TextContent
+from openhands.sdk.llm import ImageContent, TextContent
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.definition import MCPToolObservation
 from openhands.sdk.mcp.tool import MCPToolDefinition, MCPToolExecutor
@@ -38,9 +38,10 @@ def test_from_call_tool_result_success(self):
 
         assert observation.tool_name == "test_tool"
         assert observation.output is not None
-        assert "[Tool 'test_tool' executed.]" in observation.output
-        assert "Operation completed successfully" in observation.output
-        assert len(observation.images) == 0
+        assert len(observation.output) == 1
+        assert isinstance(observation.output[0], TextContent)
+        assert "[Tool 'test_tool' executed.]" in observation.output[0].text
+        assert "Operation completed successfully" in observation.output[0].text
         assert observation.has_error is False
 
     def test_from_call_tool_result_error(self):
@@ -59,7 +60,7 @@ def test_from_call_tool_result_error(self):
         assert "[Tool 'test_tool' executed.]" in observation.error
         assert "[An error occurred during execution.]" in observation.error
         assert "Operation failed" in observation.error
-        assert len(observation.images) == 0
+        assert len(observation.output) == 0
         assert observation.has_error is True
 
     def test_from_call_tool_result_with_image(self):
@@ -80,17 +81,21 @@ def test_from_call_tool_result_with_image(self):
 
         assert observation.tool_name == "test_tool"
         assert observation.output is not None
-        assert "[Tool 'test_tool' executed.]" in observation.output
-        assert "Here's the image:" in observation.output
-        assert len(observation.images) == 1
-        assert hasattr(observation.images[0], "image_urls")
+        assert len(observation.output) == 2
+        # First item is text
+        assert isinstance(observation.output[0], TextContent)
+        assert "[Tool 'test_tool' executed.]" in observation.output[0].text
+        assert "Here's the image:" in observation.output[0].text
+        # Second item is image
+        assert isinstance(observation.output[1], ImageContent)
+        assert hasattr(observation.output[1], "image_urls")
         assert observation.has_error is False
 
     def test_to_llm_content_success(self):
         """Test agent observation formatting for success."""
         observation = MCPToolObservation(
             tool_name="test_tool",
-            output="[Tool 'test_tool' executed.]\nSuccess result",
+            output=[TextContent(text="[Tool 'test_tool' executed.]\nSuccess result")],
         )
 
         agent_obs = observation.to_llm_content
diff --git a/tests/sdk/mcp/test_mcp_tool_kind_field.py b/tests/sdk/mcp/test_mcp_tool_kind_field.py
index 87f20139fd..50195a6660 100644
--- a/tests/sdk/mcp/test_mcp_tool_kind_field.py
+++ b/tests/sdk/mcp/test_mcp_tool_kind_field.py
@@ -91,7 +91,13 @@ def test_real_mcp_tool_execution_without_kind_field(fetch_tool):
         content_str = observation.error
     else:
         assert observation.output is not None
-        content_str = observation.output
+        # Extract text from content blocks
+        from openhands.sdk.llm import TextContent
+
+        text_parts = [
+            block.text for block in observation.output if isinstance(block, TextContent)
+        ]
+        content_str = " ".join(text_parts)
 
     # Check that the response doesn't contain validation error about 'kind'
     if "error" in content_str.lower():
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index b8ae22e000..35bd1955db 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -6,18 +6,20 @@
 
 def test_browser_observation_basic_output():
     """Test basic BrowserObservation creation with output."""
-    observation = BrowserObservation(output="Test output")
+    observation = BrowserObservation(output=[TextContent(text="Test output")])
 
-    assert observation.output == "Test output"
+    assert len(observation.output) == 1
+    assert isinstance(observation.output[0], TextContent)
+    assert observation.output[0].text == "Test output"
     assert observation.error is None
     assert observation.screenshot_data is None
 
 
 def test_browser_observation_with_error():
     """Test BrowserObservation with error."""
-    observation = BrowserObservation(output="", error="Test error")
+    observation = BrowserObservation(error="Test error")
 
-    assert observation.output == ""
+    assert len(observation.output) == 0
     assert observation.error == "Test error"
     assert observation.screenshot_data is None
 
@@ -26,17 +28,19 @@ def test_browser_observation_with_screenshot():
     """Test BrowserObservation with screenshot data."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        output="Screenshot taken", screenshot_data=screenshot_data
+        output=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
     )
 
-    assert observation.output == "Screenshot taken"
+    assert len(observation.output) == 1
+    assert isinstance(observation.output[0], TextContent)
+    assert observation.output[0].text == "Screenshot taken"
     assert observation.error is None
     assert observation.screenshot_data == screenshot_data
 
 
 def test_browser_observation_to_llm_content_text_only():
     """Test to_llm_content property with text only."""
-    observation = BrowserObservation(output="Test output")
+    observation = BrowserObservation(output=[TextContent(text="Test output")])
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 1
@@ -48,7 +52,7 @@ def test_browser_observation_to_llm_content_with_screenshot():
     """Test to_llm_content property with screenshot."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        output="Screenshot taken", screenshot_data=screenshot_data
+        output=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
     )
     agent_obs = observation.to_llm_content
 
@@ -63,7 +67,7 @@ def test_browser_observation_to_llm_content_with_screenshot():
 
 def test_browser_observation_to_llm_content_with_error():
     """Test to_llm_content property with error."""
-    observation = BrowserObservation(output="", error="Test error")
+    observation = BrowserObservation(error="Test error")
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 1
@@ -75,7 +79,7 @@ def test_browser_observation_output_truncation():
     """Test output truncation for very long outputs."""
     # Create a very long output string
     long_output = "x" * 100000  # 100k characters
-    observation = BrowserObservation(output=long_output)
+    observation = BrowserObservation(output=[TextContent(text=long_output)])
 
     agent_obs = observation.to_llm_content
 
@@ -89,7 +93,9 @@ def test_browser_observation_output_truncation():
 def test_browser_observation_screenshot_data_url_conversion():
     """Test that screenshot data is properly converted to data URL."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
-    observation = BrowserObservation(output="Test", screenshot_data=screenshot_data)
+    observation = BrowserObservation(
+        output=[TextContent(text="Test")], screenshot_data=screenshot_data
+    )
 
     agent_obs = observation.to_llm_content
     expected_data_url = f"data:image/png;base64,{screenshot_data}"
@@ -101,10 +107,14 @@ def test_browser_observation_screenshot_data_url_conversion():
 
 def test_browser_observation_empty_screenshot_handling():
     """Test handling of empty or None screenshot data."""
-    observation = BrowserObservation(output="Test", screenshot_data="")
+    observation = BrowserObservation(
+        output=[TextContent(text="Test")], screenshot_data=""
+    )
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
 
-    observation = BrowserObservation(output="Test", screenshot_data=None)
+    observation = BrowserObservation(
+        output=[TextContent(text="Test")], screenshot_data=None
+    )
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index 3e27ddd0b2..2ed0f9c57c 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -6,7 +6,7 @@
 from pydantic import SecretStr
 
 from openhands.sdk.conversation.state import AgentExecutionStatus
-from openhands.sdk.llm import LLM
+from openhands.sdk.llm import LLM, TextContent
 from openhands.tools.delegate import (
     DelegateAction,
     DelegateExecutor,
@@ -67,21 +67,30 @@ def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
     # Test spawn observation
     spawn_observation = DelegateObservation(
-        output="spawn: Sub-agents created successfully",
+        output=[TextContent(text="spawn: Sub-agents created successfully")],
     )
-    assert spawn_observation.output == "spawn: Sub-agents created successfully"
+    assert len(spawn_observation.output) == 1
+    assert isinstance(spawn_observation.output[0], TextContent)
+    assert spawn_observation.output[0].text == "spawn: Sub-agents created successfully"
     # spawn observation doesn't have results field anymore
 
     # Test delegate observation
     delegate_observation = DelegateObservation(
-        output=(
-            "delegate: Tasks completed successfully\n\nResults:\n"
-            "1. Result 1\n2. Result 2"
-        ),
+        output=[
+            TextContent(
+                text=(
+                    "delegate: Tasks completed successfully\n\nResults:\n"
+                    "1. Result 1\n2. Result 2"
+                )
+            )
+        ],
     )
-    assert "Tasks completed successfully" in delegate_observation.output
-    assert "Result 1" in delegate_observation.output
-    assert "Result 2" in delegate_observation.output
+    assert len(delegate_observation.output) == 1
+    output_block = delegate_observation.output[0]
+    assert isinstance(output_block, TextContent)
+    assert "Tasks completed successfully" in output_block.text
+    assert "Result 1" in output_block.text
+    assert "Result 2" in output_block.text
 
 
 def test_delegate_executor_delegate():
@@ -91,7 +100,9 @@ def test_delegate_executor_delegate():
     # First spawn some agents
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
-    assert "Successfully spawned" in spawn_observation.output
+    output_block = spawn_observation.output[0]
+    assert isinstance(output_block, TextContent)
+    assert "Successfully spawned" in output_block.text
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -101,19 +112,25 @@ def test_delegate_executor_delegate():
 
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
-            output=(
-                "delegate: Tasks completed successfully\n\nResults:\n"
-                "1. Agent agent1: Code analysis complete\n"
-                "2. Agent agent2: Tests written"
-            ),
+            output=[
+                TextContent(
+                    text=(
+                        "delegate: Tasks completed successfully\n\nResults:\n"
+                        "1. Agent agent1: Code analysis complete\n"
+                        "2. Agent agent2: Tests written"
+                    )
+                )
+            ],
         )
         mock_delegate.return_value = mock_observation
 
         observation = executor(delegate_action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    assert "Agent agent1: Code analysis complete" in observation.output
-    assert "Agent agent2: Tests written" in observation.output
+    obs_block = observation.output[0]
+    assert isinstance(obs_block, TextContent)
+    assert "Agent agent1: Code analysis complete" in obs_block.text
+    assert "Agent agent2: Tests written" in obs_block.text
 
 
 def test_delegate_executor_missing_task():
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 98d8af4861..3bed2dfdb2 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -318,12 +318,13 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.has_error is True
-    assert obs.output == "ERROR: No previous running command to retrieve logs from."
+    assert obs.output == ""  # When there's an error, output should not be populated
+    assert obs.error == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
-    assert "There was an error during command execution." in obs.to_llm_content[0].text
+    assert "Tool Execution Error:" in obs.to_llm_content[0].text
     assert (
-        "ERROR: No previous running command to retrieve logs from."
+        "No previous running command to retrieve logs from."
         in obs.to_llm_content[0].text
     )
     assert obs.metadata.exit_code == -1

From 5c422bec3d0107c33c1f7407ba706babe624d144 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 28 Oct 2025 14:36:18 +0000
Subject: [PATCH 16/76] refactor: simplify MCPToolObservation to use base
 output field

- Removed custom content field, use base class output field instead
- Converted content items are now stored directly in output field
- Error messages include all content combined into error string
- Removed unnecessary complexity from from_call_tool_result
- Updated tests to match new structure (header + content items)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 40 +++++++++----------
 tests/sdk/mcp/test_mcp_tool.py                | 23 ++++++-----
 2 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 3fd0761032..83d98c762a 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -58,14 +58,13 @@ def from_call_tool_result(
     ) -> "MCPToolObservation":
         """Create an MCPToolObservation from a CallToolResult."""
         content: list[mcp.types.ContentBlock] = result.content
-        text_parts = []
-        output_content: list[TextContent | ImageContent] = []
+        converted_content: list[TextContent | ImageContent] = []
 
         for block in content:
             if isinstance(block, mcp.types.TextContent):
-                text_parts.append(block.text)
+                converted_content.append(TextContent(text=block.text))
             elif isinstance(block, mcp.types.ImageContent):
-                output_content.append(
+                converted_content.append(
                     ImageContent(
                         image_urls=[f"data:{block.mimeType};base64,{block.data}"],
                     )
@@ -75,30 +74,31 @@ def from_call_tool_result(
                     f"Unsupported MCP content block type: {type(block)}. Ignoring."
                 )
 
-        header = f"[Tool '{tool_name}' executed.]"
-        text_content = "\n".join(text_parts) if text_parts else ""
+        # Build initial message
+        initial_message = f"[Tool '{tool_name}' executed.]"
+        if result.isError:
+            initial_message += "\n[An error occurred during execution.]"
+
+        # Prepend initial message to content
+        content_with_header = [TextContent(text=initial_message)] + converted_content
 
         # Populate error or output field based on result status
         if result.isError:
-            error_msg = (
-                f"{header}\n[An error occurred during execution.]\n{text_content}"
-            )
-            # When there is an error, don't populate output
+            # When there is an error, populate error field only with all content
             return cls(
-                error=error_msg,
+                error="\n".join(
+                    [initial_message]
+                    + [
+                        c.text if isinstance(c, TextContent) else "[Image]"
+                        for c in converted_content
+                    ]
+                ),
                 tool_name=tool_name,
             )
         else:
-            # When success, don't populate error
-            # Combine text and images in output
-            if text_content:
-                output_msg = f"{header}\n{text_content}"
-                output_content.insert(0, TextContent(text=output_msg))
-            else:
-                output_content.insert(0, TextContent(text=header))
-
+            # When success, populate output field only
             return cls(
-                output=output_content,
+                output=content_with_header,
                 tool_name=tool_name,
             )
 
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 31d313c7b9..bfff2d6193 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -38,10 +38,11 @@ def test_from_call_tool_result_success(self):
 
         assert observation.tool_name == "test_tool"
         assert observation.output is not None
-        assert len(observation.output) == 1
+        assert len(observation.output) == 2
         assert isinstance(observation.output[0], TextContent)
-        assert "[Tool 'test_tool' executed.]" in observation.output[0].text
-        assert "Operation completed successfully" in observation.output[0].text
+        assert observation.output[0].text == "[Tool 'test_tool' executed.]"
+        assert isinstance(observation.output[1], TextContent)
+        assert observation.output[1].text == "Operation completed successfully"
         assert observation.has_error is False
 
     def test_from_call_tool_result_error(self):
@@ -81,14 +82,16 @@ def test_from_call_tool_result_with_image(self):
 
         assert observation.tool_name == "test_tool"
         assert observation.output is not None
-        assert len(observation.output) == 2
-        # First item is text
+        assert len(observation.output) == 3
+        # First item is header
         assert isinstance(observation.output[0], TextContent)
-        assert "[Tool 'test_tool' executed.]" in observation.output[0].text
-        assert "Here's the image:" in observation.output[0].text
-        # Second item is image
-        assert isinstance(observation.output[1], ImageContent)
-        assert hasattr(observation.output[1], "image_urls")
+        assert observation.output[0].text == "[Tool 'test_tool' executed.]"
+        # Second item is text
+        assert isinstance(observation.output[1], TextContent)
+        assert observation.output[1].text == "Here's the image:"
+        # Third item is image
+        assert isinstance(observation.output[2], ImageContent)
+        assert hasattr(observation.output[2], "image_urls")
         assert observation.has_error is False
 
     def test_to_llm_content_success(self):

From 09cee6bc246ac8a777e09dafbd9e996f6093d823 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 31 Oct 2025 09:13:16 +0100
Subject: [PATCH 17/76] update tool base schema

---
 openhands-sdk/openhands/sdk/tool/schema.py | 29 ++++++++++++++++------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c56c93c272..d7a2cff7ca 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -208,6 +208,14 @@ class Observation(Schema, ABC):
         default=None, description="Error message if operation failed"
     )
 
+    @property
+    def command(self) -> str | None:
+        """
+        The command that was executed to produce this observation.
+        Subclasses can override to provide the actual command run.
+        """
+        return None
+
     @property
     def has_error(self) -> bool:
         # Support both string and boolean-style error flags across subclasses.
@@ -218,19 +226,24 @@ def has_error(self) -> bool:
     def result_status(self) -> ObservationStatus:
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
-    def _format_error(self) -> list[TextContent | ImageContent]:
-        return [TextContent(text=f"Tool Execution Error: {self.error}")]
+    def _format_error(self) -> TextContent:
+        return TextContent(text=f"Tool Execution Error: {self.error}")
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        """Default content formatting prioritizing error then output.
-
-        Subclasses can override to provide richer content (e.g., images, diffs),
-        but should preserve the error-first convention.
         """
+        Default content formatting for converting observation to LLM readable content.
+        Subclasses can override to provide richer content (e.g., images, diffs).
+        Errors can be partial so both output and error are included if present.
+        """
+        llm_content: list[TextContent | ImageContent] = []
+        if self.command:
+            llm_content.append(TextContent(text=f"Executed Command: {self.command}\n"))
         if self.error:
-            return self._format_error()
-        return self.output
+            llm_content.append(self._format_error())
+        if self.output:
+            llm_content.extend(self.output)
+        return llm_content
 
     @property
     def visualize(self) -> Text:

From 157140cca4ef065e00ccd8d5a49a9b94e5f1c297 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 10:05:58 +0000
Subject: [PATCH 18/76] refactor: update all Observation subclasses to use
 standardized base fields

- Updated all Observation subclasses to use standardized output and error fields
- Added command property to all observations with type narrowing where needed
- MCP observations: removed redundant to_llm_content overrides, removed error header duplication
- FinishObservation/ThinkObservation: empty output list as specified
- DelegateObservation: added cmd field and command property with Literal type
- ExecuteBashObservation: added cmd field and command property, use raw_output for command output
- FileEditorObservation: added cmd field with Literal type narrowing
- TaskTrackerObservation: added cmd field with Literal type narrowing
- BrowserObservation: wrapped output strings in TextContent
- GrepObservation/GlobObservation: wrapped output strings in TextContent
- Updated all tests to use new field names (cmd, raw_output) and TextContent wrapping

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py |   2 -
 openhands-sdk/openhands/sdk/mcp/tool.py       |   3 -
 .../openhands/sdk/tool/builtins/finish.py     |   5 +-
 .../openhands/sdk/tool/builtins/think.py      |   5 +-
 .../openhands/tools/browser_use/definition.py |   8 +-
 .../openhands/tools/browser_use/impl.py       |  12 +-
 .../openhands/tools/delegate/definition.py    |   7 +
 .../openhands/tools/delegate/impl.py          |   9 +
 .../tools/execute_bash/definition.py          |  15 +-
 .../openhands/tools/execute_bash/impl.py      |  18 +-
 .../execute_bash/terminal/terminal_session.py |  15 +-
 .../openhands/tools/file_editor/definition.py |   8 +-
 .../openhands/tools/file_editor/editor.py     |  39 ++--
 .../openhands/tools/file_editor/impl.py       |   6 +-
 openhands-tools/openhands/tools/glob/impl.py  |   3 +-
 openhands-tools/openhands/tools/grep/impl.py  |   5 +-
 .../tools/task_tracker/definition.py          |  39 +++-
 tests/cross/test_agent_secrets_integration.py |  24 +--
 tests/cross/test_stuck_detector.py            |  12 +-
 .../local/test_confirmation_mode.py           |  20 +-
 tests/sdk/mcp/test_mcp_tool.py                |   1 -
 tests/tools/browser_use/conftest.py           |   8 +-
 .../browser_use/test_browser_executor.py      |   5 +-
 .../browser_use/test_browser_executor_e2e.py  |  27 ++-
 .../browser_use/test_browser_observation.py   |   2 +-
 tests/tools/delegation/test_delegation.py     |   3 +
 .../execute_bash/test_bash_ps1_metadata.py    |   9 +-
 tests/tools/execute_bash/test_bash_reset.py   |  32 +--
 tests/tools/execute_bash/test_bash_session.py | 132 ++++++------
 tests/tools/execute_bash/test_bash_tool.py    |   4 +-
 .../test_bash_tool_auto_detection.py          |   4 +-
 .../test_observation_truncation.py            |  22 +-
 .../execute_bash/test_secrets_masking.py      |  14 +-
 tests/tools/file_editor/conftest.py           |   6 +
 .../file_editor/test_basic_operations.py      | 203 ++++++++++--------
 .../tools/file_editor/test_error_handling.py  |   4 +-
 .../file_editor/test_file_editor_tool.py      |  12 +-
 tests/tools/file_editor/test_memory_usage.py  |   4 +-
 .../test_view_supported_binary_files.py       |  14 +-
 .../tools/file_editor/test_visualize_diff.py  |  16 +-
 .../tools/file_editor/utils/test_encoding.py  |  80 +++++--
 41 files changed, 500 insertions(+), 357 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 83d98c762a..bb34849773 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -76,8 +76,6 @@ def from_call_tool_result(
 
         # Build initial message
         initial_message = f"[Tool '{tool_name}' executed.]"
-        if result.isError:
-            initial_message += "\n[An error occurred during execution.]"
 
         # Prepend initial message to content
         content_with_header = [TextContent(text=initial_message)] + converted_content
diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index cef0e61776..62e3f825f8 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -12,7 +12,6 @@
 from litellm import ChatCompletionToolParam
 from pydantic import Field, ValidationError
 
-from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.definition import MCPToolAction, MCPToolObservation
@@ -68,7 +67,6 @@ async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
                 error_msg = f"Error calling MCP tool {self.tool_name}: {str(e)}"
                 logger.error(error_msg, exc_info=True)
                 return MCPToolObservation(
-                    content=[TextContent(text=error_msg)],
                     error=error_msg,
                     tool_name=self.tool_name,
                 )
@@ -148,7 +146,6 @@ def __call__(
             error_msg = f"Validation error for MCP tool '{self.name}' args: {e}"
             logger.error(error_msg, exc_info=True)
             return MCPToolObservation(
-                content=[TextContent(text=error_msg)],
                 error=error_msg,
                 tool_name=self.name,
             )
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index 369a9ad63a..dd0e44f623 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -3,7 +3,6 @@
 from pydantic import Field
 from rich.text import Text
 
-from openhands.sdk.llm.message import TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -54,10 +53,10 @@ class FinishObservation(Observation):
 class FinishExecutor(ToolExecutor):
     def __call__(
         self,
-        action: FinishAction,
+        action: FinishAction,  # noqa: ARG002
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation(output=[TextContent(text=action.message)])
+        return FinishObservation()
 
 
 FinishTool = ToolDefinition(
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 4913fa37ba..3d2608fde4 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -3,7 +3,6 @@
 from pydantic import Field
 from rich.text import Text
 
-from openhands.sdk.llm.message import TextContent
 from openhands.sdk.tool.tool import (
     Action,
     Observation,
@@ -69,9 +68,7 @@ def __call__(
         _: ThinkAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> ThinkObservation:
-        return ThinkObservation(
-            output=[TextContent(text="Your thought has been logged.")]
-        )
+        return ThinkObservation()
 
 
 ThinkTool = ToolDefinition(
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index ec63ad687c..682580f9bb 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -35,10 +35,14 @@ class BrowserObservation(Observation):
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
-            return [TextContent(text=f"Error: {self.error}")]
+            return [TextContent(text=f"Tool Execution Error: {self.error}")]
 
+        # Extract text from output list
+        output_text = "".join(
+            [c.text for c in self.output if isinstance(c, TextContent)]
+        )
         content: list[TextContent | ImageContent] = [
-            TextContent(text=maybe_truncate(self.output, MAX_BROWSER_OUTPUT_SIZE))
+            TextContent(text=maybe_truncate(output_text, MAX_BROWSER_OUTPUT_SIZE))
         ]
 
         if self.screenshot_data:
diff --git a/openhands-tools/openhands/tools/browser_use/impl.py b/openhands-tools/openhands/tools/browser_use/impl.py
index cbe6aebf1b..7e7d8cbb0c 100644
--- a/openhands-tools/openhands/tools/browser_use/impl.py
+++ b/openhands-tools/openhands/tools/browser_use/impl.py
@@ -14,6 +14,7 @@
 
 from openhands.sdk.logger import DEBUG, get_logger
 from openhands.sdk.tool import ToolExecutor
+from openhands.sdk.tool.schema import TextContent
 from openhands.sdk.utils.async_executor import AsyncExecutor
 from openhands.tools.browser_use.definition import BrowserAction, BrowserObservation
 from openhands.tools.browser_use.server import CustomBrowserUseServer
@@ -225,13 +226,13 @@ async def _execute_action(self, action):
                 result = await self.close_tab(action.tab_id)
             else:
                 error_msg = f"Unsupported action type: {type(action)}"
-                return BrowserObservation(output="", error=error_msg)
+                return BrowserObservation(error=error_msg)
 
-            return BrowserObservation(output=result)
+            return BrowserObservation(output=[TextContent(text=result)])
         except Exception as e:
             error_msg = f"Browser operation failed: {str(e)}"
             logger.error(error_msg, exc_info=True)
-            return BrowserObservation(output="", error=error_msg)
+            return BrowserObservation(error=error_msg)
 
     async def _ensure_initialized(self):
         """Ensure browser session is initialized."""
@@ -282,13 +283,14 @@ async def get_state(self, include_screenshot: bool = False):
                 # Return clean JSON + separate screenshot data
                 clean_json = json.dumps(result_data, indent=2)
                 return BrowserObservation(
-                    output=clean_json, screenshot_data=screenshot_data
+                    output=[TextContent(text=clean_json)],
+                    screenshot_data=screenshot_data,
                 )
             except json.JSONDecodeError:
                 # If JSON parsing fails, return as-is
                 pass
 
-        return BrowserObservation(output=result_json)
+        return BrowserObservation(output=[TextContent(text=result_json)])
 
     # Tab Management
     async def list_tabs(self) -> str:
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index 14eb2f0b14..f1d84459e2 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -43,6 +43,13 @@ class DelegateAction(Action):
 class DelegateObservation(Observation):
     """Observation from delegation operations."""
 
+    cmd: CommandLiteral = Field(description="The command that was executed")
+
+    @property
+    def command(self) -> CommandLiteral:
+        """Return the command that was executed, type-narrowed to CommandLiteral."""
+        return self.cmd
+
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
 
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index a2f22c6947..e169fe994c 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -60,6 +60,7 @@ def __call__(  # type: ignore[override]
             return self._delegate_tasks(action)
         else:
             return DelegateObservation(
+                cmd=action.command,
                 error=f"Unsupported command: {action.command}. "
                 "Available commands: spawn, delegate",
             )
@@ -76,11 +77,13 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         """
         if not action.ids:
             return DelegateObservation(
+                cmd=action.command,
                 error="At least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
+                cmd=action.command,
                 error=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
@@ -115,12 +118,14 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             agent_list = ", ".join(action.ids)
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
+                cmd=action.command,
                 output=[TextContent(text=message)],
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
+                cmd=action.command,
                 error=f"failed to spawn agents: {str(e)}",
             )
 
@@ -137,6 +142,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         """
         if not action.tasks:
             return DelegateObservation(
+                cmd=action.command,
                 error="at least one task is required for delegate action",
             )
 
@@ -144,6 +150,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
             return DelegateObservation(
+                cmd=action.command,
                 error=(
                     f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
@@ -217,11 +224,13 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                 output_text += f"\n\nResults:\n{results_text}"
 
             return DelegateObservation(
+                cmd=action.command,
                 output=[TextContent(text=output_text)],
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
+                cmd=action.command,
                 error=f"failed to delegate tasks: {str(e)}",
             )
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 316e6122d3..c765a6fb7a 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,7 +79,9 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    output: str = ""  # type: ignore[assignment]
+    # Internal string output field (raw command output)
+    raw_output: str = Field(default="", description="Raw command output string")
+    cmd: str | None = Field(default=None, description="The command that was executed")
     exit_code: int | None = Field(
         default=None,
         description="The exit code of the command. -1 indicates the process hit the soft timeout and is not yet finished.",  # noqa
@@ -92,6 +94,11 @@ class ExecuteBashObservation(Observation):
         description="Additional metadata captured from PS1 after command execution.",
     )
 
+    @property
+    def command(self) -> str | None:
+        """Return the command that was executed."""
+        return self.cmd
+
     @property
     def command_id(self) -> int | None:
         """Get the command ID from metadata."""
@@ -103,7 +110,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             # When there's an error, format it appropriately
             return [TextContent(text=f"Tool Execution Error: {self.error}")]
 
-        ret = f"{self.metadata.prefix}{self.output}{self.metadata.suffix}"
+        ret = f"{self.metadata.prefix}{self.raw_output}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
         if self.metadata.py_interpreter_path:
@@ -123,9 +130,9 @@ def visualize(self) -> Text:
             content.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        if self.output:
+        if self.raw_output:
             # Style the output based on content
-            output_lines = self.output.split("\n")
+            output_lines = self.raw_output.split("\n")
             for line in output_lines:
                 if line.strip():
                     # Color error-like lines differently
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index a4dd0c652d..621a8f7e9c 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -112,11 +112,11 @@ def reset(self) -> ExecuteBashObservation:
         )
 
         return ExecuteBashObservation(
-            output=(
+            raw_output=(
                 "Terminal session has been reset. All previous environment "
                 "variables and session state have been cleared."
             ),
-            command="[RESET]",
+            cmd="[RESET]",
             exit_code=0,
         )
 
@@ -143,10 +143,10 @@ def __call__(
                 command_result = self.session.execute(command_action)
                 observation = command_result.model_copy(
                     update={
-                        "output": (
-                            reset_result.output + "\n\n" + command_result.output
+                        "raw_output": (
+                            reset_result.raw_output + "\n\n" + command_result.raw_output
                         ),
-                        "command": f"[RESET] {action.command}",
+                        "cmd": f"[RESET] {action.command}",
                     }
                 )
             else:
@@ -158,15 +158,15 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        if observation.output and conversation is not None:
+        if observation.raw_output and conversation is not None:
             try:
                 secret_registry = conversation.state.secret_registry
                 masked_output = secret_registry.mask_secrets_in_output(
-                    observation.output
+                    observation.raw_output
                 )
                 if masked_output:
-                    data = observation.model_dump(exclude={"output"})
-                    return ExecuteBashObservation(**data, output=masked_output)
+                    data = observation.model_dump(exclude={"raw_output"})
+                    return ExecuteBashObservation(**data, raw_output=masked_output)
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 6e8c9d7bef..995026e929 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -187,7 +187,8 @@ def _handle_completed_command(
         self.prev_output = ""  # Reset previous command output
         self._ready_for_next_command()
         return ExecuteBashObservation(
-            output=command_output,
+            cmd=command,
+            raw_output=command_output,
             metadata=metadata,
         )
 
@@ -220,7 +221,8 @@ def _handle_nochange_timeout_command(
             continue_prefix="[Below is the output of the previous command.]\n",
         )
         return ExecuteBashObservation(
-            output=command_output,
+            cmd=command,
+            raw_output=command_output,
             metadata=metadata,
         )
 
@@ -254,7 +256,8 @@ def _handle_hard_timeout_command(
             continue_prefix="[Below is the output of the previous command.]\n",
         )
         return ExecuteBashObservation(
-            output=command_output,
+            cmd=command,
+            raw_output=command_output,
             metadata=metadata,
         )
 
@@ -310,10 +313,12 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
         }:
             if command == "":
                 return ExecuteBashObservation(
+                    cmd=command,
                     error="No previous running command to retrieve logs from.",
                 )
             if is_input:
                 return ExecuteBashObservation(
+                    cmd=command,
                     error="No previous running command to interact with.",
                 )
 
@@ -324,6 +329,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands)
             )
             return ExecuteBashObservation(
+                cmd=command,
                 error=(
                     "Cannot execute multiple commands at once.\n"
                     "Please run each command separately OR chain them into a single "
@@ -380,7 +386,8 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 continue_prefix="[Below is the output of the previous command.]\n",
             )
             obs = ExecuteBashObservation(
-                output=command_output,
+                cmd=command,
+                raw_output=command_output,
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 9331c2e317..b8db6bf4e1 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -65,12 +65,18 @@ class FileEditorAction(Action):
 class FileEditorObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    command: CommandLiteral = Field(  # type: ignore[assignment]
+    cmd: CommandLiteral = Field(
         description=(
             "The command that was run: `view`, `create`, `str_replace`, "
             "`insert`, or `undo_edit`."
         )
     )
+
+    @property
+    def command(self) -> CommandLiteral:
+        """Return the command that was executed, type-narrowed to CommandLiteral."""
+        return self.cmd
+
     path: str | None = Field(default=None, description="The file path that was edited.")
     prev_exist: bool = Field(
         default=True,
diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
index 509bba6323..f7517bc360 100644
--- a/openhands-tools/openhands/tools/file_editor/editor.py
+++ b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -8,6 +8,7 @@
 from binaryornot.check import is_binary
 
 from openhands.sdk.logger import get_logger
+from openhands.sdk.tool.schema import TextContent
 from openhands.sdk.utils.truncate import maybe_truncate
 from openhands.tools.file_editor.definition import (
     CommandLiteral,
@@ -109,11 +110,11 @@ def __call__(
             self.write_file(_path, file_text)
             self._history_manager.add_history(_path, file_text)
             return FileEditorObservation(
-                command=command,
+                cmd=command,
                 path=str(_path),
                 new_content=file_text,
                 prev_exist=False,
-                output=f"File created successfully at: {_path}",
+                output=[TextContent(text=f"File created successfully at: {_path}")],
             )
         elif command == "str_replace":
             if old_str is None:
@@ -254,8 +255,8 @@ def str_replace(
             "file again if necessary."
         )
         return FileEditorObservation(
-            command="str_replace",
-            output=success_message,
+            cmd="str_replace",
+            output=[TextContent(text=success_message)],
             prev_exist=True,
             path=str(path),
             old_content=file_content,
@@ -314,8 +315,8 @@ def view(
                     )
                 stdout = "\n".join(msg)
             return FileEditorObservation(
-                command="view",
-                output=stdout,
+                cmd="view",
+                output=[TextContent(text=stdout)],
                 error=stderr,
                 path=str(path),
                 prev_exist=True,
@@ -331,8 +332,8 @@ def view(
             output = self._make_output(file_content, str(path), start_line)
 
             return FileEditorObservation(
-                command="view",
-                output=output,
+                cmd="view",
+                output=[TextContent(text=output)],
                 path=str(path),
                 prev_exist=True,
             )
@@ -384,9 +385,9 @@ def view(
             output = f"NOTE: {warning_message}\n{output}"
 
         return FileEditorObservation(
-            command="view",
+            cmd="view",
             path=str(path),
-            output=output,
+            output=[TextContent(text=output)],
             prev_exist=True,
         )
 
@@ -497,8 +498,8 @@ def insert(
             "indentation, no duplicate lines, etc). Edit the file again if necessary."
         )
         return FileEditorObservation(
-            command="insert",
-            output=success_message,
+            cmd="insert",
+            output=[TextContent(text=success_message)],
             prev_exist=True,
             path=str(path),
             old_content=file_text,
@@ -566,11 +567,15 @@ def undo_edit(self, path: Path) -> FileEditorObservation:
         self.write_file(path, old_text)
 
         return FileEditorObservation(
-            command="undo_edit",
-            output=(
-                f"Last edit to {path} undone successfully. "
-                f"{self._make_output(old_text, str(path))}"
-            ),
+            cmd="undo_edit",
+            output=[
+                TextContent(
+                    text=(
+                        f"Last edit to {path} undone successfully. "
+                        f"{self._make_output(old_text, str(path))}"
+                    )
+                )
+            ],
             path=str(path),
             prev_exist=True,
             old_content=current_text,
diff --git a/openhands-tools/openhands/tools/file_editor/impl.py b/openhands-tools/openhands/tools/file_editor/impl.py
index afc1f7906d..309ae6e7f5 100644
--- a/openhands-tools/openhands/tools/file_editor/impl.py
+++ b/openhands-tools/openhands/tools/file_editor/impl.py
@@ -44,7 +44,7 @@ def __call__(
             action_path = Path(action.path).resolve()
             if action_path not in self.allowed_edits_files:
                 return FileEditorObservation(
-                    command=action.command,
+                    cmd=action.command,
                     error=f"Operation '{action.command}' is not allowed "
                     f"on file '{action_path}'. "
                     f"Only the following files can be edited: "
@@ -63,7 +63,7 @@ def __call__(
                 insert_line=action.insert_line,
             )
         except ToolError as e:
-            result = FileEditorObservation(command=action.command, error=e.message)
+            result = FileEditorObservation(cmd=action.command, error=e.message)
         assert result is not None, "file_editor should always return a result"
         return result
 
@@ -95,6 +95,6 @@ def file_editor(
             insert_line=insert_line,
         )
     except ToolError as e:
-        result = FileEditorObservation(command=command, error=e.message)
+        result = FileEditorObservation(cmd=command, error=e.message)
     assert result is not None, "file_editor should always return a result"
     return result
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 29f6cbbd1d..5ebcf21263 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -8,6 +8,7 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
+from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -102,7 +103,7 @@ def __call__(
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                output=output,
+                output=[TextContent(text=output)],
             )
 
         except Exception as e:
diff --git a/openhands-tools/openhands/tools/grep/impl.py b/openhands-tools/openhands/tools/grep/impl.py
index 4f22e5f3c2..c38ca6a910 100644
--- a/openhands-tools/openhands/tools/grep/impl.py
+++ b/openhands-tools/openhands/tools/grep/impl.py
@@ -6,6 +6,7 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
+from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -180,7 +181,7 @@ def _execute_with_ripgrep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=output,
+            output=[TextContent(text=output)],
         )
 
     def _execute_with_grep(
@@ -243,5 +244,5 @@ def _execute_with_grep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=output,
+            output=[TextContent(text=output)],
         )
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index d408b3a092..a3c0fba5fc 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -11,6 +11,7 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
+from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import (
     Action,
@@ -69,10 +70,18 @@ def visualize(self) -> Text:
 class TaskTrackerObservation(Observation):
     """This data class represents the result of a task tracking operation."""
 
+    cmd: Literal["view", "plan"] = Field(
+        description='The command that was executed: "view" or "plan".'
+    )
     task_list: list[TaskItem] = Field(
         default_factory=list, description="The current task list"
     )
 
+    @property
+    def command(self) -> Literal["view", "plan"]:
+        """Return the command that was executed, type-narrowed to Literal."""
+        return self.cmd
+
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
@@ -166,23 +175,37 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                output=(
-                    f"Task list has been updated with {len(self._task_list)} item(s)."
-                ),
-                command=action.command,
+                output=[
+                    TextContent(
+                        text=(
+                            f"Task list has been updated with "
+                            f"{len(self._task_list)} item(s)."
+                        )
+                    )
+                ],
+                cmd=action.command,
                 task_list=self._task_list,
             )
         elif action.command == "view":
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    output='No task list found. Use the "plan" command to create one.',
-                    command=action.command,
+                    output=[
+                        TextContent(
+                            text=(
+                                "No task list found. Use the "
+                                '"plan" command to create one.'
+                            )
+                        )
+                    ],
+                    cmd=action.command,
                     task_list=[],
                 )
             output = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
-                output=output, command=action.command, task_list=self._task_list
+                output=[TextContent(text=output)],
+                cmd=action.command,
+                task_list=self._task_list,
             )
         else:
             return TaskTrackerObservation(
@@ -190,7 +213,7 @@ def __call__(
                     f"Unknown command: {action.command}. "
                     'Supported commands are "view" and "plan".'
                 ),
-                command=action.command,
+                cmd=action.command,
                 task_list=[],
             )
 
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index e67742010f..9d6df9deb9 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -234,13 +234,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $API_KEY")
         result = bash_executor(action, conversation=conversation)
-        assert "test-api-key" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "test-api-key" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "dynamic-secret" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "dynamic-secret" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
 
     finally:
         bash_executor.close()
@@ -265,13 +265,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "changing-secret" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "changing-secret" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
 
     finally:
         bash_executor.close()
@@ -303,13 +303,13 @@ def get_value(self):
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
         print(result)
-        assert "changing-secret" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "changing-secret" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.output
-        assert "<secret-hidden>" in result.output
+        assert "changing-secret" not in result.raw_output
+        assert "<secret-hidden>" in result.raw_output
         assert dynamic_secret.raised_on_second
 
     finally:
diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index 06ae4ae0fd..a355f47939 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -59,7 +59,7 @@ def test_history_too_short():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output="file1.txt\nfile2.txt", command="ls", exit_code=0
+            raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
@@ -108,7 +108,7 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output="file1.txt\nfile2.txt", command="ls", exit_code=0
+                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -157,7 +157,7 @@ def test_repeating_action_observation_stuck():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output="file1.txt\nfile2.txt", command="ls", exit_code=0
+                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -298,7 +298,7 @@ def test_not_stuck_with_different_actions():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=f"output from {cmd}", command=cmd, exit_code=0
+                raw_output=f"output from {cmd}", cmd=cmd, exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -347,7 +347,7 @@ def test_reset_after_user_message():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output="file1.txt\nfile2.txt", command="ls", exit_code=0
+                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -390,7 +390,7 @@ def test_reset_after_user_message():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output="/home/user", command="pwd", exit_code=0
+            raw_output="/home/user", cmd="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 04913ced0c..30d41148f4 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -525,10 +525,8 @@ def test_single_finish_action_skips_confirmation_entirely(self):
             e for e in self.conversation.state.events if isinstance(e, ObservationEvent)
         ]
         assert len(obs_events) == 1
-        assert len(obs_events[0].observation.output) == 1
-        output_block = obs_events[0].observation.output[0]
-        assert isinstance(output_block, TextContent)
-        assert output_block.text == "Task completed successfully!"
+        # FinishObservation should have empty output per base behavior
+        assert len(obs_events[0].observation.output) == 0
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -567,19 +565,13 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
         ]
         assert len(obs_events) == 2
 
-        # 1) ThinkAction observation
+        # 1) ThinkAction observation - should have empty output per base behavior
         assert hasattr(obs_events[0].observation, "output")
-        assert len(obs_events[0].observation.output) == 1
-        think_output = obs_events[0].observation.output[0]
-        assert isinstance(think_output, TextContent)
-        assert think_output.text == "Your thought has been logged."
+        assert len(obs_events[0].observation.output) == 0
 
-        # 2) FinishAction observation
+        # 2) FinishAction observation - should have empty output per base behavior
         assert hasattr(obs_events[1].observation, "output")
-        assert len(obs_events[1].observation.output) == 1
-        finish_output = obs_events[1].observation.output[0]
-        assert isinstance(finish_output, TextContent)
-        assert finish_output.text == "Analysis complete"
+        assert len(obs_events[1].observation.output) == 0
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index bfff2d6193..cad27c5969 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -59,7 +59,6 @@ def test_from_call_tool_result_error(self):
         assert observation.tool_name == "test_tool"
         assert observation.error is not None
         assert "[Tool 'test_tool' executed.]" in observation.error
-        assert "[An error occurred during execution.]" in observation.error
         assert "Operation failed" in observation.error
         assert len(observation.output) == 0
         assert observation.has_error is True
diff --git a/tests/tools/browser_use/conftest.py b/tests/tools/browser_use/conftest.py
index 0cce6b37dd..422cc8dc35 100644
--- a/tests/tools/browser_use/conftest.py
+++ b/tests/tools/browser_use/conftest.py
@@ -4,6 +4,7 @@
 
 import pytest
 
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.browser_use.definition import BrowserObservation
 from openhands.tools.browser_use.impl import BrowserToolExecutor
 
@@ -31,7 +32,7 @@ def create_mock_browser_response(
 ):
     """Helper to create mock browser responses."""
     return BrowserObservation(
-        output=output, error=error, screenshot_data=screenshot_data
+        output=[TextContent(text=output)], error=error, screenshot_data=screenshot_data
     )
 
 
@@ -42,7 +43,10 @@ def assert_browser_observation_success(
     assert isinstance(observation, BrowserObservation)
     assert observation.error is None
     if expected_output:
-        assert expected_output in observation.output
+        output_text = "".join(
+            [c.text for c in observation.output if isinstance(c, TextContent)]
+        )
+        assert expected_output in output_text
 
 
 def assert_browser_observation_error(
diff --git a/tests/tools/browser_use/test_browser_executor.py b/tests/tools/browser_use/test_browser_executor.py
index d200f1e727..161a403554 100644
--- a/tests/tools/browser_use/test_browser_executor.py
+++ b/tests/tools/browser_use/test_browser_executor.py
@@ -2,6 +2,7 @@
 
 from unittest.mock import AsyncMock, patch
 
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.browser_use.definition import (
     BrowserClickAction,
     BrowserGetStateAction,
@@ -73,7 +74,7 @@ async def test_browser_executor_action_routing_get_state(
 ):
     """Test that get_state actions are routed correctly and return directly."""
     expected_observation = BrowserObservation(
-        output="State retrieved", screenshot_data="base64data"
+        output=[TextContent(text="State retrieved")], screenshot_data="base64data"
     )
     mock_get_state.return_value = expected_observation
 
@@ -113,7 +114,7 @@ def test_browser_executor_async_execution(mock_browser_executor):
     with patch.object(
         mock_browser_executor, "_execute_action", new_callable=AsyncMock
     ) as mock_execute:
-        expected_result = BrowserObservation(output="Test result")
+        expected_result = BrowserObservation(output=[TextContent(text="Test result")])
         mock_execute.return_value = expected_result
 
         action = BrowserNavigateAction(url="https://example.com")
diff --git a/tests/tools/browser_use/test_browser_executor_e2e.py b/tests/tools/browser_use/test_browser_executor_e2e.py
index 1c2cd9e4dd..25f12a90b9 100644
--- a/tests/tools/browser_use/test_browser_executor_e2e.py
+++ b/tests/tools/browser_use/test_browser_executor_e2e.py
@@ -6,6 +6,7 @@
 
 import pytest
 
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.browser_use.definition import (
     BrowserClickAction,
     BrowserCloseTabAction,
@@ -22,6 +23,11 @@
 from openhands.tools.browser_use.impl import BrowserToolExecutor
 
 
+def get_output_text(observation: BrowserObservation) -> str:
+    """Extract text from observation output."""
+    return "".join([c.text for c in observation.output if isinstance(c, TextContent)])
+
+
 # Test HTML content for browser operations
 TEST_HTML = """<!DOCTYPE html>
 <html lang="en">
@@ -172,10 +178,8 @@ def test_navigate_action(
 
         assert isinstance(result, BrowserObservation)
         assert result.error is None
-        assert (
-            "successfully" in result.output.lower()
-            or "navigated" in result.output.lower()
-        )
+        output_text = get_output_text(result).lower()
+        assert "successfully" in output_text or "navigated" in output_text
 
     def test_get_state_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -191,7 +195,7 @@ def test_get_state_action(
 
         assert isinstance(result, BrowserObservation)
         assert result.error is None
-        assert "Browser Test Page" in result.output
+        assert "Browser Test Page" in get_output_text(result)
 
     def test_get_state_with_screenshot(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -224,7 +228,7 @@ def test_click_action(
 
         # Parse the state to find button index
         # The test button should be indexed in the interactive elements
-        assert "Click Me" in state_result.output
+        assert "Click Me" in get_output_text(state_result)
 
         # Try to click the first interactive element (likely the button)
         click_action = BrowserClickAction(index=0)
@@ -244,7 +248,8 @@ def test_type_action(self, browser_executor: BrowserToolExecutor, test_server: s
         state_result = browser_executor(get_state_action)
 
         # Look for input field in the state
-        assert "test-input" in state_result.output or "Type here" in state_result.output
+        state_output = get_output_text(state_result)
+        assert "test-input" in state_output or "Type here" in state_output
 
         # Find the input field index and type into it
         # This assumes the input field is one of the interactive elements
@@ -290,7 +295,7 @@ def test_get_content_action(
 
         assert isinstance(result, BrowserObservation)
         assert result.error is None
-        assert "Browser Test Page" in result.output
+        assert "Browser Test Page" in get_output_text(result)
 
         # Get content with links
         content_with_links_action = BrowserGetContentAction(
@@ -300,7 +305,7 @@ def test_get_content_action(
 
         assert isinstance(result, BrowserObservation)
         assert result.error is None
-        assert "Browser Test Page" in result.output
+        assert "Browser Test Page" in get_output_text(result)
 
     def test_navigate_new_tab(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -328,7 +333,7 @@ def test_list_tabs_action(
         assert isinstance(result, BrowserObservation)
         assert result.error is None
         # Should contain tab information
-        assert len(result.output) > 0
+        assert len(get_output_text(result)) > 0
 
     def test_go_back_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -370,7 +375,7 @@ def test_switch_tab_action(
 
         # Parse tab information to get a tab ID
         # This is a simplified approach - in practice you'd parse the JSON response
-        if "tab" in tabs_result.output.lower():
+        if "tab" in get_output_text(tabs_result).lower():
             # Try to switch to first tab (assuming tab ID format)
             switch_action = BrowserSwitchTabAction(tab_id="0")
             result = browser_executor(switch_action)
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index 35bd1955db..e4bf1174e0 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -72,7 +72,7 @@ def test_browser_observation_to_llm_content_with_error():
 
     assert len(agent_obs) == 1
     assert isinstance(agent_obs[0], TextContent)
-    assert agent_obs[0].text == "Error: Test error"
+    assert agent_obs[0].text == "Tool Execution Error: Test error"
 
 
 def test_browser_observation_output_truncation():
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index 2ed0f9c57c..d94b5307df 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -67,6 +67,7 @@ def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
     # Test spawn observation
     spawn_observation = DelegateObservation(
+        cmd="spawn",
         output=[TextContent(text="spawn: Sub-agents created successfully")],
     )
     assert len(spawn_observation.output) == 1
@@ -76,6 +77,7 @@ def test_delegate_observation_creation():
 
     # Test delegate observation
     delegate_observation = DelegateObservation(
+        cmd="delegate",
         output=[
             TextContent(
                 text=(
@@ -112,6 +114,7 @@ def test_delegate_executor_delegate():
 
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
+            cmd="delegate",
             output=[
                 TextContent(
                     text=(
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index c5477f605a..bb4ef60825 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -273,7 +273,7 @@ def test_cmd_output_observation_properties():
     # Test with successful command
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
     obs = ExecuteBashObservation(
-        command="ls", output="file1\nfile2", exit_code=0, metadata=metadata
+        cmd="ls", raw_output="file1\nfile2", exit_code=0, metadata=metadata
     )
     assert obs.command_id == 123
     assert obs.exit_code == 0
@@ -288,8 +288,8 @@ def test_cmd_output_observation_properties():
     # Test with failed command
     metadata = CmdOutputMetadata(exit_code=1, pid=456)
     obs = ExecuteBashObservation(
-        command="invalid",
-        output="error",
+        cmd="invalid",
+        raw_output="",
         exit_code=1,
         error="Command failed",
         metadata=metadata,
@@ -299,7 +299,8 @@ def test_cmd_output_observation_properties():
     assert obs.has_error
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
-    assert "exit code 1" in obs.to_llm_content[0].text
+    # When there's an error, only error message is returned
+    assert "Tool Execution Error: Command failed" == obs.to_llm_content[0].text
     assert obs.has_error
 
 
diff --git a/tests/tools/execute_bash/test_bash_reset.py b/tests/tools/execute_bash/test_bash_reset.py
index e19e7d2286..31ccc6b26d 100644
--- a/tests/tools/execute_bash/test_bash_reset.py
+++ b/tests/tools/execute_bash/test_bash_reset.py
@@ -43,13 +43,13 @@ def test_bash_reset_basic():
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "hello" in result.output
+        assert "hello" in result.raw_output
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
         assert reset_result.command == "[RESET]"
 
         # Verify the variable is no longer set after reset
@@ -57,7 +57,7 @@ def test_bash_reset_basic():
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
         # The variable should be empty after reset
-        assert result.output.strip() == ""
+        assert result.raw_output.strip() == ""
 
 
 def test_bash_reset_with_command():
@@ -78,15 +78,15 @@ def test_bash_reset_with_command():
         )
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
-        assert "hello from fresh terminal" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "hello from fresh terminal" in reset_result.raw_output
         assert reset_result.command == "[RESET] echo 'hello from fresh terminal'"
 
         # Verify the variable is no longer set (confirming reset worked)
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert result.output.strip() == ""
+        assert result.raw_output.strip() == ""
 
 
 def test_bash_reset_working_directory():
@@ -99,7 +99,7 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.output
+        assert temp_dir in result.raw_output
 
         # Change directory
         action = ExecuteBashAction(command="cd /home")
@@ -110,19 +110,19 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "/home" in result.output
+        assert "/home" in result.raw_output
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
 
         # Verify working directory is back to original
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.output
+        assert temp_dir in result.raw_output
 
 
 def test_bash_reset_multiple_times():
@@ -135,25 +135,25 @@ def test_bash_reset_multiple_times():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
 
         # Execute a command after first reset
         action = ExecuteBashAction(command="echo 'after first reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after first reset" in result.output
+        assert "after first reset" in result.raw_output
 
         # Second reset
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
 
         # Execute a command after second reset
         action = ExecuteBashAction(command="echo 'after second reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after second reset" in result.output
+        assert "after second reset" in result.raw_output
 
 
 def test_bash_reset_with_timeout():
@@ -166,7 +166,7 @@ def test_bash_reset_with_timeout():
         reset_action = ExecuteBashAction(command="", reset=True, timeout=5.0)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
         assert reset_result.command == "[RESET]"
 
 
@@ -196,5 +196,5 @@ def test_bash_reset_only_with_empty_command():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.output
+        assert "Terminal session has been reset" in reset_result.raw_output
         assert reset_result.command == "[RESET]"
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 3bed2dfdb2..9ee6603bd8 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -43,7 +43,7 @@ def test_session_initialization(terminal_type):
         session.initialize()
         obs = session.execute(ExecuteBashAction(command="pwd"))
 
-        assert temp_dir in obs.output
+        assert temp_dir in obs.raw_output
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
         session.close()
 
@@ -66,7 +66,7 @@ def test_cwd_property(tmp_path, terminal_type):
 
     # For other implementations, just verify the command executed successfully
     obs = session.execute(ExecuteBashAction(command="pwd"))
-    assert str(random_dir) in obs.output
+    assert str(random_dir) in obs.raw_output
 
     # Note: CWD tracking may vary between terminal implementations
     # For tmux, it should track properly. For subprocess, it may not.
@@ -84,7 +84,7 @@ def test_basic_command(terminal_type):
     # Test simple command
     obs = session.execute(ExecuteBashAction(command="echo 'hello world'"))
 
-    assert "hello world" in obs.output
+    assert "hello world" in obs.raw_output
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -95,16 +95,16 @@ def test_basic_command(terminal_type):
 
     # Note: Exit code handling may vary between terminal implementations
     # The important thing is that the error message is captured
-    assert "nonexistent_command: command not found" in obs.output
+    assert "nonexistent_command: command not found" in obs.raw_output
     assert session.prev_status == TerminalCommandStatus.COMPLETED
 
     # Test multiple commands in sequence
     obs = session.execute(
         ExecuteBashAction(command='echo "first" && echo "second" && echo "third"')
     )
-    assert "first" in obs.output
-    assert "second" in obs.output
-    assert "third" in obs.output
+    assert "first" in obs.raw_output
+    assert "second" in obs.raw_output
+    assert "third" in obs.raw_output
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -125,7 +125,7 @@ def test_environment_variable_persistence(terminal_type):
 
     # Use the environment variable in a subsequent command
     obs = session.execute(ExecuteBashAction(command="echo $TEST_VAR"))
-    assert "hello world" in obs.output
+    assert "hello world" in obs.raw_output
     assert obs.metadata.exit_code == 0
 
     session.close()
@@ -151,8 +151,8 @@ def test_environment_variable_inheritance_from_parent(terminal_type):
 
         # Check if the environment variable is available in the terminal
         obs = session.execute(ExecuteBashAction(command=f"echo ${test_var_name}"))
-        assert test_var_value in obs.output, (
-            f"Expected '{test_var_value}' in output, but got: {obs.output}"
+        assert test_var_value in obs.raw_output, (
+            f"Expected '{test_var_value}' in output, but got: {obs.raw_output}"
         )
         assert obs.metadata.exit_code == 0
 
@@ -176,7 +176,7 @@ def test_long_running_command_follow_by_execute():
         ExecuteBashAction(command="echo 1; sleep 3; echo 2; sleep 3; echo 3")
     )
 
-    assert "1" in obs.output  # First number should appear before timeout
+    assert "1" in obs.raw_output  # First number should appear before timeout
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
@@ -185,7 +185,7 @@ def test_long_running_command_follow_by_execute():
     # Continue watching output
     obs = session.execute(ExecuteBashAction(command="", is_input=True))
 
-    assert "2" in obs.output
+    assert "2" in obs.raw_output
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -194,7 +194,7 @@ def test_long_running_command_follow_by_execute():
     # Test command that produces no output
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert "3" not in obs.output
+    assert "3" not in obs.raw_output
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert "The previous command is still running" in obs.metadata.suffix
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -205,7 +205,9 @@ def test_long_running_command_follow_by_execute():
     # Run it again, this time it should produce output and then start a new command
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert "3" in obs.output  # Should see the final output from the previous command
+    assert (
+        "3" in obs.raw_output
+    )  # Should see the final output from the previous command
     assert obs.metadata.exit_code == -1  # -1 indicates new command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
@@ -227,7 +229,7 @@ def test_interactive_command(terminal_type):
         )
     )
 
-    assert "Enter name:" in obs.output
+    assert "Enter name:" in obs.raw_output
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(3)
@@ -236,7 +238,7 @@ def test_interactive_command(terminal_type):
     # Send input
     obs = session.execute(ExecuteBashAction(command="John", is_input=True))
 
-    assert "Hello John" in obs.output
+    assert "Hello John" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -266,7 +268,7 @@ def test_interactive_command(terminal_type):
 
     obs = session.execute(ExecuteBashAction(command="EOF", is_input=True))
 
-    assert "line 1" in obs.output and "line 2" in obs.output
+    assert "line 1" in obs.raw_output and "line 2" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -287,7 +289,7 @@ def test_ctrl_c(terminal_type):
         ExecuteBashAction(command="while true; do echo 'looping'; sleep 3; done"),
     )
 
-    assert "looping" in obs.output
+    assert "looping" in obs.raw_output
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.prefix == ""
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -318,7 +320,7 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.has_error is True
-    assert obs.output == ""  # When there's an error, output should not be populated
+    assert obs.raw_output == ""  # When there's an error, output should not be populated
     assert obs.error == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
@@ -357,22 +359,22 @@ def test_command_output_continuation(terminal_type):
     if session.prev_status == TerminalCommandStatus.COMPLETED:
         # If the command completed immediately, verify we got all the output
         logger.info("Command completed immediately", extra={"msg_type": "TEST_INFO"})
-        assert "1" in obs.output
-        assert "2" in obs.output
-        assert "3" in obs.output
-        assert "4" in obs.output
-        assert "5" in obs.output
+        assert "1" in obs.raw_output
+        assert "2" in obs.raw_output
+        assert "3" in obs.raw_output
+        assert "4" in obs.raw_output
+        assert "5" in obs.raw_output
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
     else:
         # If the command timed out, verify we got the timeout message
         assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
-        assert "1" in obs.output
+        assert "1" in obs.raw_output
         assert "[The command has no new output after 1 seconds." in obs.metadata.suffix
 
         # Continue getting output until we see all numbers
         numbers_seen = set()
         for i in range(1, 6):
-            if str(i) in obs.output:
+            if str(i) in obs.raw_output:
                 numbers_seen.add(i)
 
         # We need to see numbers 2-5 and then the command completion
@@ -384,7 +386,7 @@ def test_command_output_continuation(terminal_type):
 
             # Check for numbers in the output
             for i in range(1, 6):
-                if str(i) in obs.output and i not in numbers_seen:
+                if str(i) in obs.raw_output and i not in numbers_seen:
                     numbers_seen.add(i)
                     logger.info(
                         f"Found number {i} in output", extra={"msg_type": "TEST_INFO"}
@@ -424,8 +426,8 @@ def test_long_output(terminal_type):
         ExecuteBashAction(command='for i in {1..5000}; do echo "Line $i"; done')
     )
 
-    assert "Line 1" in obs.output
-    assert "Line 5000" in obs.output
+    assert "Line 1" in obs.raw_output
+    assert "Line 5000" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -444,8 +446,8 @@ def test_long_output_exceed_history_limit(terminal_type):
     )
 
     assert "Previous command outputs are truncated" in obs.metadata.prefix
-    assert "Line 40000" in obs.output
-    assert "Line 50000" in obs.output
+    assert "Line 40000" in obs.raw_output
+    assert "Line 50000" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
 
@@ -465,7 +467,7 @@ def test_multiline_command():
         )
     )
 
-    assert "inside if" in obs.output
+    assert "inside if" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -489,21 +491,21 @@ def test_python_interactive_input(terminal_type):
     # Start Python with the interactive script
     obs = session.execute(ExecuteBashAction(command=f'python3 -c "{python_script}"'))
 
-    assert "Enter your name:" in obs.output
+    assert "Enter your name:" in obs.raw_output
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send first input (name)
     obs = session.execute(ExecuteBashAction(command="Alice", is_input=True))
 
-    assert "Enter your age:" in obs.output
+    assert "Enter your age:" in obs.raw_output
     assert obs.metadata.exit_code == -1
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send second input (age)
     obs = session.execute(ExecuteBashAction(command="25", is_input=True))
 
-    assert "Hello Alice, you are 25 years old" in obs.output
+    assert "Hello Alice, you are 25 years old" in obs.raw_output
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert session.prev_status == TerminalCommandStatus.COMPLETED
@@ -516,7 +518,7 @@ def _run_bash_action(session, command: str, **kwargs):
     action = ExecuteBashAction(command=command, **kwargs)
     obs = session.execute(action)
     logger.info(f"Command: {command}")
-    logger.info(f"Output: {obs.output}")
+    logger.info(f"Output: {obs.raw_output}")
     logger.info(f"Exit code: {obs.metadata.exit_code}")
     return obs
 
@@ -536,12 +538,12 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in obs.output
+            assert "Serving HTTP on" in obs.raw_output
 
             # Send Ctrl+C to interrupt
             obs = _run_bash_action(session, "C-c", is_input=True)
             assert "CTRL+C was sent" in obs.metadata.suffix
-            assert "Keyboard interrupt received, exiting." in obs.output
+            assert "Keyboard interrupt received, exiting." in obs.raw_output
 
             # Verify we can run commands after interrupt
             obs = _run_bash_action(session, "ls")
@@ -552,7 +554,7 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in obs.output
+            assert "Serving HTTP on" in obs.raw_output
 
         finally:
             session.close()
@@ -579,7 +581,7 @@ def test_bash_background_server(terminal_type):
             obs = _run_bash_action(session, f"curl http://localhost:{server_port}")
             assert obs.metadata.exit_code == 0
             # Check for content typical of python http.server directory listing
-            assert "Directory listing for" in obs.output
+            assert "Directory listing for" in obs.raw_output
 
             # Kill the server
             obs = _run_bash_action(session, 'pkill -f "http.server"')
@@ -602,17 +604,17 @@ def test_multiline_commands(terminal_type):
             # single multiline command
             obs = _run_bash_action(session, 'echo \\\n -e "foo"')
             assert obs.metadata.exit_code == 0
-            assert "foo" in obs.output
+            assert "foo" in obs.raw_output
 
             # test multiline echo
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
-            assert "hello\nworld" in obs.output
+            assert "hello\nworld" in obs.raw_output
 
             # test whitespace
             obs = _run_bash_action(session, 'echo -e "a\\n\\n\\nz"')
             assert obs.metadata.exit_code == 0
-            assert "\n\n\n" in obs.output
+            assert "\n\n\n" in obs.raw_output
         finally:
             session.close()
 
@@ -635,7 +637,7 @@ def test_complex_commands(terminal_type):
         try:
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "Got 3 heads in a row after 3 flips!" in obs.output
+            assert "Got 3 heads in a row after 3 flips!" in obs.raw_output
         finally:
             session.close()
 
@@ -652,8 +654,8 @@ def test_no_ps2_in_output(terminal_type):
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
 
-            assert "hello\nworld" in obs.output
-            assert ">" not in obs.output
+            assert "hello\nworld" in obs.raw_output
+            assert ">" not in obs.raw_output
         finally:
             session.close()
 
@@ -683,11 +685,11 @@ def test_multiline_command_loop(terminal_type):
         try:
             obs = _run_bash_action(session, init_cmd)
             assert obs.metadata.exit_code == 0
-            assert "created files" in obs.output
+            assert "created files" in obs.raw_output
 
             obs = _run_bash_action(session, follow_up_cmd)
             assert obs.metadata.exit_code == 0
-            assert "success" in obs.output
+            assert "success" in obs.raw_output
         finally:
             session.close()
 
@@ -725,7 +727,7 @@ def test_multiple_multiline_commands(terminal_type):
             for cmd in cmds:
                 obs = _run_bash_action(session, cmd)
                 assert obs.metadata.exit_code == 0
-                results.append(obs.output)
+                results.append(obs.raw_output)
 
             # Verify all expected outputs are present
             assert "total 0" in results[0]  # ls -l
@@ -758,21 +760,21 @@ def test_cmd_run(terminal_type):
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "total 0" in obs.output
+            assert "total 0" in obs.raw_output
 
             obs = _run_bash_action(session, "mkdir test")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "test" in obs.output
+            assert "test" in obs.raw_output
 
             obs = _run_bash_action(session, "touch test/foo.txt")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l test")
             assert obs.metadata.exit_code == 0
-            assert "foo.txt" in obs.output
+            assert "foo.txt" in obs.raw_output
 
             # clean up
             _run_bash_action(session, "rm -rf test")
@@ -794,7 +796,7 @@ def test_run_as_user_correct_home_dir(terminal_type):
             obs = _run_bash_action(session, "cd ~ && pwd")
             assert obs.metadata.exit_code == 0
             home = os.getenv("HOME")
-            assert home and home in obs.output
+            assert home and home in obs.raw_output
         finally:
             session.close()
 
@@ -809,8 +811,8 @@ def test_multi_cmd_run_in_single_line(terminal_type):
             # Original Linux version using &&
             obs = _run_bash_action(session, "pwd && ls -l")
             assert obs.metadata.exit_code == 0
-            assert temp_dir in obs.output
-            assert "total 0" in obs.output
+            assert temp_dir in obs.raw_output
+            assert "total 0" in obs.raw_output
         finally:
             session.close()
 
@@ -833,7 +835,7 @@ def test_stateful_cmd(terminal_type):
 
             obs = _run_bash_action(session, "pwd")
             assert obs.metadata.exit_code == 0
-            assert f"{temp_dir}/test" in obs.output.strip()
+            assert f"{temp_dir}/test" in obs.raw_output.strip()
         finally:
             session.close()
 
@@ -864,7 +866,7 @@ def test_python_version(terminal_type):
         try:
             obs = _run_bash_action(session, "python --version")
             assert obs.metadata.exit_code == 0
-            assert "Python 3" in obs.output
+            assert "Python 3" in obs.raw_output
         finally:
             session.close()
 
@@ -884,7 +886,7 @@ def test_pwd_property(terminal_type):
 
             obs = _run_bash_action(session, "cd random_dir && pwd")
             assert obs.metadata.exit_code == 0
-            assert "random_dir" in obs.output
+            assert "random_dir" in obs.raw_output
         finally:
             session.close()
 
@@ -913,10 +915,10 @@ def test_long_output_from_nested_directories(terminal_type):
             assert obs.metadata.exit_code == 0
 
             # Verify output contains expected files
-            assert "folder_1" in obs.output
-            assert "file_1.txt" in obs.output
-            assert "folder_100" in obs.output
-            assert "file_100.txt" in obs.output
+            assert "folder_1" in obs.raw_output
+            assert "file_1.txt" in obs.raw_output
+            assert "folder_100" in obs.raw_output
+            assert "file_100.txt" in obs.raw_output
         finally:
             session.close()
 
@@ -950,7 +952,7 @@ def test_command_backslash(terminal_type):
             )
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "/tmp/test_dir/file_1.txt" in obs.output
+            assert "/tmp/test_dir/file_1.txt" in obs.raw_output
         finally:
             session.close()
 
@@ -974,7 +976,7 @@ def test_bash_remove_prefix(terminal_type):
             # Check git remote - same for both platforms
             obs = _run_bash_action(session, "git remote -v")
             assert obs.metadata.exit_code == 0
-            assert "https://github.com/OpenHands/OpenHands" in obs.output
-            assert "git remote -v" not in obs.output
+            assert "https://github.com/OpenHands/OpenHands" in obs.raw_output
+            assert "git remote -v" not in obs.raw_output
         finally:
             session.close()
diff --git a/tests/tools/execute_bash/test_bash_tool.py b/tests/tools/execute_bash/test_bash_tool.py
index 985b4d94e2..c6a34259f9 100644
--- a/tests/tools/execute_bash/test_bash_tool.py
+++ b/tests/tools/execute_bash/test_bash_tool.py
@@ -69,7 +69,7 @@ def test_bash_tool_execution():
         # Check the result
         assert result is not None
         assert isinstance(result, ExecuteBashObservation)
-        assert "Hello, World!" in result.output
+        assert "Hello, World!" in result.raw_output
 
 
 def test_bash_tool_working_directory():
@@ -87,7 +87,7 @@ def test_bash_tool_working_directory():
 
         # Check that the working directory is correct
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.output
+        assert temp_dir in result.raw_output
 
 
 def test_bash_tool_to_openai_tool():
diff --git a/tests/tools/execute_bash/test_bash_tool_auto_detection.py b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
index 500595f38f..75911d924e 100644
--- a/tests/tools/execute_bash/test_bash_tool_auto_detection.py
+++ b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
@@ -52,7 +52,7 @@ def test_default_auto_detection():
         # Test that it works
         action = ExecuteBashAction(command="echo 'Auto-detection test'")
         obs = executor(action)
-        assert "Auto-detection test" in obs.output
+        assert "Auto-detection test" in obs.raw_output
 
 
 def test_forced_terminal_types():
@@ -138,7 +138,7 @@ def test_backward_compatibility():
         assert tool.executor is not None
         action = ExecuteBashAction(command="echo 'Backward compatibility test'")
         obs = tool.executor(action)
-        assert "Backward compatibility test" in obs.output
+        assert "Backward compatibility test" in obs.raw_output
         assert obs.metadata.exit_code == 0
 
 
diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 3f27759c76..33df408141 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -18,7 +18,7 @@ def test_execute_bash_observation_truncation_under_limit():
     )
 
     observation = ExecuteBashObservation(
-        output="Short output",
+        raw_output="Short output",
         metadata=metadata,
         error=None,
     )
@@ -52,7 +52,7 @@ def test_execute_bash_observation_truncation_over_limit():
     long_output = "A" * (MAX_CMD_OUTPUT_SIZE + 1000)
 
     observation = ExecuteBashObservation(
-        output=long_output,
+        raw_output=long_output,
         metadata=metadata,
         error=None,
     )
@@ -89,7 +89,7 @@ def test_execute_bash_observation_truncation_with_error():
     long_output = "B" * (MAX_CMD_OUTPUT_SIZE + 500)
 
     observation = ExecuteBashObservation(
-        output=long_output,
+        raw_output=long_output,
         metadata=metadata,
         error="Command failed",
     )
@@ -99,16 +99,8 @@ def test_execute_bash_observation_truncation_with_error():
     assert isinstance(result[0], TextContent)
     result = result[0].text
 
-    # The result should be truncated and have error prefix
-    assert result.startswith("[There was an error during command execution.]")
-    assert len(result) < len(long_output) + 300  # Account for metadata and error prefix
-    # With head-and-tail truncation, should end with original content + metadata
-    expected_end = (
-        "B\n[Current working directory: /test]\n[Python interpreter: /usr/bin/python]\n"
-        "[Command finished with exit code 1]"
-    )
-    assert result.endswith(expected_end)
-    assert "<response clipped>" in result  # Should contain truncation notice
+    # When there's an error, only the error message is returned
+    assert result == "Tool Execution Error: Command failed"
 
 
 def test_execute_bash_observation_truncation_exact_limit():
@@ -132,7 +124,7 @@ def test_execute_bash_observation_truncation_exact_limit():
     exact_output = "C" * exact_output_size
 
     observation = ExecuteBashObservation(
-        output=exact_output,
+        raw_output=exact_output,
         metadata=metadata,
         error=None,
     )
@@ -162,7 +154,7 @@ def test_execute_bash_observation_truncation_with_prefix_suffix():
     long_output = "D" * (MAX_CMD_OUTPUT_SIZE + 200)
 
     observation = ExecuteBashObservation(
-        output=long_output,
+        raw_output=long_output,
         metadata=metadata,
         error=None,
     )
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index ea9b8b0491..3924d781e1 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -24,8 +24,8 @@ def test_bash_executor_without_conversation():
             result = executor(action)
 
             # Check that the output is not masked (no conversation provided)
-            assert "secret-value-123" in result.output
-            assert "<secret-hidden>" not in result.output
+            assert "secret-value-123" in result.raw_output
+            assert "<secret-hidden>" not in result.raw_output
 
         finally:
             executor.close()
@@ -60,9 +60,9 @@ def test_bash_executor_with_conversation_secrets():
             mock_session = Mock()
             # session.execute returns ExecuteBashObservation
             mock_observation = ExecuteBashObservation(
-                command="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
+                cmd="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
                 exit_code=0,
-                output="Token: secret-value-123, Key: another-secret-456",
+                raw_output="Token: secret-value-123, Key: another-secret-456",
             )
             mock_session.execute.return_value = mock_observation
             executor.session = mock_session
@@ -77,10 +77,10 @@ def test_bash_executor_with_conversation_secrets():
             assert mock_session.execute.called
 
             # Check that both secrets were masked in the output
-            assert "secret-value-123" not in result.output
-            assert "another-secret-456" not in result.output
+            assert "secret-value-123" not in result.raw_output
+            assert "another-secret-456" not in result.raw_output
             # SecretsManager uses <secret-hidden> as the mask
-            assert "<secret-hidden>" in result.output
+            assert "<secret-hidden>" in result.raw_output
 
         finally:
             executor.close()
diff --git a/tests/tools/file_editor/conftest.py b/tests/tools/file_editor/conftest.py
index d1f52f659e..2d37f9ab4a 100644
--- a/tests/tools/file_editor/conftest.py
+++ b/tests/tools/file_editor/conftest.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.file_editor.definition import (
     FileEditorObservation,
 )
@@ -75,3 +76,8 @@ def create_test_file(path: Path, content: str):
     """Helper to create a test file with given content."""
     path.write_text(content)
     return path
+
+
+def get_output_text(result: FileEditorObservation) -> str:
+    """Extract text content from a FileEditorObservation's output list."""
+    return "".join([c.text for c in result.output if isinstance(c, TextContent)])
diff --git a/tests/tools/file_editor/test_basic_operations.py b/tests/tools/file_editor/test_basic_operations.py
index 16624d890c..ff3f56d4e5 100644
--- a/tests/tools/file_editor/test_basic_operations.py
+++ b/tests/tools/file_editor/test_basic_operations.py
@@ -21,6 +21,7 @@
 
 from .conftest import (
     assert_successful_result,
+    get_output_text,
 )
 
 
@@ -62,11 +63,13 @@ def test_file_editor_happy_path(temp_file):
     # Validate the result
     assert_successful_result(result, str(temp_file))
     assert (
-        result.output is not None
-        and "The file" in result.output
-        and "has been edited" in result.output
+        get_output_text(result) is not None
+        and "The file" in get_output_text(result)
+        and "has been edited" in get_output_text(result)
     )
-    assert result.output is not None and "This is a sample file." in result.output
+    assert get_output_text(
+        result
+    ) is not None and "This is a sample file." in get_output_text(result)
     assert result.path == str(temp_file)
     assert result.prev_exist is True
     assert (
@@ -105,16 +108,19 @@ def test_file_editor_view_operation(temp_file):
 
     # Validate the result
     assert_successful_result(result, str(temp_file))
+    assert get_output_text(
+        result
+    ) is not None and "Here's the result of running `cat -n`" in get_output_text(result)
     assert (
-        result.output is not None
-        and "Here's the result of running `cat -n`" in result.output
+        get_output_text(result) is not None
+        and "This is a file with XML tags parsing logic..." in get_output_text(result)
     )
-    assert (
-        result.output is not None
-        and "This is a file with XML tags parsing logic..." in result.output
-    )
-    assert result.output is not None and "match = re.search(" in result.output
-    assert result.output is not None and "...More text here." in result.output
+    assert get_output_text(
+        result
+    ) is not None and "match = re.search(" in get_output_text(result)
+    assert get_output_text(
+        result
+    ) is not None and "...More text here." in get_output_text(result)
 
 
 def test_successful_operations(temp_file):
@@ -130,11 +136,10 @@ def test_successful_operations(temp_file):
         path=str(temp_file),
     )
     assert_successful_result(result)
-    assert (
-        result.output is not None
-        and "Here's the result of running `cat -n`" in result.output
-    )
-    assert result.output is not None and "line 1" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Here's the result of running `cat -n`" in get_output_text(result)
+    assert get_output_text(result) is not None and "line 1" in get_output_text(result)
 
     # Test str_replace
     result = file_editor(
@@ -144,8 +149,12 @@ def test_successful_operations(temp_file):
         new_str="replaced line",
     )
     assert_successful_result(result)
-    assert result.output is not None and "has been edited" in result.output
-    assert result.output is not None and "replaced line" in result.output
+    assert get_output_text(result) is not None and "has been edited" in get_output_text(
+        result
+    )
+    assert get_output_text(result) is not None and "replaced line" in get_output_text(
+        result
+    )
 
     # Test insert
     result = file_editor(
@@ -155,8 +164,12 @@ def test_successful_operations(temp_file):
         new_str="inserted line",
     )
     assert_successful_result(result)
-    assert result.output is not None and "has been edited" in result.output
-    assert result.output is not None and "inserted line" in result.output
+    assert get_output_text(result) is not None and "has been edited" in get_output_text(
+        result
+    )
+    assert get_output_text(result) is not None and "inserted line" in get_output_text(
+        result
+    )
 
     # Test undo
     result = file_editor(
@@ -164,7 +177,9 @@ def test_successful_operations(temp_file):
         path=str(temp_file),
     )
     assert_successful_result(result)
-    assert result.output is not None and "undone successfully" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "undone successfully" in get_output_text(result)
 
 
 def test_tab_expansion(temp_file):
@@ -181,8 +196,12 @@ def test_tab_expansion(temp_file):
     )
     assert_successful_result(result)
     # Tabs should be preserved in output
-    assert result.output is not None and "\tindented" in result.output
-    assert result.output is not None and "line\twith\ttabs" in result.output
+    assert get_output_text(result) is not None and "\tindented" in get_output_text(
+        result
+    )
+    assert get_output_text(
+        result
+    ) is not None and "line\twith\ttabs" in get_output_text(result)
 
     # Test str_replace with tabs in old_str
     result = file_editor(
@@ -192,7 +211,9 @@ def test_tab_expansion(temp_file):
         new_str="replaced line",
     )
     assert_successful_result(result)
-    assert result.output is not None and "replaced line" in result.output
+    assert get_output_text(result) is not None and "replaced line" in get_output_text(
+        result
+    )
 
     # Test str_replace with tabs in new_str
     result = file_editor(
@@ -202,7 +223,9 @@ def test_tab_expansion(temp_file):
         new_str="new\tline\twith\ttabs",
     )
     assert_successful_result(result)
-    assert result.output is not None and "new\tline\twith\ttabs" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "new\tline\twith\ttabs" in get_output_text(result)
 
     # Test insert with tabs
     result = file_editor(
@@ -212,7 +235,9 @@ def test_tab_expansion(temp_file):
         new_str="\tindented\tline",
     )
     assert_successful_result(result)
-    assert result.output is not None and "\tindented\tline" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "\tindented\tline" in get_output_text(result)
 
 
 def test_create_operation(temp_file):
@@ -229,7 +254,9 @@ def test_create_operation(temp_file):
     )
 
     assert_successful_result(result, str(temp_file))
-    assert result.output is not None and "created successfully" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "created successfully" in get_output_text(result)
     assert result.prev_exist is False
     assert result.new_content == content
 
@@ -258,29 +285,31 @@ def test_view_operation_truncation(temp_file):
     )
 
     assert_successful_result(result)
-    assert result.output is not None
+    assert get_output_text(result) is not None
 
     # Check that truncation notice is present
-    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in result.output
+    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
 
     # The content should be truncated before line numbers are added
     # So the final output will be longer than MAX_RESPONSE_LEN_CHAR due to formatting
     # but the original content was truncated
-    assert "Here's the result of running `cat -n`" in result.output
+    assert "Here's the result of running `cat -n`" in get_output_text(result)
 
     # With head-and-tail truncation, should contain both start and end content
     # The line numbers will show as "     1\tA..." at start and end with "A"
-    assert "\tA" in result.output  # Should have A's with tab formatting
+    assert "\tA" in get_output_text(result)  # Should have A's with tab formatting
 
 
 def test_view_file(editor):
     editor, test_file = editor
     result = editor(command="view", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert f"Here's the result of running `cat -n` on {test_file}:" in result.output
-    assert "1\tThis is a test file." in result.output
-    assert "2\tThis file is for testing purposes." in result.output
-    assert "3\t" not in result.output  # No extra line
+    assert f"Here's the result of running `cat -n` on {test_file}:" in get_output_text(
+        result
+    )
+    assert "1\tThis is a test file." in get_output_text(result)
+    assert "2\tThis file is for testing purposes." in get_output_text(result)
+    assert "3\t" not in get_output_text(result)  # No extra line
 
 
 def test_view_directory(editor):
@@ -288,7 +317,7 @@ def test_view_directory(editor):
     parent_dir = test_file.parent
     result = editor(command="view", path=str(parent_dir))
     assert (
-        result.output
+        get_output_text(result)
         == f"""Here's the files and directories up to 2 levels deep in {parent_dir}, excluding hidden items:
 {parent_dir}/
 {parent_dir}/test.txt"""  # noqa: E501
@@ -315,11 +344,13 @@ def test_view_with_a_specific_range(editor):
 
     # View file in range 50-100
     result = editor(command="view", path=str(test_file), view_range=[50, 100])
-    assert f"Here's the result of running `cat -n` on {test_file}:" in result.output
-    assert "    49\tLine 49" not in result.output
-    assert "    50\tLine 50" in result.output
-    assert "   100\tLine 100" in result.output
-    assert "101" not in result.output
+    assert f"Here's the result of running `cat -n` on {test_file}:" in get_output_text(
+        result
+    )
+    assert "    49\tLine 49" not in get_output_text(result)
+    assert "    50\tLine 50" in get_output_text(result)
+    assert "   100\tLine 100" in get_output_text(result)
+    assert "101" not in get_output_text(result)
 
 
 def test_create_file(editor):
@@ -328,7 +359,7 @@ def test_create_file(editor):
     result = editor(command="create", path=str(new_file), file_text="New file content")
     assert new_file.exists()
     assert new_file.read_text() == "New file content"
-    assert "File created successfully" in result.output
+    assert "File created successfully" in get_output_text(result)
 
 
 def test_create_with_empty_string(editor):
@@ -337,12 +368,14 @@ def test_create_with_empty_string(editor):
     result = editor(command="create", path=str(new_file), file_text="")
     assert new_file.exists()
     assert new_file.read_text() == ""
-    assert "File created successfully" in result.output
+    assert "File created successfully" in get_output_text(result)
 
     # Test the view command showing an empty line
     result = editor(command="view", path=str(new_file))
-    assert f"Here's the result of running `cat -n` on {new_file}:" in result.output
-    assert "1\t" in result.output  # Check for empty line
+    assert f"Here's the result of running `cat -n` on {new_file}:" in get_output_text(
+        result
+    )
+    assert "1\t" in get_output_text(result)  # Check for empty line
 
 
 def test_create_with_none_file_text(editor):
@@ -365,7 +398,7 @@ def test_str_replace_no_linting(editor):
 
     # Test str_replace command
     assert (
-        result.output
+        get_output_text(result)
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tThis is a sample file.
      2\tThis file is for testing purposes.
@@ -388,7 +421,7 @@ def test_str_replace_multi_line_no_linting(editor):
 
     # Test str_replace command
     assert (
-        result.output
+        get_output_text(result)
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tThis is a sample file.
      2\tThis file is for testing purposes.
@@ -407,7 +440,7 @@ def test_str_replace_multi_line_with_tabs_no_linting(editor_python_file_with_tab
     assert isinstance(result, FileEditorObservation)
 
     assert (
-        result.output
+        get_output_text(result)
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tdef test():
      2\t\tprint("Hello, Universe!")
@@ -510,7 +543,7 @@ def test_insert_no_linting(editor):
     assert isinstance(result, FileEditorObservation)
     assert "Inserted line" in test_file.read_text()
     assert (
-        result.output
+        get_output_text(result)
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of the edited file:
      1\tThis is a test file.
      2\tInserted line
@@ -559,7 +592,7 @@ def test_insert_chinese_text_into_english_file(editor):
     assert isinstance(result, FileEditorObservation)
     assert "中文文本" in test_file.read_text()
     assert (
-        result.output
+        get_output_text(result)
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of the edited file:
      1\t中文文本
      2\tThis is a test file.
@@ -592,7 +625,7 @@ def test_undo_edit(editor):
     # Undo the edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in result.output
+    assert "Last edit to" in get_output_text(result)
     assert "test file" in test_file.read_text()  # Original content restored
 
 
@@ -615,13 +648,13 @@ def test_multiple_undo_edits(editor):
     # Undo the last edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in result.output
+    assert "Last edit to" in get_output_text(result)
     assert "sample file v1" in test_file.read_text()  # Previous content restored
 
     # Undo the first edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in result.output
+    assert "Last edit to" in get_output_text(result)
     assert "test file" in test_file.read_text()  # Original content restored
 
 
@@ -697,16 +730,17 @@ def test_view_directory_with_hidden_files(tmp_path):
 
     # Verify output
     assert isinstance(result, FileEditorObservation)
-    assert str(test_dir) in result.output
-    assert "visible.txt" in result.output  # Visible file is shown
-    assert "visible_dir" in result.output  # Visible directory is shown
-    assert ".hidden1" not in result.output  # Hidden files not shown
-    assert ".hidden2" not in result.output
-    assert ".hidden_dir" not in result.output
+    assert str(test_dir) in get_output_text(result)
+    assert "visible.txt" in get_output_text(result)  # Visible file is shown
+    assert "visible_dir" in get_output_text(result)  # Visible directory is shown
+    assert ".hidden1" not in get_output_text(result)  # Hidden files not shown
+    assert ".hidden2" not in get_output_text(result)
+    assert ".hidden_dir" not in get_output_text(result)
     assert (
-        "3 hidden files/directories in this directory are excluded" in result.output
+        "3 hidden files/directories in this directory are excluded"
+        in get_output_text(result)
     )  # Shows count of hidden items in current dir only
-    assert "ls -la" in result.output  # Shows command to view hidden files
+    assert "ls -la" in get_output_text(result)  # Shows command to view hidden files
 
 
 def test_view_symlinked_directory(tmp_path):
@@ -732,11 +766,11 @@ def test_view_symlinked_directory(tmp_path):
 
     # Verify that all files are listed through the symlink
     assert isinstance(result, FileEditorObservation)
-    assert str(symlink_dir) in result.output
-    assert "file1.txt" in result.output
-    assert "file2.txt" in result.output
-    assert "subdir" in result.output
-    assert "file3.txt" in result.output
+    assert str(symlink_dir) in get_output_text(result)
+    assert "file1.txt" in get_output_text(result)
+    assert "file2.txt" in get_output_text(result)
+    assert "subdir" in get_output_text(result)
+    assert "file3.txt" in get_output_text(result)
 
 
 def test_view_large_directory_with_truncation(editor, tmp_path):
@@ -749,7 +783,7 @@ def test_view_large_directory_with_truncation(editor, tmp_path):
 
     result = editor(command="view", path=str(large_dir))
     assert isinstance(result, FileEditorObservation)
-    assert DIRECTORY_CONTENT_TRUNCATED_NOTICE in result.output
+    assert DIRECTORY_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
 
 
 def test_view_directory_on_hidden_path(tmp_path):
@@ -791,22 +825,23 @@ def test_view_directory_on_hidden_path(tmp_path):
     # Verify output
     assert isinstance(result, FileEditorObservation)
     # Depth 1: Visible files/dirs shown, hidden files/dirs not shown
-    assert "visible1.txt" in result.output
-    assert "visible_dir" in result.output
-    assert ".hidden1" not in result.output
-    assert ".hidden_dir" not in result.output
+    assert "visible1.txt" in get_output_text(result)
+    assert "visible_dir" in get_output_text(result)
+    assert ".hidden1" not in get_output_text(result)
+    assert ".hidden_dir" not in get_output_text(result)
 
     # Depth 2: Files in visible_dir shown
-    assert "visible2.txt" in result.output
-    assert ".hidden2" not in result.output
+    assert "visible2.txt" in get_output_text(result)
+    assert ".hidden2" not in get_output_text(result)
 
     # Depth 2: Files in hidden_dir not shown
-    assert "visible3.txt" not in result.output
-    assert ".hidden3" not in result.output
+    assert "visible3.txt" not in get_output_text(result)
+    assert ".hidden3" not in get_output_text(result)
 
     # Hidden file count only includes depth 1
     assert (
-        "2 hidden files/directories in this directory are excluded" in result.output
+        "2 hidden files/directories in this directory are excluded"
+        in get_output_text(result)
     )  # Only .hidden1 and .hidden_dir at depth 1
 
 
@@ -819,7 +854,7 @@ def test_view_large_file_with_truncation(editor, tmp_path):
 
     result = editor(command="view", path=str(large_file))
     assert isinstance(result, FileEditorObservation)
-    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in result.output
+    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
 
 
 def test_validate_path_suggests_absolute_path(editor, tmp_path):
@@ -868,8 +903,8 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
 
     # View file
     result = editor(command="view", path=str(test_file))
-    assert "     1\tLine 1" in result.output
-    assert "   500\tLine 500" in result.output
+    assert "     1\tLine 1" in get_output_text(result)
+    assert "   500\tLine 500" in get_output_text(result)
 
     # Replace line 500's content with '500 new'
     result = editor(
@@ -878,14 +913,14 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
         old_str="Line 500",
         new_str="500 new",
     )
-    assert "   500\t500 new" in result.output
+    assert "   500\t500 new" in get_output_text(result)
 
     # Delete the line '500 new'
     result = editor(
         command="str_replace", path=str(test_file), old_str="500 new\n", new_str=""
     )
-    assert "   499\tLine 499" in result.output
-    assert "   500\tLine 501" in result.output
+    assert "   499\tLine 499" in get_output_text(result)
+    assert "   500\tLine 501" in get_output_text(result)
 
     # Insert content at line 500
     result = editor(
@@ -894,4 +929,4 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
         insert_line=499,
         new_str="Inserted line at 500",
     )
-    assert "   500\tInserted line at 500" in result.output
+    assert "   500\tInserted line at 500" in get_output_text(result)
diff --git a/tests/tools/file_editor/test_error_handling.py b/tests/tools/file_editor/test_error_handling.py
index 02ced3bb61..64924be03c 100644
--- a/tests/tools/file_editor/test_error_handling.py
+++ b/tests/tools/file_editor/test_error_handling.py
@@ -2,7 +2,7 @@
 
 from openhands.tools.file_editor.impl import file_editor
 
-from .conftest import assert_error_result
+from .conftest import assert_error_result, get_output_text
 
 
 def test_validation_error_formatting():
@@ -88,7 +88,7 @@ def test_view_range_validation(temp_file):
     assert result.error is None
     assert (
         "NOTE: We only show up to 3 since there're only 3 lines in this file."
-        in result.output
+        in get_output_text(result)
     )
 
     # Test invalid range order
diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py
index 31c2b9ee1c..c43c5036d5 100644
--- a/tests/tools/file_editor/test_file_editor_tool.py
+++ b/tests/tools/file_editor/test_file_editor_tool.py
@@ -16,6 +16,8 @@
     FileEditorTool,
 )
 
+from .conftest import get_output_text
+
 
 def _create_test_conv_state(temp_dir: str) -> ConversationState:
     """Helper to create a test conversation state."""
@@ -95,9 +97,9 @@ def test_file_editor_tool_view_file():
         assert result is not None
         assert isinstance(result, FileEditorObservation)
         assert not result.error
-        assert "Line 1" in result.output
-        assert "Line 2" in result.output
-        assert "Line 3" in result.output
+        assert "Line 1" in get_output_text(result)
+        assert "Line 2" in get_output_text(result)
+        assert "Line 3" in get_output_text(result)
 
 
 def test_file_editor_tool_str_replace():
@@ -178,8 +180,8 @@ def test_file_editor_tool_view_directory():
         assert result is not None
         assert isinstance(result, FileEditorObservation)
         assert not result.error
-        assert "file1.txt" in result.output
-        assert "file2.txt" in result.output
+        assert "file1.txt" in get_output_text(result)
+        assert "file2.txt" in get_output_text(result)
 
 
 def test_file_editor_tool_includes_working_directory_in_description():
diff --git a/tests/tools/file_editor/test_memory_usage.py b/tests/tools/file_editor/test_memory_usage.py
index dd7c351f93..2a028f296a 100644
--- a/tests/tools/file_editor/test_memory_usage.py
+++ b/tests/tools/file_editor/test_memory_usage.py
@@ -11,7 +11,7 @@
 
 from openhands.tools.file_editor import file_editor
 
-from .conftest import assert_successful_result
+from .conftest import assert_successful_result, get_output_text
 
 
 # Apply the forked marker and serialize execution across workers
@@ -71,7 +71,7 @@ def test_file_read_memory_usage(temp_file):
 
     # Pull output before measuring and drop references to encourage GC
     assert_successful_result(result)
-    content = result.output
+    content = get_output_text(result)
     del result
     gc.collect()
 
diff --git a/tests/tools/file_editor/test_view_supported_binary_files.py b/tests/tools/file_editor/test_view_supported_binary_files.py
index e29ae932a6..f8938e0622 100644
--- a/tests/tools/file_editor/test_view_supported_binary_files.py
+++ b/tests/tools/file_editor/test_view_supported_binary_files.py
@@ -6,7 +6,7 @@
     FileEditorObservation,
 )
 
-from .conftest import assert_successful_result
+from .conftest import assert_successful_result, get_output_text
 
 
 def test_view_pdf_file():
@@ -74,12 +74,16 @@ def test_view_pdf_file():
 
         assert isinstance(result, FileEditorObservation)
         assert_successful_result(result)
-        assert f"Here's the result of running `cat -n` on {test_file}" in result.output
+        assert (
+            f"Here's the result of running `cat -n` on {test_file}"
+            in get_output_text(result)
+        )
 
         # Check for specific content present in the PDF
-        assert (
-            result.output is not None
-            and "Printer-Friendly Caltrain Schedule" in result.output
+        assert get_output_text(
+            result
+        ) is not None and "Printer-Friendly Caltrain Schedule" in get_output_text(
+            result
         )
     finally:
         # Clean up the temporary file
diff --git a/tests/tools/file_editor/test_visualize_diff.py b/tests/tools/file_editor/test_visualize_diff.py
index 3ad341e6a6..e316a88fa5 100644
--- a/tests/tools/file_editor/test_visualize_diff.py
+++ b/tests/tools/file_editor/test_visualize_diff.py
@@ -20,7 +20,7 @@ def test_visualize_diff_simple_replacement():
     return True"""
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -50,7 +50,7 @@ def test_visualize_diff_no_changes():
     return True"""
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=content,
         new_content=content,
@@ -93,7 +93,7 @@ def main():
     calculate(x, y)"""
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/calc.py",
         old_content=old_content,
         new_content=new_content,
@@ -121,7 +121,7 @@ def test_visualize_diff_attempted_edit():
     new_content = "new line"
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -149,7 +149,7 @@ def test_visualize_diff_caching():
     new_content = "new line"
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -190,7 +190,7 @@ def test_visualize_diff_custom_context_lines():
 line7"""
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -232,7 +232,7 @@ def test_get_edit_groups():
 line3"""
 
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -280,7 +280,7 @@ def test_get_edit_groups_no_content():
 def test_visualize_diff_none_content():
     """Test visualize_diff when content is None."""
     observation = FileEditorObservation(
-        command="str_replace",
+        cmd="str_replace",
         path="/test/file.py",
         old_content=None,
         new_content=None,
diff --git a/tests/tools/file_editor/utils/test_encoding.py b/tests/tools/file_editor/utils/test_encoding.py
index 9bd47dd477..0bfce2c26b 100644
--- a/tests/tools/file_editor/utils/test_encoding.py
+++ b/tests/tools/file_editor/utils/test_encoding.py
@@ -15,6 +15,8 @@
     with_encoding,
 )
 
+from ..conftest import get_output_text
+
 
 @pytest.fixture
 def temp_file():
@@ -287,9 +289,15 @@ def test_view_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the content was read correctly
-    assert result.output is not None and "Привет, мир!" in result.output
-    assert result.output is not None and "Тестовый файл с кириллицей" in result.output
-    assert result.output is not None and "Это тестовая строка" in result.output
+    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
+        result
+    )
+    assert get_output_text(
+        result
+    ) is not None and "Тестовый файл с кириллицей" in get_output_text(result)
+    assert get_output_text(
+        result
+    ) is not None and "Это тестовая строка" in get_output_text(result)
 
 
 def test_view_range_non_utf8_file(temp_non_utf8_file):
@@ -305,11 +313,17 @@ def test_view_range_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the content was read correctly
-    assert result.output is not None and "Тестовый файл с кириллицей" in result.output
-    assert result.output is not None and "Привет, мир!" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Тестовый файл с кириллицей" in get_output_text(result)
+    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
+        result
+    )
 
     # Verify that line 6 is not included
-    assert result.output is not None and "Это тестовая строка" not in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Это тестовая строка" not in get_output_text(result)
 
 
 def test_str_replace_non_utf8_file(temp_non_utf8_file):
@@ -326,8 +340,12 @@ def test_str_replace_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the replacement was successful
-    assert result.output is not None and "Здравствуй, мир!" in result.output
-    assert result.output is not None and "Привет, мир!" not in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Здравствуй, мир!" in get_output_text(result)
+    assert get_output_text(
+        result
+    ) is not None and "Привет, мир!" not in get_output_text(result)
 
     # Verify the file was saved with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -354,7 +372,9 @@ def test_insert_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the insertion was successful
-    assert result.output is not None and "Новая переменная" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Новая переменная" in get_output_text(result)
 
     # Verify the file was saved with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -391,9 +411,9 @@ def test_create_non_utf8_file():
         # Parse the result - now using direct access
 
         # Verify the file was created successfully
-        assert (
-            result.output is not None and "File created successfully" in result.output
-        )
+        assert get_output_text(
+            result
+        ) is not None and "File created successfully" in get_output_text(result)
 
         # Read the file with cp1251 encoding to verify content
         encoding_manager = EncodingManager()
@@ -433,7 +453,9 @@ def test_undo_edit_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the undo was successful
-    assert result.output is not None and "undone successfully" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "undone successfully" in get_output_text(result)
 
     # Verify the original content was restored with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -455,7 +477,9 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         path=str(temp_non_utf8_file),
     )
     # Parse the result - now using direct access
-    assert result.output is not None and "Привет, мир!" in result.output
+    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
+        result
+    )
 
     # 2. Replace text
     result = file_editor(
@@ -465,7 +489,9 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         new_str="Здравствуй, мир!",
     )
     # Parse the result - now using direct access
-    assert result.output is not None and "Здравствуй, мир!" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Здравствуй, мир!" in get_output_text(result)
 
     # 3. Insert text
     result = file_editor(
@@ -475,7 +501,9 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         new_str="# Добавленная строка\nboolean_var = True",
     )
     # Parse the result - now using direct access
-    assert result.output is not None and "Добавленная строка" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Добавленная строка" in get_output_text(result)
 
     # 4. View specific range
     result = file_editor(
@@ -484,8 +512,12 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         view_range=[5, 7],
     )
     # Parse the result - now using direct access
-    assert result.output is not None and "Добавленная строка" in result.output
-    assert result.output is not None and "boolean_var = True" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "Добавленная строка" in get_output_text(result)
+    assert get_output_text(
+        result
+    ) is not None and "boolean_var = True" in get_output_text(result)
 
     # 5. Undo the last edit
     result = file_editor(
@@ -493,7 +525,9 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         path=str(temp_non_utf8_file),
     )
     # Parse the result - now using direct access
-    assert result.output is not None and "undone successfully" in result.output
+    assert get_output_text(
+        result
+    ) is not None and "undone successfully" in get_output_text(result)
 
     # 6. Verify the file content after all operations
     with open(temp_non_utf8_file, "rb") as f:
@@ -532,7 +566,7 @@ def test_mixed_encoding_workflow():
             path=path1,
         )
         # Parse the result - now using direct access
-        assert "Текст в кодировке CP1251" in result1.output
+        assert "Текст в кодировке CP1251" in get_output_text(result1)
 
         # 2. View the UTF-8 file
         result2 = file_editor(
@@ -540,7 +574,7 @@ def test_mixed_encoding_workflow():
             path=path2,
         )
         # Parse the result - now using direct access
-        assert "Текст в кодировке UTF-8" in result2.output
+        assert "Текст в кодировке UTF-8" in get_output_text(result2)
 
         # 3. Edit the cp1251 file
         result3 = file_editor(
@@ -550,7 +584,7 @@ def test_mixed_encoding_workflow():
             new_str="Измененный текст в CP1251",
         )
         # Parse the result - now using direct access
-        assert "Измененный текст в CP1251" in result3.output
+        assert "Измененный текст в CP1251" in get_output_text(result3)
 
         # 4. Edit the UTF-8 file
         result4 = file_editor(
@@ -560,7 +594,7 @@ def test_mixed_encoding_workflow():
             new_str="Измененный текст в UTF-8",
         )
         # Parse the result - now using direct access
-        assert "Измененный текст в UTF-8" in result4.output
+        assert "Измененный текст в UTF-8" in get_output_text(result4)
 
         # 5. Verify both files maintain their original encodings
         with open(path1, "rb") as f:

From 651f957068ce4645530671cd78012b8b5e462610 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 11:04:46 +0000
Subject: [PATCH 19/76] fix: resolve type errors in examples,
 planning_file_editor, and test_registry

- Use raw_output instead of output in custom_tools.py example for ExecuteBashObservation
- Use cmd parameter instead of command in PlanningFileEditorObservation
- Rename output field to message in test_registry.py to avoid conflict with base Observation.output

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py          |  4 ++--
 .../openhands/tools/planning_file_editor/impl.py       |  2 +-
 tests/sdk/tool/test_registry.py                        | 10 +++++-----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 6b7dc3bf34..acf8dd60ca 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,8 +93,8 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        if result.output.strip():
-            for line in result.output.strip().splitlines():
+        if result.raw_output.strip():
+            for line in result.raw_output.strip().splitlines():
                 matches.append(line)
                 # Expect "path:line:content" — take the file part before first ":"
                 file_path = line.split(":", 1)[0]
diff --git a/openhands-tools/openhands/tools/planning_file_editor/impl.py b/openhands-tools/openhands/tools/planning_file_editor/impl.py
index 1b2f45a43c..822e7f266c 100644
--- a/openhands-tools/openhands/tools/planning_file_editor/impl.py
+++ b/openhands-tools/openhands/tools/planning_file_editor/impl.py
@@ -59,7 +59,7 @@ def __call__(
 
         # Convert FileEditorObservation to PlanningFileEditorObservation
         return PlanningFileEditorObservation(
-            command=action.command,
+            cmd=action.command,
             output=file_editor_obs.output,
             error=file_editor_obs.error,
         )
diff --git a/tests/sdk/tool/test_registry.py b/tests/sdk/tool/test_registry.py
index 3524a878fc..6bd32db394 100644
--- a/tests/sdk/tool/test_registry.py
+++ b/tests/sdk/tool/test_registry.py
@@ -26,16 +26,16 @@ class _HelloAction(Action):
 
 
 class _HelloObservation(Observation):
-    output: str = ""
+    message: str = ""
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        return [TextContent(text=self.output)]
+        return [TextContent(text=self.message)]
 
 
 class _HelloExec(ToolExecutor[_HelloAction, _HelloObservation]):
     def __call__(self, action: _HelloAction, conversation=None) -> _HelloObservation:
-        return _HelloObservation(output=f"Hello, {action.name}!")
+        return _HelloObservation(message=f"Hello, {action.name}!")
 
 
 class _ConfigurableHelloTool(ToolDefinition):
@@ -55,7 +55,7 @@ def __call__(
                 self, action: _HelloAction, conversation=None
             ) -> _HelloObservation:
                 return _HelloObservation(
-                    output=f"{self._greeting}, {action.name}{self._punctuation}"
+                    message=f"{self._greeting}, {action.name}{self._punctuation}"
                 )
 
         return [
@@ -133,4 +133,4 @@ def test_register_tool_type_uses_create_params():
 
     observation = tool(_HelloAction(name="Alice"))
     assert isinstance(observation, _HelloObservation)
-    assert observation.output == "Howdy, Alice?"
+    assert observation.message == "Howdy, Alice?"

From 2412728079bb5e78904b45a2495b1e5679975773 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 11:10:46 +0000
Subject: [PATCH 20/76] refactor: rename _format_error to format_error and use
 it consistently

- Renamed Observation._format_error() to format_error() (public method)
- Updated BrowserObservation.to_llm_content() to use self.format_error()
- Updated ExecuteBashObservation.to_llm_content() to use self.format_error()
- Eliminated hardcoded 'Tool Execution Error:' strings in observation subclasses
- All error formatting now goes through the base class format_error() method

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py                 | 5 +++--
 openhands-tools/openhands/tools/browser_use/definition.py  | 2 +-
 openhands-tools/openhands/tools/execute_bash/definition.py | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index d7a2cff7ca..42223238f9 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -226,7 +226,8 @@ def has_error(self) -> bool:
     def result_status(self) -> ObservationStatus:
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
-    def _format_error(self) -> TextContent:
+    def format_error(self) -> TextContent:
+        """Format the error message for LLM display."""
         return TextContent(text=f"Tool Execution Error: {self.error}")
 
     @property
@@ -240,7 +241,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.command:
             llm_content.append(TextContent(text=f"Executed Command: {self.command}\n"))
         if self.error:
-            llm_content.append(self._format_error())
+            llm_content.append(self.format_error())
         if self.output:
             llm_content.extend(self.output)
         return llm_content
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 682580f9bb..7ede04331d 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -35,7 +35,7 @@ class BrowserObservation(Observation):
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
-            return [TextContent(text=f"Tool Execution Error: {self.error}")]
+            return [self.format_error()]
 
         # Extract text from output list
         output_text = "".join(
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index c765a6fb7a..5552fa5bd4 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -108,7 +108,7 @@ def command_id(self) -> int | None:
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
             # When there's an error, format it appropriately
-            return [TextContent(text=f"Tool Execution Error: {self.error}")]
+            return [self.format_error()]
 
         ret = f"{self.metadata.prefix}{self.raw_output}{self.metadata.suffix}"
         if self.metadata.working_dir:

From da0e0bea22e363745e647753d14fcd283416cb85 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 12:27:42 +0000
Subject: [PATCH 21/76] refactor: preserve truncation in
 ExecuteBashObservation.to_llm_content

- Keep custom to_llm_content implementation with truncation (preserving main branch behavior)
- When error is set, return formatted error: "Tool Execution Error: {error}"
- Otherwise, build output string with metadata and apply maybe_truncate()
- No changes to how observations are created - errors still populate error field with full message
- Output format stays almost identical to main branch (only error header changes from "[There was an error during command execution.]" to "Tool Execution Error:")

This addresses feedback: ExecuteBashObservation CAN override to_llm_content and must preserve truncation functionality.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/execute_bash/definition.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 5552fa5bd4..e602f64332 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -107,9 +107,11 @@ def command_id(self) -> int | None:
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
-            # When there's an error, format it appropriately
-            return [self.format_error()]
+            # When there's an error, return formatted error message
+            # Use custom header instead of base class format
+            return [TextContent(text=f"Tool Execution Error: {self.error}")]
 
+        # Build output string with metadata (same as main branch)
         ret = f"{self.metadata.prefix}{self.raw_output}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -117,6 +119,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             ret += f"\n[Python interpreter: {self.metadata.py_interpreter_path}]"
         if self.metadata.exit_code != -1:
             ret += f"\n[Command finished with exit code {self.metadata.exit_code}]"
+        # Apply truncation to the entire output (preserving main branch behavior)
         return [TextContent(text=maybe_truncate(ret, MAX_CMD_OUTPUT_SIZE))]
 
     @property

From e4f3efd07896e29237bebe5d45f6abda5389d39c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 13:05:20 +0000
Subject: [PATCH 22/76] refactor: apply metadata prefix/suffix to error output

- Error output now includes metadata prefix and suffix: error = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
- Remove unnecessary comments
- All 144 tests pass

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/execute_bash/definition.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index e602f64332..a3ffd6b1b0 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -107,11 +107,9 @@ def command_id(self) -> int | None:
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
-            # When there's an error, return formatted error message
-            # Use custom header instead of base class format
-            return [TextContent(text=f"Tool Execution Error: {self.error}")]
+            error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
+            return [TextContent(text=f"Tool Execution Error: {error_msg}")]
 
-        # Build output string with metadata (same as main branch)
         ret = f"{self.metadata.prefix}{self.raw_output}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -119,7 +117,6 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             ret += f"\n[Python interpreter: {self.metadata.py_interpreter_path}]"
         if self.metadata.exit_code != -1:
             ret += f"\n[Command finished with exit code {self.metadata.exit_code}]"
-        # Apply truncation to the entire output (preserving main branch behavior)
         return [TextContent(text=maybe_truncate(ret, MAX_CMD_OUTPUT_SIZE))]
 
     @property

From 353b3e9383f5480f3105e377d73db6798d606616 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 31 Oct 2025 15:11:35 +0100
Subject: [PATCH 23/76] update doc

---
 openhands-sdk/openhands/sdk/tool/schema.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 42223238f9..fa4e563c68 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -196,7 +196,6 @@ class ObservationStatus(str, Enum):
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
-    # Standardized primary output and error handling
     output: list[TextContent | ImageContent] = Field(
         default_factory=list,
         description=(
@@ -218,12 +217,15 @@ def command(self) -> str | None:
 
     @property
     def has_error(self) -> bool:
-        # Support both string and boolean-style error flags across subclasses.
-        # Using bool() handles: None/""/False -> False; non-empty str/True -> True.
+        """
+        Check if the observation indicates an error.
+        """
         return bool(self.error)
 
     @property
     def result_status(self) -> ObservationStatus:
+        """
+        Get the observation result status based on presence of error."""
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
     def format_error(self) -> TextContent:

From 85773de7eee66219d7098c2da1596049e39f66b4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 14:12:11 +0000
Subject: [PATCH 24/76] refactor: simplify ExecuteBashObservation to use only
 output field from base class

- Removed redundant raw_output field from ExecuteBashObservation
- All creation sites now use output=[TextContent(text=...)] pattern
- Updated to_llm_content and visualize to extract text using isinstance checks
- Updated impl.py to merge outputs when combining reset and command results
- Added get_output_text() helper in conftest.py for type-safe text extraction
- Updated all tests to use get_output_text() instead of direct .output[0].text access
- Fixed example to use output field with proper type checking

This aligns ExecuteBashObservation with the base Observation class pattern
where errors are set directly and output contains TextContent objects.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |   6 +-
 .../tools/execute_bash/definition.py          |  12 +-
 .../openhands/tools/execute_bash/impl.py      |  43 ++++--
 .../execute_bash/terminal/terminal_session.py |   9 +-
 tests/tools/execute_bash/conftest.py          |  14 ++
 .../execute_bash/test_bash_ps1_metadata.py    |   9 +-
 tests/tools/execute_bash/test_bash_reset.py   |  33 ++---
 tests/tools/execute_bash/test_bash_session.py | 135 +++++++++---------
 tests/tools/execute_bash/test_bash_tool.py    |   5 +-
 .../test_bash_tool_auto_detection.py          |   5 +-
 .../test_observation_truncation.py            |  10 +-
 .../execute_bash/test_secrets_masking.py      |  16 ++-
 12 files changed, 172 insertions(+), 125 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index acf8dd60ca..cfd1133d4c 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,8 +93,10 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        if result.raw_output.strip():
-            for line in result.raw_output.strip().splitlines():
+        first_item = result.output[0] if result.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if output_text.strip():
+            for line in output_text.strip().splitlines():
                 matches.append(line)
                 # Expect "path:line:content" — take the file part before first ":"
                 file_path = line.split(":", 1)[0]
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index a3ffd6b1b0..26cb972380 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,8 +79,6 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    # Internal string output field (raw command output)
-    raw_output: str = Field(default="", description="Raw command output string")
     cmd: str | None = Field(default=None, description="The command that was executed")
     exit_code: int | None = Field(
         default=None,
@@ -110,7 +108,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
             return [TextContent(text=f"Tool Execution Error: {error_msg}")]
 
-        ret = f"{self.metadata.prefix}{self.raw_output}{self.metadata.suffix}"
+        first_item = self.output[0] if self.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        ret = f"{self.metadata.prefix}{output_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
         if self.metadata.py_interpreter_path:
@@ -130,9 +130,11 @@ def visualize(self) -> Text:
             content.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        if self.raw_output:
+        first_item = self.output[0] if self.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if output_text:
             # Style the output based on content
-            output_lines = self.raw_output.split("\n")
+            output_lines = output_text.split("\n")
             for line in output_lines:
                 if line.strip():
                     # Color error-like lines differently
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 621a8f7e9c..8893e3064b 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -3,6 +3,7 @@
 
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import ToolExecutor
+from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -112,10 +113,14 @@ def reset(self) -> ExecuteBashObservation:
         )
 
         return ExecuteBashObservation(
-            raw_output=(
-                "Terminal session has been reset. All previous environment "
-                "variables and session state have been cleared."
-            ),
+            output=[
+                TextContent(
+                    text=(
+                        "Terminal session has been reset. All previous environment "
+                        "variables and session state have been cleared."
+                    )
+                )
+            ],
             cmd="[RESET]",
             exit_code=0,
         )
@@ -141,11 +146,21 @@ def __call__(
                 )
                 self._export_envs(command_action, conversation)
                 command_result = self.session.execute(command_action)
+                reset_text = (
+                    reset_result.output[0].text
+                    if reset_result.output
+                    and isinstance(reset_result.output[0], TextContent)
+                    else ""
+                )
+                command_text = (
+                    command_result.output[0].text
+                    if command_result.output
+                    and isinstance(command_result.output[0], TextContent)
+                    else ""
+                )
                 observation = command_result.model_copy(
                     update={
-                        "raw_output": (
-                            reset_result.raw_output + "\n\n" + command_result.raw_output
-                        ),
+                        "output": [TextContent(text=f"{reset_text}\n\n{command_text}")],
                         "cmd": f"[RESET] {action.command}",
                     }
                 )
@@ -158,15 +173,17 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        if observation.raw_output and conversation is not None:
+        first_item = observation.output[0] if observation.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if output_text and conversation is not None:
             try:
                 secret_registry = conversation.state.secret_registry
-                masked_output = secret_registry.mask_secrets_in_output(
-                    observation.raw_output
-                )
+                masked_output = secret_registry.mask_secrets_in_output(output_text)
                 if masked_output:
-                    data = observation.model_dump(exclude={"raw_output"})
-                    return ExecuteBashObservation(**data, raw_output=masked_output)
+                    data = observation.model_dump(exclude={"output"})
+                    return ExecuteBashObservation(
+                        **data, output=[TextContent(text=masked_output)]
+                    )
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 995026e929..1bbcee98a3 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -5,6 +5,7 @@
 from enum import Enum
 
 from openhands.sdk.logger import get_logger
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash.constants import (
     CMD_OUTPUT_PS1_END,
     NO_CHANGE_TIMEOUT_SECONDS,
@@ -188,7 +189,7 @@ def _handle_completed_command(
         self._ready_for_next_command()
         return ExecuteBashObservation(
             cmd=command,
-            raw_output=command_output,
+            output=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -222,7 +223,7 @@ def _handle_nochange_timeout_command(
         )
         return ExecuteBashObservation(
             cmd=command,
-            raw_output=command_output,
+            output=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -257,7 +258,7 @@ def _handle_hard_timeout_command(
         )
         return ExecuteBashObservation(
             cmd=command,
-            raw_output=command_output,
+            output=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -387,7 +388,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             obs = ExecuteBashObservation(
                 cmd=command,
-                raw_output=command_output,
+                output=[TextContent(text=command_output)],
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index f566d3fb9b..8fdf11310e 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -3,13 +3,27 @@
 import tempfile
 
 from openhands.sdk.logger import get_logger
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash.constants import TIMEOUT_MESSAGE_TEMPLATE
+from openhands.tools.execute_bash.definition import ExecuteBashObservation
 from openhands.tools.execute_bash.terminal import create_terminal_session
 
 
 logger = get_logger(__name__)
 
 
+def get_output_text(obs: ExecuteBashObservation) -> str:
+    """Extract text from observation output field.
+
+    This helper handles type-safe extraction of text from the observation's
+    output field, which contains Content items (TextContent or ImageContent).
+    """
+    if not obs.output:
+        return ""
+    first_item = obs.output[0]
+    return first_item.text if isinstance(first_item, TextContent) else ""
+
+
 def get_no_change_timeout_suffix(timeout_seconds):
     """Helper function to generate the expected no-change timeout suffix."""
     return (
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index bb4ef60825..d3bee61c6b 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -1,6 +1,5 @@
 import json
 
-from openhands.sdk import TextContent
 from openhands.tools.execute_bash.constants import (
     CMD_OUTPUT_METADATA_PS1_REGEX,
     CMD_OUTPUT_PS1_BEGIN,
@@ -270,10 +269,15 @@ def test_ps1_metadata_regex_pattern():
 
 def test_cmd_output_observation_properties():
     """Test ExecuteBashObservation class properties"""
+    from openhands.sdk.tool.schema import TextContent
+
     # Test with successful command
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
     obs = ExecuteBashObservation(
-        cmd="ls", raw_output="file1\nfile2", exit_code=0, metadata=metadata
+        cmd="ls",
+        output=[TextContent(text="file1\nfile2")],
+        exit_code=0,
+        metadata=metadata,
     )
     assert obs.command_id == 123
     assert obs.exit_code == 0
@@ -289,7 +293,6 @@ def test_cmd_output_observation_properties():
     metadata = CmdOutputMetadata(exit_code=1, pid=456)
     obs = ExecuteBashObservation(
         cmd="invalid",
-        raw_output="",
         exit_code=1,
         error="Command failed",
         metadata=metadata,
diff --git a/tests/tools/execute_bash/test_bash_reset.py b/tests/tools/execute_bash/test_bash_reset.py
index 31ccc6b26d..9e16150222 100644
--- a/tests/tools/execute_bash/test_bash_reset.py
+++ b/tests/tools/execute_bash/test_bash_reset.py
@@ -15,6 +15,7 @@
     ExecuteBashAction,
     ExecuteBashObservation,
 )
+from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_conv_state(working_dir: str) -> ConversationState:
@@ -43,13 +44,13 @@ def test_bash_reset_basic():
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "hello" in result.raw_output
+        assert "hello" in get_output_text(result)
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
         assert reset_result.command == "[RESET]"
 
         # Verify the variable is no longer set after reset
@@ -57,7 +58,7 @@ def test_bash_reset_basic():
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
         # The variable should be empty after reset
-        assert result.raw_output.strip() == ""
+        assert get_output_text(result).strip() == ""
 
 
 def test_bash_reset_with_command():
@@ -78,15 +79,15 @@ def test_bash_reset_with_command():
         )
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
-        assert "hello from fresh terminal" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "hello from fresh terminal" in get_output_text(reset_result)
         assert reset_result.command == "[RESET] echo 'hello from fresh terminal'"
 
         # Verify the variable is no longer set (confirming reset worked)
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert result.raw_output.strip() == ""
+        assert get_output_text(result).strip() == ""
 
 
 def test_bash_reset_working_directory():
@@ -99,7 +100,7 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.raw_output
+        assert temp_dir in get_output_text(result)
 
         # Change directory
         action = ExecuteBashAction(command="cd /home")
@@ -110,19 +111,19 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "/home" in result.raw_output
+        assert "/home" in get_output_text(result)
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
 
         # Verify working directory is back to original
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.raw_output
+        assert temp_dir in get_output_text(result)
 
 
 def test_bash_reset_multiple_times():
@@ -135,25 +136,25 @@ def test_bash_reset_multiple_times():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
 
         # Execute a command after first reset
         action = ExecuteBashAction(command="echo 'after first reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after first reset" in result.raw_output
+        assert "after first reset" in get_output_text(result)
 
         # Second reset
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
 
         # Execute a command after second reset
         action = ExecuteBashAction(command="echo 'after second reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after second reset" in result.raw_output
+        assert "after second reset" in get_output_text(result)
 
 
 def test_bash_reset_with_timeout():
@@ -166,7 +167,7 @@ def test_bash_reset_with_timeout():
         reset_action = ExecuteBashAction(command="", reset=True, timeout=5.0)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
         assert reset_result.command == "[RESET]"
 
 
@@ -196,5 +197,5 @@ def test_bash_reset_only_with_empty_command():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in reset_result.raw_output
+        assert "Terminal session has been reset" in get_output_text(reset_result)
         assert reset_result.command == "[RESET]"
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 9ee6603bd8..5ae356480e 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -23,7 +23,7 @@
     create_terminal_session,
 )
 
-from .conftest import get_no_change_timeout_suffix
+from .conftest import get_no_change_timeout_suffix, get_output_text
 
 
 logger = get_logger(__name__)
@@ -43,7 +43,7 @@ def test_session_initialization(terminal_type):
         session.initialize()
         obs = session.execute(ExecuteBashAction(command="pwd"))
 
-        assert temp_dir in obs.raw_output
+        assert temp_dir in get_output_text(obs)
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
         session.close()
 
@@ -66,7 +66,7 @@ def test_cwd_property(tmp_path, terminal_type):
 
     # For other implementations, just verify the command executed successfully
     obs = session.execute(ExecuteBashAction(command="pwd"))
-    assert str(random_dir) in obs.raw_output
+    assert str(random_dir) in get_output_text(obs)
 
     # Note: CWD tracking may vary between terminal implementations
     # For tmux, it should track properly. For subprocess, it may not.
@@ -84,7 +84,7 @@ def test_basic_command(terminal_type):
     # Test simple command
     obs = session.execute(ExecuteBashAction(command="echo 'hello world'"))
 
-    assert "hello world" in obs.raw_output
+    assert "hello world" in get_output_text(obs)
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -95,16 +95,16 @@ def test_basic_command(terminal_type):
 
     # Note: Exit code handling may vary between terminal implementations
     # The important thing is that the error message is captured
-    assert "nonexistent_command: command not found" in obs.raw_output
+    assert "nonexistent_command: command not found" in get_output_text(obs)
     assert session.prev_status == TerminalCommandStatus.COMPLETED
 
     # Test multiple commands in sequence
     obs = session.execute(
         ExecuteBashAction(command='echo "first" && echo "second" && echo "third"')
     )
-    assert "first" in obs.raw_output
-    assert "second" in obs.raw_output
-    assert "third" in obs.raw_output
+    assert "first" in get_output_text(obs)
+    assert "second" in get_output_text(obs)
+    assert "third" in get_output_text(obs)
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -125,7 +125,7 @@ def test_environment_variable_persistence(terminal_type):
 
     # Use the environment variable in a subsequent command
     obs = session.execute(ExecuteBashAction(command="echo $TEST_VAR"))
-    assert "hello world" in obs.raw_output
+    assert "hello world" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
 
     session.close()
@@ -151,8 +151,8 @@ def test_environment_variable_inheritance_from_parent(terminal_type):
 
         # Check if the environment variable is available in the terminal
         obs = session.execute(ExecuteBashAction(command=f"echo ${test_var_name}"))
-        assert test_var_value in obs.raw_output, (
-            f"Expected '{test_var_value}' in output, but got: {obs.raw_output}"
+        assert test_var_value in get_output_text(obs), (
+            f"Expected '{test_var_value}' in output, but got: {get_output_text(obs)}"
         )
         assert obs.metadata.exit_code == 0
 
@@ -176,7 +176,7 @@ def test_long_running_command_follow_by_execute():
         ExecuteBashAction(command="echo 1; sleep 3; echo 2; sleep 3; echo 3")
     )
 
-    assert "1" in obs.raw_output  # First number should appear before timeout
+    assert "1" in get_output_text(obs)  # First number should appear before timeout
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
@@ -185,7 +185,7 @@ def test_long_running_command_follow_by_execute():
     # Continue watching output
     obs = session.execute(ExecuteBashAction(command="", is_input=True))
 
-    assert "2" in obs.raw_output
+    assert "2" in get_output_text(obs)
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -194,7 +194,7 @@ def test_long_running_command_follow_by_execute():
     # Test command that produces no output
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert "3" not in obs.raw_output
+    assert "3" not in get_output_text(obs)
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert "The previous command is still running" in obs.metadata.suffix
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -205,8 +205,8 @@ def test_long_running_command_follow_by_execute():
     # Run it again, this time it should produce output and then start a new command
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert (
-        "3" in obs.raw_output
+    assert "3" in get_output_text(
+        obs
     )  # Should see the final output from the previous command
     assert obs.metadata.exit_code == -1  # -1 indicates new command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
@@ -229,7 +229,7 @@ def test_interactive_command(terminal_type):
         )
     )
 
-    assert "Enter name:" in obs.raw_output
+    assert "Enter name:" in get_output_text(obs)
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(3)
@@ -238,7 +238,7 @@ def test_interactive_command(terminal_type):
     # Send input
     obs = session.execute(ExecuteBashAction(command="John", is_input=True))
 
-    assert "Hello John" in obs.raw_output
+    assert "Hello John" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -268,7 +268,7 @@ def test_interactive_command(terminal_type):
 
     obs = session.execute(ExecuteBashAction(command="EOF", is_input=True))
 
-    assert "line 1" in obs.raw_output and "line 2" in obs.raw_output
+    assert "line 1" in get_output_text(obs) and "line 2" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -289,7 +289,7 @@ def test_ctrl_c(terminal_type):
         ExecuteBashAction(command="while true; do echo 'looping'; sleep 3; done"),
     )
 
-    assert "looping" in obs.raw_output
+    assert "looping" in get_output_text(obs)
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.prefix == ""
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -320,7 +320,7 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.has_error is True
-    assert obs.raw_output == ""  # When there's an error, output should not be populated
+    assert not obs.output  # When there's an error, output should not be populated
     assert obs.error == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
@@ -359,22 +359,22 @@ def test_command_output_continuation(terminal_type):
     if session.prev_status == TerminalCommandStatus.COMPLETED:
         # If the command completed immediately, verify we got all the output
         logger.info("Command completed immediately", extra={"msg_type": "TEST_INFO"})
-        assert "1" in obs.raw_output
-        assert "2" in obs.raw_output
-        assert "3" in obs.raw_output
-        assert "4" in obs.raw_output
-        assert "5" in obs.raw_output
+        assert "1" in get_output_text(obs)
+        assert "2" in get_output_text(obs)
+        assert "3" in get_output_text(obs)
+        assert "4" in get_output_text(obs)
+        assert "5" in get_output_text(obs)
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
     else:
         # If the command timed out, verify we got the timeout message
         assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
-        assert "1" in obs.raw_output
+        assert "1" in get_output_text(obs)
         assert "[The command has no new output after 1 seconds." in obs.metadata.suffix
 
         # Continue getting output until we see all numbers
         numbers_seen = set()
         for i in range(1, 6):
-            if str(i) in obs.raw_output:
+            if str(i) in get_output_text(obs):
                 numbers_seen.add(i)
 
         # We need to see numbers 2-5 and then the command completion
@@ -386,7 +386,7 @@ def test_command_output_continuation(terminal_type):
 
             # Check for numbers in the output
             for i in range(1, 6):
-                if str(i) in obs.raw_output and i not in numbers_seen:
+                if str(i) in get_output_text(obs) and i not in numbers_seen:
                     numbers_seen.add(i)
                     logger.info(
                         f"Found number {i} in output", extra={"msg_type": "TEST_INFO"}
@@ -426,8 +426,8 @@ def test_long_output(terminal_type):
         ExecuteBashAction(command='for i in {1..5000}; do echo "Line $i"; done')
     )
 
-    assert "Line 1" in obs.raw_output
-    assert "Line 5000" in obs.raw_output
+    assert "Line 1" in get_output_text(obs)
+    assert "Line 5000" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -446,8 +446,8 @@ def test_long_output_exceed_history_limit(terminal_type):
     )
 
     assert "Previous command outputs are truncated" in obs.metadata.prefix
-    assert "Line 40000" in obs.raw_output
-    assert "Line 50000" in obs.raw_output
+    assert "Line 40000" in get_output_text(obs)
+    assert "Line 50000" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
 
@@ -467,7 +467,7 @@ def test_multiline_command():
         )
     )
 
-    assert "inside if" in obs.raw_output
+    assert "inside if" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -491,21 +491,21 @@ def test_python_interactive_input(terminal_type):
     # Start Python with the interactive script
     obs = session.execute(ExecuteBashAction(command=f'python3 -c "{python_script}"'))
 
-    assert "Enter your name:" in obs.raw_output
+    assert "Enter your name:" in get_output_text(obs)
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send first input (name)
     obs = session.execute(ExecuteBashAction(command="Alice", is_input=True))
 
-    assert "Enter your age:" in obs.raw_output
+    assert "Enter your age:" in get_output_text(obs)
     assert obs.metadata.exit_code == -1
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send second input (age)
     obs = session.execute(ExecuteBashAction(command="25", is_input=True))
 
-    assert "Hello Alice, you are 25 years old" in obs.raw_output
+    assert "Hello Alice, you are 25 years old" in get_output_text(obs)
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert session.prev_status == TerminalCommandStatus.COMPLETED
@@ -518,7 +518,8 @@ def _run_bash_action(session, command: str, **kwargs):
     action = ExecuteBashAction(command=command, **kwargs)
     obs = session.execute(action)
     logger.info(f"Command: {command}")
-    logger.info(f"Output: {obs.raw_output}")
+    output_text = get_output_text(obs) if obs.output else ""
+    logger.info(f"Output: {output_text}")
     logger.info(f"Exit code: {obs.metadata.exit_code}")
     return obs
 
@@ -538,12 +539,12 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in obs.raw_output
+            assert "Serving HTTP on" in get_output_text(obs)
 
             # Send Ctrl+C to interrupt
             obs = _run_bash_action(session, "C-c", is_input=True)
             assert "CTRL+C was sent" in obs.metadata.suffix
-            assert "Keyboard interrupt received, exiting." in obs.raw_output
+            assert "Keyboard interrupt received, exiting." in get_output_text(obs)
 
             # Verify we can run commands after interrupt
             obs = _run_bash_action(session, "ls")
@@ -554,7 +555,7 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in obs.raw_output
+            assert "Serving HTTP on" in get_output_text(obs)
 
         finally:
             session.close()
@@ -581,7 +582,7 @@ def test_bash_background_server(terminal_type):
             obs = _run_bash_action(session, f"curl http://localhost:{server_port}")
             assert obs.metadata.exit_code == 0
             # Check for content typical of python http.server directory listing
-            assert "Directory listing for" in obs.raw_output
+            assert "Directory listing for" in get_output_text(obs)
 
             # Kill the server
             obs = _run_bash_action(session, 'pkill -f "http.server"')
@@ -604,17 +605,17 @@ def test_multiline_commands(terminal_type):
             # single multiline command
             obs = _run_bash_action(session, 'echo \\\n -e "foo"')
             assert obs.metadata.exit_code == 0
-            assert "foo" in obs.raw_output
+            assert "foo" in get_output_text(obs)
 
             # test multiline echo
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
-            assert "hello\nworld" in obs.raw_output
+            assert "hello\nworld" in get_output_text(obs)
 
             # test whitespace
             obs = _run_bash_action(session, 'echo -e "a\\n\\n\\nz"')
             assert obs.metadata.exit_code == 0
-            assert "\n\n\n" in obs.raw_output
+            assert "\n\n\n" in get_output_text(obs)
         finally:
             session.close()
 
@@ -637,7 +638,7 @@ def test_complex_commands(terminal_type):
         try:
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "Got 3 heads in a row after 3 flips!" in obs.raw_output
+            assert "Got 3 heads in a row after 3 flips!" in get_output_text(obs)
         finally:
             session.close()
 
@@ -654,8 +655,8 @@ def test_no_ps2_in_output(terminal_type):
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
 
-            assert "hello\nworld" in obs.raw_output
-            assert ">" not in obs.raw_output
+            assert "hello\nworld" in get_output_text(obs)
+            assert ">" not in get_output_text(obs)
         finally:
             session.close()
 
@@ -685,11 +686,11 @@ def test_multiline_command_loop(terminal_type):
         try:
             obs = _run_bash_action(session, init_cmd)
             assert obs.metadata.exit_code == 0
-            assert "created files" in obs.raw_output
+            assert "created files" in get_output_text(obs)
 
             obs = _run_bash_action(session, follow_up_cmd)
             assert obs.metadata.exit_code == 0
-            assert "success" in obs.raw_output
+            assert "success" in get_output_text(obs)
         finally:
             session.close()
 
@@ -727,7 +728,7 @@ def test_multiple_multiline_commands(terminal_type):
             for cmd in cmds:
                 obs = _run_bash_action(session, cmd)
                 assert obs.metadata.exit_code == 0
-                results.append(obs.raw_output)
+                results.append(get_output_text(obs))
 
             # Verify all expected outputs are present
             assert "total 0" in results[0]  # ls -l
@@ -760,21 +761,21 @@ def test_cmd_run(terminal_type):
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "total 0" in obs.raw_output
+            assert "total 0" in get_output_text(obs)
 
             obs = _run_bash_action(session, "mkdir test")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "test" in obs.raw_output
+            assert "test" in get_output_text(obs)
 
             obs = _run_bash_action(session, "touch test/foo.txt")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l test")
             assert obs.metadata.exit_code == 0
-            assert "foo.txt" in obs.raw_output
+            assert "foo.txt" in get_output_text(obs)
 
             # clean up
             _run_bash_action(session, "rm -rf test")
@@ -796,7 +797,7 @@ def test_run_as_user_correct_home_dir(terminal_type):
             obs = _run_bash_action(session, "cd ~ && pwd")
             assert obs.metadata.exit_code == 0
             home = os.getenv("HOME")
-            assert home and home in obs.raw_output
+            assert home and home in get_output_text(obs)
         finally:
             session.close()
 
@@ -811,8 +812,8 @@ def test_multi_cmd_run_in_single_line(terminal_type):
             # Original Linux version using &&
             obs = _run_bash_action(session, "pwd && ls -l")
             assert obs.metadata.exit_code == 0
-            assert temp_dir in obs.raw_output
-            assert "total 0" in obs.raw_output
+            assert temp_dir in get_output_text(obs)
+            assert "total 0" in get_output_text(obs)
         finally:
             session.close()
 
@@ -835,7 +836,7 @@ def test_stateful_cmd(terminal_type):
 
             obs = _run_bash_action(session, "pwd")
             assert obs.metadata.exit_code == 0
-            assert f"{temp_dir}/test" in obs.raw_output.strip()
+            assert f"{temp_dir}/test" in get_output_text(obs).strip()
         finally:
             session.close()
 
@@ -866,7 +867,7 @@ def test_python_version(terminal_type):
         try:
             obs = _run_bash_action(session, "python --version")
             assert obs.metadata.exit_code == 0
-            assert "Python 3" in obs.raw_output
+            assert "Python 3" in get_output_text(obs)
         finally:
             session.close()
 
@@ -886,7 +887,7 @@ def test_pwd_property(terminal_type):
 
             obs = _run_bash_action(session, "cd random_dir && pwd")
             assert obs.metadata.exit_code == 0
-            assert "random_dir" in obs.raw_output
+            assert "random_dir" in get_output_text(obs)
         finally:
             session.close()
 
@@ -915,10 +916,10 @@ def test_long_output_from_nested_directories(terminal_type):
             assert obs.metadata.exit_code == 0
 
             # Verify output contains expected files
-            assert "folder_1" in obs.raw_output
-            assert "file_1.txt" in obs.raw_output
-            assert "folder_100" in obs.raw_output
-            assert "file_100.txt" in obs.raw_output
+            assert "folder_1" in get_output_text(obs)
+            assert "file_1.txt" in get_output_text(obs)
+            assert "folder_100" in get_output_text(obs)
+            assert "file_100.txt" in get_output_text(obs)
         finally:
             session.close()
 
@@ -952,7 +953,7 @@ def test_command_backslash(terminal_type):
             )
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "/tmp/test_dir/file_1.txt" in obs.raw_output
+            assert "/tmp/test_dir/file_1.txt" in get_output_text(obs)
         finally:
             session.close()
 
@@ -976,7 +977,7 @@ def test_bash_remove_prefix(terminal_type):
             # Check git remote - same for both platforms
             obs = _run_bash_action(session, "git remote -v")
             assert obs.metadata.exit_code == 0
-            assert "https://github.com/OpenHands/OpenHands" in obs.raw_output
-            assert "git remote -v" not in obs.raw_output
+            assert "https://github.com/OpenHands/OpenHands" in get_output_text(obs)
+            assert "git remote -v" not in get_output_text(obs)
         finally:
             session.close()
diff --git a/tests/tools/execute_bash/test_bash_tool.py b/tests/tools/execute_bash/test_bash_tool.py
index c6a34259f9..cc04a57452 100644
--- a/tests/tools/execute_bash/test_bash_tool.py
+++ b/tests/tools/execute_bash/test_bash_tool.py
@@ -14,6 +14,7 @@
     ExecuteBashAction,
     ExecuteBashObservation,
 )
+from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_test_conv_state(temp_dir: str) -> ConversationState:
@@ -69,7 +70,7 @@ def test_bash_tool_execution():
         # Check the result
         assert result is not None
         assert isinstance(result, ExecuteBashObservation)
-        assert "Hello, World!" in result.raw_output
+        assert "Hello, World!" in get_output_text(result)
 
 
 def test_bash_tool_working_directory():
@@ -87,7 +88,7 @@ def test_bash_tool_working_directory():
 
         # Check that the working directory is correct
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in result.raw_output
+        assert temp_dir in get_output_text(result)
 
 
 def test_bash_tool_to_openai_tool():
diff --git a/tests/tools/execute_bash/test_bash_tool_auto_detection.py b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
index 75911d924e..5e3de19914 100644
--- a/tests/tools/execute_bash/test_bash_tool_auto_detection.py
+++ b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
@@ -18,6 +18,7 @@
     TerminalSession,
     TmuxTerminal,
 )
+from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_conv_state(working_dir: str) -> ConversationState:
@@ -52,7 +53,7 @@ def test_default_auto_detection():
         # Test that it works
         action = ExecuteBashAction(command="echo 'Auto-detection test'")
         obs = executor(action)
-        assert "Auto-detection test" in obs.raw_output
+        assert "Auto-detection test" in get_output_text(obs)
 
 
 def test_forced_terminal_types():
@@ -138,7 +139,7 @@ def test_backward_compatibility():
         assert tool.executor is not None
         action = ExecuteBashAction(command="echo 'Backward compatibility test'")
         obs = tool.executor(action)
-        assert "Backward compatibility test" in obs.raw_output
+        assert "Backward compatibility test" in get_output_text(obs)
         assert obs.metadata.exit_code == 0
 
 
diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 33df408141..75fee8597e 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -18,7 +18,7 @@ def test_execute_bash_observation_truncation_under_limit():
     )
 
     observation = ExecuteBashObservation(
-        raw_output="Short output",
+        output=[TextContent(text="Short output")],
         metadata=metadata,
         error=None,
     )
@@ -52,7 +52,7 @@ def test_execute_bash_observation_truncation_over_limit():
     long_output = "A" * (MAX_CMD_OUTPUT_SIZE + 1000)
 
     observation = ExecuteBashObservation(
-        raw_output=long_output,
+        output=[TextContent(text=long_output)],
         metadata=metadata,
         error=None,
     )
@@ -89,7 +89,7 @@ def test_execute_bash_observation_truncation_with_error():
     long_output = "B" * (MAX_CMD_OUTPUT_SIZE + 500)
 
     observation = ExecuteBashObservation(
-        raw_output=long_output,
+        output=[TextContent(text=long_output)],
         metadata=metadata,
         error="Command failed",
     )
@@ -124,7 +124,7 @@ def test_execute_bash_observation_truncation_exact_limit():
     exact_output = "C" * exact_output_size
 
     observation = ExecuteBashObservation(
-        raw_output=exact_output,
+        output=[TextContent(text=exact_output)],
         metadata=metadata,
         error=None,
     )
@@ -154,7 +154,7 @@ def test_execute_bash_observation_truncation_with_prefix_suffix():
     long_output = "D" * (MAX_CMD_OUTPUT_SIZE + 200)
 
     observation = ExecuteBashObservation(
-        raw_output=long_output,
+        output=[TextContent(text=long_output)],
         metadata=metadata,
         error=None,
     )
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index 3924d781e1..69c8507480 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -8,8 +8,10 @@
 from openhands.sdk.agent import Agent
 from openhands.sdk.conversation import Conversation
 from openhands.sdk.llm import LLM
+from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash import ExecuteBashAction, ExecuteBashObservation
 from openhands.tools.execute_bash.impl import BashExecutor
+from tests.tools.execute_bash.conftest import get_output_text
 
 
 def test_bash_executor_without_conversation():
@@ -24,8 +26,8 @@ def test_bash_executor_without_conversation():
             result = executor(action)
 
             # Check that the output is not masked (no conversation provided)
-            assert "secret-value-123" in result.raw_output
-            assert "<secret-hidden>" not in result.raw_output
+            assert "secret-value-123" in get_output_text(result)
+            assert "<secret-hidden>" not in get_output_text(result)
 
         finally:
             executor.close()
@@ -62,7 +64,9 @@ def test_bash_executor_with_conversation_secrets():
             mock_observation = ExecuteBashObservation(
                 cmd="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
                 exit_code=0,
-                raw_output="Token: secret-value-123, Key: another-secret-456",
+                output=[
+                    TextContent(text="Token: secret-value-123, Key: another-secret-456")
+                ],
             )
             mock_session.execute.return_value = mock_observation
             executor.session = mock_session
@@ -77,10 +81,10 @@ def test_bash_executor_with_conversation_secrets():
             assert mock_session.execute.called
 
             # Check that both secrets were masked in the output
-            assert "secret-value-123" not in result.raw_output
-            assert "another-secret-456" not in result.raw_output
+            assert "secret-value-123" not in get_output_text(result)
+            assert "another-secret-456" not in get_output_text(result)
             # SecretsManager uses <secret-hidden> as the mask
-            assert "<secret-hidden>" in result.raw_output
+            assert "<secret-hidden>" in get_output_text(result)
 
         finally:
             executor.close()

From 01479d7fdada458f2562ac4a5e8a97384148bad3 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 14:31:29 +0000
Subject: [PATCH 25/76] fix: add raw_output property to ExecuteBashObservation
 and update tests

- Added raw_output property to ExecuteBashObservation for backward compatibility
  - Property extracts text from first TextContent item in output field
  - Enables tests and code to access raw output text seamlessly

- Updated test files to use correct observation field names
  - Changed test_stuck_detector.py to use output=[TextContent(text=...)] instead of raw_output=
  - Tests can still access raw_output via the property for assertions

- All previously failing tests now pass:
  - test_mask_secrets
  - test_mask_changing_secrets
  - test_masking_persists
  - test_history_too_short
  - test_repeating_action_observation_not_stuck_less_than_4_repeats
  - test_repeating_action_observation_stuck
  - test_not_stuck_with_different_actions
  - test_reset_after_user_message

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/execute_bash/definition.py       |  9 +++++++++
 tests/cross/test_stuck_detector.py                   | 12 ++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 26cb972380..afe3fe32bf 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -102,6 +102,15 @@ def command_id(self) -> int | None:
         """Get the command ID from metadata."""
         return self.metadata.pid
 
+    @property
+    def raw_output(self) -> str:
+        """Return the raw output text for backward compatibility.
+
+        Extracts the text from the first TextContent item in output.
+        """
+        first_item = self.output[0] if self.output else None
+        return first_item.text if isinstance(first_item, TextContent) else ""
+
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index a355f47939..68c676e79d 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -59,7 +59,7 @@ def test_history_too_short():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
+            output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
@@ -108,7 +108,7 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -157,7 +157,7 @@ def test_repeating_action_observation_stuck():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -298,7 +298,7 @@ def test_not_stuck_with_different_actions():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                raw_output=f"output from {cmd}", cmd=cmd, exit_code=0
+                output=[TextContent(text=f"output from {cmd}")], cmd=cmd, exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -347,7 +347,7 @@ def test_reset_after_user_message():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                raw_output="file1.txt\nfile2.txt", cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -390,7 +390,7 @@ def test_reset_after_user_message():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            raw_output="/home/user", cmd="pwd", exit_code=0
+            output=[TextContent(text="/home/user")], cmd="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",

From 6285b8587a257eceae3405ac23918a56911f6335 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 14:57:24 +0000
Subject: [PATCH 26/76] refactor: remove command property from base Observation
 class

- Removed command property and to_llm_content serialization from base Observation
- Renamed cmd field to command in all Observation subclasses:
  * ExecuteBashObservation
  * FileEditorObservation
  * DelegateObservation
  * TaskTrackerObservation
- Updated all executor implementations to use command instead of cmd
- Updated test files to use command field consistently
- Fixed subprocess.TimeoutExpired to use correct parameter name (cmd)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py    | 10 ----------
 .../openhands/tools/delegate/definition.py    |  7 +------
 .../openhands/tools/delegate/impl.py          | 18 ++++++++---------
 .../tools/execute_bash/definition.py          |  9 +++------
 .../openhands/tools/execute_bash/impl.py      |  2 +-
 .../execute_bash/terminal/terminal_session.py | 14 ++++++-------
 .../openhands/tools/file_editor/definition.py |  7 +------
 .../openhands/tools/file_editor/editor.py     | 14 ++++++-------
 .../openhands/tools/file_editor/impl.py       |  6 +++---
 .../tools/task_tracker/definition.py          | 15 +++++---------
 tests/cross/test_stuck_detector.py            | 20 +++++++++++++------
 tests/tools/delegation/test_delegation.py     |  6 +++---
 .../execute_bash/test_bash_ps1_metadata.py    |  4 ++--
 .../execute_bash/test_secrets_masking.py      |  2 +-
 .../tools/file_editor/test_visualize_diff.py  | 16 +++++++--------
 15 files changed, 65 insertions(+), 85 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index fa4e563c68..c436bf8569 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -207,14 +207,6 @@ class Observation(Schema, ABC):
         default=None, description="Error message if operation failed"
     )
 
-    @property
-    def command(self) -> str | None:
-        """
-        The command that was executed to produce this observation.
-        Subclasses can override to provide the actual command run.
-        """
-        return None
-
     @property
     def has_error(self) -> bool:
         """
@@ -240,8 +232,6 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         Errors can be partial so both output and error are included if present.
         """
         llm_content: list[TextContent | ImageContent] = []
-        if self.command:
-            llm_content.append(TextContent(text=f"Executed Command: {self.command}\n"))
         if self.error:
             llm_content.append(self.format_error())
         if self.output:
diff --git a/openhands-tools/openhands/tools/delegate/definition.py b/openhands-tools/openhands/tools/delegate/definition.py
index f1d84459e2..1712523624 100644
--- a/openhands-tools/openhands/tools/delegate/definition.py
+++ b/openhands-tools/openhands/tools/delegate/definition.py
@@ -43,12 +43,7 @@ class DelegateAction(Action):
 class DelegateObservation(Observation):
     """Observation from delegation operations."""
 
-    cmd: CommandLiteral = Field(description="The command that was executed")
-
-    @property
-    def command(self) -> CommandLiteral:
-        """Return the command that was executed, type-narrowed to CommandLiteral."""
-        return self.cmd
+    command: CommandLiteral = Field(description="The command that was executed")
 
 
 TOOL_DESCRIPTION = """Delegation tool for spawning sub-agents and delegating tasks to them.
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index e169fe994c..e436cf0439 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -60,7 +60,7 @@ def __call__(  # type: ignore[override]
             return self._delegate_tasks(action)
         else:
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error=f"Unsupported command: {action.command}. "
                 "Available commands: spawn, delegate",
             )
@@ -77,13 +77,13 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         """
         if not action.ids:
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error="At least one ID is required for spawn action",
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
@@ -118,14 +118,14 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             agent_list = ", ".join(action.ids)
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 output=[TextContent(text=message)],
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error=f"failed to spawn agents: {str(e)}",
             )
 
@@ -142,7 +142,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         """
         if not action.tasks:
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error="at least one task is required for delegate action",
             )
 
@@ -150,7 +150,7 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error=(
                     f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
@@ -224,13 +224,13 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                 output_text += f"\n\nResults:\n{results_text}"
 
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 output=[TextContent(text=output_text)],
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
-                cmd=action.command,
+                command=action.command,
                 error=f"failed to delegate tasks: {str(e)}",
             )
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index afe3fe32bf..6759f64dc7 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -79,7 +79,9 @@ def visualize(self) -> Text:
 class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    cmd: str | None = Field(default=None, description="The command that was executed")
+    command: str | None = Field(
+        default=None, description="The command that was executed"
+    )
     exit_code: int | None = Field(
         default=None,
         description="The exit code of the command. -1 indicates the process hit the soft timeout and is not yet finished.",  # noqa
@@ -92,11 +94,6 @@ class ExecuteBashObservation(Observation):
         description="Additional metadata captured from PS1 after command execution.",
     )
 
-    @property
-    def command(self) -> str | None:
-        """Return the command that was executed."""
-        return self.cmd
-
     @property
     def command_id(self) -> int | None:
         """Get the command ID from metadata."""
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 8893e3064b..e7e1a76e8f 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -121,7 +121,7 @@ def reset(self) -> ExecuteBashObservation:
                     )
                 )
             ],
-            cmd="[RESET]",
+            command="[RESET]",
             exit_code=0,
         )
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 1bbcee98a3..6ed1171bbf 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -188,7 +188,7 @@ def _handle_completed_command(
         self.prev_output = ""  # Reset previous command output
         self._ready_for_next_command()
         return ExecuteBashObservation(
-            cmd=command,
+            command=command,
             output=[TextContent(text=command_output)],
             metadata=metadata,
         )
@@ -222,7 +222,7 @@ def _handle_nochange_timeout_command(
             continue_prefix="[Below is the output of the previous command.]\n",
         )
         return ExecuteBashObservation(
-            cmd=command,
+            command=command,
             output=[TextContent(text=command_output)],
             metadata=metadata,
         )
@@ -257,7 +257,7 @@ def _handle_hard_timeout_command(
             continue_prefix="[Below is the output of the previous command.]\n",
         )
         return ExecuteBashObservation(
-            cmd=command,
+            command=command,
             output=[TextContent(text=command_output)],
             metadata=metadata,
         )
@@ -314,12 +314,12 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
         }:
             if command == "":
                 return ExecuteBashObservation(
-                    cmd=command,
+                    command=command,
                     error="No previous running command to retrieve logs from.",
                 )
             if is_input:
                 return ExecuteBashObservation(
-                    cmd=command,
+                    command=command,
                     error="No previous running command to interact with.",
                 )
 
@@ -330,7 +330,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands)
             )
             return ExecuteBashObservation(
-                cmd=command,
+                command=command,
                 error=(
                     "Cannot execute multiple commands at once.\n"
                     "Please run each command separately OR chain them into a single "
@@ -387,7 +387,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 continue_prefix="[Below is the output of the previous command.]\n",
             )
             obs = ExecuteBashObservation(
-                cmd=command,
+                command=command,
                 output=[TextContent(text=command_output)],
                 metadata=metadata,
             )
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index b8db6bf4e1..0c6c9f3d26 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -65,18 +65,13 @@ class FileEditorAction(Action):
 class FileEditorObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
-    cmd: CommandLiteral = Field(
+    command: CommandLiteral = Field(
         description=(
             "The command that was run: `view`, `create`, `str_replace`, "
             "`insert`, or `undo_edit`."
         )
     )
 
-    @property
-    def command(self) -> CommandLiteral:
-        """Return the command that was executed, type-narrowed to CommandLiteral."""
-        return self.cmd
-
     path: str | None = Field(default=None, description="The file path that was edited.")
     prev_exist: bool = Field(
         default=True,
diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
index f7517bc360..2e21c62b2d 100644
--- a/openhands-tools/openhands/tools/file_editor/editor.py
+++ b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -110,7 +110,7 @@ def __call__(
             self.write_file(_path, file_text)
             self._history_manager.add_history(_path, file_text)
             return FileEditorObservation(
-                cmd=command,
+                command=command,
                 path=str(_path),
                 new_content=file_text,
                 prev_exist=False,
@@ -255,7 +255,7 @@ def str_replace(
             "file again if necessary."
         )
         return FileEditorObservation(
-            cmd="str_replace",
+            command="str_replace",
             output=[TextContent(text=success_message)],
             prev_exist=True,
             path=str(path),
@@ -315,7 +315,7 @@ def view(
                     )
                 stdout = "\n".join(msg)
             return FileEditorObservation(
-                cmd="view",
+                command="view",
                 output=[TextContent(text=stdout)],
                 error=stderr,
                 path=str(path),
@@ -332,7 +332,7 @@ def view(
             output = self._make_output(file_content, str(path), start_line)
 
             return FileEditorObservation(
-                cmd="view",
+                command="view",
                 output=[TextContent(text=output)],
                 path=str(path),
                 prev_exist=True,
@@ -385,7 +385,7 @@ def view(
             output = f"NOTE: {warning_message}\n{output}"
 
         return FileEditorObservation(
-            cmd="view",
+            command="view",
             path=str(path),
             output=[TextContent(text=output)],
             prev_exist=True,
@@ -498,7 +498,7 @@ def insert(
             "indentation, no duplicate lines, etc). Edit the file again if necessary."
         )
         return FileEditorObservation(
-            cmd="insert",
+            command="insert",
             output=[TextContent(text=success_message)],
             prev_exist=True,
             path=str(path),
@@ -567,7 +567,7 @@ def undo_edit(self, path: Path) -> FileEditorObservation:
         self.write_file(path, old_text)
 
         return FileEditorObservation(
-            cmd="undo_edit",
+            command="undo_edit",
             output=[
                 TextContent(
                     text=(
diff --git a/openhands-tools/openhands/tools/file_editor/impl.py b/openhands-tools/openhands/tools/file_editor/impl.py
index 309ae6e7f5..afc1f7906d 100644
--- a/openhands-tools/openhands/tools/file_editor/impl.py
+++ b/openhands-tools/openhands/tools/file_editor/impl.py
@@ -44,7 +44,7 @@ def __call__(
             action_path = Path(action.path).resolve()
             if action_path not in self.allowed_edits_files:
                 return FileEditorObservation(
-                    cmd=action.command,
+                    command=action.command,
                     error=f"Operation '{action.command}' is not allowed "
                     f"on file '{action_path}'. "
                     f"Only the following files can be edited: "
@@ -63,7 +63,7 @@ def __call__(
                 insert_line=action.insert_line,
             )
         except ToolError as e:
-            result = FileEditorObservation(cmd=action.command, error=e.message)
+            result = FileEditorObservation(command=action.command, error=e.message)
         assert result is not None, "file_editor should always return a result"
         return result
 
@@ -95,6 +95,6 @@ def file_editor(
             insert_line=insert_line,
         )
     except ToolError as e:
-        result = FileEditorObservation(cmd=command, error=e.message)
+        result = FileEditorObservation(command=command, error=e.message)
     assert result is not None, "file_editor should always return a result"
     return result
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index a3c0fba5fc..9e9ca4552b 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -70,18 +70,13 @@ def visualize(self) -> Text:
 class TaskTrackerObservation(Observation):
     """This data class represents the result of a task tracking operation."""
 
-    cmd: Literal["view", "plan"] = Field(
+    command: Literal["view", "plan"] = Field(
         description='The command that was executed: "view" or "plan".'
     )
     task_list: list[TaskItem] = Field(
         default_factory=list, description="The current task list"
     )
 
-    @property
-    def command(self) -> Literal["view", "plan"]:
-        """Return the command that was executed, type-narrowed to Literal."""
-        return self.cmd
-
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
@@ -183,7 +178,7 @@ def __call__(
                         )
                     )
                 ],
-                cmd=action.command,
+                command=action.command,
                 task_list=self._task_list,
             )
         elif action.command == "view":
@@ -198,13 +193,13 @@ def __call__(
                             )
                         )
                     ],
-                    cmd=action.command,
+                    command=action.command,
                     task_list=[],
                 )
             output = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
                 output=[TextContent(text=output)],
-                cmd=action.command,
+                command=action.command,
                 task_list=self._task_list,
             )
         else:
@@ -213,7 +208,7 @@ def __call__(
                     f"Unknown command: {action.command}. "
                     'Supported commands are "view" and "plan".'
                 ),
-                cmd=action.command,
+                command=action.command,
                 task_list=[],
             )
 
diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index 68c676e79d..e3d9a30e8d 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -59,7 +59,7 @@ def test_history_too_short():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
+            output=[TextContent(text="file1.txt\nfile2.txt")], command="ls", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
@@ -108,7 +108,9 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")],
+                command="ls",
+                exit_code=0,
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -157,7 +159,9 @@ def test_repeating_action_observation_stuck():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")],
+                command="ls",
+                exit_code=0,
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -298,7 +302,9 @@ def test_not_stuck_with_different_actions():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text=f"output from {cmd}")], cmd=cmd, exit_code=0
+                output=[TextContent(text=f"output from {cmd}")],
+                command=cmd,
+                exit_code=0,
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -347,7 +353,9 @@ def test_reset_after_user_message():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")], cmd="ls", exit_code=0
+                output=[TextContent(text="file1.txt\nfile2.txt")],
+                command="ls",
+                exit_code=0,
             ),
             action_id=action.id,
             tool_name="execute_bash",
@@ -390,7 +398,7 @@ def test_reset_after_user_message():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output=[TextContent(text="/home/user")], cmd="pwd", exit_code=0
+            output=[TextContent(text="/home/user")], command="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index d94b5307df..f61b9f519c 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -67,7 +67,7 @@ def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
     # Test spawn observation
     spawn_observation = DelegateObservation(
-        cmd="spawn",
+        command="spawn",
         output=[TextContent(text="spawn: Sub-agents created successfully")],
     )
     assert len(spawn_observation.output) == 1
@@ -77,7 +77,7 @@ def test_delegate_observation_creation():
 
     # Test delegate observation
     delegate_observation = DelegateObservation(
-        cmd="delegate",
+        command="delegate",
         output=[
             TextContent(
                 text=(
@@ -114,7 +114,7 @@ def test_delegate_executor_delegate():
 
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
-            cmd="delegate",
+            command="delegate",
             output=[
                 TextContent(
                     text=(
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index d3bee61c6b..44d36c68cd 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -274,7 +274,7 @@ def test_cmd_output_observation_properties():
     # Test with successful command
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
     obs = ExecuteBashObservation(
-        cmd="ls",
+        command="ls",
         output=[TextContent(text="file1\nfile2")],
         exit_code=0,
         metadata=metadata,
@@ -292,7 +292,7 @@ def test_cmd_output_observation_properties():
     # Test with failed command
     metadata = CmdOutputMetadata(exit_code=1, pid=456)
     obs = ExecuteBashObservation(
-        cmd="invalid",
+        command="invalid",
         exit_code=1,
         error="Command failed",
         metadata=metadata,
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index 69c8507480..23c6b6cdc1 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -62,7 +62,7 @@ def test_bash_executor_with_conversation_secrets():
             mock_session = Mock()
             # session.execute returns ExecuteBashObservation
             mock_observation = ExecuteBashObservation(
-                cmd="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
+                command="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
                 exit_code=0,
                 output=[
                     TextContent(text="Token: secret-value-123, Key: another-secret-456")
diff --git a/tests/tools/file_editor/test_visualize_diff.py b/tests/tools/file_editor/test_visualize_diff.py
index e316a88fa5..3ad341e6a6 100644
--- a/tests/tools/file_editor/test_visualize_diff.py
+++ b/tests/tools/file_editor/test_visualize_diff.py
@@ -20,7 +20,7 @@ def test_visualize_diff_simple_replacement():
     return True"""
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -50,7 +50,7 @@ def test_visualize_diff_no_changes():
     return True"""
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=content,
         new_content=content,
@@ -93,7 +93,7 @@ def main():
     calculate(x, y)"""
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/calc.py",
         old_content=old_content,
         new_content=new_content,
@@ -121,7 +121,7 @@ def test_visualize_diff_attempted_edit():
     new_content = "new line"
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -149,7 +149,7 @@ def test_visualize_diff_caching():
     new_content = "new line"
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -190,7 +190,7 @@ def test_visualize_diff_custom_context_lines():
 line7"""
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -232,7 +232,7 @@ def test_get_edit_groups():
 line3"""
 
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=old_content,
         new_content=new_content,
@@ -280,7 +280,7 @@ def test_get_edit_groups_no_content():
 def test_visualize_diff_none_content():
     """Test visualize_diff when content is None."""
     observation = FileEditorObservation(
-        cmd="str_replace",
+        command="str_replace",
         path="/test/file.py",
         old_content=None,
         new_content=None,

From 6470d0ab97b224034a0042ac07365365e833159b Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 16:10:53 +0000
Subject: [PATCH 27/76] fix: update planning_file_editor to use command instead
 of cmd

- Updated PlanningFileEditorObservation initialization to use command field
- Aligns with FileEditorObservation parent class field naming

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/planning_file_editor/impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands-tools/openhands/tools/planning_file_editor/impl.py b/openhands-tools/openhands/tools/planning_file_editor/impl.py
index 822e7f266c..1b2f45a43c 100644
--- a/openhands-tools/openhands/tools/planning_file_editor/impl.py
+++ b/openhands-tools/openhands/tools/planning_file_editor/impl.py
@@ -59,7 +59,7 @@ def __call__(
 
         # Convert FileEditorObservation to PlanningFileEditorObservation
         return PlanningFileEditorObservation(
-            cmd=action.command,
+            command=action.command,
             output=file_editor_obs.output,
             error=file_editor_obs.error,
         )

From 2d12008cdc393d70c25d8a806e0f8aff439f6658 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 31 Oct 2025 16:27:24 +0000
Subject: [PATCH 28/76] fix: use command instead of cmd in bash reset with
 command

- Fixed model_copy update to use 'command' field instead of 'cmd'
- Ensures [RESET] prefix is properly included in command field

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/execute_bash/impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index e7e1a76e8f..33078dc120 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -161,7 +161,7 @@ def __call__(
                 observation = command_result.model_copy(
                     update={
                         "output": [TextContent(text=f"{reset_text}\n\n{command_text}")],
-                        "cmd": f"[RESET] {action.command}",
+                        "command": f"[RESET] {action.command}",
                     }
                 )
             else:

From b252b156825c7a0ed0dd1a763faf4667d7c84920 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sat, 1 Nov 2025 09:30:22 +0000
Subject: [PATCH 29/76] Add ergonomic helpers for standardized 'output' field

- Added Observation.text_output() static method for easy text wrapping
- Added output_as_text property for easy text extraction
- Updated ExecuteBash and Glob to use new helpers
- Reduces ceremony of wrapping/unwrapping TextContent

This addresses the typing complexity while maintaining the standardized
'output' field across all observations (vs 'message', 'content', etc.)

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  3 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 36 +++++++++++++++----
 .../tools/execute_bash/definition.py          | 11 +++---
 openhands-tools/openhands/tools/glob/impl.py  |  4 +--
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index cfd1133d4c..ee3e2dc976 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,8 +93,7 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        first_item = result.output[0] if result.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        output_text = result.output_as_text
         if output_text.strip():
             for line in output_text.strip().splitlines():
                 matches.append(line)
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c436bf8569..1974d21539 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -194,7 +194,13 @@ class ObservationStatus(str, Enum):
 
 
 class Observation(Schema, ABC):
-    """Base schema for output observation."""
+    """Base schema for output observation.
+
+    All observations use a standardized 'output' field (not 'message', 'content', etc.)
+    to represent what the tool produced. For simple text output (most tools), use the
+    text_output() helper. For rich content with images (Browser, MCP), build the list
+    directly.
+    """
 
     output: list[TextContent | ImageContent] = Field(
         default_factory=list,
@@ -207,17 +213,35 @@ class Observation(Schema, ABC):
         default=None, description="Error message if operation failed"
     )
 
-    @property
-    def has_error(self) -> bool:
+    @staticmethod
+    def text_output(text: str) -> list[TextContent | ImageContent]:
+        """Helper to create output from plain text.
+
+        Use this for the common case of text-only output:
+            return MyObservation(output=Observation.text_output("result"))
+
+        Instead of manually wrapping:
+            return MyObservation(output=[TextContent(text="result")])
         """
-        Check if the observation indicates an error.
+        return [TextContent(text=text)]
+
+    @property
+    def output_as_text(self) -> str:
+        """Extract output as plain text.
+
+        Convenience property for text-only observations. Concatenates all
+        TextContent items, ignoring images.
         """
+        return "".join(c.text for c in self.output if isinstance(c, TextContent))
+
+    @property
+    def has_error(self) -> bool:
+        """Check if the observation indicates an error."""
         return bool(self.error)
 
     @property
     def result_status(self) -> ObservationStatus:
-        """
-        Get the observation result status based on presence of error."""
+        """Get the observation result status based on presence of error."""
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
     def format_error(self) -> TextContent:
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 6759f64dc7..a3ca371e39 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -103,10 +103,9 @@ def command_id(self) -> int | None:
     def raw_output(self) -> str:
         """Return the raw output text for backward compatibility.
 
-        Extracts the text from the first TextContent item in output.
+        Extracts the text from output using the helper property.
         """
-        first_item = self.output[0] if self.output else None
-        return first_item.text if isinstance(first_item, TextContent) else ""
+        return self.output_as_text
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
@@ -114,8 +113,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
             return [TextContent(text=f"Tool Execution Error: {error_msg}")]
 
-        first_item = self.output[0] if self.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        output_text = self.output_as_text
         ret = f"{self.metadata.prefix}{output_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -136,8 +134,7 @@ def visualize(self) -> Text:
             content.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        first_item = self.output[0] if self.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        output_text = self.output_as_text
         if output_text:
             # Style the output based on content
             output_lines = output_text.split("\n")
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 5ebcf21263..824c2df0af 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -8,7 +8,7 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import TextContent
+from openhands.sdk.tool.schema import Observation
 
 
 if TYPE_CHECKING:
@@ -103,7 +103,7 @@ def __call__(
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                output=[TextContent(text=output)],
+                output=Observation.text_output(output),
             )
 
         except Exception as e:

From 21d2d56453ad87d7bb2eac9e7dde5c6447948e4a Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Sat, 1 Nov 2025 10:36:16 +0100
Subject: [PATCH 30/76] Revert "Add ergonomic helpers for standardized 'output'
 field"

This reverts commit b252b156825c7a0ed0dd1a763faf4667d7c84920.
---
 examples/01_standalone_sdk/02_custom_tools.py |  3 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 36 ++++---------------
 .../tools/execute_bash/definition.py          | 11 +++---
 openhands-tools/openhands/tools/glob/impl.py  |  4 +--
 4 files changed, 17 insertions(+), 37 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index ee3e2dc976..cfd1133d4c 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,7 +93,8 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        output_text = result.output_as_text
+        first_item = result.output[0] if result.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
         if output_text.strip():
             for line in output_text.strip().splitlines():
                 matches.append(line)
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 1974d21539..c436bf8569 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -194,13 +194,7 @@ class ObservationStatus(str, Enum):
 
 
 class Observation(Schema, ABC):
-    """Base schema for output observation.
-
-    All observations use a standardized 'output' field (not 'message', 'content', etc.)
-    to represent what the tool produced. For simple text output (most tools), use the
-    text_output() helper. For rich content with images (Browser, MCP), build the list
-    directly.
-    """
+    """Base schema for output observation."""
 
     output: list[TextContent | ImageContent] = Field(
         default_factory=list,
@@ -213,35 +207,17 @@ class Observation(Schema, ABC):
         default=None, description="Error message if operation failed"
     )
 
-    @staticmethod
-    def text_output(text: str) -> list[TextContent | ImageContent]:
-        """Helper to create output from plain text.
-
-        Use this for the common case of text-only output:
-            return MyObservation(output=Observation.text_output("result"))
-
-        Instead of manually wrapping:
-            return MyObservation(output=[TextContent(text="result")])
-        """
-        return [TextContent(text=text)]
-
-    @property
-    def output_as_text(self) -> str:
-        """Extract output as plain text.
-
-        Convenience property for text-only observations. Concatenates all
-        TextContent items, ignoring images.
-        """
-        return "".join(c.text for c in self.output if isinstance(c, TextContent))
-
     @property
     def has_error(self) -> bool:
-        """Check if the observation indicates an error."""
+        """
+        Check if the observation indicates an error.
+        """
         return bool(self.error)
 
     @property
     def result_status(self) -> ObservationStatus:
-        """Get the observation result status based on presence of error."""
+        """
+        Get the observation result status based on presence of error."""
         return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
 
     def format_error(self) -> TextContent:
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index a3ca371e39..6759f64dc7 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -103,9 +103,10 @@ def command_id(self) -> int | None:
     def raw_output(self) -> str:
         """Return the raw output text for backward compatibility.
 
-        Extracts the text from output using the helper property.
+        Extracts the text from the first TextContent item in output.
         """
-        return self.output_as_text
+        first_item = self.output[0] if self.output else None
+        return first_item.text if isinstance(first_item, TextContent) else ""
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
@@ -113,7 +114,8 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
             return [TextContent(text=f"Tool Execution Error: {error_msg}")]
 
-        output_text = self.output_as_text
+        first_item = self.output[0] if self.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
         ret = f"{self.metadata.prefix}{output_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -134,7 +136,8 @@ def visualize(self) -> Text:
             content.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        output_text = self.output_as_text
+        first_item = self.output[0] if self.output else None
+        output_text = first_item.text if isinstance(first_item, TextContent) else ""
         if output_text:
             # Style the output based on content
             output_lines = output_text.split("\n")
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 824c2df0af..5ebcf21263 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -8,7 +8,7 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import Observation
+from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -103,7 +103,7 @@ def __call__(
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                output=Observation.text_output(output),
+                output=[TextContent(text=output)],
             )
 
         except Exception as e:

From 9b1868ac0b30b60881f663ffa5297ccc2ff9b911 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 2 Nov 2025 14:11:51 +0000
Subject: [PATCH 31/76] feat: support str and list types for Observation.output

- Updated Observation.output type to str | list[TextContent | ImageContent]
- Most non-MCP tools now use simple str output for efficiency
- MCP tools continue to use list[TextContent | ImageContent] for rich content
- Updated to_llm_content to handle both types with isinstance checks
- Updated all observation subclasses (ExecuteBash, FileEditor, Browser) to use str
- Updated test helper functions to handle both output types
- All pre-commit checks and tests passing

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  8 ++-
 openhands-sdk/openhands/sdk/tool/schema.py    | 15 +++--
 .../openhands/tools/browser_use/definition.py | 12 ++--
 .../openhands/tools/browser_use/impl.py       |  7 +--
 .../tools/execute_bash/definition.py          | 29 +++++----
 .../openhands/tools/execute_bash/impl.py      | 60 +++++++++++--------
 .../execute_bash/terminal/terminal_session.py |  9 ++-
 .../openhands/tools/file_editor/editor.py     | 25 ++++----
 .../browser_use/test_browser_executor_e2e.py  |  2 +
 tests/tools/execute_bash/conftest.py          |  4 +-
 tests/tools/file_editor/conftest.py           |  4 +-
 11 files changed, 102 insertions(+), 73 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index cfd1133d4c..0f84bb7a06 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,8 +93,12 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        first_item = result.output[0] if result.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if isinstance(result.output, str):
+            output_text = result.output
+        else:
+            first_item = result.output[0] if result.output else None
+            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+
         if output_text.strip():
             for line in output_text.strip().splitlines():
                 matches.append(line)
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c436bf8569..fb7ac97749 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -196,11 +196,11 @@ class ObservationStatus(str, Enum):
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
-    output: list[TextContent | ImageContent] = Field(
-        default_factory=list,
+    output: str | list[TextContent | ImageContent] = Field(
+        default="",
         description=(
-            "Output returned from the tool converted to LLM Ready "
-            "TextContent or ImageContent"
+            "Output returned from the tool. Can be a simple string for most tools, "
+            "or a list of TextContent/ImageContent for tools that need rich content."
         ),
     )
     error: str | None = Field(
@@ -235,7 +235,12 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
             llm_content.append(self.format_error())
         if self.output:
-            llm_content.extend(self.output)
+            # Handle both str and list types
+            if isinstance(self.output, str):
+                llm_content.append(TextContent(text=self.output))
+            else:
+                # It's a list of TextContent | ImageContent
+                llm_content.extend(self.output)
         return llm_content
 
     @property
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 7ede04331d..bdaa4171c8 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -37,10 +37,14 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.error:
             return [self.format_error()]
 
-        # Extract text from output list
-        output_text = "".join(
-            [c.text for c in self.output if isinstance(c, TextContent)]
-        )
+        # Extract text from output (handle both str and list types)
+        if isinstance(self.output, str):
+            output_text = self.output
+        else:
+            output_text = "".join(
+                [c.text for c in self.output if isinstance(c, TextContent)]
+            )
+
         content: list[TextContent | ImageContent] = [
             TextContent(text=maybe_truncate(output_text, MAX_BROWSER_OUTPUT_SIZE))
         ]
diff --git a/openhands-tools/openhands/tools/browser_use/impl.py b/openhands-tools/openhands/tools/browser_use/impl.py
index 7e7d8cbb0c..1dd397e025 100644
--- a/openhands-tools/openhands/tools/browser_use/impl.py
+++ b/openhands-tools/openhands/tools/browser_use/impl.py
@@ -14,7 +14,6 @@
 
 from openhands.sdk.logger import DEBUG, get_logger
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import TextContent
 from openhands.sdk.utils.async_executor import AsyncExecutor
 from openhands.tools.browser_use.definition import BrowserAction, BrowserObservation
 from openhands.tools.browser_use.server import CustomBrowserUseServer
@@ -228,7 +227,7 @@ async def _execute_action(self, action):
                 error_msg = f"Unsupported action type: {type(action)}"
                 return BrowserObservation(error=error_msg)
 
-            return BrowserObservation(output=[TextContent(text=result)])
+            return BrowserObservation(output=result)
         except Exception as e:
             error_msg = f"Browser operation failed: {str(e)}"
             logger.error(error_msg, exc_info=True)
@@ -283,14 +282,14 @@ async def get_state(self, include_screenshot: bool = False):
                 # Return clean JSON + separate screenshot data
                 clean_json = json.dumps(result_data, indent=2)
                 return BrowserObservation(
-                    output=[TextContent(text=clean_json)],
+                    output=clean_json,
                     screenshot_data=screenshot_data,
                 )
             except json.JSONDecodeError:
                 # If JSON parsing fails, return as-is
                 pass
 
-        return BrowserObservation(output=[TextContent(text=result_json)])
+        return BrowserObservation(output=result_json)
 
     # Tab Management
     async def list_tabs(self) -> str:
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 6759f64dc7..5bb2bcd4b4 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -101,12 +101,12 @@ def command_id(self) -> int | None:
 
     @property
     def raw_output(self) -> str:
-        """Return the raw output text for backward compatibility.
-
-        Extracts the text from the first TextContent item in output.
-        """
-        first_item = self.output[0] if self.output else None
-        return first_item.text if isinstance(first_item, TextContent) else ""
+        """Return the raw output text for backward compatibility."""
+        if isinstance(self.output, str):
+            return self.output
+        else:
+            first_item = self.output[0] if self.output else None
+            return first_item.text if isinstance(first_item, TextContent) else ""
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
@@ -114,8 +114,13 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
             return [TextContent(text=f"Tool Execution Error: {error_msg}")]
 
-        first_item = self.output[0] if self.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        # Handle both str and list types
+        if isinstance(self.output, str):
+            output_text = self.output
+        else:
+            first_item = self.output[0] if self.output else None
+            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+
         ret = f"{self.metadata.prefix}{output_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
@@ -136,8 +141,12 @@ def visualize(self) -> Text:
             content.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        first_item = self.output[0] if self.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if isinstance(self.output, str):
+            output_text = self.output
+        else:
+            first_item = self.output[0] if self.output else None
+            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+
         if output_text:
             # Style the output based on content
             output_lines = output_text.split("\n")
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 9cec73323e..9a81f5b501 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -113,14 +113,10 @@ def reset(self) -> ExecuteBashObservation:
         )
 
         return ExecuteBashObservation(
-            output=[
-                TextContent(
-                    text=(
-                        "Terminal session has been reset. All previous environment "
-                        "variables and session state have been cleared."
-                    )
-                )
-            ],
+            output=(
+                "Terminal session has been reset. All previous environment "
+                "variables and session state have been cleared."
+            ),
             command="[RESET]",
             exit_code=0,
         )
@@ -146,21 +142,31 @@ def __call__(
                 )
                 self._export_envs(command_action, conversation)
                 command_result = self.session.execute(command_action)
-                reset_text = (
-                    reset_result.output[0].text
-                    if reset_result.output
-                    and isinstance(reset_result.output[0], TextContent)
-                    else ""
-                )
-                command_text = (
-                    command_result.output[0].text
-                    if command_result.output
-                    and isinstance(command_result.output[0], TextContent)
-                    else ""
-                )
+
+                # Extract text from output (handle both str and list types)
+                if isinstance(reset_result.output, str):
+                    reset_text = reset_result.output
+                else:
+                    reset_text = (
+                        reset_result.output[0].text
+                        if reset_result.output
+                        and isinstance(reset_result.output[0], TextContent)
+                        else ""
+                    )
+
+                if isinstance(command_result.output, str):
+                    command_text = command_result.output
+                else:
+                    command_text = (
+                        command_result.output[0].text
+                        if command_result.output
+                        and isinstance(command_result.output[0], TextContent)
+                        else ""
+                    )
+
                 observation = command_result.model_copy(
                     update={
-                        "output": [TextContent(text=f"{reset_text}\n\n{command_text}")],
+                        "output": f"{reset_text}\n\n{command_text}",
                         "command": f"[RESET] {action.command}",
                     }
                 )
@@ -173,17 +179,19 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        first_item = observation.output[0] if observation.output else None
-        output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        if isinstance(observation.output, str):
+            output_text = observation.output
+        else:
+            first_item = observation.output[0] if observation.output else None
+            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+
         if output_text and conversation is not None:
             try:
                 secret_registry = conversation.state.secret_registry
                 masked_output = secret_registry.mask_secrets_in_output(output_text)
                 if masked_output:
                     data = observation.model_dump(exclude={"output"})
-                    return ExecuteBashObservation(
-                        **data, output=[TextContent(text=masked_output)]
-                    )
+                    return ExecuteBashObservation(**data, output=masked_output)
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 6ed1171bbf..909c2bc5b5 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -5,7 +5,6 @@
 from enum import Enum
 
 from openhands.sdk.logger import get_logger
-from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash.constants import (
     CMD_OUTPUT_PS1_END,
     NO_CHANGE_TIMEOUT_SECONDS,
@@ -189,7 +188,7 @@ def _handle_completed_command(
         self._ready_for_next_command()
         return ExecuteBashObservation(
             command=command,
-            output=[TextContent(text=command_output)],
+            output=command_output,
             metadata=metadata,
         )
 
@@ -223,7 +222,7 @@ def _handle_nochange_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            output=[TextContent(text=command_output)],
+            output=command_output,
             metadata=metadata,
         )
 
@@ -258,7 +257,7 @@ def _handle_hard_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            output=[TextContent(text=command_output)],
+            output=command_output,
             metadata=metadata,
         )
 
@@ -388,7 +387,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             obs = ExecuteBashObservation(
                 command=command,
-                output=[TextContent(text=command_output)],
+                output=command_output,
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
index 2e21c62b2d..509bba6323 100644
--- a/openhands-tools/openhands/tools/file_editor/editor.py
+++ b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -8,7 +8,6 @@
 from binaryornot.check import is_binary
 
 from openhands.sdk.logger import get_logger
-from openhands.sdk.tool.schema import TextContent
 from openhands.sdk.utils.truncate import maybe_truncate
 from openhands.tools.file_editor.definition import (
     CommandLiteral,
@@ -114,7 +113,7 @@ def __call__(
                 path=str(_path),
                 new_content=file_text,
                 prev_exist=False,
-                output=[TextContent(text=f"File created successfully at: {_path}")],
+                output=f"File created successfully at: {_path}",
             )
         elif command == "str_replace":
             if old_str is None:
@@ -256,7 +255,7 @@ def str_replace(
         )
         return FileEditorObservation(
             command="str_replace",
-            output=[TextContent(text=success_message)],
+            output=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_content,
@@ -316,7 +315,7 @@ def view(
                 stdout = "\n".join(msg)
             return FileEditorObservation(
                 command="view",
-                output=[TextContent(text=stdout)],
+                output=stdout,
                 error=stderr,
                 path=str(path),
                 prev_exist=True,
@@ -333,7 +332,7 @@ def view(
 
             return FileEditorObservation(
                 command="view",
-                output=[TextContent(text=output)],
+                output=output,
                 path=str(path),
                 prev_exist=True,
             )
@@ -387,7 +386,7 @@ def view(
         return FileEditorObservation(
             command="view",
             path=str(path),
-            output=[TextContent(text=output)],
+            output=output,
             prev_exist=True,
         )
 
@@ -499,7 +498,7 @@ def insert(
         )
         return FileEditorObservation(
             command="insert",
-            output=[TextContent(text=success_message)],
+            output=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_text,
@@ -568,14 +567,10 @@ def undo_edit(self, path: Path) -> FileEditorObservation:
 
         return FileEditorObservation(
             command="undo_edit",
-            output=[
-                TextContent(
-                    text=(
-                        f"Last edit to {path} undone successfully. "
-                        f"{self._make_output(old_text, str(path))}"
-                    )
-                )
-            ],
+            output=(
+                f"Last edit to {path} undone successfully. "
+                f"{self._make_output(old_text, str(path))}"
+            ),
             path=str(path),
             prev_exist=True,
             old_content=current_text,
diff --git a/tests/tools/browser_use/test_browser_executor_e2e.py b/tests/tools/browser_use/test_browser_executor_e2e.py
index 25f12a90b9..a7d0f0b54e 100644
--- a/tests/tools/browser_use/test_browser_executor_e2e.py
+++ b/tests/tools/browser_use/test_browser_executor_e2e.py
@@ -25,6 +25,8 @@
 
 def get_output_text(observation: BrowserObservation) -> str:
     """Extract text from observation output."""
+    if isinstance(observation.output, str):
+        return observation.output
     return "".join([c.text for c in observation.output if isinstance(c, TextContent)])
 
 
diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index 8fdf11310e..e61ea22dc6 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -16,8 +16,10 @@ def get_output_text(obs: ExecuteBashObservation) -> str:
     """Extract text from observation output field.
 
     This helper handles type-safe extraction of text from the observation's
-    output field, which contains Content items (TextContent or ImageContent).
+    output field, which can be a str or list of Content items.
     """
+    if isinstance(obs.output, str):
+        return obs.output
     if not obs.output:
         return ""
     first_item = obs.output[0]
diff --git a/tests/tools/file_editor/conftest.py b/tests/tools/file_editor/conftest.py
index 2d37f9ab4a..107b226b33 100644
--- a/tests/tools/file_editor/conftest.py
+++ b/tests/tools/file_editor/conftest.py
@@ -79,5 +79,7 @@ def create_test_file(path: Path, content: str):
 
 
 def get_output_text(result: FileEditorObservation) -> str:
-    """Extract text content from a FileEditorObservation's output list."""
+    """Extract text content from a FileEditorObservation's output."""
+    if isinstance(result.output, str):
+        return result.output
     return "".join([c.text for c in result.output if isinstance(c, TextContent)])

From c47440514b74ba20f54e1937e85acd8cfecb2ecd Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Sun, 2 Nov 2025 14:40:28 +0000
Subject: [PATCH 32/76] refactor: update remaining tools to use str output
 instead of list

- Updated delegate, grep, glob, and task_tracker tools to use str output
- Simplified example 02_custom_tools.py to use assert for type narrowing
- Fixed test helper assert_browser_observation_success to handle both str and list[TextContent] output types
- Removed unused TextContent imports from tool implementations

This completes the migration to the new Observation.output type that supports both str and list[TextContent | ImageContent].

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  7 ++----
 .../openhands/tools/delegate/impl.py          |  5 ++--
 openhands-tools/openhands/tools/glob/impl.py  |  3 +--
 openhands-tools/openhands/tools/grep/impl.py  |  5 ++--
 .../tools/task_tracker/definition.py          | 25 ++++++-------------
 tests/tools/browser_use/conftest.py           |  9 ++++---
 6 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 0f84bb7a06..f2a81d33f6 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,11 +93,8 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        if isinstance(result.output, str):
-            output_text = result.output
-        else:
-            first_item = result.output[0] if result.output else None
-            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+        assert isinstance(result.output, str)
+        output_text = result.output
 
         if output_text.strip():
             for line in output_text.strip().splitlines():
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index e436cf0439..9957a2ca02 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -5,7 +5,6 @@
 
 from openhands.sdk.conversation.impl.local_conversation import LocalConversation
 from openhands.sdk.conversation.response_utils import get_agent_final_response
-from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool.tool import ToolExecutor
 from openhands.tools.delegate.definition import DelegateObservation
@@ -119,7 +118,7 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
                 command=action.command,
-                output=[TextContent(text=message)],
+                output=message,
             )
 
         except Exception as e:
@@ -225,7 +224,7 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
 
             return DelegateObservation(
                 command=action.command,
-                output=[TextContent(text=output_text)],
+                output=output_text,
             )
 
         except Exception as e:
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 5ebcf21263..29f6cbbd1d 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -8,7 +8,6 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -103,7 +102,7 @@ def __call__(
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                output=[TextContent(text=output)],
+                output=output,
             )
 
         except Exception as e:
diff --git a/openhands-tools/openhands/tools/grep/impl.py b/openhands-tools/openhands/tools/grep/impl.py
index c38ca6a910..4f22e5f3c2 100644
--- a/openhands-tools/openhands/tools/grep/impl.py
+++ b/openhands-tools/openhands/tools/grep/impl.py
@@ -6,7 +6,6 @@
 from typing import TYPE_CHECKING
 
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -181,7 +180,7 @@ def _execute_with_ripgrep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=[TextContent(text=output)],
+            output=output,
         )
 
     def _execute_with_grep(
@@ -244,5 +243,5 @@ def _execute_with_grep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=[TextContent(text=output)],
+            output=output,
         )
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 9e9ca4552b..f7fb628ef4 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -11,7 +11,6 @@
     from openhands.sdk.conversation.state import ConversationState
 from rich.text import Text
 
-from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import (
     Action,
@@ -170,14 +169,9 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                output=[
-                    TextContent(
-                        text=(
-                            f"Task list has been updated with "
-                            f"{len(self._task_list)} item(s)."
-                        )
-                    )
-                ],
+                output=(
+                    f"Task list has been updated with {len(self._task_list)} item(s)."
+                ),
                 command=action.command,
                 task_list=self._task_list,
             )
@@ -185,20 +179,15 @@ def __call__(
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    output=[
-                        TextContent(
-                            text=(
-                                "No task list found. Use the "
-                                '"plan" command to create one.'
-                            )
-                        )
-                    ],
+                    output=(
+                        'No task list found. Use the "plan" command to create one.'
+                    ),
                     command=action.command,
                     task_list=[],
                 )
             output = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
-                output=[TextContent(text=output)],
+                output=output,
                 command=action.command,
                 task_list=self._task_list,
             )
diff --git a/tests/tools/browser_use/conftest.py b/tests/tools/browser_use/conftest.py
index 422cc8dc35..2e189137f5 100644
--- a/tests/tools/browser_use/conftest.py
+++ b/tests/tools/browser_use/conftest.py
@@ -43,9 +43,12 @@ def assert_browser_observation_success(
     assert isinstance(observation, BrowserObservation)
     assert observation.error is None
     if expected_output:
-        output_text = "".join(
-            [c.text for c in observation.output if isinstance(c, TextContent)]
-        )
+        if isinstance(observation.output, str):
+            output_text = observation.output
+        else:
+            output_text = "".join(
+                [c.text for c in observation.output if isinstance(c, TextContent)]
+            )
         assert expected_output in output_text
 
 

From 23bb43d05ed177c383b3854aa52f51e22d315b72 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 3 Nov 2025 08:01:01 +0000
Subject: [PATCH 33/76] fix: update delegation tests to work with standardized
 Observation base

The new Observation base class now uses output: str | list[TextContent | ImageContent]
instead of just list[TextContent]. The DelegateExecutor correctly returns observations
with output as a string, so the tests need to be updated to check the string directly
instead of treating it as a list and accessing output[0].

Changes:
- Updated test_delegate_observation_creation to create observations with string output
- Updated test_delegate_executor_delegate to check output as a string
- Verified to_llm_content property returns the expected TextContent list

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/tools/delegation/test_delegation.py | 72 +++++++++++------------
 1 file changed, 34 insertions(+), 38 deletions(-)

diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index f61b9f519c..6ab4114edd 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -65,34 +65,36 @@ def test_delegate_action_creation():
 
 def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
-    # Test spawn observation
+    # Test spawn observation with string output
     spawn_observation = DelegateObservation(
         command="spawn",
-        output=[TextContent(text="spawn: Sub-agents created successfully")],
+        output="spawn: Sub-agents created successfully",
     )
-    assert len(spawn_observation.output) == 1
-    assert isinstance(spawn_observation.output[0], TextContent)
-    assert spawn_observation.output[0].text == "spawn: Sub-agents created successfully"
-    # spawn observation doesn't have results field anymore
-
-    # Test delegate observation
+    assert isinstance(spawn_observation.output, str)
+    assert spawn_observation.output == "spawn: Sub-agents created successfully"
+    # Verify to_llm_content returns TextContent
+    llm_content = spawn_observation.to_llm_content
+    assert len(llm_content) == 1
+    assert isinstance(llm_content[0], TextContent)
+    assert llm_content[0].text == "spawn: Sub-agents created successfully"
+
+    # Test delegate observation with string output
     delegate_observation = DelegateObservation(
         command="delegate",
-        output=[
-            TextContent(
-                text=(
-                    "delegate: Tasks completed successfully\n\nResults:\n"
-                    "1. Result 1\n2. Result 2"
-                )
-            )
-        ],
+        output=(
+            "delegate: Tasks completed successfully\n\nResults:\n"
+            "1. Result 1\n2. Result 2"
+        ),
     )
-    assert len(delegate_observation.output) == 1
-    output_block = delegate_observation.output[0]
-    assert isinstance(output_block, TextContent)
-    assert "Tasks completed successfully" in output_block.text
-    assert "Result 1" in output_block.text
-    assert "Result 2" in output_block.text
+    assert isinstance(delegate_observation.output, str)
+    assert "Tasks completed successfully" in delegate_observation.output
+    assert "Result 1" in delegate_observation.output
+    assert "Result 2" in delegate_observation.output
+    # Verify to_llm_content
+    llm_content = delegate_observation.to_llm_content
+    assert len(llm_content) == 1
+    assert isinstance(llm_content[0], TextContent)
+    assert "Tasks completed successfully" in llm_content[0].text
 
 
 def test_delegate_executor_delegate():
@@ -102,9 +104,8 @@ def test_delegate_executor_delegate():
     # First spawn some agents
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
-    output_block = spawn_observation.output[0]
-    assert isinstance(output_block, TextContent)
-    assert "Successfully spawned" in output_block.text
+    assert isinstance(spawn_observation.output, str)
+    assert "Successfully spawned" in spawn_observation.output
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -115,25 +116,20 @@ def test_delegate_executor_delegate():
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
             command="delegate",
-            output=[
-                TextContent(
-                    text=(
-                        "delegate: Tasks completed successfully\n\nResults:\n"
-                        "1. Agent agent1: Code analysis complete\n"
-                        "2. Agent agent2: Tests written"
-                    )
-                )
-            ],
+            output=(
+                "delegate: Tasks completed successfully\n\nResults:\n"
+                "1. Agent agent1: Code analysis complete\n"
+                "2. Agent agent2: Tests written"
+            ),
         )
         mock_delegate.return_value = mock_observation
 
         observation = executor(delegate_action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    obs_block = observation.output[0]
-    assert isinstance(obs_block, TextContent)
-    assert "Agent agent1: Code analysis complete" in obs_block.text
-    assert "Agent agent2: Tests written" in obs_block.text
+    assert isinstance(observation.output, str)
+    assert "Agent agent1: Code analysis complete" in observation.output
+    assert "Agent agent2: Tests written" in observation.output
 
 
 def test_delegate_executor_missing_task():

From 0e712a97d49f0306b378e7b201acd4941b3b329f Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 16:56:30 +0000
Subject: [PATCH 34/76] refactor: update Observation base class to use
 'content' and 'is_error' fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Changed base Observation signature: 'output' → 'content', added 'is_error: bool'
- Removed ObservationStatus enum class
- Updated to_llm_content to prepend 'Tool Execution Error. ' when is_error=True
- Updated all 9 observation subclasses to follow new pattern:
  - ThinkObservation: adjusted executor and visualize override
  - FinishObservation: adjusted executor and visualize override
  - ExecuteBashObservation: removed raw_output property, adapted to new pattern
  - MCPToolObservation: adapted to new base pattern
  - BrowserObservation: adapted to new base pattern
  - FileEditorObservation: adapted to new pattern, fixed is_error references
  - GlobObservation: updated all error cases to use content= and is_error=True
  - GrepObservation: updated all cases to use content= field
  - TaskTrackerObservation: updated all observation returns
  - DelegateObservation: updated all observation returns to use content= and is_error=True
- Updated tests to use 'content' instead of 'raw_output'

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 66 +++++++++-----
 .../openhands/sdk/tool/builtins/finish.py     |  9 +-
 .../openhands/sdk/tool/builtins/think.py      |  7 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 52 ++++-------
 .../openhands/tools/browser_use/definition.py | 27 +++---
 .../openhands/tools/browser_use/impl.py       | 10 +-
 .../openhands/tools/delegate/impl.py          | 25 +++--
 .../tools/execute_bash/definition.py          | 91 ++++++++++---------
 .../openhands/tools/execute_bash/impl.py      | 46 +++++-----
 .../execute_bash/terminal/terminal_session.py | 17 ++--
 .../openhands/tools/file_editor/definition.py |  4 +-
 .../openhands/tools/file_editor/editor.py     | 62 +++++++------
 .../openhands/tools/file_editor/impl.py       | 11 ++-
 openhands-tools/openhands/tools/glob/impl.py  | 16 ++--
 openhands-tools/openhands/tools/grep/impl.py  | 13 ++-
 .../tools/task_tracker/definition.py          | 11 ++-
 tests/cross/test_agent_secrets_integration.py | 24 ++---
 17 files changed, 266 insertions(+), 225 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index bb34849773..932cf83652 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -80,47 +80,63 @@ def from_call_tool_result(
         # Prepend initial message to content
         content_with_header = [TextContent(text=initial_message)] + converted_content
 
-        # Populate error or output field based on result status
+        # Populate content field and is_error flag based on result status
         if result.isError:
-            # When there is an error, populate error field only with all content
+            # When there is an error, populate content field with all content
+            # and set is_error=True
             return cls(
-                error="\n".join(
+                content="\n".join(
                     [initial_message]
                     + [
                         c.text if isinstance(c, TextContent) else "[Image]"
                         for c in converted_content
                     ]
                 ),
+                is_error=True,
                 tool_name=tool_name,
             )
         else:
-            # When success, populate output field only
+            # When success, populate content field only
             return cls(
-                output=content_with_header,
+                content=content_with_header,
                 tool_name=tool_name,
             )
 
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
-        content = Text()
-        content.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
-
-        if self.has_error:
-            content.append("[Error during execution]\n", style="bold red")
-            if self.error:
-                content.append(self.error + "\n")
-        elif self.output:
+        content_obj = Text()
+        content_obj.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
+
+        if self.is_error:
+            content_obj.append("[Error during execution]\n", style="bold red")
+            # Handle both str and list types for content
+            if isinstance(self.content, str):
+                content_obj.append(self.content + "\n")
+            else:
+                for block in self.content:
+                    if isinstance(block, TextContent):
+                        content_obj.append(block.text + "\n")
+                    elif isinstance(block, ImageContent):
+                        content_obj.append(
+                            f"[Image with {len(block.image_urls)} URLs]\n"
+                        )
+        elif self.content:
             # Display all content blocks
-            for block in self.output:
-                if isinstance(block, TextContent):
-                    # Try to parse as JSON for better display
-                    try:
-                        parsed = json.loads(block.text)
-                        content.append(display_dict(parsed))
-                    except (json.JSONDecodeError, TypeError):
-                        content.append(block.text + "\n")
-                elif isinstance(block, ImageContent):
-                    content.append(f"[Image with {len(block.image_urls)} URLs]\n")
-
-        return content
+            if isinstance(self.content, str):
+                content_obj.append(self.content + "\n")
+            else:
+                for block in self.content:
+                    if isinstance(block, TextContent):
+                        # Try to parse as JSON for better display
+                        try:
+                            parsed = json.loads(block.text)
+                            content_obj.append(display_dict(parsed))
+                        except (json.JSONDecodeError, TypeError):
+                            content_obj.append(block.text + "\n")
+                    elif isinstance(block, ImageContent):
+                        content_obj.append(
+                            f"[Image with {len(block.image_urls)} URLs]\n"
+                        )
+
+        return content_obj
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index 177afa76a2..172188450a 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -37,6 +37,11 @@ class FinishObservation(Observation):
     extra fields are needed here.
     """
 
+    @property
+    def visualize(self) -> Text:
+        """Return an empty Text representation since the message is in the action."""
+        return Text()
+
 
 TOOL_DESCRIPTION = """Signals the completion of the current task or conversation.
 
@@ -55,10 +60,10 @@ class FinishObservation(Observation):
 class FinishExecutor(ToolExecutor):
     def __call__(
         self,
-        action: FinishAction,  # noqa: ARG002
+        action: FinishAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation()
+        return FinishObservation(content=action.message)
 
 
 class FinishTool(ToolDefinition[FinishAction, FinishObservation]):
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 3c8867bb92..8455c68189 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -51,6 +51,11 @@ class ThinkObservation(Observation):
     fields are needed here.
     """
 
+    @property
+    def visualize(self) -> Text:
+        """Return an empty Text representation since the thought is in the action."""
+        return Text()
+
 
 THINK_DESCRIPTION = """Use the tool to think about something. It will not obtain new information or make any changes to the repository, but just log the thought. Use it when complex reasoning or brainstorming is needed.
 
@@ -70,7 +75,7 @@ def __call__(
         _: ThinkAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> ThinkObservation:
-        return ThinkObservation()
+        return ThinkObservation(content="Your thought has been logged.")
 
 
 class ThinkTool(ToolDefinition[ThinkAction, ThinkObservation]):
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index fb7ac97749..e7377812f1 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,6 +1,5 @@
 from abc import ABC
 from collections.abc import Sequence
-from enum import Enum
 from typing import Any, ClassVar, TypeVar
 
 from pydantic import ConfigDict, Field, create_model
@@ -188,59 +187,42 @@ def visualize(self) -> Text:
         return content
 
 
-class ObservationStatus(str, Enum):
-    SUCCESS = "success"
-    ERROR = "error"
-
-
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
-    output: str | list[TextContent | ImageContent] = Field(
+    content: str | list[TextContent | ImageContent] = Field(
         default="",
         description=(
-            "Output returned from the tool. Can be a simple string for most tools, "
-            "or a list of TextContent/ImageContent for tools that need rich content."
+            "Content returned from the tool. Can be a simple string for most tools, "
+            "or a list of TextContent/ImageContent for tools that need rich content. "
+            "When there is an error, it should be written in this field."
         ),
     )
-    error: str | None = Field(
-        default=None, description="Error message if operation failed"
+    is_error: bool = Field(
+        default=False, description="Whether the observation indicates an error"
     )
 
-    @property
-    def has_error(self) -> bool:
-        """
-        Check if the observation indicates an error.
-        """
-        return bool(self.error)
-
-    @property
-    def result_status(self) -> ObservationStatus:
-        """
-        Get the observation result status based on presence of error."""
-        return ObservationStatus.ERROR if self.has_error else ObservationStatus.SUCCESS
-
-    def format_error(self) -> TextContent:
-        """Format the error message for LLM display."""
-        return TextContent(text=f"Tool Execution Error: {self.error}")
-
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         """
         Default content formatting for converting observation to LLM readable content.
         Subclasses can override to provide richer content (e.g., images, diffs).
-        Errors can be partial so both output and error are included if present.
         """
         llm_content: list[TextContent | ImageContent] = []
-        if self.error:
-            llm_content.append(self.format_error())
-        if self.output:
+
+        # If is_error is true, prepend error message
+        if self.is_error:
+            llm_content.append(TextContent(text="Tool Execution Error. "))
+
+        # Add content
+        if self.content:
             # Handle both str and list types
-            if isinstance(self.output, str):
-                llm_content.append(TextContent(text=self.output))
+            if isinstance(self.content, str):
+                llm_content.append(TextContent(text=self.content))
             else:
                 # It's a list of TextContent | ImageContent
-                llm_content.extend(self.output)
+                llm_content.extend(self.content)
+
         return llm_content
 
     @property
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 2c97953455..63e059d82a 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -33,20 +33,23 @@ class BrowserObservation(Observation):
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        if self.error:
-            return [self.format_error()]
+        llm_content: list[TextContent | ImageContent] = []
 
-        # Extract text from output (handle both str and list types)
-        if isinstance(self.output, str):
-            output_text = self.output
+        # If is_error is true, prepend error message
+        if self.is_error:
+            llm_content.append(TextContent(text="Tool Execution Error. "))
+
+        # Extract text from content (handle both str and list types)
+        if isinstance(self.content, str):
+            content_text = self.content
         else:
-            output_text = "".join(
-                [c.text for c in self.output if isinstance(c, TextContent)]
+            content_text = "".join(
+                [c.text for c in self.content if isinstance(c, TextContent)]
             )
 
-        content: list[TextContent | ImageContent] = [
-            TextContent(text=maybe_truncate(output_text, MAX_BROWSER_OUTPUT_SIZE))
-        ]
+        llm_content.append(
+            TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
+        )
 
         if self.screenshot_data:
             mime_type = "image/png"
@@ -60,9 +63,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
                 mime_type = "image/webp"
             # Convert base64 to data URL format for ImageContent
             data_url = f"data:{mime_type};base64,{self.screenshot_data}"
-            content.append(ImageContent(image_urls=[data_url]))
+            llm_content.append(ImageContent(image_urls=[data_url]))
 
-        return content
+        return llm_content
 
 
 # ============================================
diff --git a/openhands-tools/openhands/tools/browser_use/impl.py b/openhands-tools/openhands/tools/browser_use/impl.py
index 1dd397e025..3b872a3edb 100644
--- a/openhands-tools/openhands/tools/browser_use/impl.py
+++ b/openhands-tools/openhands/tools/browser_use/impl.py
@@ -225,13 +225,13 @@ async def _execute_action(self, action):
                 result = await self.close_tab(action.tab_id)
             else:
                 error_msg = f"Unsupported action type: {type(action)}"
-                return BrowserObservation(error=error_msg)
+                return BrowserObservation(content=error_msg, is_error=True)
 
-            return BrowserObservation(output=result)
+            return BrowserObservation(content=result)
         except Exception as e:
             error_msg = f"Browser operation failed: {str(e)}"
             logger.error(error_msg, exc_info=True)
-            return BrowserObservation(error=error_msg)
+            return BrowserObservation(content=error_msg, is_error=True)
 
     async def _ensure_initialized(self):
         """Ensure browser session is initialized."""
@@ -282,14 +282,14 @@ async def get_state(self, include_screenshot: bool = False):
                 # Return clean JSON + separate screenshot data
                 clean_json = json.dumps(result_data, indent=2)
                 return BrowserObservation(
-                    output=clean_json,
+                    content=clean_json,
                     screenshot_data=screenshot_data,
                 )
             except json.JSONDecodeError:
                 # If JSON parsing fails, return as-is
                 pass
 
-        return BrowserObservation(output=result_json)
+        return BrowserObservation(content=result_json)
 
     # Tab Management
     async def list_tabs(self) -> str:
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index 9957a2ca02..ec83ed4d5a 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -60,8 +60,9 @@ def __call__(  # type: ignore[override]
         else:
             return DelegateObservation(
                 command=action.command,
-                error=f"Unsupported command: {action.command}. "
+                content=f"Unsupported command: {action.command}. "
                 "Available commands: spawn, delegate",
+                is_error=True,
             )
 
     def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
@@ -77,17 +78,19 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
         if not action.ids:
             return DelegateObservation(
                 command=action.command,
-                error="At least one ID is required for spawn action",
+                content="At least one ID is required for spawn action",
+                is_error=True,
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
             return DelegateObservation(
                 command=action.command,
-                error=(
+                content=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
                     f"maximum is {self._max_children}"
                 ),
+                is_error=True,
             )
 
         try:
@@ -118,14 +121,15 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
             return DelegateObservation(
                 command=action.command,
-                output=message,
+                content=message,
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
             return DelegateObservation(
                 command=action.command,
-                error=f"failed to spawn agents: {str(e)}",
+                content=f"failed to spawn agents: {str(e)}",
+                is_error=True,
             )
 
     def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
@@ -142,7 +146,8 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         if not action.tasks:
             return DelegateObservation(
                 command=action.command,
-                error="at least one task is required for delegate action",
+                content="at least one task is required for delegate action",
+                is_error=True,
             )
 
         # Check that all requested agent IDs exist
@@ -150,10 +155,11 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
         if missing_agents:
             return DelegateObservation(
                 command=action.command,
-                error=(
+                content=(
                     f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
                 ),
+                is_error=True,
             )
 
         try:
@@ -224,12 +230,13 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
 
             return DelegateObservation(
                 command=action.command,
-                output=output_text,
+                content=output_text,
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
             return DelegateObservation(
                 command=action.command,
-                error=f"failed to delegate tasks: {str(e)}",
+                content=f"failed to delegate tasks: {str(e)}",
+                is_error=True,
             )
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index d9490f08a4..dcaa435a97 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -100,57 +100,56 @@ def command_id(self) -> int | None:
         """Get the command ID from metadata."""
         return self.metadata.pid
 
-    @property
-    def raw_output(self) -> str:
-        """Return the raw output text for backward compatibility."""
-        if isinstance(self.output, str):
-            return self.output
-        else:
-            first_item = self.output[0] if self.output else None
-            return first_item.text if isinstance(first_item, TextContent) else ""
-
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        if self.error:
-            error_msg = f"{self.metadata.prefix}{self.error}{self.metadata.suffix}"
-            return [TextContent(text=f"Tool Execution Error: {error_msg}")]
+        llm_content: list[TextContent | ImageContent] = []
 
-        # Handle both str and list types
-        if isinstance(self.output, str):
-            output_text = self.output
+        # If is_error is true, prepend error message
+        if self.is_error:
+            llm_content.append(TextContent(text="Tool Execution Error. "))
+
+        # Handle both str and list types for content
+        if isinstance(self.content, str):
+            content_text = self.content
         else:
-            first_item = self.output[0] if self.output else None
-            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+            first_item = self.content[0] if self.content else None
+            content_text = (
+                first_item.text if isinstance(first_item, TextContent) else ""
+            )
 
-        ret = f"{self.metadata.prefix}{output_text}{self.metadata.suffix}"
+        ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
             ret += f"\n[Current working directory: {self.metadata.working_dir}]"
         if self.metadata.py_interpreter_path:
             ret += f"\n[Python interpreter: {self.metadata.py_interpreter_path}]"
         if self.metadata.exit_code != -1:
             ret += f"\n[Command finished with exit code {self.metadata.exit_code}]"
-        return [TextContent(text=maybe_truncate(ret, MAX_CMD_OUTPUT_SIZE))]
+        llm_content.append(TextContent(text=maybe_truncate(ret, MAX_CMD_OUTPUT_SIZE)))
+
+        return llm_content
 
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with terminal-style output formatting."""
-        content = Text()
+        content_obj = Text()
 
         # Add error indicator if present
-        if self.error:
-            content.append("❌ ", style="red bold")
-            content.append("Command execution error\n", style="red")
+        if self.is_error:
+            content_obj.append("❌ ", style="red bold")
+            content_obj.append("Command execution error\n", style="red")
 
         # Add command output with proper styling
-        if isinstance(self.output, str):
-            output_text = self.output
+        if isinstance(self.content, str):
+            content_text = self.content
         else:
-            first_item = self.output[0] if self.output else None
-            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+            first_item = self.content[0] if self.content else None
+            content_text = (
+                first_item.text if isinstance(first_item, TextContent) else ""
+            )
 
-        if output_text:
+        if content_text:
             # Style the output based on content
-            output_lines = output_text.split("\n")
+            output_lines = content_text.split("\n")
             for line in output_lines:
                 if line.strip():
                     # Color error-like lines differently
@@ -158,28 +157,28 @@ def visualize(self) -> Text:
                         keyword in line.lower()
                         for keyword in ["error", "failed", "exception", "traceback"]
                     ):
-                        content.append(line, style="red")
+                        content_obj.append(line, style="red")
                     elif any(
                         keyword in line.lower() for keyword in ["warning", "warn"]
                     ):
-                        content.append(line, style="yellow")
+                        content_obj.append(line, style="yellow")
                     elif line.startswith("+ "):  # bash -x output
-                        content.append(line, style="cyan")
+                        content_obj.append(line, style="cyan")
                     else:
-                        content.append(line, style="white")
-                content.append("\n")
+                        content_obj.append(line, style="white")
+                content_obj.append("\n")
 
         # Add metadata with styling
         if hasattr(self, "metadata") and self.metadata:
             if self.metadata.working_dir:
-                content.append("\n📁 ", style="blue")
-                content.append(
+                content_obj.append("\n📁 ", style="blue")
+                content_obj.append(
                     f"Working directory: {self.metadata.working_dir}", style="blue"
                 )
 
             if self.metadata.py_interpreter_path:
-                content.append("\n🐍 ", style="green")
-                content.append(
+                content_obj.append("\n🐍 ", style="green")
+                content_obj.append(
                     f"Python interpreter: {self.metadata.py_interpreter_path}",
                     style="green",
                 )
@@ -189,20 +188,22 @@ def visualize(self) -> Text:
                 and self.metadata.exit_code is not None
             ):
                 if self.metadata.exit_code == 0:
-                    content.append("\n✅ ", style="green")
-                    content.append(
+                    content_obj.append("\n✅ ", style="green")
+                    content_obj.append(
                         f"Exit code: {self.metadata.exit_code}", style="green"
                     )
                 elif self.metadata.exit_code == -1:
-                    content.append("\n⏳ ", style="yellow")
-                    content.append(
+                    content_obj.append("\n⏳ ", style="yellow")
+                    content_obj.append(
                         "Process still running (soft timeout)", style="yellow"
                     )
                 else:
-                    content.append("\n❌ ", style="red")
-                    content.append(f"Exit code: {self.metadata.exit_code}", style="red")
+                    content_obj.append("\n❌ ", style="red")
+                    content_obj.append(
+                        f"Exit code: {self.metadata.exit_code}", style="red"
+                    )
 
-        return content
+        return content_obj
 
 
 TOOL_DESCRIPTION = """Execute a bash command in the terminal within a persistent shell session.
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 9a81f5b501..5741b294a3 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -113,7 +113,7 @@ def reset(self) -> ExecuteBashObservation:
         )
 
         return ExecuteBashObservation(
-            output=(
+            content=(
                 "Terminal session has been reset. All previous environment "
                 "variables and session state have been cleared."
             ),
@@ -143,30 +143,30 @@ def __call__(
                 self._export_envs(command_action, conversation)
                 command_result = self.session.execute(command_action)
 
-                # Extract text from output (handle both str and list types)
-                if isinstance(reset_result.output, str):
-                    reset_text = reset_result.output
+                # Extract text from content (handle both str and list types)
+                if isinstance(reset_result.content, str):
+                    reset_text = reset_result.content
                 else:
                     reset_text = (
-                        reset_result.output[0].text
-                        if reset_result.output
-                        and isinstance(reset_result.output[0], TextContent)
+                        reset_result.content[0].text
+                        if reset_result.content
+                        and isinstance(reset_result.content[0], TextContent)
                         else ""
                     )
 
-                if isinstance(command_result.output, str):
-                    command_text = command_result.output
+                if isinstance(command_result.content, str):
+                    command_text = command_result.content
                 else:
                     command_text = (
-                        command_result.output[0].text
-                        if command_result.output
-                        and isinstance(command_result.output[0], TextContent)
+                        command_result.content[0].text
+                        if command_result.content
+                        and isinstance(command_result.content[0], TextContent)
                         else ""
                     )
 
                 observation = command_result.model_copy(
                     update={
-                        "output": f"{reset_text}\n\n{command_text}",
+                        "content": f"{reset_text}\n\n{command_text}",
                         "command": f"[RESET] {action.command}",
                     }
                 )
@@ -179,19 +179,21 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        if isinstance(observation.output, str):
-            output_text = observation.output
+        if isinstance(observation.content, str):
+            content_text = observation.content
         else:
-            first_item = observation.output[0] if observation.output else None
-            output_text = first_item.text if isinstance(first_item, TextContent) else ""
+            first_item = observation.content[0] if observation.content else None
+            content_text = (
+                first_item.text if isinstance(first_item, TextContent) else ""
+            )
 
-        if output_text and conversation is not None:
+        if content_text and conversation is not None:
             try:
                 secret_registry = conversation.state.secret_registry
-                masked_output = secret_registry.mask_secrets_in_output(output_text)
-                if masked_output:
-                    data = observation.model_dump(exclude={"output"})
-                    return ExecuteBashObservation(**data, output=masked_output)
+                masked_content = secret_registry.mask_secrets_in_output(content_text)
+                if masked_content:
+                    data = observation.model_dump(exclude={"content"})
+                    return ExecuteBashObservation(**data, content=masked_content)
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 909c2bc5b5..667a1251a4 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -188,7 +188,7 @@ def _handle_completed_command(
         self._ready_for_next_command()
         return ExecuteBashObservation(
             command=command,
-            output=command_output,
+            content=command_output,
             metadata=metadata,
         )
 
@@ -222,7 +222,7 @@ def _handle_nochange_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            output=command_output,
+            content=command_output,
             metadata=metadata,
         )
 
@@ -257,7 +257,7 @@ def _handle_hard_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            output=command_output,
+            content=command_output,
             metadata=metadata,
         )
 
@@ -314,12 +314,14 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             if command == "":
                 return ExecuteBashObservation(
                     command=command,
-                    error="No previous running command to retrieve logs from.",
+                    content="No previous running command to retrieve logs from.",
+                    is_error=True,
                 )
             if is_input:
                 return ExecuteBashObservation(
                     command=command,
-                    error="No previous running command to interact with.",
+                    content="No previous running command to interact with.",
+                    is_error=True,
                 )
 
         # Check if the command is a single command or multiple commands
@@ -330,11 +332,12 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             return ExecuteBashObservation(
                 command=command,
-                error=(
+                content=(
                     "Cannot execute multiple commands at once.\n"
                     "Please run each command separately OR chain them into a single "
                     f"command via && or ;\nProvided commands:\n{commands_list}"
                 ),
+                is_error=True,
             )
 
         # Get initial state before sending command
@@ -387,7 +390,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             obs = ExecuteBashObservation(
                 command=command,
-                output=command_output,
+                content=command_output,
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 4482e9da33..49beccb1af 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -100,7 +100,7 @@ def visualize(self) -> Text:
         assert self.path is not None, "path should be set for meaningful diff"
         # Generate and cache diff visualization
         if not self._diff_cache:
-            change_applied = self.command != "view" and not self.error
+            change_applied = self.command != "view" and not self.is_error
             self._diff_cache = visualize_diff(
                 self.path,
                 self.old_content,
@@ -114,7 +114,7 @@ def visualize(self) -> Text:
     @property
     def _has_meaningful_diff(self) -> bool:
         """Check if there's a meaningful diff to display."""
-        if self.error:
+        if self.is_error:
             return False
 
         if not self.path:
diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
index 509bba6323..271074aba7 100644
--- a/openhands-tools/openhands/tools/file_editor/editor.py
+++ b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -113,7 +113,7 @@ def __call__(
                 path=str(_path),
                 new_content=file_text,
                 prev_exist=False,
-                output=f"File created successfully at: {_path}",
+                content=f"File created successfully at: {_path}",
             )
         elif command == "str_replace":
             if old_str is None:
@@ -255,7 +255,7 @@ def str_replace(
         )
         return FileEditorObservation(
             command="str_replace",
-            output=success_message,
+            content=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_content,
@@ -293,30 +293,36 @@ def view(
                 rf"-path '{path}/*/\.*' \) | sort",
                 truncate_notice=DIRECTORY_CONTENT_TRUNCATED_NOTICE,
             )
-            if not stderr:
-                # Add trailing slashes to directories
-                paths = stdout.strip().split("\n") if stdout.strip() else []
-                formatted_paths = []
-                for p in paths:
-                    if Path(p).is_dir():
-                        formatted_paths.append(f"{p}/")
-                    else:
-                        formatted_paths.append(p)
-
-                msg = [
-                    f"Here's the files and directories up to 2 levels deep in {path}, "
-                    "excluding hidden items:\n" + "\n".join(formatted_paths)
-                ]
-                if hidden_count > 0:
-                    msg.append(
-                        f"\n{hidden_count} hidden files/directories in this directory "
-                        f"are excluded. You can use 'ls -la {path}' to see them."
-                    )
-                stdout = "\n".join(msg)
+            if stderr:
+                return FileEditorObservation(
+                    command="view",
+                    content=stderr,
+                    is_error=True,
+                    path=str(path),
+                    prev_exist=True,
+                )
+            # Add trailing slashes to directories
+            paths = stdout.strip().split("\n") if stdout.strip() else []
+            formatted_paths = []
+            for p in paths:
+                if Path(p).is_dir():
+                    formatted_paths.append(f"{p}/")
+                else:
+                    formatted_paths.append(p)
+
+            msg = [
+                f"Here's the files and directories up to 2 levels deep in {path}, "
+                "excluding hidden items:\n" + "\n".join(formatted_paths)
+            ]
+            if hidden_count > 0:
+                msg.append(
+                    f"\n{hidden_count} hidden files/directories in this directory "
+                    f"are excluded. You can use 'ls -la {path}' to see them."
+                )
+            stdout = "\n".join(msg)
             return FileEditorObservation(
                 command="view",
-                output=stdout,
-                error=stderr,
+                content=stdout,
                 path=str(path),
                 prev_exist=True,
             )
@@ -332,7 +338,7 @@ def view(
 
             return FileEditorObservation(
                 command="view",
-                output=output,
+                content=output,
                 path=str(path),
                 prev_exist=True,
             )
@@ -386,7 +392,7 @@ def view(
         return FileEditorObservation(
             command="view",
             path=str(path),
-            output=output,
+            content=output,
             prev_exist=True,
         )
 
@@ -498,7 +504,7 @@ def insert(
         )
         return FileEditorObservation(
             command="insert",
-            output=success_message,
+            content=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_text,
@@ -567,7 +573,7 @@ def undo_edit(self, path: Path) -> FileEditorObservation:
 
         return FileEditorObservation(
             command="undo_edit",
-            output=(
+            content=(
                 f"Last edit to {path} undone successfully. "
                 f"{self._make_output(old_text, str(path))}"
             ),
diff --git a/openhands-tools/openhands/tools/file_editor/impl.py b/openhands-tools/openhands/tools/file_editor/impl.py
index afc1f7906d..f6b8732a0b 100644
--- a/openhands-tools/openhands/tools/file_editor/impl.py
+++ b/openhands-tools/openhands/tools/file_editor/impl.py
@@ -45,10 +45,11 @@ def __call__(
             if action_path not in self.allowed_edits_files:
                 return FileEditorObservation(
                     command=action.command,
-                    error=f"Operation '{action.command}' is not allowed "
+                    content=f"Operation '{action.command}' is not allowed "
                     f"on file '{action_path}'. "
                     f"Only the following files can be edited: "
                     f"{sorted(str(p) for p in self.allowed_edits_files)}",
+                    is_error=True,
                 )
 
         result: FileEditorObservation | None = None
@@ -63,7 +64,9 @@ def __call__(
                 insert_line=action.insert_line,
             )
         except ToolError as e:
-            result = FileEditorObservation(command=action.command, error=e.message)
+            result = FileEditorObservation(
+                command=action.command, content=e.message, is_error=True
+            )
         assert result is not None, "file_editor should always return a result"
         return result
 
@@ -95,6 +98,8 @@ def file_editor(
             insert_line=insert_line,
         )
     except ToolError as e:
-        result = FileEditorObservation(command=command, error=e.message)
+        result = FileEditorObservation(
+            command=command, content=e.message, is_error=True
+        )
     assert result is not None, "file_editor should always return a result"
     return result
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index 29f6cbbd1d..d0e00bda53 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -71,7 +71,8 @@ def __call__(
                     files=[],
                     pattern=original_pattern,
                     search_path=str(search_path),
-                    error=f"Search path '{search_path}' is not a valid directory",
+                    content=f"Search path '{search_path}' is not a valid directory",
+                    is_error=True,
                 )
 
             if self._ripgrep_available:
@@ -79,20 +80,20 @@ def __call__(
             else:
                 files, truncated = self._execute_with_glob(pattern, search_path)
 
-            # Format output message
+            # Format content message
             if not files:
-                output = (
+                content = (
                     f"No files found matching pattern '{original_pattern}' "
                     f"in directory '{search_path}'"
                 )
             else:
                 file_list = "\n".join(files)
-                output = (
+                content = (
                     f"Found {len(files)} file(s) matching pattern "
                     f"'{original_pattern}' in '{search_path}':\n{file_list}"
                 )
                 if truncated:
-                    output += (
+                    content += (
                         "\n\n[Results truncated to first 100 files. "
                         "Consider using a more specific pattern.]"
                     )
@@ -102,7 +103,7 @@ def __call__(
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                output=output,
+                content=content,
             )
 
         except Exception as e:
@@ -119,7 +120,8 @@ def __call__(
                 files=[],
                 pattern=action.pattern,
                 search_path=error_search_path,
-                error=str(e),
+                content=str(e),
+                is_error=True,
             )
 
     def _execute_with_ripgrep(
diff --git a/openhands-tools/openhands/tools/grep/impl.py b/openhands-tools/openhands/tools/grep/impl.py
index 4f22e5f3c2..e936725853 100644
--- a/openhands-tools/openhands/tools/grep/impl.py
+++ b/openhands-tools/openhands/tools/grep/impl.py
@@ -60,7 +60,8 @@ def __call__(
                         pattern=action.pattern,
                         search_path=str(search_path),
                         include_pattern=action.include,
-                        error=f"Search path '{action.path}' is not a valid directory",
+                        content=f"Search path '{action.path}' is not a valid directory",
+                        is_error=True,
                     )
             else:
                 search_path = self.working_dir
@@ -74,7 +75,8 @@ def __call__(
                     pattern=action.pattern,
                     search_path=str(search_path),
                     include_pattern=action.include,
-                    error=f"Invalid regex pattern: {e}",
+                    content=f"Invalid regex pattern: {e}",
+                    is_error=True,
                 )
 
             if self._ripgrep_available:
@@ -97,7 +99,8 @@ def __call__(
                 pattern=action.pattern,
                 search_path=error_search_path,
                 include_pattern=action.include,
-                error=str(e),
+                content=str(e),
+                is_error=True,
             )
 
     def _format_output(
@@ -180,7 +183,7 @@ def _execute_with_ripgrep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=output,
+            content=output,
         )
 
     def _execute_with_grep(
@@ -243,5 +246,5 @@ def _execute_with_grep(
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            output=output,
+            content=output,
         )
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 0ed41192d5..d9e6f96d4c 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -169,7 +169,7 @@ def __call__(
             if self.save_dir:
                 self._save_tasks()
             return TaskTrackerObservation(
-                output=(
+                content=(
                     f"Task list has been updated with {len(self._task_list)} item(s)."
                 ),
                 command=action.command,
@@ -179,24 +179,25 @@ def __call__(
             # Return the current task list
             if not self._task_list:
                 return TaskTrackerObservation(
-                    output=(
+                    content=(
                         'No task list found. Use the "plan" command to create one.'
                     ),
                     command=action.command,
                     task_list=[],
                 )
-            output = self._format_task_list(self._task_list)
+            content = self._format_task_list(self._task_list)
             return TaskTrackerObservation(
-                output=output,
+                content=content,
                 command=action.command,
                 task_list=self._task_list,
             )
         else:
             return TaskTrackerObservation(
-                error=(
+                content=(
                     f"Unknown command: {action.command}. "
                     'Supported commands are "view" and "plan".'
                 ),
+                is_error=True,
                 command=action.command,
                 task_list=[],
             )
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index 9d6df9deb9..911badcc91 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -234,13 +234,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $API_KEY")
         result = bash_executor(action, conversation=conversation)
-        assert "test-api-key" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "test-api-key" not in result.content
+        assert "<secret-hidden>" in result.content
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "dynamic-secret" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "dynamic-secret" not in result.content
+        assert "<secret-hidden>" in result.content
 
     finally:
         bash_executor.close()
@@ -265,13 +265,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "changing-secret" not in result.content
+        assert "<secret-hidden>" in result.content
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "changing-secret" not in result.content
+        assert "<secret-hidden>" in result.content
 
     finally:
         bash_executor.close()
@@ -303,13 +303,13 @@ def get_value(self):
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
         print(result)
-        assert "changing-secret" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "changing-secret" not in result.content
+        assert "<secret-hidden>" in result.content
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.raw_output
-        assert "<secret-hidden>" in result.raw_output
+        assert "changing-secret" not in result.content
+        assert "<secret-hidden>" in result.content
         assert dynamic_secret.raised_on_second
 
     finally:

From 1ffcf6bf93d7dbaefc2cbc32b05ecce815f975cd Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 17:05:26 +0000
Subject: [PATCH 35/76] test: update all tests to use new observation schema
 with content and is_error

- Replace .has_error with .is_error across all test files
- Replace .output with .content in observation assertions
- Update observation creation to use content= and is_error= instead of output= and error=
- Fix assertions to match new to_llm_content behavior (prepends 'Tool Execution Error. ' when is_error=True)
- Handle content field being either str or list[TextContent | ImageContent] in tests
- Updated test files:
  - tests/sdk/mcp/test_mcp_tool.py
  - tests/sdk/mcp/test_mcp_tool_kind_field.py
  - tests/sdk/mcp/test_mcp_security_risk.py
  - tests/tools/browser_use/conftest.py
  - tests/tools/browser_use/test_browser_observation.py
  - tests/tools/execute_bash/conftest.py
  - tests/tools/execute_bash/test_bash_ps1_metadata.py
  - tests/tools/execute_bash/test_bash_session.py
  - tests/tools/glob/test_glob_executor.py
  - tests/tools/glob/test_glob_tool.py
  - tests/tools/grep/test_grep_executor.py
  - tests/tools/grep/test_grep_tool.py
  - tests/tools/delegation/test_delegation.py

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/sdk/mcp/test_mcp_security_risk.py       |  2 +-
 tests/sdk/mcp/test_mcp_tool.py                | 71 ++++++++++---------
 tests/sdk/mcp/test_mcp_tool_kind_field.py     | 12 ++--
 tests/tools/browser_use/conftest.py           | 18 +++--
 .../browser_use/test_browser_observation.py   | 46 ++++++------
 tests/tools/delegation/test_delegation.py     | 43 ++++++-----
 tests/tools/execute_bash/conftest.py          | 12 ++--
 .../execute_bash/test_bash_ps1_metadata.py    | 17 ++---
 tests/tools/execute_bash/test_bash_session.py | 19 +++--
 tests/tools/glob/test_glob_executor.py        | 26 +++----
 tests/tools/glob/test_glob_tool.py            | 16 ++---
 tests/tools/grep/test_grep_executor.py        | 24 +++----
 tests/tools/grep/test_grep_tool.py            | 22 +++---
 13 files changed, 172 insertions(+), 156 deletions(-)

diff --git a/tests/sdk/mcp/test_mcp_security_risk.py b/tests/sdk/mcp/test_mcp_security_risk.py
index e014ff9bf7..aa0649c411 100644
--- a/tests/sdk/mcp/test_mcp_security_risk.py
+++ b/tests/sdk/mcp/test_mcp_security_risk.py
@@ -180,4 +180,4 @@ def test_mcp_tool_validates_correctly_after_security_risk_pop():
     # 4. Execute the action (this should also work)
     observation = tool(action)
     assert isinstance(observation, MCPToolObservation)
-    assert not observation.has_error
+    assert not observation.is_error
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index cad27c5969..79ec2d05ce 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -37,13 +37,13 @@ def test_from_call_tool_result_success(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert observation.output is not None
-        assert len(observation.output) == 2
-        assert isinstance(observation.output[0], TextContent)
-        assert observation.output[0].text == "[Tool 'test_tool' executed.]"
-        assert isinstance(observation.output[1], TextContent)
-        assert observation.output[1].text == "Operation completed successfully"
-        assert observation.has_error is False
+        assert observation.content is not None
+        assert len(observation.content) == 2
+        assert isinstance(observation.content[0], TextContent)
+        assert observation.content[0].text == "[Tool 'test_tool' executed.]"
+        assert isinstance(observation.content[1], TextContent)
+        assert observation.content[1].text == "Operation completed successfully"
+        assert observation.is_error is False
 
     def test_from_call_tool_result_error(self):
         """Test creating observation from error MCP result."""
@@ -57,11 +57,9 @@ def test_from_call_tool_result_error(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert observation.error is not None
-        assert "[Tool 'test_tool' executed.]" in observation.error
-        assert "Operation failed" in observation.error
-        assert len(observation.output) == 0
-        assert observation.has_error is True
+        assert observation.is_error is True
+        assert "[Tool 'test_tool' executed.]" in observation.content
+        assert "Operation failed" in observation.content
 
     def test_from_call_tool_result_with_image(self):
         """Test creating observation from MCP result with image content."""
@@ -80,24 +78,24 @@ def test_from_call_tool_result_with_image(self):
         )
 
         assert observation.tool_name == "test_tool"
-        assert observation.output is not None
-        assert len(observation.output) == 3
+        assert observation.content is not None
+        assert len(observation.content) == 3
         # First item is header
-        assert isinstance(observation.output[0], TextContent)
-        assert observation.output[0].text == "[Tool 'test_tool' executed.]"
+        assert isinstance(observation.content[0], TextContent)
+        assert observation.content[0].text == "[Tool 'test_tool' executed.]"
         # Second item is text
-        assert isinstance(observation.output[1], TextContent)
-        assert observation.output[1].text == "Here's the image:"
+        assert isinstance(observation.content[1], TextContent)
+        assert observation.content[1].text == "Here's the image:"
         # Third item is image
-        assert isinstance(observation.output[2], ImageContent)
-        assert hasattr(observation.output[2], "image_urls")
-        assert observation.has_error is False
+        assert isinstance(observation.content[2], ImageContent)
+        assert hasattr(observation.content[2], "image_urls")
+        assert observation.is_error is False
 
     def test_to_llm_content_success(self):
         """Test agent observation formatting for success."""
         observation = MCPToolObservation(
             tool_name="test_tool",
-            output=[TextContent(text="[Tool 'test_tool' executed.]\nSuccess result")],
+            content=[TextContent(text="[Tool 'test_tool' executed.]\nSuccess result")],
         )
 
         agent_obs = observation.to_llm_content
@@ -105,26 +103,28 @@ def test_to_llm_content_success(self):
         assert isinstance(agent_obs[0], TextContent)
         assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
         assert "Success result" in agent_obs[0].text
-        assert "[An error occurred during execution.]" not in agent_obs[0].text
+        assert "Tool Execution Error." not in agent_obs[0].text
 
     def test_to_llm_content_error(self):
         """Test agent observation formatting for error."""
         observation = MCPToolObservation(
             tool_name="test_tool",
-            error=(
+            content=(
                 "[Tool 'test_tool' executed.]\n"
                 "[An error occurred during execution.]\n"
                 "Error occurred"
             ),
+            is_error=True,
         )
 
         agent_obs = observation.to_llm_content
-        assert len(agent_obs) == 1
+        assert len(agent_obs) == 2
         assert isinstance(agent_obs[0], TextContent)
-        assert "Tool Execution Error:" in agent_obs[0].text
-        assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
-        assert "[An error occurred during execution.]" in agent_obs[0].text
-        assert "Error occurred" in agent_obs[0].text
+        assert agent_obs[0].text == "Tool Execution Error. "
+        assert isinstance(agent_obs[1], TextContent)
+        assert "[Tool 'test_tool' executed.]" in agent_obs[1].text
+        assert "[An error occurred during execution.]" in agent_obs[1].text
+        assert "Error occurred" in agent_obs[1].text
 
 
 class TestMCPToolExecutor:
@@ -162,7 +162,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
 
         assert isinstance(observation, MCPToolObservation)
         assert observation.tool_name == "test_tool"
-        assert observation.has_error is False
+        assert observation.is_error is False
 
     def test_call_tool_error(self):
         """Test tool execution with error."""
@@ -189,7 +189,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
 
         assert isinstance(observation, MCPToolObservation)
         assert observation.tool_name == "test_tool"
-        assert observation.has_error is True
+        assert observation.is_error is True
 
     def test_call_tool_exception(self):
         """Test tool execution with exception."""
@@ -200,8 +200,9 @@ def test_call_tool_exception(self):
         # Mock call_async_from_sync to return an error observation
         def mock_call_async_from_sync(coro_func, **kwargs):
             return MCPToolObservation(
-                error="Error calling MCP tool test_tool: Connection failed",
+                content="Error calling MCP tool test_tool: Connection failed",
                 tool_name="test_tool",
+                is_error=True,
             )
 
         self.mock_client.call_async_from_sync = mock_call_async_from_sync
@@ -210,9 +211,9 @@ def mock_call_async_from_sync(coro_func, **kwargs):
 
         assert isinstance(observation, MCPToolObservation)
         assert observation.tool_name == "test_tool"
-        assert observation.has_error is True
-        assert observation.error is not None
-        assert "Connection failed" in observation.error
+        assert observation.is_error is True
+        assert observation.is_error is True
+        assert "Connection failed" in observation.content
 
 
 class TestMCPTool:
diff --git a/tests/sdk/mcp/test_mcp_tool_kind_field.py b/tests/sdk/mcp/test_mcp_tool_kind_field.py
index 50195a6660..101ecbeeb1 100644
--- a/tests/sdk/mcp/test_mcp_tool_kind_field.py
+++ b/tests/sdk/mcp/test_mcp_tool_kind_field.py
@@ -86,16 +86,18 @@ def test_real_mcp_tool_execution_without_kind_field(fetch_tool):
 
     # Verify we got a valid response (not an error about 'kind')
     # Check output if no error, otherwise check error message
-    if observation.has_error:
-        assert observation.error is not None
-        content_str = observation.error
+    if observation.is_error:
+        assert observation.is_error is True
+        content_str = observation.content
     else:
-        assert observation.output is not None
+        assert observation.content is not None
         # Extract text from content blocks
         from openhands.sdk.llm import TextContent
 
         text_parts = [
-            block.text for block in observation.output if isinstance(block, TextContent)
+            block.text
+            for block in observation.content
+            if isinstance(block, TextContent)
         ]
         content_str = " ".join(text_parts)
 
diff --git a/tests/tools/browser_use/conftest.py b/tests/tools/browser_use/conftest.py
index 2e189137f5..bd2d0d42cf 100644
--- a/tests/tools/browser_use/conftest.py
+++ b/tests/tools/browser_use/conftest.py
@@ -31,8 +31,12 @@ def create_mock_browser_response(
     screenshot_data: str | None = None,
 ):
     """Helper to create mock browser responses."""
+    if error:
+        return BrowserObservation(
+            content=error, is_error=True, screenshot_data=screenshot_data
+        )
     return BrowserObservation(
-        output=[TextContent(text=output)], error=error, screenshot_data=screenshot_data
+        content=[TextContent(text=output)], screenshot_data=screenshot_data
     )
 
 
@@ -41,13 +45,13 @@ def assert_browser_observation_success(
 ):
     """Assert that a browser observation indicates success."""
     assert isinstance(observation, BrowserObservation)
-    assert observation.error is None
+    assert observation.is_error is False
     if expected_output:
-        if isinstance(observation.output, str):
-            output_text = observation.output
+        if isinstance(observation.content, str):
+            output_text = observation.content
         else:
             output_text = "".join(
-                [c.text for c in observation.output if isinstance(c, TextContent)]
+                [c.text for c in observation.content if isinstance(c, TextContent)]
             )
         assert expected_output in output_text
 
@@ -57,6 +61,6 @@ def assert_browser_observation_error(
 ):
     """Assert that a browser observation contains an error."""
     assert isinstance(observation, BrowserObservation)
-    assert observation.error is not None
+    assert observation.is_error is True
     if expected_error:
-        assert expected_error in observation.error
+        assert expected_error in observation.content
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index e4bf1174e0..c0e434ca9e 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -6,21 +6,21 @@
 
 def test_browser_observation_basic_output():
     """Test basic BrowserObservation creation with output."""
-    observation = BrowserObservation(output=[TextContent(text="Test output")])
+    observation = BrowserObservation(content=[TextContent(text="Test output")])
 
-    assert len(observation.output) == 1
-    assert isinstance(observation.output[0], TextContent)
-    assert observation.output[0].text == "Test output"
-    assert observation.error is None
+    assert len(observation.content) == 1
+    assert isinstance(observation.content[0], TextContent)
+    assert observation.content[0].text == "Test output"
+    assert observation.is_error is False
     assert observation.screenshot_data is None
 
 
 def test_browser_observation_with_error():
     """Test BrowserObservation with error."""
-    observation = BrowserObservation(error="Test error")
+    observation = BrowserObservation(content="Test error", is_error=True)
 
-    assert len(observation.output) == 0
-    assert observation.error == "Test error"
+    assert observation.content == "Test error"
+    assert observation.is_error is True
     assert observation.screenshot_data is None
 
 
@@ -28,19 +28,19 @@ def test_browser_observation_with_screenshot():
     """Test BrowserObservation with screenshot data."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        output=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
+        content=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
     )
 
-    assert len(observation.output) == 1
-    assert isinstance(observation.output[0], TextContent)
-    assert observation.output[0].text == "Screenshot taken"
-    assert observation.error is None
+    assert len(observation.content) == 1
+    assert isinstance(observation.content[0], TextContent)
+    assert observation.content[0].text == "Screenshot taken"
+    assert observation.is_error is False
     assert observation.screenshot_data == screenshot_data
 
 
 def test_browser_observation_to_llm_content_text_only():
     """Test to_llm_content property with text only."""
-    observation = BrowserObservation(output=[TextContent(text="Test output")])
+    observation = BrowserObservation(content=[TextContent(text="Test output")])
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 1
@@ -52,7 +52,7 @@ def test_browser_observation_to_llm_content_with_screenshot():
     """Test to_llm_content property with screenshot."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        output=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
+        content=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
     )
     agent_obs = observation.to_llm_content
 
@@ -67,19 +67,21 @@ def test_browser_observation_to_llm_content_with_screenshot():
 
 def test_browser_observation_to_llm_content_with_error():
     """Test to_llm_content property with error."""
-    observation = BrowserObservation(error="Test error")
+    observation = BrowserObservation(content="Test error", is_error=True)
     agent_obs = observation.to_llm_content
 
-    assert len(agent_obs) == 1
+    assert len(agent_obs) == 2
     assert isinstance(agent_obs[0], TextContent)
-    assert agent_obs[0].text == "Tool Execution Error: Test error"
+    assert agent_obs[0].text == "Tool Execution Error. "
+    assert isinstance(agent_obs[1], TextContent)
+    assert "Test error" in agent_obs[1].text
 
 
 def test_browser_observation_output_truncation():
     """Test output truncation for very long outputs."""
     # Create a very long output string
     long_output = "x" * 100000  # 100k characters
-    observation = BrowserObservation(output=[TextContent(text=long_output)])
+    observation = BrowserObservation(content=[TextContent(text=long_output)])
 
     agent_obs = observation.to_llm_content
 
@@ -94,7 +96,7 @@ def test_browser_observation_screenshot_data_url_conversion():
     """Test that screenshot data is properly converted to data URL."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        output=[TextContent(text="Test")], screenshot_data=screenshot_data
+        content=[TextContent(text="Test")], screenshot_data=screenshot_data
     )
 
     agent_obs = observation.to_llm_content
@@ -108,13 +110,13 @@ def test_browser_observation_screenshot_data_url_conversion():
 def test_browser_observation_empty_screenshot_handling():
     """Test handling of empty or None screenshot data."""
     observation = BrowserObservation(
-        output=[TextContent(text="Test")], screenshot_data=""
+        content=[TextContent(text="Test")], screenshot_data=""
     )
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
 
     observation = BrowserObservation(
-        output=[TextContent(text="Test")], screenshot_data=None
+        content=[TextContent(text="Test")], screenshot_data=None
     )
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index 6ab4114edd..aab044c9ee 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -68,10 +68,10 @@ def test_delegate_observation_creation():
     # Test spawn observation with string output
     spawn_observation = DelegateObservation(
         command="spawn",
-        output="spawn: Sub-agents created successfully",
+        content="spawn: Sub-agents created successfully",
     )
-    assert isinstance(spawn_observation.output, str)
-    assert spawn_observation.output == "spawn: Sub-agents created successfully"
+    assert isinstance(spawn_observation.content, str)
+    assert spawn_observation.content == "spawn: Sub-agents created successfully"
     # Verify to_llm_content returns TextContent
     llm_content = spawn_observation.to_llm_content
     assert len(llm_content) == 1
@@ -81,15 +81,15 @@ def test_delegate_observation_creation():
     # Test delegate observation with string output
     delegate_observation = DelegateObservation(
         command="delegate",
-        output=(
+        content=(
             "delegate: Tasks completed successfully\n\nResults:\n"
             "1. Result 1\n2. Result 2"
         ),
     )
-    assert isinstance(delegate_observation.output, str)
-    assert "Tasks completed successfully" in delegate_observation.output
-    assert "Result 1" in delegate_observation.output
-    assert "Result 2" in delegate_observation.output
+    assert isinstance(delegate_observation.content, str)
+    assert "Tasks completed successfully" in delegate_observation.content
+    assert "Result 1" in delegate_observation.content
+    assert "Result 2" in delegate_observation.content
     # Verify to_llm_content
     llm_content = delegate_observation.to_llm_content
     assert len(llm_content) == 1
@@ -104,8 +104,8 @@ def test_delegate_executor_delegate():
     # First spawn some agents
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
-    assert isinstance(spawn_observation.output, str)
-    assert "Successfully spawned" in spawn_observation.output
+    assert isinstance(spawn_observation.content, str)
+    assert "Successfully spawned" in spawn_observation.content
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -116,7 +116,7 @@ def test_delegate_executor_delegate():
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
         mock_observation = DelegateObservation(
             command="delegate",
-            output=(
+            content=(
                 "delegate: Tasks completed successfully\n\nResults:\n"
                 "1. Agent agent1: Code analysis complete\n"
                 "2. Agent agent2: Tests written"
@@ -127,9 +127,9 @@ def test_delegate_executor_delegate():
         observation = executor(delegate_action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    assert isinstance(observation.output, str)
-    assert "Agent agent1: Code analysis complete" in observation.output
-    assert "Agent agent2: Tests written" in observation.output
+    assert isinstance(observation.content, str)
+    assert "Agent agent1: Code analysis complete" in observation.content
+    assert "Agent agent2: Tests written" in observation.content
 
 
 def test_delegate_executor_missing_task():
@@ -143,11 +143,18 @@ def test_delegate_executor_missing_task():
 
     assert isinstance(observation, DelegateObservation)
     # Error message should be in the error field
-    assert observation.has_error
-    assert observation.error is not None
+    assert observation.is_error
+    assert observation.is_error is True
+    content_text = (
+        observation.content
+        if isinstance(observation.content, str)
+        else "".join(
+            [c.text for c in observation.content if isinstance(c, TextContent)]
+        )
+    )
     assert (
-        "task is required" in observation.error.lower()
-        or "at least one task" in observation.error.lower()
+        "task is required" in content_text.lower()
+        or "at least one task" in content_text.lower()
     )
 
 
diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index e61ea22dc6..743eca4da9 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -13,16 +13,16 @@
 
 
 def get_output_text(obs: ExecuteBashObservation) -> str:
-    """Extract text from observation output field.
+    """Extract text from observation content field.
 
     This helper handles type-safe extraction of text from the observation's
-    output field, which can be a str or list of Content items.
+    content field, which can be a str or list of Content items.
     """
-    if isinstance(obs.output, str):
-        return obs.output
-    if not obs.output:
+    if isinstance(obs.content, str):
+        return obs.content
+    if not obs.content:
         return ""
-    first_item = obs.output[0]
+    first_item = obs.content[0]
     return first_item.text if isinstance(first_item, TextContent) else ""
 
 
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index 44d36c68cd..d33831f27a 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -275,13 +275,13 @@ def test_cmd_output_observation_properties():
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
     obs = ExecuteBashObservation(
         command="ls",
-        output=[TextContent(text="file1\nfile2")],
+        content=[TextContent(text="file1\nfile2")],
         exit_code=0,
         metadata=metadata,
     )
     assert obs.command_id == 123
     assert obs.exit_code == 0
-    assert not obs.error
+    assert not obs.is_error
     assert len(obs.to_llm_content) == 1
     assert isinstance(obs.to_llm_content[0], TextContent)
     assert "exit code 0" in obs.to_llm_content[0].text
@@ -294,17 +294,18 @@ def test_cmd_output_observation_properties():
     obs = ExecuteBashObservation(
         command="invalid",
         exit_code=1,
-        error="Command failed",
+        content="Command failed",
+        is_error=True,
         metadata=metadata,
     )
     assert obs.command_id == 456
     assert obs.exit_code == 1
-    assert obs.has_error
-    assert len(obs.to_llm_content) == 1
+    assert obs.is_error
+    assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
-    # When there's an error, only error message is returned
-    assert "Tool Execution Error: Command failed" == obs.to_llm_content[0].text
-    assert obs.has_error
+    assert obs.to_llm_content[0].text == "Tool Execution Error. "
+    assert isinstance(obs.to_llm_content[1], TextContent)
+    assert "Command failed" in obs.to_llm_content[1].text
 
 
 def test_ps1_metadata_empty_fields():
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 5ae356480e..bc4ff492c2 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -319,15 +319,15 @@ def test_empty_command_error(terminal_type):
     # Test empty command without previous command
     obs = session.execute(ExecuteBashAction(command=""))
 
-    assert obs.has_error is True
-    assert not obs.output  # When there's an error, output should not be populated
-    assert obs.error == "No previous running command to retrieve logs from."
-    assert len(obs.to_llm_content) == 1
+    assert obs.is_error is True
+    assert obs.content == "No previous running command to retrieve logs from."
+    assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
-    assert "Tool Execution Error:" in obs.to_llm_content[0].text
+    assert obs.to_llm_content[0].text == "Tool Execution Error. "
+    assert isinstance(obs.to_llm_content[1], TextContent)
     assert (
         "No previous running command to retrieve logs from."
-        in obs.to_llm_content[0].text
+        == obs.to_llm_content[1].text
     )
     assert obs.metadata.exit_code == -1
     assert obs.metadata.prefix == ""
@@ -518,7 +518,7 @@ def _run_bash_action(session, command: str, **kwargs):
     action = ExecuteBashAction(command=command, **kwargs)
     obs = session.execute(action)
     logger.info(f"Command: {command}")
-    output_text = get_output_text(obs) if obs.output else ""
+    output_text = get_output_text(obs) if obs.content else ""
     logger.info(f"Output: {output_text}")
     logger.info(f"Exit code: {obs.metadata.exit_code}")
     return obs
@@ -719,9 +719,8 @@ def test_multiple_multiline_commands(terminal_type):
 
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
-            assert obs.has_error is True
-            assert obs.error is not None
-            assert "Cannot execute multiple commands at once" in obs.error
+            assert obs.is_error is True
+            assert "Cannot execute multiple commands at once" in obs.content
 
             # Now run each command individually and verify they work
             results = []
diff --git a/tests/tools/glob/test_glob_executor.py b/tests/tools/glob/test_glob_executor.py
index e14ed8ac9e..a4c89f25ce 100644
--- a/tests/tools/glob/test_glob_executor.py
+++ b/tests/tools/glob/test_glob_executor.py
@@ -26,7 +26,7 @@ def test_glob_executor_basic_pattern():
         action = GlobAction(pattern="*.py")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 2
         assert all(f.endswith(".py") for f in observation.files)
         assert observation.pattern == "*.py"
@@ -49,7 +49,7 @@ def test_glob_executor_recursive_pattern():
         action = GlobAction(pattern="**/*.py")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 2
         assert all(f.endswith(".py") for f in observation.files)
 
@@ -70,7 +70,7 @@ def test_glob_executor_custom_path():
         action = GlobAction(pattern="*.txt", path=str(sub_dir))
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 2
         assert observation.search_path == str(sub_dir.resolve())
         assert all(str(sub_dir) in f for f in observation.files)
@@ -83,8 +83,8 @@ def test_glob_executor_invalid_path():
         action = GlobAction(pattern="*.py", path="/nonexistent/path")
         observation = executor(action)
 
-        assert observation.error is not None
-        assert "is not a valid directory" in observation.error
+        assert observation.is_error is True
+        assert "is not a valid directory" in observation.content
         assert len(observation.files) == 0
 
 
@@ -99,7 +99,7 @@ def test_glob_executor_no_matches():
         action = GlobAction(pattern="*.py")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 0
         assert not observation.truncated
 
@@ -116,7 +116,7 @@ def test_glob_executor_directories_excluded():
         action = GlobAction(pattern="*")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         # Should only find the file, not directories
         assert len(observation.files) == 1
         assert observation.files[0].endswith("file.txt")
@@ -143,7 +143,7 @@ def test_glob_executor_sorting():
         action = GlobAction(pattern="*.txt")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 3
 
         # Files should be sorted by modification time (newest first)
@@ -162,7 +162,7 @@ def test_glob_executor_truncation():
         action = GlobAction(pattern="*.txt")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 100
         assert observation.truncated is True
 
@@ -189,7 +189,7 @@ def test_glob_executor_complex_patterns():
         action = GlobAction(pattern="config.*")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 4  # All config files
         extensions = {Path(f).suffix for f in observation.files}
         assert extensions == {".json", ".yaml", ".yml", ".toml"}
@@ -206,7 +206,7 @@ def test_glob_executor_exception_handling():
         observation = executor(action)
 
         # Should not raise exception, even if there are no matches
-        assert observation.error is None or isinstance(observation.error, str)
+        assert observation.is_error is False or isinstance(observation.content, str)
         assert isinstance(observation.files, list)
 
 
@@ -220,7 +220,7 @@ def test_glob_executor_absolute_paths():
         action = GlobAction(pattern="*.py")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 1
 
         # Check that returned path is absolute
@@ -236,7 +236,7 @@ def test_glob_executor_empty_directory():
         action = GlobAction(pattern="*")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 0
         assert not observation.truncated
 
diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index 3c3f69c374..46ad8519a6 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -85,7 +85,7 @@ def test_glob_tool_find_files():
         observation = tool.executor(action)
 
         assert isinstance(observation, GlobObservation)
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 3  # test.py, src/app.py, tests/test_main.py
         assert observation.pattern == "**/*.py"
         assert observation.search_path == str(Path(temp_dir).resolve())
@@ -119,7 +119,7 @@ def test_glob_tool_specific_directory():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 2  # app.py, utils.py
         assert observation.search_path == str(src_dir.resolve())
 
@@ -143,7 +143,7 @@ def test_glob_tool_no_matches():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 0
         assert observation.pattern == "**/*.py"
         assert not observation.truncated
@@ -161,8 +161,8 @@ def test_glob_tool_invalid_directory():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is not None
-        assert "is not a valid directory" in observation.error
+        assert observation.is_error is True
+        assert "is not a valid directory" in observation.content
         assert len(observation.files) == 0
 
 
@@ -191,7 +191,7 @@ def test_glob_tool_complex_patterns():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 4  # All config files
         assert observation.pattern == "config.*"
 
@@ -218,7 +218,7 @@ def test_glob_tool_directories_excluded():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         # Should find all files recursively, but not directories
         assert len(observation.files) == 2  # app.py and src/utils.py
         # Verify both files are present
@@ -309,7 +309,7 @@ def test_glob_tool_truncation():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.files) == 100  # Truncated to 100
         assert observation.truncated is True
 
diff --git a/tests/tools/grep/test_grep_executor.py b/tests/tools/grep/test_grep_executor.py
index 40ea9512f1..0f7be69c87 100644
--- a/tests/tools/grep/test_grep_executor.py
+++ b/tests/tools/grep/test_grep_executor.py
@@ -37,7 +37,7 @@ def test_grep_executor_basic_search():
         action = GrepAction(pattern="print")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 2  # Two files containing "print"
         assert observation.pattern == "print"
         assert observation.search_path == str(Path(temp_dir).resolve())
@@ -59,7 +59,7 @@ def test_grep_executor_case_insensitive():
         action = GrepAction(pattern="print")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1  # File contains pattern (case-insensitive)
         assert "case_test.py" in observation.matches[0]
 
@@ -75,7 +75,7 @@ def test_grep_executor_include_filter():
         action = GrepAction(pattern="test", include="*.py")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert observation.matches[0].endswith(".py")
 
@@ -92,7 +92,7 @@ def test_grep_executor_custom_path():
         action = GrepAction(pattern="print", path=str(sub_dir))
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert observation.search_path == str(sub_dir.resolve())
         assert str(sub_dir) in str(observation.matches[0])
@@ -105,8 +105,8 @@ def test_grep_executor_invalid_path():
         action = GrepAction(pattern="test", path="/nonexistent/path")
         observation = executor(action)
 
-        assert observation.error is not None
-        assert "not a valid directory" in observation.error
+        assert observation.is_error is True
+        assert "not a valid directory" in observation.content
 
 
 def test_grep_executor_no_matches():
@@ -118,7 +118,7 @@ def test_grep_executor_no_matches():
         action = GrepAction(pattern="nonexistent")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 0
 
 
@@ -132,7 +132,7 @@ def test_grep_executor_hidden_files_excluded():
         action = GrepAction(pattern="test")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert ".hidden" not in observation.matches[0]
 
@@ -155,7 +155,7 @@ def test_grep_executor_sorting():
         action = GrepAction(pattern="test")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 2
         # Newest file should be first
         assert "new.py" in observation.matches[0]
@@ -173,7 +173,7 @@ def test_grep_executor_truncation():
         action = GrepAction(pattern="test")
         observation = executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 100
         assert observation.truncated is True
 
@@ -185,5 +185,5 @@ def test_grep_executor_invalid_regex():
         action = GrepAction(pattern="[invalid")
         observation = executor(action)
 
-        assert observation.error is not None
-        assert "Invalid regex pattern" in observation.error
+        assert observation.is_error is True
+        assert "Invalid regex pattern" in observation.content
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index e5b6abc874..5373b5a50d 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -63,7 +63,7 @@ def test_grep_tool_basic_search():
         observation = tool.executor(action)
 
         assert isinstance(observation, GrepObservation)
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 2  # Two files
         assert observation.pattern == "print"
         assert observation.search_path == str(Path(temp_dir).resolve())
@@ -89,7 +89,7 @@ def test_grep_tool_case_insensitive():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
 
 
@@ -107,7 +107,7 @@ def test_grep_tool_include_filter():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert observation.matches[0].endswith(".py")
 
@@ -128,7 +128,7 @@ def test_grep_tool_specific_directory():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert observation.search_path == str(src_dir.resolve())
         assert str(src_dir) in observation.matches[0]
@@ -147,7 +147,7 @@ def test_grep_tool_no_matches():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 0
         assert not observation.truncated
 
@@ -163,8 +163,8 @@ def test_grep_tool_invalid_regex():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is not None
-        assert "Invalid regex pattern" in observation.error
+        assert observation.is_error is True
+        assert "Invalid regex pattern" in observation.content
 
 
 def test_grep_tool_invalid_directory():
@@ -178,8 +178,8 @@ def test_grep_tool_invalid_directory():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is not None
-        assert "not a valid directory" in observation.error
+        assert observation.is_error is True
+        assert "not a valid directory" in observation.content
 
 
 def test_grep_tool_hidden_files_excluded():
@@ -196,7 +196,7 @@ def test_grep_tool_hidden_files_excluded():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 1
         assert ".hidden" not in observation.matches[0]
 
@@ -289,7 +289,7 @@ def test_grep_tool_truncation():
         assert tool.executor is not None
         observation = tool.executor(action)
 
-        assert observation.error is None
+        assert observation.is_error is False
         assert len(observation.matches) == 100
         assert observation.truncated is True
 

From 0b67da35f63f5e317c7e0ee70977fcf3307aa092 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 17:17:30 +0000
Subject: [PATCH 36/76] test: fix additional test failures after observation
 schema refactor

- Updated glob and grep tests to expect 2 items in to_llm_content when is_error=True
- Fixed grep consistency tests to use .is_error instead of .error
- Tests now properly handle the new error format with 'Tool Execution Error. ' prepended

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/tools/glob/test_glob_tool.py   |  6 ++--
 tests/tools/grep/test_consistency.py | 48 ++++++++++++++--------------
 tests/tools/grep/test_grep_tool.py   |  5 +--
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index 46ad8519a6..f3f0e27be7 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -287,9 +287,9 @@ def test_glob_tool_to_llm_content_error():
         observation = tool.executor(action)
 
         content = observation.to_llm_content
-        assert len(content) == 1
-        text_content = content[0].text
-        assert "Error:" in text_content
+        assert len(content) == 2
+        assert content[0].text == "Tool Execution Error. "
+        text_content = content[1].text
         assert "is not a valid directory" in text_content
 
 
diff --git a/tests/tools/grep/test_consistency.py b/tests/tools/grep/test_consistency.py
index 6c13ad361a..0c3b0a8eda 100644
--- a/tests/tools/grep/test_consistency.py
+++ b/tests/tools/grep/test_consistency.py
@@ -114,8 +114,8 @@ def test_basic_search_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets of matching files for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -143,8 +143,8 @@ def test_case_insensitive_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -172,8 +172,8 @@ def test_include_pattern_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -205,8 +205,8 @@ def test_no_matches_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed with identical empty results
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -229,8 +229,8 @@ def test_regex_pattern_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -258,8 +258,8 @@ def test_todo_comments_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -287,8 +287,8 @@ def test_error_patterns_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -316,8 +316,8 @@ def test_import_statements_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -345,8 +345,8 @@ def test_class_definitions_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -374,8 +374,8 @@ def test_deep_nested_search_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
@@ -409,8 +409,8 @@ def test_config_file_search_consistency(self, temp_dir_with_content):
             )
 
             # Both should succeed
-            assert not ripgrep_result.error
-            assert not fallback_result.error
+            assert not ripgrep_result.is_error
+            assert not fallback_result.is_error
 
             # Convert to sets for exact comparison
             ripgrep_matches = set(ripgrep_result.matches)
@@ -439,8 +439,8 @@ def test_hidden_files_search_consistency(self, temp_dir_with_content):
         )
 
         # Both should succeed
-        assert not ripgrep_result.error
-        assert not fallback_result.error
+        assert not ripgrep_result.is_error
+        assert not fallback_result.is_error
 
         # Convert to sets for exact comparison
         ripgrep_matches = set(ripgrep_result.matches)
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index 5373b5a50d..14199f8175 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -269,8 +269,9 @@ def test_grep_tool_to_llm_content_error():
         observation = tool.executor(action)
 
         content = observation.to_llm_content
-        text = content[0].text
-        assert "Error:" in text
+        assert len(content) == 2
+        assert content[0].text == "Tool Execution Error. "
+        text = content[1].text
         assert "Invalid regex pattern" in text
 
 

From d609bf1b35f812cc55d0b0632e6f835bbe5727f4 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 17:19:20 +0000
Subject: [PATCH 37/76] test: fix file_editor test helper to use content
 instead of output

- Updated get_output_text helper in conftest.py to access .content instead of .output
- Updated assert_successful_result to use .is_error instead of .error
- Updated assert_error_result to use .is_error and extract error from .content
- All file_editor encoding tests now passing

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/tools/file_editor/conftest.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/tools/file_editor/conftest.py b/tests/tools/file_editor/conftest.py
index 107b226b33..7a778d9427 100644
--- a/tests/tools/file_editor/conftest.py
+++ b/tests/tools/file_editor/conftest.py
@@ -57,7 +57,7 @@ def assert_successful_result(
 ):
     """Assert that a result is successful (no error)."""
     assert isinstance(result, FileEditorObservation)
-    assert result.error is None
+    assert not result.is_error
     if expected_path:
         assert result.path == expected_path
 
@@ -67,9 +67,14 @@ def assert_error_result(
 ):
     """Assert that a result contains an error."""
     assert isinstance(result, FileEditorObservation)
-    assert result.error is not None
+    assert result.is_error
     if expected_error_substring:
-        assert expected_error_substring in result.error
+        content_text = (
+            result.content
+            if isinstance(result.content, str)
+            else "".join([c.text for c in result.content if isinstance(c, TextContent)])
+        )
+        assert expected_error_substring in content_text
 
 
 def create_test_file(path: Path, content: str):
@@ -79,7 +84,7 @@ def create_test_file(path: Path, content: str):
 
 
 def get_output_text(result: FileEditorObservation) -> str:
-    """Extract text content from a FileEditorObservation's output."""
-    if isinstance(result.output, str):
-        return result.output
-    return "".join([c.text for c in result.output if isinstance(c, TextContent)])
+    """Extract text content from a FileEditorObservation's content."""
+    if isinstance(result.content, str):
+        return result.content
+    return "".join([c.text for c in result.content if isinstance(c, TextContent)])

From 23512252da68b589f29001a86764da1290e94841 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 17:45:54 +0000
Subject: [PATCH 38/76] test: update execute_bash and file_editor tests to use
 new observation API

- Replace output= with content= in ExecuteBashObservation
- Replace .error with .is_error checks
- Update error message assertions to use get_output_text()
- Remove unused variable in test_observation_truncation.py
- All execute_bash and file_editor tests now passing

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../test_observation_truncation.py            | 34 +++++++++----------
 .../execute_bash/test_secrets_masking.py      |  2 +-
 .../tools/file_editor/test_error_handling.py  | 29 ++++++++--------
 .../file_editor/test_file_editor_tool.py      |  8 ++---
 4 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 75fee8597e..3db0c8db3a 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -18,9 +18,8 @@ def test_execute_bash_observation_truncation_under_limit():
     )
 
     observation = ExecuteBashObservation(
-        output=[TextContent(text="Short output")],
+        content=[TextContent(text="Short output")],
         metadata=metadata,
-        error=None,
     )
 
     result = observation.to_llm_content
@@ -52,9 +51,8 @@ def test_execute_bash_observation_truncation_over_limit():
     long_output = "A" * (MAX_CMD_OUTPUT_SIZE + 1000)
 
     observation = ExecuteBashObservation(
-        output=[TextContent(text=long_output)],
+        content=[TextContent(text=long_output)],
         metadata=metadata,
-        error=None,
     )
 
     result = observation.to_llm_content
@@ -85,22 +83,26 @@ def test_execute_bash_observation_truncation_with_error():
         pid=123,
     )
 
-    # Create output that exceeds the limit
-    long_output = "B" * (MAX_CMD_OUTPUT_SIZE + 500)
-
     observation = ExecuteBashObservation(
-        output=[TextContent(text=long_output)],
+        content="Command failed",
         metadata=metadata,
-        error="Command failed",
+        is_error=True,
     )
 
     result = observation.to_llm_content
-    assert len(result) == 1
+    assert len(result) == 2
     assert isinstance(result[0], TextContent)
-    result = result[0].text
+    assert isinstance(result[1], TextContent)
+
+    # First part is the error prefix
+    assert result[0].text == "Tool Execution Error. "
 
-    # When there's an error, only the error message is returned
-    assert result == "Tool Execution Error: Command failed"
+    # Second part includes the error message with metadata
+    full_text = result[1].text
+    assert "Command failed" in full_text
+    assert "[Current working directory: /test]" in full_text
+    assert "[Python interpreter: /usr/bin/python]" in full_text
+    assert "[Command finished with exit code 1]" in full_text
 
 
 def test_execute_bash_observation_truncation_exact_limit():
@@ -124,9 +126,8 @@ def test_execute_bash_observation_truncation_exact_limit():
     exact_output = "C" * exact_output_size
 
     observation = ExecuteBashObservation(
-        output=[TextContent(text=exact_output)],
+        content=[TextContent(text=exact_output)],
         metadata=metadata,
-        error=None,
     )
 
     result = observation.to_llm_content
@@ -154,9 +155,8 @@ def test_execute_bash_observation_truncation_with_prefix_suffix():
     long_output = "D" * (MAX_CMD_OUTPUT_SIZE + 200)
 
     observation = ExecuteBashObservation(
-        output=[TextContent(text=long_output)],
+        content=[TextContent(text=long_output)],
         metadata=metadata,
-        error=None,
     )
 
     result = observation.to_llm_content
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index faaf4ae7dd..e546c211eb 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -64,7 +64,7 @@ def test_bash_executor_with_conversation_secrets():
             mock_observation = ExecuteBashObservation(
                 command="echo 'Token: $SECRET_TOKEN, Key: $API_KEY'",
                 exit_code=0,
-                output=[
+                content=[
                     TextContent(text="Token: secret-value-123, Key: another-secret-456")
                 ],
             )
diff --git a/tests/tools/file_editor/test_error_handling.py b/tests/tools/file_editor/test_error_handling.py
index 64924be03c..655daa3263 100644
--- a/tests/tools/file_editor/test_error_handling.py
+++ b/tests/tools/file_editor/test_error_handling.py
@@ -12,7 +12,7 @@ def test_validation_error_formatting():
         path="/nonexistent/file.txt",
     )
     assert_error_result(result)
-    assert result.error is not None and "does not exist" in result.error
+    assert result.is_error and "does not exist" in get_output_text(result)
 
     # Test directory validation for non-view commands
     result = file_editor(
@@ -23,8 +23,8 @@ def test_validation_error_formatting():
     )
     assert_error_result(result)
     assert (
-        result.error is not None
-        and "directory and only the `view` command" in result.error
+        result.is_error
+        and "directory and only the `view` command" in get_output_text(result)
     )
 
 
@@ -43,7 +43,7 @@ def test_str_replace_error_handling(temp_file):
         new_str="something",
     )
     assert_error_result(result)
-    assert result.error is not None and "did not appear verbatim" in result.error
+    assert result.is_error and "did not appear verbatim" in get_output_text(result)
 
     # Test multiple occurrences
     with open(temp_file, "w") as f:
@@ -56,8 +56,8 @@ def test_str_replace_error_handling(temp_file):
         new_str="new_line",
     )
     assert_error_result(result)
-    assert result.error is not None and "Multiple occurrences" in result.error
-    assert result.error is not None and "lines [1, 2]" in result.error
+    assert result.is_error and "Multiple occurrences" in get_output_text(result)
+    assert result.is_error and "lines [1, 2]" in get_output_text(result)
 
 
 def test_view_range_validation(temp_file):
@@ -74,8 +74,8 @@ def test_view_range_validation(temp_file):
         view_range=[1],  # Should be [start, end]
     )
     assert_error_result(result)
-    assert (
-        result.error is not None and "should be a list of two integers" in result.error
+    assert result.is_error and "should be a list of two integers" in get_output_text(
+        result
     )
 
     # Test out of bounds range: should clamp to file end and show a warning
@@ -85,7 +85,7 @@ def test_view_range_validation(temp_file):
         view_range=[1, 10],  # File only has 3 lines
     )
     # This should succeed but show a warning
-    assert result.error is None
+    assert not result.is_error
     assert (
         "NOTE: We only show up to 3 since there're only 3 lines in this file."
         in get_output_text(result)
@@ -98,9 +98,8 @@ def test_view_range_validation(temp_file):
         view_range=[3, 1],  # End before start
     )
     assert_error_result(result)
-    assert (
-        result.error is not None
-        and "should be greater than or equal to" in result.error
+    assert result.is_error and "should be greater than or equal to" in get_output_text(
+        result
     )
 
 
@@ -119,7 +118,7 @@ def test_insert_validation(temp_file):
         new_str="new line",
     )
     assert_error_result(result)
-    assert result.error is not None and "should be within the range" in result.error
+    assert result.is_error and "should be within the range" in get_output_text(result)
 
     # Test insert beyond file length
     result = file_editor(
@@ -129,7 +128,7 @@ def test_insert_validation(temp_file):
         new_str="new line",
     )
     assert_error_result(result)
-    assert result.error is not None and "should be within the range" in result.error
+    assert result.is_error and "should be within the range" in get_output_text(result)
 
 
 def test_undo_validation(temp_file):
@@ -145,4 +144,4 @@ def test_undo_validation(temp_file):
         path=temp_file,
     )
     assert_error_result(result)
-    assert result.error is not None and "No edit history found" in result.error
+    assert result.is_error and "No edit history found" in get_output_text(result)
diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py
index c43c5036d5..700010544d 100644
--- a/tests/tools/file_editor/test_file_editor_tool.py
+++ b/tests/tools/file_editor/test_file_editor_tool.py
@@ -65,7 +65,7 @@ def test_file_editor_tool_create_file():
         # Check the result
         assert result is not None
         assert isinstance(result, FileEditorObservation)
-        assert not result.error
+        assert not result.is_error
         assert os.path.exists(test_file)
 
         # Check file contents
@@ -96,7 +96,7 @@ def test_file_editor_tool_view_file():
         # Check the result
         assert result is not None
         assert isinstance(result, FileEditorObservation)
-        assert not result.error
+        assert not result.is_error
         assert "Line 1" in get_output_text(result)
         assert "Line 2" in get_output_text(result)
         assert "Line 3" in get_output_text(result)
@@ -129,7 +129,7 @@ def test_file_editor_tool_str_replace():
         # Check the result
         assert result is not None
         assert isinstance(result, FileEditorObservation)
-        assert not result.error
+        assert not result.is_error
 
         # Check file contents
         with open(test_file) as f:
@@ -179,7 +179,7 @@ def test_file_editor_tool_view_directory():
         # Check the result
         assert result is not None
         assert isinstance(result, FileEditorObservation)
-        assert not result.error
+        assert not result.is_error
         assert "file1.txt" in get_output_text(result)
         assert "file2.txt" in get_output_text(result)
 

From b6ad77498e42fb3454b5acab2c5b0e09970808e5 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 17:50:33 +0000
Subject: [PATCH 39/76] test: update browser_use and stuck_detector tests to
 use new observation API

- Change output= to content= in all observation instantiations
- Change .error to .content for error message extraction
- Change .error is None to not .is_error for error checks
- Update get_output_text helper to use .content instead of .output

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/cross/test_stuck_detector.py            | 14 ++++---
 .../browser_use/test_browser_executor.py      |  6 +--
 .../browser_use/test_browser_executor_e2e.py  | 42 +++++++++----------
 3 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index e3d9a30e8d..6072e33d22 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -59,7 +59,9 @@ def test_history_too_short():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output=[TextContent(text="file1.txt\nfile2.txt")], command="ls", exit_code=0
+            content=[TextContent(text="file1.txt\nfile2.txt")],
+            command="ls",
+            exit_code=0,
         ),
         action_id=action.id,
         tool_name="execute_bash",
@@ -108,7 +110,7 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")],
+                content=[TextContent(text="file1.txt\nfile2.txt")],
                 command="ls",
                 exit_code=0,
             ),
@@ -159,7 +161,7 @@ def test_repeating_action_observation_stuck():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")],
+                content=[TextContent(text="file1.txt\nfile2.txt")],
                 command="ls",
                 exit_code=0,
             ),
@@ -302,7 +304,7 @@ def test_not_stuck_with_different_actions():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text=f"output from {cmd}")],
+                content=[TextContent(text=f"output from {cmd}")],
                 command=cmd,
                 exit_code=0,
             ),
@@ -353,7 +355,7 @@ def test_reset_after_user_message():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                output=[TextContent(text="file1.txt\nfile2.txt")],
+                content=[TextContent(text="file1.txt\nfile2.txt")],
                 command="ls",
                 exit_code=0,
             ),
@@ -398,7 +400,7 @@ def test_reset_after_user_message():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            output=[TextContent(text="/home/user")], command="pwd", exit_code=0
+            content=[TextContent(text="/home/user")], command="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="execute_bash",
diff --git a/tests/tools/browser_use/test_browser_executor.py b/tests/tools/browser_use/test_browser_executor.py
index 161a403554..35d72b78b9 100644
--- a/tests/tools/browser_use/test_browser_executor.py
+++ b/tests/tools/browser_use/test_browser_executor.py
@@ -74,7 +74,7 @@ async def test_browser_executor_action_routing_get_state(
 ):
     """Test that get_state actions are routed correctly and return directly."""
     expected_observation = BrowserObservation(
-        output=[TextContent(text="State retrieved")], screenshot_data="base64data"
+        content=[TextContent(text="State retrieved")], screenshot_data="base64data"
     )
     mock_get_state.return_value = expected_observation
 
@@ -106,7 +106,7 @@ async def test_browser_executor_error_wrapping(mock_navigate, mock_browser_execu
     result = await mock_browser_executor._execute_action(action)
 
     assert_browser_observation_error(result, "Browser operation failed")
-    assert "Browser error occurred" in result.error
+    assert "Browser error occurred" in result.content
 
 
 def test_browser_executor_async_execution(mock_browser_executor):
@@ -114,7 +114,7 @@ def test_browser_executor_async_execution(mock_browser_executor):
     with patch.object(
         mock_browser_executor, "_execute_action", new_callable=AsyncMock
     ) as mock_execute:
-        expected_result = BrowserObservation(output=[TextContent(text="Test result")])
+        expected_result = BrowserObservation(content=[TextContent(text="Test result")])
         mock_execute.return_value = expected_result
 
         action = BrowserNavigateAction(url="https://example.com")
diff --git a/tests/tools/browser_use/test_browser_executor_e2e.py b/tests/tools/browser_use/test_browser_executor_e2e.py
index a7d0f0b54e..26a6bf48e5 100644
--- a/tests/tools/browser_use/test_browser_executor_e2e.py
+++ b/tests/tools/browser_use/test_browser_executor_e2e.py
@@ -24,10 +24,10 @@
 
 
 def get_output_text(observation: BrowserObservation) -> str:
-    """Extract text from observation output."""
-    if isinstance(observation.output, str):
-        return observation.output
-    return "".join([c.text for c in observation.output if isinstance(c, TextContent)])
+    """Extract text from observation content."""
+    if isinstance(observation.content, str):
+        return observation.content
+    return "".join([c.text for c in observation.content if isinstance(c, TextContent)])
 
 
 # Test HTML content for browser operations
@@ -179,7 +179,7 @@ def test_navigate_action(
         result = browser_executor(action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         output_text = get_output_text(result).lower()
         assert "successfully" in output_text or "navigated" in output_text
 
@@ -196,7 +196,7 @@ def test_get_state_action(
         result = browser_executor(action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         assert "Browser Test Page" in get_output_text(result)
 
     def test_get_state_with_screenshot(
@@ -212,7 +212,7 @@ def test_get_state_with_screenshot(
         result = browser_executor(action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         assert result.screenshot_data is not None
         assert len(result.screenshot_data) > 0
 
@@ -237,7 +237,7 @@ def test_click_action(
         result = browser_executor(click_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
     def test_type_action(self, browser_executor: BrowserToolExecutor, test_server: str):
         """Test typing text into an input field."""
@@ -259,7 +259,7 @@ def test_type_action(self, browser_executor: BrowserToolExecutor, test_server: s
         result = browser_executor(type_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
     def test_scroll_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -274,14 +274,14 @@ def test_scroll_action(
         result = browser_executor(scroll_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
         # Scroll back up
         scroll_up_action = BrowserScrollAction(direction="up")
         result = browser_executor(scroll_up_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
     def test_get_content_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -296,7 +296,7 @@ def test_get_content_action(
         result = browser_executor(content_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         assert "Browser Test Page" in get_output_text(result)
 
         # Get content with links
@@ -306,7 +306,7 @@ def test_get_content_action(
         result = browser_executor(content_with_links_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         assert "Browser Test Page" in get_output_text(result)
 
     def test_navigate_new_tab(
@@ -318,7 +318,7 @@ def test_navigate_new_tab(
         result = browser_executor(action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
     def test_list_tabs_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -333,7 +333,7 @@ def test_list_tabs_action(
         result = browser_executor(list_tabs_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
         # Should contain tab information
         assert len(get_output_text(result)) > 0
 
@@ -355,7 +355,7 @@ def test_go_back_action(
         result = browser_executor(back_action)
 
         assert isinstance(result, BrowserObservation)
-        assert result.error is None
+        assert not result.is_error
 
     def test_switch_tab_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -435,24 +435,24 @@ def test_concurrent_actions(
         """Test that multiple actions can be executed in sequence."""
         # Navigate
         navigate_result = browser_executor(BrowserNavigateAction(url=test_server))
-        assert navigate_result.error is None
+        assert not navigate_result.is_error
 
         # Get state
         state_result = browser_executor(BrowserGetStateAction(include_screenshot=False))
-        assert state_result.error is None
+        assert not state_result.is_error
 
         # Scroll
         scroll_result = browser_executor(BrowserScrollAction(direction="down"))
-        assert scroll_result.error is None
+        assert not scroll_result.is_error
 
         # Get content
         content_result = browser_executor(
             BrowserGetContentAction(extract_links=False, start_from_char=0)
         )
-        assert content_result.error is None
+        assert not content_result.is_error
 
         # All actions should complete successfully
         assert all(
-            result.error is None
+            not result.is_error
             for result in [navigate_result, state_result, scroll_result, content_result]
         )

From dfc918f21db1534211d71b305e78356d8c6bc6cd Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 18:03:30 +0000
Subject: [PATCH 40/76] refactor: simplify MCPToolObservation creation with
 single return path

- Remove conditional logic for error vs success cases
- Always return content as list for consistency
- Set is_error flag directly from result.isError
- Update test to check content as list for both success and error cases

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 26 ++++---------------
 tests/sdk/mcp/test_mcp_tool.py                |  7 +++--
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 932cf83652..4a0d3ffc17 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -80,27 +80,11 @@ def from_call_tool_result(
         # Prepend initial message to content
         content_with_header = [TextContent(text=initial_message)] + converted_content
 
-        # Populate content field and is_error flag based on result status
-        if result.isError:
-            # When there is an error, populate content field with all content
-            # and set is_error=True
-            return cls(
-                content="\n".join(
-                    [initial_message]
-                    + [
-                        c.text if isinstance(c, TextContent) else "[Image]"
-                        for c in converted_content
-                    ]
-                ),
-                is_error=True,
-                tool_name=tool_name,
-            )
-        else:
-            # When success, populate content field only
-            return cls(
-                content=content_with_header,
-                tool_name=tool_name,
-            )
+        return cls(
+            content=content_with_header,
+            is_error=result.isError,
+            tool_name=tool_name,
+        )
 
     @property
     def visualize(self) -> Text:
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 79ec2d05ce..a496332c78 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -58,8 +58,11 @@ def test_from_call_tool_result_error(self):
 
         assert observation.tool_name == "test_tool"
         assert observation.is_error is True
-        assert "[Tool 'test_tool' executed.]" in observation.content
-        assert "Operation failed" in observation.content
+        assert len(observation.content) == 2
+        assert isinstance(observation.content[0], TextContent)
+        assert observation.content[0].text == "[Tool 'test_tool' executed.]"
+        assert isinstance(observation.content[1], TextContent)
+        assert observation.content[1].text == "Operation failed"
 
     def test_from_call_tool_result_with_image(self):
         """Test creating observation from MCP result with image content."""

From 38841bef556cf8be4db7afecbb55487ddb4b9f48 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 18:07:15 +0000
Subject: [PATCH 41/76] refactor: simplify MCPToolObservation.visualize using
 assert for type narrowing

- Remove unnecessary isinstance checks for str content type
- MCPToolObservation always has content as a list
- Use assert for type narrowing to help type checker
- Restore simpler implementation from main branch

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 44 ++++++-------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 4a0d3ffc17..84b6deee6f 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -89,38 +89,22 @@ def from_call_tool_result(
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
+        # MCPToolObservation always has content as a list
+        assert isinstance(self.content, list)
+
         content_obj = Text()
         content_obj.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
-
         if self.is_error:
             content_obj.append("[Error during execution]\n", style="bold red")
-            # Handle both str and list types for content
-            if isinstance(self.content, str):
-                content_obj.append(self.content + "\n")
-            else:
-                for block in self.content:
-                    if isinstance(block, TextContent):
-                        content_obj.append(block.text + "\n")
-                    elif isinstance(block, ImageContent):
-                        content_obj.append(
-                            f"[Image with {len(block.image_urls)} URLs]\n"
-                        )
-        elif self.content:
-            # Display all content blocks
-            if isinstance(self.content, str):
-                content_obj.append(self.content + "\n")
-            else:
-                for block in self.content:
-                    if isinstance(block, TextContent):
-                        # Try to parse as JSON for better display
-                        try:
-                            parsed = json.loads(block.text)
-                            content_obj.append(display_dict(parsed))
-                        except (json.JSONDecodeError, TypeError):
-                            content_obj.append(block.text + "\n")
-                    elif isinstance(block, ImageContent):
-                        content_obj.append(
-                            f"[Image with {len(block.image_urls)} URLs]\n"
-                        )
-
+        for block in self.content:
+            if isinstance(block, TextContent):
+                # try to see if block.text is a JSON
+                try:
+                    parsed = json.loads(block.text)
+                    content_obj.append(display_dict(parsed))
+                    continue
+                except (json.JSONDecodeError, TypeError):
+                    content_obj.append(block.text + "\n")
+            elif isinstance(block, ImageContent):
+                content_obj.append(f"[Image with {len(block.image_urls)} URLs]\n")
         return content_obj

From 7b5ff64372f6db637008a019ff7f5bad06d5c02c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 18:13:07 +0000
Subject: [PATCH 42/76] feat: add configurable error_message_header to
 Observation base class

Add error_message_header field to Observation base class with default value
'Tool Execution Error. ', allowing subclasses to customize the error prefix
when is_error is True.

Updated BrowserObservation and ExecuteBashObservation to use
self.error_message_header instead of hardcoding the error message prefix.

This provides flexibility for tools that need different error message formats
while maintaining backward compatibility with the default behavior.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py                 | 6 +++++-
 openhands-tools/openhands/tools/browser_use/definition.py  | 2 +-
 openhands-tools/openhands/tools/execute_bash/definition.py | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index e7377812f1..9c596fae04 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -201,6 +201,10 @@ class Observation(Schema, ABC):
     is_error: bool = Field(
         default=False, description="Whether the observation indicates an error"
     )
+    error_message_header: str = Field(
+        default="Tool Execution Error. ",
+        description="Header prepended to content when is_error is True",
+    )
 
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
@@ -212,7 +216,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text="Tool Execution Error. "))
+            llm_content.append(TextContent(text=self.error_message_header))
 
         # Add content
         if self.content:
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 63e059d82a..80be172a3c 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -37,7 +37,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text="Tool Execution Error. "))
+            llm_content.append(TextContent(text=self.error_message_header))
 
         # Extract text from content (handle both str and list types)
         if isinstance(self.content, str):
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index dcaa435a97..31319909a6 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -106,7 +106,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text="Tool Execution Error. "))
+            llm_content.append(TextContent(text=self.error_message_header))
 
         # Handle both str and list types for content
         if isinstance(self.content, str):

From e17c51511d0914f680876ea2f4af7903ddfea87b Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 18:47:03 +0000
Subject: [PATCH 43/76] fix: resolve merge conflicts and simplify observation
 implementations

- Update examples and tests to use 'content' instead of deprecated 'output' field
- Simplify ExecuteBashObservation and BrowserObservation by removing unnecessary isinstance checks
- Add assertions to document that content is always str in these subclasses
- Update PlanningFileEditorObservation to use correct field names (content, is_error)
- Remove field overrides in mock observations to avoid Pydantic narrowing issues

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  4 ++--
 .../openhands/tools/browser_use/definition.py | 10 +++------
 .../tools/execute_bash/definition.py          | 22 +++++--------------
 .../tools/planning_file_editor/impl.py        |  5 +++--
 .../sdk/context/test_view_batch_atomicity.py  |  2 +-
 tests/sdk/conversation/test_visualizer.py     |  3 +--
 tests/sdk/event/test_event_serialization.py   |  3 +--
 tests/tools/file_editor/test_memory_usage.py  |  7 +++++-
 8 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 8d89c9be8a..74f987c2d4 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -93,8 +93,8 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        assert isinstance(result.output, str)
-        output_text = result.output
+        assert isinstance(result.content, str)
+        output_text = result.content
 
         if output_text.strip():
             for line in output_text.strip().splitlines():
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 80be172a3c..3066fdb6dd 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -39,13 +39,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # Extract text from content (handle both str and list types)
-        if isinstance(self.content, str):
-            content_text = self.content
-        else:
-            content_text = "".join(
-                [c.text for c in self.content if isinstance(c, TextContent)]
-            )
+        # BrowserObservation always has content as str
+        assert isinstance(self.content, str)
+        content_text = self.content
 
         llm_content.append(
             TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 31319909a6..d693a24888 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -108,14 +108,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # Handle both str and list types for content
-        if isinstance(self.content, str):
-            content_text = self.content
-        else:
-            first_item = self.content[0] if self.content else None
-            content_text = (
-                first_item.text if isinstance(first_item, TextContent) else ""
-            )
+        # ExecuteBashObservation always has content as str
+        assert isinstance(self.content, str)
+        content_text = self.content
 
         ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
@@ -138,14 +133,9 @@ def visualize(self) -> Text:
             content_obj.append("❌ ", style="red bold")
             content_obj.append("Command execution error\n", style="red")
 
-        # Add command output with proper styling
-        if isinstance(self.content, str):
-            content_text = self.content
-        else:
-            first_item = self.content[0] if self.content else None
-            content_text = (
-                first_item.text if isinstance(first_item, TextContent) else ""
-            )
+        # ExecuteBashObservation always has content as str
+        assert isinstance(self.content, str)
+        content_text = self.content
 
         if content_text:
             # Style the output based on content
diff --git a/openhands-tools/openhands/tools/planning_file_editor/impl.py b/openhands-tools/openhands/tools/planning_file_editor/impl.py
index 1b2f45a43c..20ba166e9d 100644
--- a/openhands-tools/openhands/tools/planning_file_editor/impl.py
+++ b/openhands-tools/openhands/tools/planning_file_editor/impl.py
@@ -60,6 +60,7 @@ def __call__(
         # Convert FileEditorObservation to PlanningFileEditorObservation
         return PlanningFileEditorObservation(
             command=action.command,
-            output=file_editor_obs.output,
-            error=file_editor_obs.error,
+            content=file_editor_obs.content,
+            is_error=file_editor_obs.is_error,
+            path=file_editor_obs.path,
         )
diff --git a/tests/sdk/context/test_view_batch_atomicity.py b/tests/sdk/context/test_view_batch_atomicity.py
index 8144f664c1..ac37464cef 100644
--- a/tests/sdk/context/test_view_batch_atomicity.py
+++ b/tests/sdk/context/test_view_batch_atomicity.py
@@ -56,7 +56,7 @@ def create_observation_event(
 ) -> ObservationEvent:
     """Helper to create an ObservationEvent."""
     observation = MCPToolObservation(
-        output=[TextContent(text=content)],
+        content=[TextContent(text=content)],
         tool_name=tool_name,
     )
     return ObservationEvent(
diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py
index 7b479c2c0d..209ec6308b 100644
--- a/tests/sdk/conversation/test_visualizer.py
+++ b/tests/sdk/conversation/test_visualizer.py
@@ -153,10 +153,9 @@ def test_observation_event_visualize():
     from openhands.sdk.tool import Observation
 
     class VisualizerMockObservation(Observation):
-        content: str = "Command output"
-
         @property
         def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+            assert isinstance(self.content, str)
             return [TextContent(text=self.content)]
 
     observation = VisualizerMockObservation(
diff --git a/tests/sdk/event/test_event_serialization.py b/tests/sdk/event/test_event_serialization.py
index fd3a3d17ee..bbb82a799b 100644
--- a/tests/sdk/event/test_event_serialization.py
+++ b/tests/sdk/event/test_event_serialization.py
@@ -38,10 +38,9 @@ def execute(self) -> "EventsSerializationMockObservation":
 class EventsSerializationMockObservation(Observation):
     """Mock observation for testing."""
 
-    content: str
-
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
+        assert isinstance(self.content, str)
         return [TextContent(text=self.content)]
 
 
diff --git a/tests/tools/file_editor/test_memory_usage.py b/tests/tools/file_editor/test_memory_usage.py
index 2a028f296a..e5542dd53d 100644
--- a/tests/tools/file_editor/test_memory_usage.py
+++ b/tests/tools/file_editor/test_memory_usage.py
@@ -187,7 +187,12 @@ def test_file_editor_memory_leak(temp_file):
                     new_str=new_content,
                 )
                 if i == 0:
-                    print(f"First edit result: {result.output[:200]}...")
+                    content_str = (
+                        result.content
+                        if isinstance(result.content, str)
+                        else str(result.content)
+                    )
+                    print(f"First edit result: {content_str[:200]}...")
             except Exception as e:
                 print(f"\nError during edit {i}:")
                 print(f"File size: {os.path.getsize(temp_file) / (1024 * 1024):.2f} MB")

From a4cfa37fd8697246c45666a36eeb4dd23d3f9de9 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 22:20:46 +0000
Subject: [PATCH 44/76] fix: handle both str and list types in
 ExecuteBashObservation content

ExecuteBashObservation's to_llm_content and visualize methods now properly
handle both string and list[TextContent | ImageContent] types for the
content field, matching the base Observation class definition.

This fixes test failures where content was passed as a list.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../tools/execute_bash/definition.py          | 24 ++++++++++++++-----
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 756e65f080..f854ecae8a 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -109,9 +109,15 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # ExecuteBashObservation always has content as str
-        assert isinstance(self.content, str)
-        content_text = self.content
+        # Extract text from content (handle both str and list types)
+        if isinstance(self.content, str):
+            content_text = self.content
+        else:
+            # It's a list of TextContent | ImageContent
+            first_item = self.content[0] if self.content else None
+            content_text = (
+                first_item.text if isinstance(first_item, TextContent) else ""
+            )
 
         ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
@@ -134,9 +140,15 @@ def visualize(self) -> Text:
             content_obj.append("❌ ", style="red bold")
             content_obj.append("Command execution error\n", style="red")
 
-        # ExecuteBashObservation always has content as str
-        assert isinstance(self.content, str)
-        content_text = self.content
+        # Extract text from content (handle both str and list types)
+        if isinstance(self.content, str):
+            content_text = self.content
+        else:
+            # It's a list of TextContent | ImageContent
+            first_item = self.content[0] if self.content else None
+            content_text = (
+                first_item.text if isinstance(first_item, TextContent) else ""
+            )
 
         if content_text:
             # Style the output based on content

From d63ed09bcdb1ed116bfb376722ccce1b35db8c1e Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 22:32:15 +0000
Subject: [PATCH 45/76] fix: remove incorrect list content in
 ExecuteBashObservation test cases

ExecuteBashObservation content field must be str, not list[TextContent].
Fixed test cases that incorrectly defined content as list.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/cross/test_stuck_detector.py                 | 12 ++++++------
 tests/tools/execute_bash/test_bash_ps1_metadata.py |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index 77d142a61b..f5508baf3a 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -59,7 +59,7 @@ def test_history_too_short():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            content=[TextContent(text="file1.txt\nfile2.txt")],
+            content="file1.txt\nfile2.txt",
             command="ls",
             exit_code=0,
         ),
@@ -110,7 +110,7 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                content=[TextContent(text="file1.txt\nfile2.txt")],
+                content="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -161,7 +161,7 @@ def test_repeating_action_observation_stuck():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                content=[TextContent(text="file1.txt\nfile2.txt")],
+                content="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -304,7 +304,7 @@ def test_not_stuck_with_different_actions():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                content=[TextContent(text=f"output from {cmd}")],
+                content=f"output from {cmd}",
                 command=cmd,
                 exit_code=0,
             ),
@@ -355,7 +355,7 @@ def test_reset_after_user_message():
         observation = ObservationEvent(
             source="environment",
             observation=ExecuteBashObservation(
-                content=[TextContent(text="file1.txt\nfile2.txt")],
+                content="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -400,7 +400,7 @@ def test_reset_after_user_message():
     observation = ObservationEvent(
         source="environment",
         observation=ExecuteBashObservation(
-            content=[TextContent(text="/home/user")], command="pwd", exit_code=0
+            content="/home/user", command="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="bash",
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index d33831f27a..fb52c415a0 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -275,7 +275,7 @@ def test_cmd_output_observation_properties():
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
     obs = ExecuteBashObservation(
         command="ls",
-        content=[TextContent(text="file1\nfile2")],
+        content="file1\nfile2",
         exit_code=0,
         metadata=metadata,
     )

From ccda8ea3a61c68845311f5b1351157e0f182442c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 22:35:13 +0000
Subject: [PATCH 46/76] fix: correct BrowserObservation content from list to
 str in tests

BrowserObservation content field must be str, not list[TextContent].
Fixed test cases that incorrectly defined content as list.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../browser_use/test_browser_observation.py   | 30 +++++++------------
 1 file changed, 10 insertions(+), 20 deletions(-)

diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index c0e434ca9e..cf5971d881 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -6,11 +6,9 @@
 
 def test_browser_observation_basic_output():
     """Test basic BrowserObservation creation with output."""
-    observation = BrowserObservation(content=[TextContent(text="Test output")])
+    observation = BrowserObservation(content="Test output")
 
-    assert len(observation.content) == 1
-    assert isinstance(observation.content[0], TextContent)
-    assert observation.content[0].text == "Test output"
+    assert observation.content == "Test output"
     assert observation.is_error is False
     assert observation.screenshot_data is None
 
@@ -28,19 +26,17 @@ def test_browser_observation_with_screenshot():
     """Test BrowserObservation with screenshot data."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        content=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
+        content="Screenshot taken", screenshot_data=screenshot_data
     )
 
-    assert len(observation.content) == 1
-    assert isinstance(observation.content[0], TextContent)
-    assert observation.content[0].text == "Screenshot taken"
+    assert observation.content == "Screenshot taken"
     assert observation.is_error is False
     assert observation.screenshot_data == screenshot_data
 
 
 def test_browser_observation_to_llm_content_text_only():
     """Test to_llm_content property with text only."""
-    observation = BrowserObservation(content=[TextContent(text="Test output")])
+    observation = BrowserObservation(content="Test output")
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 1
@@ -52,7 +48,7 @@ def test_browser_observation_to_llm_content_with_screenshot():
     """Test to_llm_content property with screenshot."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
     observation = BrowserObservation(
-        content=[TextContent(text="Screenshot taken")], screenshot_data=screenshot_data
+        content="Screenshot taken", screenshot_data=screenshot_data
     )
     agent_obs = observation.to_llm_content
 
@@ -81,7 +77,7 @@ def test_browser_observation_output_truncation():
     """Test output truncation for very long outputs."""
     # Create a very long output string
     long_output = "x" * 100000  # 100k characters
-    observation = BrowserObservation(content=[TextContent(text=long_output)])
+    observation = BrowserObservation(content=long_output)
 
     agent_obs = observation.to_llm_content
 
@@ -95,9 +91,7 @@ def test_browser_observation_output_truncation():
 def test_browser_observation_screenshot_data_url_conversion():
     """Test that screenshot data is properly converted to data URL."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
-    observation = BrowserObservation(
-        content=[TextContent(text="Test")], screenshot_data=screenshot_data
-    )
+    observation = BrowserObservation(content="Test", screenshot_data=screenshot_data)
 
     agent_obs = observation.to_llm_content
     expected_data_url = f"data:image/png;base64,{screenshot_data}"
@@ -109,14 +103,10 @@ def test_browser_observation_screenshot_data_url_conversion():
 
 def test_browser_observation_empty_screenshot_handling():
     """Test handling of empty or None screenshot data."""
-    observation = BrowserObservation(
-        content=[TextContent(text="Test")], screenshot_data=""
-    )
+    observation = BrowserObservation(content="Test", screenshot_data="")
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
 
-    observation = BrowserObservation(
-        content=[TextContent(text="Test")], screenshot_data=None
-    )
+    observation = BrowserObservation(content="Test", screenshot_data=None)
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image

From c0750b0f3f509cc991259f8d1ea711d935eda3f0 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 22:52:14 +0000
Subject: [PATCH 47/76] fix: update MCPToolObservation error handling to use
 list content and fix confirmation mode tests

- Fixed MCPToolObservation to always use list[TextContent] for error messages
- Updated test_confirmation_mode.py to correctly assert ThinkObservation and FinishObservation content
- ThinkObservation should contain 'Your thought has been logged.'
- FinishObservation should contain the finish message in content field

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/tool.py              |  5 +++--
 .../sdk/conversation/local/test_confirmation_mode.py | 12 ++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index d0bb3254bb..0e28b9eda5 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -12,6 +12,7 @@
 from litellm import ChatCompletionToolParam
 from pydantic import Field, ValidationError
 
+from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.definition import MCPToolAction, MCPToolObservation
@@ -69,7 +70,7 @@ async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
                 error_msg = f"Error calling MCP tool {self.tool_name}: {str(e)}"
                 logger.error(error_msg, exc_info=True)
                 return MCPToolObservation(
-                    content=error_msg,
+                    content=[TextContent(text=error_msg)],
                     is_error=True,
                     tool_name=self.tool_name,
                 )
@@ -154,7 +155,7 @@ def __call__(
             error_msg = f"Validation error for MCP tool '{self.name}' args: {e}"
             logger.error(error_msg, exc_info=True)
             return MCPToolObservation(
-                content=error_msg,
+                content=[TextContent(text=error_msg)],
                 is_error=True,
                 tool_name=self.name,
             )
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index d38556a401..2e87755604 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -552,8 +552,8 @@ def test_single_finish_action_skips_confirmation_entirely(self):
             e for e in self.conversation.state.events if isinstance(e, ObservationEvent)
         ]
         assert len(obs_events) == 1
-        # FinishObservation should have empty content per base behavior
-        assert len(obs_events[0].observation.content) == 0
+        # FinishObservation should contain the finish message in content
+        assert obs_events[0].observation.content == "Task completed successfully!"
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -595,13 +595,13 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
         ]
         assert len(obs_events) == 2
 
-        # 1) ThinkAction observation - should have empty content per base behavior
+        # 1) ThinkAction observation - should contain the standard message
         assert hasattr(obs_events[0].observation, "content")
-        assert len(obs_events[0].observation.content) == 0
+        assert obs_events[0].observation.content == "Your thought has been logged."
 
-        # 2) FinishAction observation - should have empty content per base behavior
+        # 2) FinishAction observation - should contain the finish message
         assert hasattr(obs_events[1].observation, "content")
-        assert len(obs_events[1].observation.content) == 0
+        assert obs_events[1].observation.content == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.

From 0cf66ad2c171a9c6abe7c1a458d1c0f8775a3ae9 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 4 Nov 2025 23:28:16 +0000
Subject: [PATCH 48/76] refactor: change Observation.content to
 list[TextContent | ImageContent] with utility methods

- Change base Observation.content from str | list[...] to just list[TextContent | ImageContent]
- Add utility methods: from_text(), get_text(), get_text_safe() for easy read/write
- Update all 10 observation subclasses to use from_text() for instantiation
- Update BrowserObservation.to_llm_content to use get_text_safe()
- Update all tests to use new pattern (browser, delegation, MCP, execute_bash, context, stuck_detector)
- All pre-commit checks passing

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/tool.py       |  9 +--
 .../openhands/sdk/tool/builtins/finish.py     |  2 +-
 .../openhands/sdk/tool/builtins/think.py      |  2 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 74 +++++++++++++++----
 .../openhands/tools/browser_use/definition.py | 13 ++--
 .../openhands/tools/browser_use/impl.py       | 12 +--
 .../openhands/tools/delegate/impl.py          | 44 +++++------
 .../tools/execute_bash/definition.py          | 10 +--
 .../openhands/tools/execute_bash/impl.py      | 16 ++--
 .../execute_bash/terminal/terminal_session.py | 23 +++---
 .../openhands/tools/file_editor/editor.py     | 34 ++++-----
 .../openhands/tools/file_editor/impl.py       | 20 ++---
 openhands-tools/openhands/tools/glob/impl.py  | 12 +--
 openhands-tools/openhands/tools/grep/impl.py  | 20 ++---
 tests/cross/test_stuck_detector.py            | 24 +++---
 .../sdk/context/test_view_batch_atomicity.py  |  4 +-
 tests/sdk/mcp/test_mcp_tool.py                | 16 ++--
 tests/tools/browser_use/conftest.py           | 10 +--
 .../browser_use/test_browser_executor.py      |  9 +--
 .../browser_use/test_browser_observation.py   | 32 ++++----
 tests/tools/delegation/test_delegation.py     | 39 +++++-----
 .../execute_bash/test_bash_ps1_metadata.py    |  8 +-
 22 files changed, 239 insertions(+), 194 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/tool.py b/openhands-sdk/openhands/sdk/mcp/tool.py
index 0e28b9eda5..04b3f10b3b 100644
--- a/openhands-sdk/openhands/sdk/mcp/tool.py
+++ b/openhands-sdk/openhands/sdk/mcp/tool.py
@@ -12,7 +12,6 @@
 from litellm import ChatCompletionToolParam
 from pydantic import Field, ValidationError
 
-from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.mcp.client import MCPClient
 from openhands.sdk.mcp.definition import MCPToolAction, MCPToolObservation
@@ -69,8 +68,8 @@ async def call_tool(self, action: MCPToolAction) -> MCPToolObservation:
             except Exception as e:
                 error_msg = f"Error calling MCP tool {self.tool_name}: {str(e)}"
                 logger.error(error_msg, exc_info=True)
-                return MCPToolObservation(
-                    content=[TextContent(text=error_msg)],
+                return MCPToolObservation.from_text(
+                    text=error_msg,
                     is_error=True,
                     tool_name=self.tool_name,
                 )
@@ -154,8 +153,8 @@ def __call__(
             # Surface validation errors as an observation instead of crashing
             error_msg = f"Validation error for MCP tool '{self.name}' args: {e}"
             logger.error(error_msg, exc_info=True)
-            return MCPToolObservation(
-                content=[TextContent(text=error_msg)],
+            return MCPToolObservation.from_text(
+                text=error_msg,
                 is_error=True,
                 tool_name=self.name,
             )
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/finish.py b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
index aa9e838b7d..0bb12f849b 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/finish.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/finish.py
@@ -63,7 +63,7 @@ def __call__(
         action: FinishAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> FinishObservation:
-        return FinishObservation(content=action.message)
+        return FinishObservation.from_text(text=action.message)
 
 
 class FinishTool(ToolDefinition[FinishAction, FinishObservation]):
diff --git a/openhands-sdk/openhands/sdk/tool/builtins/think.py b/openhands-sdk/openhands/sdk/tool/builtins/think.py
index 1fcd24fcf9..ca641ce94d 100644
--- a/openhands-sdk/openhands/sdk/tool/builtins/think.py
+++ b/openhands-sdk/openhands/sdk/tool/builtins/think.py
@@ -75,7 +75,7 @@ def __call__(
         _: ThinkAction,
         conversation: "BaseConversation | None" = None,  # noqa: ARG002
     ) -> ThinkObservation:
-        return ThinkObservation(content="Your thought has been logged.")
+        return ThinkObservation.from_text(text="Your thought has been logged.")
 
 
 class ThinkTool(ToolDefinition[ThinkAction, ThinkObservation]):
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 9c596fae04..044e90f218 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -1,6 +1,6 @@
 from abc import ABC
 from collections.abc import Sequence
-from typing import Any, ClassVar, TypeVar
+from typing import TYPE_CHECKING, Any, ClassVar, TypeVar
 
 from pydantic import ConfigDict, Field, create_model
 from rich.text import Text
@@ -13,6 +13,9 @@
 from openhands.sdk.utils.visualize import display_dict
 
 
+if TYPE_CHECKING:
+    from typing import Self
+
 S = TypeVar("S", bound="Schema")
 
 
@@ -190,11 +193,11 @@ def visualize(self) -> Text:
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
-    content: str | list[TextContent | ImageContent] = Field(
-        default="",
+    content: list[TextContent | ImageContent] = Field(
+        default_factory=list,
         description=(
-            "Content returned from the tool. Can be a simple string for most tools, "
-            "or a list of TextContent/ImageContent for tools that need rich content. "
+            "Content returned from the tool as a list of "
+            "TextContent/ImageContent objects. "
             "When there is an error, it should be written in this field."
         ),
     )
@@ -206,6 +209,57 @@ class Observation(Schema, ABC):
         description="Header prepended to content when is_error is True",
     )
 
+    @classmethod
+    def from_text(
+        cls,
+        text: str,
+        is_error: bool = False,
+        **kwargs: Any,
+    ) -> "Self":
+        """Utility to create an Observation from a simple text string.
+
+        Args:
+            text: The text content to include in the observation.
+            is_error: Whether this observation represents an error.
+            **kwargs: Additional fields for the observation subclass.
+
+        Returns:
+            An Observation instance with the text wrapped in a TextContent.
+        """
+        return cls(content=[TextContent(text=text)], is_error=is_error, **kwargs)
+
+    def get_text(self) -> str:
+        """Extract text when observation contains a single TextContent.
+
+        Returns:
+            Text from the first TextContent item, or empty string if none.
+
+        Raises:
+            ValueError: If content has multiple items or non-text content.
+        """
+        if not self.content:
+            return ""
+        if len(self.content) > 1:
+            raise ValueError(
+                "get_text() can only be used when content contains a single item"
+            )
+        item = self.content[0]
+        if not isinstance(item, TextContent):
+            raise ValueError(
+                "get_text() can only be used when content contains TextContent"
+            )
+        return item.text
+
+    def get_text_safe(self) -> str:
+        """Safely extract all text content from the observation.
+
+        Returns:
+            Concatenated text from all TextContent items in content.
+        """
+        return "".join(
+            item.text for item in self.content if isinstance(item, TextContent)
+        )
+
     @property
     def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         """
@@ -218,14 +272,8 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # Add content
-        if self.content:
-            # Handle both str and list types
-            if isinstance(self.content, str):
-                llm_content.append(TextContent(text=self.content))
-            else:
-                # It's a list of TextContent | ImageContent
-                llm_content.extend(self.content)
+        # Add content (now always a list)
+        llm_content.extend(self.content)
 
         return llm_content
 
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 3ad172e149..f0c2e78c9b 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -40,13 +40,12 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # BrowserObservation always has content as str
-        assert isinstance(self.content, str)
-        content_text = self.content
-
-        llm_content.append(
-            TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
-        )
+        # Get text content and truncate if needed
+        content_text = self.get_text_safe()
+        if content_text:
+            llm_content.append(
+                TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
+            )
 
         if self.screenshot_data:
             mime_type = "image/png"
diff --git a/openhands-tools/openhands/tools/browser_use/impl.py b/openhands-tools/openhands/tools/browser_use/impl.py
index 3b872a3edb..19bdce7dc8 100644
--- a/openhands-tools/openhands/tools/browser_use/impl.py
+++ b/openhands-tools/openhands/tools/browser_use/impl.py
@@ -225,13 +225,13 @@ async def _execute_action(self, action):
                 result = await self.close_tab(action.tab_id)
             else:
                 error_msg = f"Unsupported action type: {type(action)}"
-                return BrowserObservation(content=error_msg, is_error=True)
+                return BrowserObservation.from_text(text=error_msg, is_error=True)
 
-            return BrowserObservation(content=result)
+            return BrowserObservation.from_text(text=result)
         except Exception as e:
             error_msg = f"Browser operation failed: {str(e)}"
             logger.error(error_msg, exc_info=True)
-            return BrowserObservation(content=error_msg, is_error=True)
+            return BrowserObservation.from_text(text=error_msg, is_error=True)
 
     async def _ensure_initialized(self):
         """Ensure browser session is initialized."""
@@ -281,15 +281,15 @@ async def get_state(self, include_screenshot: bool = False):
 
                 # Return clean JSON + separate screenshot data
                 clean_json = json.dumps(result_data, indent=2)
-                return BrowserObservation(
-                    content=clean_json,
+                return BrowserObservation.from_text(
+                    text=clean_json,
                     screenshot_data=screenshot_data,
                 )
             except json.JSONDecodeError:
                 # If JSON parsing fails, return as-is
                 pass
 
-        return BrowserObservation(content=result_json)
+        return BrowserObservation.from_text(text=result_json)
 
     # Tab Management
     async def list_tabs(self) -> str:
diff --git a/openhands-tools/openhands/tools/delegate/impl.py b/openhands-tools/openhands/tools/delegate/impl.py
index ec83ed4d5a..da68f9400b 100644
--- a/openhands-tools/openhands/tools/delegate/impl.py
+++ b/openhands-tools/openhands/tools/delegate/impl.py
@@ -58,10 +58,12 @@ def __call__(  # type: ignore[override]
         elif action.command == "delegate":
             return self._delegate_tasks(action)
         else:
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text=(
+                    f"Unsupported command: {action.command}. "
+                    "Available commands: spawn, delegate"
+                ),
                 command=action.command,
-                content=f"Unsupported command: {action.command}. "
-                "Available commands: spawn, delegate",
                 is_error=True,
             )
 
@@ -76,20 +78,20 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
             DelegateObservation indicating success/failure and which agents were spawned
         """
         if not action.ids:
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text="At least one ID is required for spawn action",
                 command=action.command,
-                content="At least one ID is required for spawn action",
                 is_error=True,
             )
 
         if len(self._sub_agents) + len(action.ids) > self._max_children:
-            return DelegateObservation(
-                command=action.command,
-                content=(
+            return DelegateObservation.from_text(
+                text=(
                     f"Cannot spawn {len(action.ids)} agents. "
                     f"Already have {len(self._sub_agents)} agents, "
                     f"maximum is {self._max_children}"
                 ),
+                command=action.command,
                 is_error=True,
             )
 
@@ -119,16 +121,16 @@ def _spawn_agents(self, action: "DelegateAction") -> DelegateObservation:
 
             agent_list = ", ".join(action.ids)
             message = f"Successfully spawned {len(action.ids)} sub-agents: {agent_list}"
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text=message,
                 command=action.command,
-                content=message,
             )
 
         except Exception as e:
             logger.error(f"Error: failed to spawn agents: {e}", exc_info=True)
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text=f"failed to spawn agents: {str(e)}",
                 command=action.command,
-                content=f"failed to spawn agents: {str(e)}",
                 is_error=True,
             )
 
@@ -144,21 +146,21 @@ def _delegate_tasks(self, action: "DelegateAction") -> "DelegateObservation":
             DelegateObservation with consolidated results from all sub-agents
         """
         if not action.tasks:
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text="at least one task is required for delegate action",
                 command=action.command,
-                content="at least one task is required for delegate action",
                 is_error=True,
             )
 
         # Check that all requested agent IDs exist
         missing_agents = set(action.tasks.keys()) - set(self._sub_agents.keys())
         if missing_agents:
-            return DelegateObservation(
-                command=action.command,
-                content=(
+            return DelegateObservation.from_text(
+                text=(
                     f"sub-agents not found: {', '.join(missing_agents)}. "
                     f"Available agents: {', '.join(self._sub_agents.keys())}"
                 ),
+                command=action.command,
                 is_error=True,
             )
 
@@ -228,15 +230,15 @@ def run_task(agent_id: str, conversation: LocalConversation, task: str):
                 )
                 output_text += f"\n\nResults:\n{results_text}"
 
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text=output_text,
                 command=action.command,
-                content=output_text,
             )
 
         except Exception as e:
             logger.error(f"Failed to delegate tasks: {e}", exc_info=True)
-            return DelegateObservation(
+            return DelegateObservation.from_text(
+                text=f"failed to delegate tasks: {str(e)}",
                 command=action.command,
-                content=f"failed to delegate tasks: {str(e)}",
                 is_error=True,
             )
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 756e65f080..8e9459540c 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -109,9 +109,8 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.error_message_header))
 
-        # ExecuteBashObservation always has content as str
-        assert isinstance(self.content, str)
-        content_text = self.content
+        # ExecuteBashObservation always has content as a single TextContent
+        content_text = self.get_text_safe()
 
         ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
@@ -134,9 +133,8 @@ def visualize(self) -> Text:
             content_obj.append("❌ ", style="red bold")
             content_obj.append("Command execution error\n", style="red")
 
-        # ExecuteBashObservation always has content as str
-        assert isinstance(self.content, str)
-        content_text = self.content
+        # ExecuteBashObservation always has content as a single TextContent
+        content_text = self.get_text_safe()
 
         if content_text:
             # Style the output based on content
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 5741b294a3..5774badfec 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -112,8 +112,8 @@ def reset(self) -> ExecuteBashObservation:
             f"Terminal session reset successfully with working_dir: {original_work_dir}"
         )
 
-        return ExecuteBashObservation(
-            content=(
+        return ExecuteBashObservation.from_text(
+            text=(
                 "Terminal session has been reset. All previous environment "
                 "variables and session state have been cleared."
             ),
@@ -179,13 +179,7 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        if isinstance(observation.content, str):
-            content_text = observation.content
-        else:
-            first_item = observation.content[0] if observation.content else None
-            content_text = (
-                first_item.text if isinstance(first_item, TextContent) else ""
-            )
+        content_text = observation.get_text_safe()
 
         if content_text and conversation is not None:
             try:
@@ -193,7 +187,9 @@ def __call__(
                 masked_content = secret_registry.mask_secrets_in_output(content_text)
                 if masked_content:
                     data = observation.model_dump(exclude={"content"})
-                    return ExecuteBashObservation(**data, content=masked_content)
+                    return ExecuteBashObservation(
+                        **data, content=[TextContent(text=masked_content)]
+                    )
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 667a1251a4..06dae55159 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -4,6 +4,7 @@
 import time
 from enum import Enum
 
+from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.tools.execute_bash.constants import (
     CMD_OUTPUT_PS1_END,
@@ -188,7 +189,7 @@ def _handle_completed_command(
         self._ready_for_next_command()
         return ExecuteBashObservation(
             command=command,
-            content=command_output,
+            content=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -222,7 +223,7 @@ def _handle_nochange_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            content=command_output,
+            content=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -257,7 +258,7 @@ def _handle_hard_timeout_command(
         )
         return ExecuteBashObservation(
             command=command,
-            content=command_output,
+            content=[TextContent(text=command_output)],
             metadata=metadata,
         )
 
@@ -312,15 +313,15 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             TerminalCommandStatus.HARD_TIMEOUT,
         }:
             if command == "":
-                return ExecuteBashObservation(
+                return ExecuteBashObservation.from_text(
+                    text="No previous running command to retrieve logs from.",
                     command=command,
-                    content="No previous running command to retrieve logs from.",
                     is_error=True,
                 )
             if is_input:
-                return ExecuteBashObservation(
+                return ExecuteBashObservation.from_text(
+                    text="No previous running command to interact with.",
                     command=command,
-                    content="No previous running command to interact with.",
                     is_error=True,
                 )
 
@@ -330,13 +331,13 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             commands_list = "\n".join(
                 f"({i + 1}) {cmd}" for i, cmd in enumerate(splited_commands)
             )
-            return ExecuteBashObservation(
-                command=command,
-                content=(
+            return ExecuteBashObservation.from_text(
+                text=(
                     "Cannot execute multiple commands at once.\n"
                     "Please run each command separately OR chain them into a single "
                     f"command via && or ;\nProvided commands:\n{commands_list}"
                 ),
+                command=command,
                 is_error=True,
             )
 
@@ -390,7 +391,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
             )
             obs = ExecuteBashObservation(
                 command=command,
-                content=command_output,
+                content=[TextContent(text=command_output)],
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
index 271074aba7..99adb7a409 100644
--- a/openhands-tools/openhands/tools/file_editor/editor.py
+++ b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -108,12 +108,12 @@ def __call__(
                 raise EditorToolParameterMissingError(command, "file_text")
             self.write_file(_path, file_text)
             self._history_manager.add_history(_path, file_text)
-            return FileEditorObservation(
+            return FileEditorObservation.from_text(
+                text=f"File created successfully at: {_path}",
                 command=command,
                 path=str(_path),
                 new_content=file_text,
                 prev_exist=False,
-                content=f"File created successfully at: {_path}",
             )
         elif command == "str_replace":
             if old_str is None:
@@ -253,9 +253,9 @@ def str_replace(
             "Review the changes and make sure they are as expected. Edit the "
             "file again if necessary."
         )
-        return FileEditorObservation(
+        return FileEditorObservation.from_text(
+            text=success_message,
             command="str_replace",
-            content=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_content,
@@ -294,9 +294,9 @@ def view(
                 truncate_notice=DIRECTORY_CONTENT_TRUNCATED_NOTICE,
             )
             if stderr:
-                return FileEditorObservation(
+                return FileEditorObservation.from_text(
+                    text=stderr,
                     command="view",
-                    content=stderr,
                     is_error=True,
                     path=str(path),
                     prev_exist=True,
@@ -320,9 +320,9 @@ def view(
                     f"are excluded. You can use 'ls -la {path}' to see them."
                 )
             stdout = "\n".join(msg)
-            return FileEditorObservation(
+            return FileEditorObservation.from_text(
+                text=stdout,
                 command="view",
-                content=stdout,
                 path=str(path),
                 prev_exist=True,
             )
@@ -336,9 +336,9 @@ def view(
             file_content = self.read_file(path)
             output = self._make_output(file_content, str(path), start_line)
 
-            return FileEditorObservation(
+            return FileEditorObservation.from_text(
+                text=output,
                 command="view",
-                content=output,
                 path=str(path),
                 prev_exist=True,
             )
@@ -389,10 +389,10 @@ def view(
         if warning_message:
             output = f"NOTE: {warning_message}\n{output}"
 
-        return FileEditorObservation(
+        return FileEditorObservation.from_text(
+            text=output,
             command="view",
             path=str(path),
-            content=output,
             prev_exist=True,
         )
 
@@ -502,9 +502,9 @@ def insert(
             "Review the changes and make sure they are as expected (correct "
             "indentation, no duplicate lines, etc). Edit the file again if necessary."
         )
-        return FileEditorObservation(
+        return FileEditorObservation.from_text(
+            text=success_message,
             command="insert",
-            content=success_message,
             prev_exist=True,
             path=str(path),
             old_content=file_text,
@@ -571,12 +571,12 @@ def undo_edit(self, path: Path) -> FileEditorObservation:
 
         self.write_file(path, old_text)
 
-        return FileEditorObservation(
-            command="undo_edit",
-            content=(
+        return FileEditorObservation.from_text(
+            text=(
                 f"Last edit to {path} undone successfully. "
                 f"{self._make_output(old_text, str(path))}"
             ),
+            command="undo_edit",
             path=str(path),
             prev_exist=True,
             old_content=current_text,
diff --git a/openhands-tools/openhands/tools/file_editor/impl.py b/openhands-tools/openhands/tools/file_editor/impl.py
index f6b8732a0b..a653a6a24a 100644
--- a/openhands-tools/openhands/tools/file_editor/impl.py
+++ b/openhands-tools/openhands/tools/file_editor/impl.py
@@ -43,12 +43,14 @@ def __call__(
         if self.allowed_edits_files is not None and action.command != "view":
             action_path = Path(action.path).resolve()
             if action_path not in self.allowed_edits_files:
-                return FileEditorObservation(
+                return FileEditorObservation.from_text(
+                    text=(
+                        f"Operation '{action.command}' is not allowed "
+                        f"on file '{action_path}'. "
+                        f"Only the following files can be edited: "
+                        f"{sorted(str(p) for p in self.allowed_edits_files)}"
+                    ),
                     command=action.command,
-                    content=f"Operation '{action.command}' is not allowed "
-                    f"on file '{action_path}'. "
-                    f"Only the following files can be edited: "
-                    f"{sorted(str(p) for p in self.allowed_edits_files)}",
                     is_error=True,
                 )
 
@@ -64,8 +66,8 @@ def __call__(
                 insert_line=action.insert_line,
             )
         except ToolError as e:
-            result = FileEditorObservation(
-                command=action.command, content=e.message, is_error=True
+            result = FileEditorObservation.from_text(
+                text=e.message, command=action.command, is_error=True
             )
         assert result is not None, "file_editor should always return a result"
         return result
@@ -98,8 +100,8 @@ def file_editor(
             insert_line=insert_line,
         )
     except ToolError as e:
-        result = FileEditorObservation(
-            command=command, content=e.message, is_error=True
+        result = FileEditorObservation.from_text(
+            text=e.message, command=command, is_error=True
         )
     assert result is not None, "file_editor should always return a result"
     return result
diff --git a/openhands-tools/openhands/tools/glob/impl.py b/openhands-tools/openhands/tools/glob/impl.py
index d0e00bda53..45261cb970 100644
--- a/openhands-tools/openhands/tools/glob/impl.py
+++ b/openhands-tools/openhands/tools/glob/impl.py
@@ -67,11 +67,11 @@ def __call__(
                 )
 
             if not search_path.is_dir():
-                return GlobObservation(
+                return GlobObservation.from_text(
+                    text=f"Search path '{search_path}' is not a valid directory",
                     files=[],
                     pattern=original_pattern,
                     search_path=str(search_path),
-                    content=f"Search path '{search_path}' is not a valid directory",
                     is_error=True,
                 )
 
@@ -98,12 +98,12 @@ def __call__(
                         "Consider using a more specific pattern.]"
                     )
 
-            return GlobObservation(
+            return GlobObservation.from_text(
+                text=content,
                 files=files,
                 pattern=original_pattern,
                 search_path=str(search_path),
                 truncated=truncated,
-                content=content,
             )
 
         except Exception as e:
@@ -116,11 +116,11 @@ def __call__(
             except Exception:
                 error_search_path = "unknown"
 
-            return GlobObservation(
+            return GlobObservation.from_text(
+                text=str(e),
                 files=[],
                 pattern=action.pattern,
                 search_path=error_search_path,
-                content=str(e),
                 is_error=True,
             )
 
diff --git a/openhands-tools/openhands/tools/grep/impl.py b/openhands-tools/openhands/tools/grep/impl.py
index e936725853..e08baf1ee4 100644
--- a/openhands-tools/openhands/tools/grep/impl.py
+++ b/openhands-tools/openhands/tools/grep/impl.py
@@ -55,12 +55,12 @@ def __call__(
             if action.path:
                 search_path = Path(action.path).resolve()
                 if not search_path.is_dir():
-                    return GrepObservation(
+                    return GrepObservation.from_text(
+                        text=f"Search path '{action.path}' is not a valid directory",
                         matches=[],
                         pattern=action.pattern,
                         search_path=str(search_path),
                         include_pattern=action.include,
-                        content=f"Search path '{action.path}' is not a valid directory",
                         is_error=True,
                     )
             else:
@@ -70,12 +70,12 @@ def __call__(
             try:
                 re.compile(action.pattern)
             except re.error as e:
-                return GrepObservation(
+                return GrepObservation.from_text(
+                    text=f"Invalid regex pattern: {e}",
                     matches=[],
                     pattern=action.pattern,
                     search_path=str(search_path),
                     include_pattern=action.include,
-                    content=f"Invalid regex pattern: {e}",
                     is_error=True,
                 )
 
@@ -94,12 +94,12 @@ def __call__(
             except Exception:
                 error_search_path = "unknown"
 
-            return GrepObservation(
+            return GrepObservation.from_text(
+                text=str(e),
                 matches=[],
                 pattern=action.pattern,
                 search_path=error_search_path,
                 include_pattern=action.include,
-                content=str(e),
                 is_error=True,
             )
 
@@ -177,13 +177,13 @@ def _execute_with_ripgrep(
             truncated=truncated,
         )
 
-        return GrepObservation(
+        return GrepObservation.from_text(
+            text=output,
             matches=matches,
             pattern=action.pattern,
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            content=output,
         )
 
     def _execute_with_grep(
@@ -240,11 +240,11 @@ def _execute_with_grep(
             truncated=truncated,
         )
 
-        return GrepObservation(
+        return GrepObservation.from_text(
+            text=output,
             matches=matches,
             pattern=action.pattern,
             search_path=str(search_path),
             include_pattern=action.include,
             truncated=truncated,
-            content=output,
         )
diff --git a/tests/cross/test_stuck_detector.py b/tests/cross/test_stuck_detector.py
index f5508baf3a..f7e75afa79 100644
--- a/tests/cross/test_stuck_detector.py
+++ b/tests/cross/test_stuck_detector.py
@@ -58,8 +58,8 @@ def test_history_too_short():
 
     observation = ObservationEvent(
         source="environment",
-        observation=ExecuteBashObservation(
-            content="file1.txt\nfile2.txt",
+        observation=ExecuteBashObservation.from_text(
+            text="file1.txt\nfile2.txt",
             command="ls",
             exit_code=0,
         ),
@@ -109,8 +109,8 @@ def test_repeating_action_observation_not_stuck_less_than_4_repeats():
 
         observation = ObservationEvent(
             source="environment",
-            observation=ExecuteBashObservation(
-                content="file1.txt\nfile2.txt",
+            observation=ExecuteBashObservation.from_text(
+                text="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -160,8 +160,8 @@ def test_repeating_action_observation_stuck():
 
         observation = ObservationEvent(
             source="environment",
-            observation=ExecuteBashObservation(
-                content="file1.txt\nfile2.txt",
+            observation=ExecuteBashObservation.from_text(
+                text="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -303,8 +303,8 @@ def test_not_stuck_with_different_actions():
 
         observation = ObservationEvent(
             source="environment",
-            observation=ExecuteBashObservation(
-                content=f"output from {cmd}",
+            observation=ExecuteBashObservation.from_text(
+                text=f"output from {cmd}",
                 command=cmd,
                 exit_code=0,
             ),
@@ -354,8 +354,8 @@ def test_reset_after_user_message():
 
         observation = ObservationEvent(
             source="environment",
-            observation=ExecuteBashObservation(
-                content="file1.txt\nfile2.txt",
+            observation=ExecuteBashObservation.from_text(
+                text="file1.txt\nfile2.txt",
                 command="ls",
                 exit_code=0,
             ),
@@ -399,8 +399,8 @@ def test_reset_after_user_message():
 
     observation = ObservationEvent(
         source="environment",
-        observation=ExecuteBashObservation(
-            content="/home/user", command="pwd", exit_code=0
+        observation=ExecuteBashObservation.from_text(
+            text="/home/user", command="pwd", exit_code=0
         ),
         action_id=action.id,
         tool_name="bash",
diff --git a/tests/sdk/context/test_view_batch_atomicity.py b/tests/sdk/context/test_view_batch_atomicity.py
index ac37464cef..0060b351ce 100644
--- a/tests/sdk/context/test_view_batch_atomicity.py
+++ b/tests/sdk/context/test_view_batch_atomicity.py
@@ -55,8 +55,8 @@ def create_observation_event(
     tool_call_id: str, content: str = "Success", tool_name: str = "test_tool"
 ) -> ObservationEvent:
     """Helper to create an ObservationEvent."""
-    observation = MCPToolObservation(
-        content=[TextContent(text=content)],
+    observation = MCPToolObservation.from_text(
+        text=content,
         tool_name=tool_name,
     )
     return ObservationEvent(
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index a496332c78..0ca70c41aa 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -96,9 +96,9 @@ def test_from_call_tool_result_with_image(self):
 
     def test_to_llm_content_success(self):
         """Test agent observation formatting for success."""
-        observation = MCPToolObservation(
+        observation = MCPToolObservation.from_text(
+            text="[Tool 'test_tool' executed.]\nSuccess result",
             tool_name="test_tool",
-            content=[TextContent(text="[Tool 'test_tool' executed.]\nSuccess result")],
         )
 
         agent_obs = observation.to_llm_content
@@ -110,13 +110,13 @@ def test_to_llm_content_success(self):
 
     def test_to_llm_content_error(self):
         """Test agent observation formatting for error."""
-        observation = MCPToolObservation(
-            tool_name="test_tool",
-            content=(
+        observation = MCPToolObservation.from_text(
+            text=(
                 "[Tool 'test_tool' executed.]\n"
                 "[An error occurred during execution.]\n"
                 "Error occurred"
             ),
+            tool_name="test_tool",
             is_error=True,
         )
 
@@ -202,8 +202,8 @@ def test_call_tool_exception(self):
 
         # Mock call_async_from_sync to return an error observation
         def mock_call_async_from_sync(coro_func, **kwargs):
-            return MCPToolObservation(
-                content="Error calling MCP tool test_tool: Connection failed",
+            return MCPToolObservation.from_text(
+                text="Error calling MCP tool test_tool: Connection failed",
                 tool_name="test_tool",
                 is_error=True,
             )
@@ -216,7 +216,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
         assert observation.tool_name == "test_tool"
         assert observation.is_error is True
         assert observation.is_error is True
-        assert "Connection failed" in observation.content
+        assert "Connection failed" in observation.get_text()
 
 
 class TestMCPTool:
diff --git a/tests/tools/browser_use/conftest.py b/tests/tools/browser_use/conftest.py
index bd2d0d42cf..eebd394bc9 100644
--- a/tests/tools/browser_use/conftest.py
+++ b/tests/tools/browser_use/conftest.py
@@ -32,12 +32,10 @@ def create_mock_browser_response(
 ):
     """Helper to create mock browser responses."""
     if error:
-        return BrowserObservation(
-            content=error, is_error=True, screenshot_data=screenshot_data
+        return BrowserObservation.from_text(
+            text=error, is_error=True, screenshot_data=screenshot_data
         )
-    return BrowserObservation(
-        content=[TextContent(text=output)], screenshot_data=screenshot_data
-    )
+    return BrowserObservation.from_text(text=output, screenshot_data=screenshot_data)
 
 
 def assert_browser_observation_success(
@@ -63,4 +61,4 @@ def assert_browser_observation_error(
     assert isinstance(observation, BrowserObservation)
     assert observation.is_error is True
     if expected_error:
-        assert expected_error in observation.content
+        assert expected_error in observation.get_text()
diff --git a/tests/tools/browser_use/test_browser_executor.py b/tests/tools/browser_use/test_browser_executor.py
index 35d72b78b9..c83463cab6 100644
--- a/tests/tools/browser_use/test_browser_executor.py
+++ b/tests/tools/browser_use/test_browser_executor.py
@@ -2,7 +2,6 @@
 
 from unittest.mock import AsyncMock, patch
 
-from openhands.sdk.tool.schema import TextContent
 from openhands.tools.browser_use.definition import (
     BrowserClickAction,
     BrowserGetStateAction,
@@ -73,8 +72,8 @@ async def test_browser_executor_action_routing_get_state(
     mock_get_state, mock_browser_executor
 ):
     """Test that get_state actions are routed correctly and return directly."""
-    expected_observation = BrowserObservation(
-        content=[TextContent(text="State retrieved")], screenshot_data="base64data"
+    expected_observation = BrowserObservation.from_text(
+        text="State retrieved", screenshot_data="base64data"
     )
     mock_get_state.return_value = expected_observation
 
@@ -106,7 +105,7 @@ async def test_browser_executor_error_wrapping(mock_navigate, mock_browser_execu
     result = await mock_browser_executor._execute_action(action)
 
     assert_browser_observation_error(result, "Browser operation failed")
-    assert "Browser error occurred" in result.content
+    assert "Browser error occurred" in result.get_text()
 
 
 def test_browser_executor_async_execution(mock_browser_executor):
@@ -114,7 +113,7 @@ def test_browser_executor_async_execution(mock_browser_executor):
     with patch.object(
         mock_browser_executor, "_execute_action", new_callable=AsyncMock
     ) as mock_execute:
-        expected_result = BrowserObservation(content=[TextContent(text="Test result")])
+        expected_result = BrowserObservation.from_text(text="Test result")
         mock_execute.return_value = expected_result
 
         action = BrowserNavigateAction(url="https://example.com")
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index cf5971d881..3174feecbd 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -6,18 +6,18 @@
 
 def test_browser_observation_basic_output():
     """Test basic BrowserObservation creation with output."""
-    observation = BrowserObservation(content="Test output")
+    observation = BrowserObservation.from_text(text="Test output")
 
-    assert observation.content == "Test output"
+    assert observation.get_text() == "Test output"
     assert observation.is_error is False
     assert observation.screenshot_data is None
 
 
 def test_browser_observation_with_error():
     """Test BrowserObservation with error."""
-    observation = BrowserObservation(content="Test error", is_error=True)
+    observation = BrowserObservation.from_text(text="Test error", is_error=True)
 
-    assert observation.content == "Test error"
+    assert observation.get_text() == "Test error"
     assert observation.is_error is True
     assert observation.screenshot_data is None
 
@@ -25,18 +25,18 @@ def test_browser_observation_with_error():
 def test_browser_observation_with_screenshot():
     """Test BrowserObservation with screenshot data."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
-    observation = BrowserObservation(
-        content="Screenshot taken", screenshot_data=screenshot_data
+    observation = BrowserObservation.from_text(
+        text="Screenshot taken", screenshot_data=screenshot_data
     )
 
-    assert observation.content == "Screenshot taken"
+    assert observation.get_text() == "Screenshot taken"
     assert observation.is_error is False
     assert observation.screenshot_data == screenshot_data
 
 
 def test_browser_observation_to_llm_content_text_only():
     """Test to_llm_content property with text only."""
-    observation = BrowserObservation(content="Test output")
+    observation = BrowserObservation.from_text(text="Test output")
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 1
@@ -47,8 +47,8 @@ def test_browser_observation_to_llm_content_text_only():
 def test_browser_observation_to_llm_content_with_screenshot():
     """Test to_llm_content property with screenshot."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
-    observation = BrowserObservation(
-        content="Screenshot taken", screenshot_data=screenshot_data
+    observation = BrowserObservation.from_text(
+        text="Screenshot taken", screenshot_data=screenshot_data
     )
     agent_obs = observation.to_llm_content
 
@@ -63,7 +63,7 @@ def test_browser_observation_to_llm_content_with_screenshot():
 
 def test_browser_observation_to_llm_content_with_error():
     """Test to_llm_content property with error."""
-    observation = BrowserObservation(content="Test error", is_error=True)
+    observation = BrowserObservation.from_text(text="Test error", is_error=True)
     agent_obs = observation.to_llm_content
 
     assert len(agent_obs) == 2
@@ -77,7 +77,7 @@ def test_browser_observation_output_truncation():
     """Test output truncation for very long outputs."""
     # Create a very long output string
     long_output = "x" * 100000  # 100k characters
-    observation = BrowserObservation(content=long_output)
+    observation = BrowserObservation.from_text(text=long_output)
 
     agent_obs = observation.to_llm_content
 
@@ -91,7 +91,9 @@ def test_browser_observation_output_truncation():
 def test_browser_observation_screenshot_data_url_conversion():
     """Test that screenshot data is properly converted to data URL."""
     screenshot_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChAI9jU77zgAAAABJRU5ErkJggg=="  # noqa: E501
-    observation = BrowserObservation(content="Test", screenshot_data=screenshot_data)
+    observation = BrowserObservation.from_text(
+        text="Test", screenshot_data=screenshot_data
+    )
 
     agent_obs = observation.to_llm_content
     expected_data_url = f"data:image/png;base64,{screenshot_data}"
@@ -103,10 +105,10 @@ def test_browser_observation_screenshot_data_url_conversion():
 
 def test_browser_observation_empty_screenshot_handling():
     """Test handling of empty or None screenshot data."""
-    observation = BrowserObservation(content="Test", screenshot_data="")
+    observation = BrowserObservation.from_text(text="Test", screenshot_data="")
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
 
-    observation = BrowserObservation(content="Test", screenshot_data=None)
+    observation = BrowserObservation.from_text(text="Test", screenshot_data=None)
     agent_obs = observation.to_llm_content
     assert len(agent_obs) == 1  # Only text content, no image
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index 3dd30c2d5f..85f19144fa 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -66,12 +66,12 @@ def test_delegate_action_creation():
 def test_delegate_observation_creation():
     """Test creating DelegateObservation instances."""
     # Test spawn observation with string output
-    spawn_observation = DelegateObservation(
+    spawn_observation = DelegateObservation.from_text(
+        text="spawn: Sub-agents created successfully",
         command="spawn",
-        content="spawn: Sub-agents created successfully",
     )
-    assert isinstance(spawn_observation.content, str)
-    assert spawn_observation.content == "spawn: Sub-agents created successfully"
+    assert isinstance(spawn_observation.content, list)
+    assert spawn_observation.get_text() == "spawn: Sub-agents created successfully"
     # Verify to_llm_content returns TextContent
     llm_content = spawn_observation.to_llm_content
     assert len(llm_content) == 1
@@ -79,17 +79,17 @@ def test_delegate_observation_creation():
     assert llm_content[0].text == "spawn: Sub-agents created successfully"
 
     # Test delegate observation with string output
-    delegate_observation = DelegateObservation(
-        command="delegate",
-        content=(
+    delegate_observation = DelegateObservation.from_text(
+        text=(
             "delegate: Tasks completed successfully\n\nResults:\n"
             "1. Result 1\n2. Result 2"
         ),
+        command="delegate",
     )
-    assert isinstance(delegate_observation.content, str)
-    assert "Tasks completed successfully" in delegate_observation.content
-    assert "Result 1" in delegate_observation.content
-    assert "Result 2" in delegate_observation.content
+    assert isinstance(delegate_observation.content, list)
+    assert "Tasks completed successfully" in delegate_observation.get_text()
+    assert "Result 1" in delegate_observation.get_text()
+    assert "Result 2" in delegate_observation.get_text()
     # Verify to_llm_content
     llm_content = delegate_observation.to_llm_content
     assert len(llm_content) == 1
@@ -104,8 +104,8 @@ def test_delegate_executor_delegate():
     # First spawn some agents
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
-    assert isinstance(spawn_observation.content, str)
-    assert "Successfully spawned" in spawn_observation.content
+    assert isinstance(spawn_observation.content, list)
+    assert "Successfully spawned" in spawn_observation.get_text()
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -114,22 +114,23 @@ def test_delegate_executor_delegate():
     )
 
     with patch.object(executor, "_delegate_tasks") as mock_delegate:
-        mock_observation = DelegateObservation(
-            command="delegate",
-            content=(
+        mock_observation = DelegateObservation.from_text(
+            text=(
                 "delegate: Tasks completed successfully\n\nResults:\n"
                 "1. Agent agent1: Code analysis complete\n"
                 "2. Agent agent2: Tests written"
             ),
+            command="delegate",
         )
         mock_delegate.return_value = mock_observation
 
         observation = executor(delegate_action, parent_conversation)
 
     assert isinstance(observation, DelegateObservation)
-    assert isinstance(observation.content, str)
-    assert "Agent agent1: Code analysis complete" in observation.content
-    assert "Agent agent2: Tests written" in observation.content
+    assert isinstance(observation.content, list)
+    text_content = observation.get_text()
+    assert "Agent agent1: Code analysis complete" in text_content
+    assert "Agent agent2: Tests written" in text_content
 
 
 def test_delegate_executor_missing_task():
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index fb52c415a0..e34664ed50 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -273,9 +273,9 @@ def test_cmd_output_observation_properties():
 
     # Test with successful command
     metadata = CmdOutputMetadata(exit_code=0, pid=123)
-    obs = ExecuteBashObservation(
+    obs = ExecuteBashObservation.from_text(
+        text="file1\nfile2",
         command="ls",
-        content="file1\nfile2",
         exit_code=0,
         metadata=metadata,
     )
@@ -291,10 +291,10 @@ def test_cmd_output_observation_properties():
 
     # Test with failed command
     metadata = CmdOutputMetadata(exit_code=1, pid=456)
-    obs = ExecuteBashObservation(
+    obs = ExecuteBashObservation.from_text(
+        text="Command failed",
         command="invalid",
         exit_code=1,
-        content="Command failed",
         is_error=True,
         metadata=metadata,
     )

From 2645aabaeaf1a4ccda63c892f9c10a944167de20 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 07:28:23 +0000
Subject: [PATCH 49/76] fix: update tests and examples to use list-based
 observation.content

- Fixed tests that incorrectly compared observation.content to strings
- Updated example to use get_text_safe() instead of direct content access
- Removed useless type narrowing assertion in MCPToolObservation.visualize
- All tests and pre-commit hooks now pass

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py    |  3 +--
 openhands-sdk/openhands/sdk/mcp/definition.py    |  3 ---
 .../conversation/local/test_confirmation_mode.py | 10 +++++++---
 tests/sdk/conversation/test_visualizer.py        |  9 ++-------
 tests/sdk/event/test_event_serialization.py      | 16 +++++++---------
 tests/tools/execute_bash/test_bash_session.py    |  2 +-
 .../execute_bash/test_observation_truncation.py  |  2 +-
 7 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 3c8309875e..8f5750ffb6 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -92,8 +92,7 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        assert isinstance(result.content, str)
-        output_text = result.content
+        output_text = result.get_text_safe()
 
         if output_text.strip():
             for line in output_text.strip().splitlines():
diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 84b6deee6f..539783d6df 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -89,9 +89,6 @@ def from_call_tool_result(
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
-        # MCPToolObservation always has content as a list
-        assert isinstance(self.content, list)
-
         content_obj = Text()
         content_obj.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
         if self.is_error:
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 2e87755604..193d5fe292 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -553,7 +553,9 @@ def test_single_finish_action_skips_confirmation_entirely(self):
         ]
         assert len(obs_events) == 1
         # FinishObservation should contain the finish message in content
-        assert obs_events[0].observation.content == "Task completed successfully!"
+        assert (
+            obs_events[0].observation.get_text_safe() == "Task completed successfully!"
+        )
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -597,11 +599,13 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
 
         # 1) ThinkAction observation - should contain the standard message
         assert hasattr(obs_events[0].observation, "content")
-        assert obs_events[0].observation.content == "Your thought has been logged."
+        assert (
+            obs_events[0].observation.get_text_safe() == "Your thought has been logged."
+        )
 
         # 2) FinishAction observation - should contain the finish message
         assert hasattr(obs_events[1].observation, "content")
-        assert obs_events[1].observation.content == "Analysis complete"
+        assert obs_events[1].observation.get_text_safe() == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/sdk/conversation/test_visualizer.py b/tests/sdk/conversation/test_visualizer.py
index 8037fc2dcd..040c9c00f9 100644
--- a/tests/sdk/conversation/test_visualizer.py
+++ b/tests/sdk/conversation/test_visualizer.py
@@ -1,7 +1,6 @@
 """Tests for the conversation visualizer and event visualization."""
 
 import json
-from collections.abc import Sequence
 
 from rich.text import Text
 
@@ -19,7 +18,6 @@
     UserRejectObservation,
 )
 from openhands.sdk.llm import (
-    ImageContent,
     Message,
     MessageToolCall,
     TextContent,
@@ -153,13 +151,10 @@ def test_observation_event_visualize():
     from openhands.sdk.tool import Observation
 
     class VisualizerMockObservation(Observation):
-        @property
-        def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-            assert isinstance(self.content, str)
-            return [TextContent(text=self.content)]
+        pass
 
     observation = VisualizerMockObservation(
-        content="total 4\ndrwxr-xr-x 2 user user 4096 Jan 1 12:00 ."
+        content=[TextContent(text="total 4\ndrwxr-xr-x 2 user user 4096 Jan 1 12:00 .")]
     )
     event = ObservationEvent(
         observation=observation,
diff --git a/tests/sdk/event/test_event_serialization.py b/tests/sdk/event/test_event_serialization.py
index bbb82a799b..e922bfe8ca 100644
--- a/tests/sdk/event/test_event_serialization.py
+++ b/tests/sdk/event/test_event_serialization.py
@@ -1,7 +1,5 @@
 """Comprehensive tests for event serialization and deserialization."""
 
-from collections.abc import Sequence
-
 import pytest
 from pydantic import ValidationError
 
@@ -16,7 +14,6 @@
     SystemPromptEvent,
 )
 from openhands.sdk.llm import (
-    ImageContent,
     Message,
     MessageToolCall,
     TextContent,
@@ -32,16 +29,15 @@ class EventsSerializationMockAction(Action):
     """Mock action for testing."""
 
     def execute(self) -> "EventsSerializationMockObservation":
-        return EventsSerializationMockObservation(content="mock result")
+        return EventsSerializationMockObservation(
+            content=[TextContent(text="mock result")]
+        )
 
 
 class EventsSerializationMockObservation(Observation):
     """Mock observation for testing."""
 
-    @property
-    def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
-        assert isinstance(self.content, str)
-        return [TextContent(text=self.content)]
+    pass
 
 
 def test_event_base_serialization() -> None:
@@ -99,7 +95,9 @@ def test_action_event_serialization() -> None:
 
 def test_observation_event_serialization() -> None:
     """Test ObservationEvent serialization/deserialization."""
-    observation = EventsSerializationMockObservation(content="test result")
+    observation = EventsSerializationMockObservation(
+        content=[TextContent(text="test result")]
+    )
     event = ObservationEvent(
         observation=observation,
         action_id="action_123",
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index bc4ff492c2..c56ae1d7a7 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -320,7 +320,7 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.is_error is True
-    assert obs.content == "No previous running command to retrieve logs from."
+    assert obs.get_text_safe() == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
     assert obs.to_llm_content[0].text == "Tool Execution Error. "
diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 3db0c8db3a..5137d65a44 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -84,7 +84,7 @@ def test_execute_bash_observation_truncation_with_error():
     )
 
     observation = ExecuteBashObservation(
-        content="Command failed",
+        content=[TextContent(text="Command failed")],
         metadata=metadata,
         is_error=True,
     )

From 249f9fed3cd7e07cbeddaef95579d6aef78899cf Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 09:00:50 +0000
Subject: [PATCH 50/76] fix: update task_tracker and tests to use list-based
 observation.content

- Changed TaskTrackerObservation to use from_text() helper for content initialization
- Updated all test assertions to use get_text_safe() instead of direct content access
- Fixed type errors in task_tracker/definition.py (4 errors)
- Fixed test failures in grep, glob, execute_bash, and cross-tests
- All tests now properly handle observation.content as list[TextContent | ImageContent]

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../tools/task_tracker/definition.py          | 18 +++++++-------
 tests/cross/test_agent_secrets_integration.py | 24 +++++++++----------
 tests/tools/execute_bash/test_bash_session.py |  2 +-
 tests/tools/glob/test_glob_executor.py        |  2 +-
 tests/tools/glob/test_glob_tool.py            |  2 +-
 tests/tools/grep/test_grep_executor.py        |  4 ++--
 tests/tools/grep/test_grep_tool.py            |  4 ++--
 7 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 11956806d7..c3eacd3ade 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -170,8 +170,8 @@ def __call__(
             # Save to file if save_dir is provided
             if self.save_dir:
                 self._save_tasks()
-            return TaskTrackerObservation(
-                content=(
+            return TaskTrackerObservation.from_text(
+                text=(
                     f"Task list has been updated with {len(self._task_list)} item(s)."
                 ),
                 command=action.command,
@@ -180,22 +180,20 @@ def __call__(
         elif action.command == "view":
             # Return the current task list
             if not self._task_list:
-                return TaskTrackerObservation(
-                    content=(
-                        'No task list found. Use the "plan" command to create one.'
-                    ),
+                return TaskTrackerObservation.from_text(
+                    text=('No task list found. Use the "plan" command to create one.'),
                     command=action.command,
                     task_list=[],
                 )
             content = self._format_task_list(self._task_list)
-            return TaskTrackerObservation(
-                content=content,
+            return TaskTrackerObservation.from_text(
+                text=content,
                 command=action.command,
                 task_list=self._task_list,
             )
         else:
-            return TaskTrackerObservation(
-                content=(
+            return TaskTrackerObservation.from_text(
+                text=(
                     f"Unknown command: {action.command}. "
                     'Supported commands are "view" and "plan".'
                 ),
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index b98d62ee1b..c663f2d0bb 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -234,13 +234,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $API_KEY")
         result = bash_executor(action, conversation=conversation)
-        assert "test-api-key" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "test-api-key" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "dynamic-secret" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "dynamic-secret" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
 
     finally:
         bash_executor.close()
@@ -265,13 +265,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "changing-secret" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "changing-secret" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
 
     finally:
         bash_executor.close()
@@ -303,13 +303,13 @@ def get_value(self):
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
         print(result)
-        assert "changing-secret" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "changing-secret" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.content
-        assert "<secret-hidden>" in result.content
+        assert "changing-secret" not in result.get_text_safe()
+        assert "<secret-hidden>" in result.get_text_safe()
         assert dynamic_secret.raised_on_second
 
     finally:
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index c56ae1d7a7..7be66f46cc 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -720,7 +720,7 @@ def test_multiple_multiline_commands(terminal_type):
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
             assert obs.is_error is True
-            assert "Cannot execute multiple commands at once" in obs.content
+            assert "Cannot execute multiple commands at once" in obs.get_text_safe()
 
             # Now run each command individually and verify they work
             results = []
diff --git a/tests/tools/glob/test_glob_executor.py b/tests/tools/glob/test_glob_executor.py
index a4c89f25ce..7a6fe29f6f 100644
--- a/tests/tools/glob/test_glob_executor.py
+++ b/tests/tools/glob/test_glob_executor.py
@@ -84,7 +84,7 @@ def test_glob_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.content
+        assert "is not a valid directory" in observation.get_text_safe()
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index f3f0e27be7..f04e2d520b 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -162,7 +162,7 @@ def test_glob_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.content
+        assert "is not a valid directory" in observation.get_text_safe()
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/grep/test_grep_executor.py b/tests/tools/grep/test_grep_executor.py
index 0f7be69c87..d70737d0df 100644
--- a/tests/tools/grep/test_grep_executor.py
+++ b/tests/tools/grep/test_grep_executor.py
@@ -106,7 +106,7 @@ def test_grep_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.content
+        assert "not a valid directory" in observation.get_text_safe()
 
 
 def test_grep_executor_no_matches():
@@ -186,4 +186,4 @@ def test_grep_executor_invalid_regex():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.content
+        assert "Invalid regex pattern" in observation.get_text_safe()
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index 14199f8175..44639b50e7 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -164,7 +164,7 @@ def test_grep_tool_invalid_regex():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.content
+        assert "Invalid regex pattern" in observation.get_text_safe()
 
 
 def test_grep_tool_invalid_directory():
@@ -179,7 +179,7 @@ def test_grep_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.content
+        assert "not a valid directory" in observation.get_text_safe()
 
 
 def test_grep_tool_hidden_files_excluded():

From 851213b788b41e3fe02201ca2d6428d320218415 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 09:40:56 +0000
Subject: [PATCH 51/76] refactor: simplify Observation API by replacing
 get_text_safe with get_text

- Removed get_text_safe() method and merged its implementation into get_text()
- get_text() now safely extracts all text content without raising errors
- Updated all usages of get_text_safe() to get_text() across codebase
- Simplifies API by having a single, safe text extraction method

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  2 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 24 +------------------
 .../openhands/tools/browser_use/definition.py |  2 +-
 .../tools/execute_bash/definition.py          |  4 ++--
 .../openhands/tools/execute_bash/impl.py      |  2 +-
 tests/cross/test_agent_secrets_integration.py | 24 +++++++++----------
 .../local/test_confirmation_mode.py           | 10 +++-----
 tests/tools/execute_bash/test_bash_session.py |  4 ++--
 tests/tools/glob/test_glob_executor.py        |  2 +-
 tests/tools/glob/test_glob_tool.py            |  2 +-
 tests/tools/grep/test_grep_executor.py        |  4 ++--
 tests/tools/grep/test_grep_tool.py            |  4 ++--
 12 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 8f5750ffb6..2b75edd295 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -92,7 +92,7 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        output_text = result.get_text_safe()
+        output_text = result.get_text()
 
         if output_text.strip():
             for line in output_text.strip().splitlines():
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 044e90f218..382ffb9672 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -229,29 +229,7 @@ def from_text(
         return cls(content=[TextContent(text=text)], is_error=is_error, **kwargs)
 
     def get_text(self) -> str:
-        """Extract text when observation contains a single TextContent.
-
-        Returns:
-            Text from the first TextContent item, or empty string if none.
-
-        Raises:
-            ValueError: If content has multiple items or non-text content.
-        """
-        if not self.content:
-            return ""
-        if len(self.content) > 1:
-            raise ValueError(
-                "get_text() can only be used when content contains a single item"
-            )
-        item = self.content[0]
-        if not isinstance(item, TextContent):
-            raise ValueError(
-                "get_text() can only be used when content contains TextContent"
-            )
-        return item.text
-
-    def get_text_safe(self) -> str:
-        """Safely extract all text content from the observation.
+        """Extract all text content from the observation.
 
         Returns:
             Concatenated text from all TextContent items in content.
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index f0c2e78c9b..9396a580ca 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -41,7 +41,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             llm_content.append(TextContent(text=self.error_message_header))
 
         # Get text content and truncate if needed
-        content_text = self.get_text_safe()
+        content_text = self.get_text()
         if content_text:
             llm_content.append(
                 TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index 8e9459540c..e9ad28bed9 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -110,7 +110,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             llm_content.append(TextContent(text=self.error_message_header))
 
         # ExecuteBashObservation always has content as a single TextContent
-        content_text = self.get_text_safe()
+        content_text = self.get_text()
 
         ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
@@ -134,7 +134,7 @@ def visualize(self) -> Text:
             content_obj.append("Command execution error\n", style="red")
 
         # ExecuteBashObservation always has content as a single TextContent
-        content_text = self.get_text_safe()
+        content_text = self.get_text()
 
         if content_text:
             # Style the output based on content
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 5774badfec..69580e5be3 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -179,7 +179,7 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        content_text = observation.get_text_safe()
+        content_text = observation.get_text()
 
         if content_text and conversation is not None:
             try:
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index c663f2d0bb..cb1cf27b3d 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -234,13 +234,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $API_KEY")
         result = bash_executor(action, conversation=conversation)
-        assert "test-api-key" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "test-api-key" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "dynamic-secret" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "dynamic-secret" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
 
     finally:
         bash_executor.close()
@@ -265,13 +265,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "changing-secret" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "changing-secret" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
 
     finally:
         bash_executor.close()
@@ -303,13 +303,13 @@ def get_value(self):
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
         print(result)
-        assert "changing-secret" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "changing-secret" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text_safe()
-        assert "<secret-hidden>" in result.get_text_safe()
+        assert "changing-secret" not in result.get_text()
+        assert "<secret-hidden>" in result.get_text()
         assert dynamic_secret.raised_on_second
 
     finally:
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 193d5fe292..343ee87524 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -553,9 +553,7 @@ def test_single_finish_action_skips_confirmation_entirely(self):
         ]
         assert len(obs_events) == 1
         # FinishObservation should contain the finish message in content
-        assert (
-            obs_events[0].observation.get_text_safe() == "Task completed successfully!"
-        )
+        assert obs_events[0].observation.get_text() == "Task completed successfully!"
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -599,13 +597,11 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
 
         # 1) ThinkAction observation - should contain the standard message
         assert hasattr(obs_events[0].observation, "content")
-        assert (
-            obs_events[0].observation.get_text_safe() == "Your thought has been logged."
-        )
+        assert obs_events[0].observation.get_text() == "Your thought has been logged."
 
         # 2) FinishAction observation - should contain the finish message
         assert hasattr(obs_events[1].observation, "content")
-        assert obs_events[1].observation.get_text_safe() == "Analysis complete"
+        assert obs_events[1].observation.get_text() == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 7be66f46cc..5d96d70f90 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -320,7 +320,7 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.is_error is True
-    assert obs.get_text_safe() == "No previous running command to retrieve logs from."
+    assert obs.get_text() == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
     assert obs.to_llm_content[0].text == "Tool Execution Error. "
@@ -720,7 +720,7 @@ def test_multiple_multiline_commands(terminal_type):
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
             assert obs.is_error is True
-            assert "Cannot execute multiple commands at once" in obs.get_text_safe()
+            assert "Cannot execute multiple commands at once" in obs.get_text()
 
             # Now run each command individually and verify they work
             results = []
diff --git a/tests/tools/glob/test_glob_executor.py b/tests/tools/glob/test_glob_executor.py
index 7a6fe29f6f..f4d478b7bc 100644
--- a/tests/tools/glob/test_glob_executor.py
+++ b/tests/tools/glob/test_glob_executor.py
@@ -84,7 +84,7 @@ def test_glob_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.get_text_safe()
+        assert "is not a valid directory" in observation.get_text()
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index f04e2d520b..a4fc06ce72 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -162,7 +162,7 @@ def test_glob_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.get_text_safe()
+        assert "is not a valid directory" in observation.get_text()
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/grep/test_grep_executor.py b/tests/tools/grep/test_grep_executor.py
index d70737d0df..014df969e4 100644
--- a/tests/tools/grep/test_grep_executor.py
+++ b/tests/tools/grep/test_grep_executor.py
@@ -106,7 +106,7 @@ def test_grep_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.get_text_safe()
+        assert "not a valid directory" in observation.get_text()
 
 
 def test_grep_executor_no_matches():
@@ -186,4 +186,4 @@ def test_grep_executor_invalid_regex():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.get_text_safe()
+        assert "Invalid regex pattern" in observation.get_text()
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index 44639b50e7..2f4f99ccb8 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -164,7 +164,7 @@ def test_grep_tool_invalid_regex():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.get_text_safe()
+        assert "Invalid regex pattern" in observation.get_text()
 
 
 def test_grep_tool_invalid_directory():
@@ -179,7 +179,7 @@ def test_grep_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.get_text_safe()
+        assert "not a valid directory" in observation.get_text()
 
 
 def test_grep_tool_hidden_files_excluded():

From 45bff228b18837d02471714152c8bf98f10f43ff Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 10:44:51 +0100
Subject: [PATCH 52/76] content_obj -> content

---
 openhands-sdk/openhands/sdk/mcp/definition.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 539783d6df..2baa1aa9ea 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -89,19 +89,19 @@ def from_call_tool_result(
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
-        content_obj = Text()
-        content_obj.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
+        content = Text()
+        content.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
         if self.is_error:
-            content_obj.append("[Error during execution]\n", style="bold red")
+            content.append("[Error during execution]\n", style="bold red")
         for block in self.content:
             if isinstance(block, TextContent):
                 # try to see if block.text is a JSON
                 try:
                     parsed = json.loads(block.text)
-                    content_obj.append(display_dict(parsed))
+                    content.append(display_dict(parsed))
                     continue
                 except (json.JSONDecodeError, TypeError):
-                    content_obj.append(block.text + "\n")
+                    content.append(block.text + "\n")
             elif isinstance(block, ImageContent):
-                content_obj.append(f"[Image with {len(block.image_urls)} URLs]\n")
-        return content_obj
+                content.append(f"[Image with {len(block.image_urls)} URLs]\n")
+        return content

From 5b4cffd9a7de7f5d5acb1b7474695817c347d4fe Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 10:08:55 +0000
Subject: [PATCH 53/76] refactor: standardize visualize method error handling
 across observations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Renamed 'content' variable to 'text' in all visualize methods
- Added consistent error prefix pattern with ❌ emoji and error_message_header
- Applied changes to:
  - Base Observation class in tool/schema.py
  - ExecuteBashObservation
  - FileEditorObservation
  - TaskTrackerObservation
- Ensures consistent error display across all observation types

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/tool/schema.py    | 13 ++++--
 .../tools/execute_bash/definition.py          | 45 ++++++++-----------
 .../openhands/tools/file_editor/definition.py |  9 +++-
 .../tools/task_tracker/definition.py          | 36 ++++++++-------
 4 files changed, 56 insertions(+), 47 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 382ffb9672..69967ec8b3 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -262,11 +262,16 @@ def visualize(self) -> Text:
         Subclasses can override for custom visualization; by default we show the
         same text that would be sent to the LLM.
         """
-        content = Text()
+        text = Text()
+
+        if self.is_error:
+            text.append("❌ ", style="red bold")
+            text.append(self.error_message_header, style="bold red")
+
         text_parts = content_to_str(self.to_llm_content)
         if text_parts:
             full_content = "".join(text_parts)
-            content.append(full_content)
+            text.append(full_content)
         else:
-            content.append("[no text content]")
-        return content
+            text.append("[no text content]")
+        return text
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index e9ad28bed9..df7d4cf001 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -126,12 +126,11 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with terminal-style output formatting."""
-        content_obj = Text()
+        text = Text()
 
-        # Add error indicator if present
         if self.is_error:
-            content_obj.append("❌ ", style="red bold")
-            content_obj.append("Command execution error\n", style="red")
+            text.append("❌ ", style="red bold")
+            text.append(self.error_message_header, style="bold red")
 
         # ExecuteBashObservation always has content as a single TextContent
         content_text = self.get_text()
@@ -146,28 +145,28 @@ def visualize(self) -> Text:
                         keyword in line.lower()
                         for keyword in ["error", "failed", "exception", "traceback"]
                     ):
-                        content_obj.append(line, style="red")
+                        text.append(line, style="red")
                     elif any(
                         keyword in line.lower() for keyword in ["warning", "warn"]
                     ):
-                        content_obj.append(line, style="yellow")
+                        text.append(line, style="yellow")
                     elif line.startswith("+ "):  # bash -x output
-                        content_obj.append(line, style="cyan")
+                        text.append(line, style="cyan")
                     else:
-                        content_obj.append(line, style="white")
-                content_obj.append("\n")
+                        text.append(line, style="white")
+                text.append("\n")
 
         # Add metadata with styling
         if hasattr(self, "metadata") and self.metadata:
             if self.metadata.working_dir:
-                content_obj.append("\n📁 ", style="blue")
-                content_obj.append(
+                text.append("\n📁 ", style="blue")
+                text.append(
                     f"Working directory: {self.metadata.working_dir}", style="blue"
                 )
 
             if self.metadata.py_interpreter_path:
-                content_obj.append("\n🐍 ", style="green")
-                content_obj.append(
+                text.append("\n🐍 ", style="green")
+                text.append(
                     f"Python interpreter: {self.metadata.py_interpreter_path}",
                     style="green",
                 )
@@ -177,22 +176,16 @@ def visualize(self) -> Text:
                 and self.metadata.exit_code is not None
             ):
                 if self.metadata.exit_code == 0:
-                    content_obj.append("\n✅ ", style="green")
-                    content_obj.append(
-                        f"Exit code: {self.metadata.exit_code}", style="green"
-                    )
+                    text.append("\n✅ ", style="green")
+                    text.append(f"Exit code: {self.metadata.exit_code}", style="green")
                 elif self.metadata.exit_code == -1:
-                    content_obj.append("\n⏳ ", style="yellow")
-                    content_obj.append(
-                        "Process still running (soft timeout)", style="yellow"
-                    )
+                    text.append("\n⏳ ", style="yellow")
+                    text.append("Process still running (soft timeout)", style="yellow")
                 else:
-                    content_obj.append("\n❌ ", style="red")
-                    content_obj.append(
-                        f"Exit code: {self.metadata.exit_code}", style="red"
-                    )
+                    text.append("\n❌ ", style="red")
+                    text.append(f"Exit code: {self.metadata.exit_code}", style="red")
 
-        return content_obj
+        return text
 
 
 TOOL_DESCRIPTION = """Execute a bash command in the terminal within a persistent shell session.
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index 34dc7b3538..c2b3a69a60 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -95,6 +95,11 @@ def visualize(self) -> Text:
         Shows diff visualization for meaningful changes (file creation, successful
         edits), otherwise falls back to agent observation.
         """
+        text = Text()
+
+        if self.is_error:
+            text.append("❌ ", style="red bold")
+            text.append(self.error_message_header, style="bold red")
 
         if not self._has_meaningful_diff:
             return super().visualize
@@ -111,7 +116,9 @@ def visualize(self) -> Text:
                 change_applied=change_applied,
             )
 
-        return self._diff_cache
+        # Combine error prefix with diff visualization
+        text.append(self._diff_cache)
+        return text
 
     @property
     def _has_meaningful_diff(self) -> bool:
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index c3eacd3ade..7b9fc66528 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -81,7 +81,11 @@ class TaskTrackerObservation(Observation):
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation with task list formatting."""
-        content = Text()
+        text = Text()
+
+        if self.is_error:
+            text.append("❌ ", style="red bold")
+            text.append(self.error_message_header, style="bold red")
 
         if self.task_list:
             # Count tasks by status
@@ -93,11 +97,11 @@ def visualize(self) -> Text:
 
             # Show status summary
             if self.command == "plan":
-                content.append("✅ ", style="green")
-                content.append("Task list updated: ", style="green")
+                text.append("✅ ", style="green")
+                text.append("Task list updated: ", style="green")
             else:  # view command
-                content.append("📋 ", style="blue")
-                content.append("Current task list: ", style="blue")
+                text.append("📋 ", style="blue")
+                text.append("Current task list: ", style="blue")
 
             # Status counts
             status_parts = []
@@ -109,33 +113,33 @@ def visualize(self) -> Text:
                 status_parts.append(f"{done_count} done")
 
             if status_parts:
-                content.append(", ".join(status_parts), style="white")
-                content.append("\n\n")
+                text.append(", ".join(status_parts), style="white")
+                text.append("\n\n")
 
             # Show the actual task list
             for i, task in enumerate(self.task_list, 1):
                 # Status icon
                 if task.status == "done":
-                    content.append("✅ ", style="green")
+                    text.append("✅ ", style="green")
                 elif task.status == "in_progress":
-                    content.append("🔄 ", style="yellow")
+                    text.append("🔄 ", style="yellow")
                 else:  # todo
-                    content.append("⏳ ", style="blue")
+                    text.append("⏳ ", style="blue")
 
                 # Task title
-                content.append(f"{i}. {task.title}", style="white")
+                text.append(f"{i}. {task.title}", style="white")
 
                 # NEW: show notes under the title if present
                 if task.notes:
-                    content.append("\n   Notes: " + task.notes, style="italic")
+                    text.append("\n   Notes: " + task.notes, style="italic")
 
                 if i < len(self.task_list):
-                    content.append("\n")
+                    text.append("\n")
         else:
-            content.append("📝 ", style="blue")
-            content.append("Task list is empty")
+            text.append("📝 ", style="blue")
+            text.append("Task list is empty")
 
-        return content
+        return text
 
 
 class TaskTrackerExecutor(ToolExecutor[TaskTrackerAction, TaskTrackerObservation]):

From 5ba09adeadea41beeeeae078adc94fdea47366a6 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 10:19:29 +0000
Subject: [PATCH 54/76] refactor: apply standardized visualize pattern to
 MCPToolObservation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Renamed 'content' variable to 'text'
- Added consistent error prefix pattern with ❌ emoji and error_message_header
- Removed duplicate error message display (now handled by error prefix)
- Ensures consistent error display with other observation types

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-sdk/openhands/sdk/mcp/definition.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 2baa1aa9ea..b1201151b9 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -89,19 +89,22 @@ def from_call_tool_result(
     @property
     def visualize(self) -> Text:
         """Return Rich Text representation of this observation."""
-        content = Text()
-        content.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
+        text = Text()
+
         if self.is_error:
-            content.append("[Error during execution]\n", style="bold red")
+            text.append("❌ ", style="red bold")
+            text.append(self.error_message_header, style="bold red")
+
+        text.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
         for block in self.content:
             if isinstance(block, TextContent):
                 # try to see if block.text is a JSON
                 try:
                     parsed = json.loads(block.text)
-                    content.append(display_dict(parsed))
+                    text.append(display_dict(parsed))
                     continue
                 except (json.JSONDecodeError, TypeError):
-                    content.append(block.text + "\n")
+                    text.append(block.text + "\n")
             elif isinstance(block, ImageContent):
-                content.append(f"[Image with {len(block.image_urls)} URLs]\n")
-        return content
+                text.append(f"[Image with {len(block.image_urls)} URLs]\n")
+        return text

From b7ca7a2ddd62523629ffa5b1f9fbf90805e61e29 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 11:34:38 +0100
Subject: [PATCH 55/76] improve readibility

---
 openhands-sdk/openhands/sdk/mcp/definition.py | 20 ++++++++-----------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index b1201151b9..81af345a86 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -57,14 +57,16 @@ def from_call_tool_result(
         cls, tool_name: str, result: mcp.types.CallToolResult
     ) -> "MCPToolObservation":
         """Create an MCPToolObservation from a CallToolResult."""
-        content: list[mcp.types.ContentBlock] = result.content
-        converted_content: list[TextContent | ImageContent] = []
 
-        for block in content:
+        native_content: list[mcp.types.ContentBlock] = result.content
+        content: list[TextContent | ImageContent] = [
+            TextContent(text=f"[Tool '{tool_name}' executed.]")
+        ]
+        for block in native_content:
             if isinstance(block, mcp.types.TextContent):
-                converted_content.append(TextContent(text=block.text))
+                content.append(TextContent(text=block.text))
             elif isinstance(block, mcp.types.ImageContent):
-                converted_content.append(
+                content.append(
                     ImageContent(
                         image_urls=[f"data:{block.mimeType};base64,{block.data}"],
                     )
@@ -74,14 +76,8 @@ def from_call_tool_result(
                     f"Unsupported MCP content block type: {type(block)}. Ignoring."
                 )
 
-        # Build initial message
-        initial_message = f"[Tool '{tool_name}' executed.]"
-
-        # Prepend initial message to content
-        content_with_header = [TextContent(text=initial_message)] + converted_content
-
         return cls(
-            content=content_with_header,
+            content=content,
             is_error=result.isError,
             tool_name=tool_name,
         )

From 98cce063c0e0ddd024a0304db62a4891814bd27e Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 11:40:55 +0100
Subject: [PATCH 56/76] update doc

---
 openhands-sdk/openhands/sdk/tool/schema.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 69967ec8b3..c1c94b5001 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -206,7 +206,9 @@ class Observation(Schema, ABC):
     )
     error_message_header: str = Field(
         default="Tool Execution Error. ",
-        description="Header prepended to content when is_error is True",
+        description=(
+            "Header prepended to content and visualization when is_error is True"
+        ),
     )
 
     @classmethod

From 8d65a9bcbea71b468ca129afe73f4bfd467ef40b Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 11:52:41 +0100
Subject: [PATCH 57/76] update description

---
 openhands-tools/openhands/tools/execute_bash/definition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index df7d4cf001..d906f822c0 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -82,7 +82,7 @@ class ExecuteBashObservation(Observation):
     """A ToolResult that can be rendered as a CLI output."""
 
     command: str | None = Field(
-        default=None, description="The command that was executed"
+        description="The bash command that was executed. Can be empty string if the observation is from a previous command that hit soft timeout and is not yet finished.",  # noqa
     )
     exit_code: int | None = Field(
         default=None,

From f24ab0a9e4f7c6a86cea8bb742bfb920fc72a65a Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 10:53:38 +0000
Subject: [PATCH 58/76] refactor: use from_text for ExecuteBashObservation
 initialization

- Replaced direct content=[TextContent(text=...)] initialization with from_text
- Updated terminal_session.py: all 4 observation returns now use from_text
- Updated impl.py: secrets masking now uses from_text for consistency
- Removed unused TextContent import from terminal_session.py
- Ensures consistent observation creation across execute_bash tool

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/execute_bash/impl.py        |  4 +---
 .../execute_bash/terminal/terminal_session.py   | 17 ++++++++---------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 69580e5be3..3bd688c641 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -187,9 +187,7 @@ def __call__(
                 masked_content = secret_registry.mask_secrets_in_output(content_text)
                 if masked_content:
                     data = observation.model_dump(exclude={"content"})
-                    return ExecuteBashObservation(
-                        **data, content=[TextContent(text=masked_content)]
-                    )
+                    return ExecuteBashObservation.from_text(text=masked_content, **data)
             except Exception:
                 pass
 
diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 06dae55159..01ccdcbb9b 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -4,7 +4,6 @@
 import time
 from enum import Enum
 
-from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.tools.execute_bash.constants import (
     CMD_OUTPUT_PS1_END,
@@ -187,9 +186,9 @@ def _handle_completed_command(
         self.prev_status = TerminalCommandStatus.COMPLETED
         self.prev_output = ""  # Reset previous command output
         self._ready_for_next_command()
-        return ExecuteBashObservation(
+        return ExecuteBashObservation.from_text(
             command=command,
-            content=[TextContent(text=command_output)],
+            text=command_output,
             metadata=metadata,
         )
 
@@ -221,9 +220,9 @@ def _handle_nochange_timeout_command(
             metadata,
             continue_prefix="[Below is the output of the previous command.]\n",
         )
-        return ExecuteBashObservation(
+        return ExecuteBashObservation.from_text(
             command=command,
-            content=[TextContent(text=command_output)],
+            text=command_output,
             metadata=metadata,
         )
 
@@ -256,9 +255,9 @@ def _handle_hard_timeout_command(
             metadata,
             continue_prefix="[Below is the output of the previous command.]\n",
         )
-        return ExecuteBashObservation(
+        return ExecuteBashObservation.from_text(
             command=command,
-            content=[TextContent(text=command_output)],
+            text=command_output,
             metadata=metadata,
         )
 
@@ -389,9 +388,9 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 metadata,
                 continue_prefix="[Below is the output of the previous command.]\n",
             )
-            obs = ExecuteBashObservation(
+            obs = ExecuteBashObservation.from_text(
                 command=command,
-                content=[TextContent(text=command_output)],
+                text=command_output,
                 metadata=metadata,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")

From 49e883242e24046eb8e0ab3e45253c3c532c3d13 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 10:57:36 +0000
Subject: [PATCH 59/76] fix: add required command parameter to
 ExecuteBashObservation tests

- Added command parameter to all ExecuteBashObservation instantiations in tests
- Fixes pyright type checking errors (reportCallIssue)
- Tests still pass with the updated schema

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/tools/execute_bash/test_observation_truncation.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 5137d65a44..97c1bc1366 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -18,6 +18,7 @@ def test_execute_bash_observation_truncation_under_limit():
     )
 
     observation = ExecuteBashObservation(
+        command="echo test",
         content=[TextContent(text="Short output")],
         metadata=metadata,
     )
@@ -51,6 +52,7 @@ def test_execute_bash_observation_truncation_over_limit():
     long_output = "A" * (MAX_CMD_OUTPUT_SIZE + 1000)
 
     observation = ExecuteBashObservation(
+        command="echo test",
         content=[TextContent(text=long_output)],
         metadata=metadata,
     )
@@ -84,6 +86,7 @@ def test_execute_bash_observation_truncation_with_error():
     )
 
     observation = ExecuteBashObservation(
+        command="false",
         content=[TextContent(text="Command failed")],
         metadata=metadata,
         is_error=True,
@@ -126,6 +129,7 @@ def test_execute_bash_observation_truncation_exact_limit():
     exact_output = "C" * exact_output_size
 
     observation = ExecuteBashObservation(
+        command="echo test",
         content=[TextContent(text=exact_output)],
         metadata=metadata,
     )
@@ -155,6 +159,7 @@ def test_execute_bash_observation_truncation_with_prefix_suffix():
     long_output = "D" * (MAX_CMD_OUTPUT_SIZE + 200)
 
     observation = ExecuteBashObservation(
+        command="echo test",
         content=[TextContent(text=long_output)],
         metadata=metadata,
     )

From fd182036d2e2309a3714c724ae32cc007cfdae07 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 12:14:18 +0100
Subject: [PATCH 60/76] add is_error = True

---
 .../openhands/tools/execute_bash/terminal/terminal_session.py    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
index 01ccdcbb9b..35042d35ef 100644
--- a/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
+++ b/openhands-tools/openhands/tools/execute_bash/terminal/terminal_session.py
@@ -392,6 +392,7 @@ def execute(self, action: ExecuteBashAction) -> ExecuteBashObservation:
                 command=command,
                 text=command_output,
                 metadata=metadata,
+                is_error=True,
             )
             logger.debug(f"RETURNING OBSERVATION (previous-command): {obs}")
             return obs

From 84da675cdbcaa5def460d14c791c029e59af3022 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 11:23:05 +0000
Subject: [PATCH 61/76] refactor: simplify content extraction in execute_bash
 impl

- Removed legacy str/list type handling since content is always a list
- Simplified reset_text and command_text extraction logic
- Tests pass with the streamlined implementation

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/execute_bash/impl.py      | 34 ++++++++-----------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 3bd688c641..7178f79f97 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -143,26 +143,20 @@ def __call__(
                 self._export_envs(command_action, conversation)
                 command_result = self.session.execute(command_action)
 
-                # Extract text from content (handle both str and list types)
-                if isinstance(reset_result.content, str):
-                    reset_text = reset_result.content
-                else:
-                    reset_text = (
-                        reset_result.content[0].text
-                        if reset_result.content
-                        and isinstance(reset_result.content[0], TextContent)
-                        else ""
-                    )
-
-                if isinstance(command_result.content, str):
-                    command_text = command_result.content
-                else:
-                    command_text = (
-                        command_result.content[0].text
-                        if command_result.content
-                        and isinstance(command_result.content[0], TextContent)
-                        else ""
-                    )
+                # Extract text from content
+                reset_text = (
+                    reset_result.content[0].text
+                    if reset_result.content
+                    and isinstance(reset_result.content[0], TextContent)
+                    else ""
+                )
+
+                command_text = (
+                    command_result.content[0].text
+                    if command_result.content
+                    and isinstance(command_result.content[0], TextContent)
+                    else ""
+                )
 
                 observation = command_result.model_copy(
                     update={

From 19e5a7bd2ddcdbc4b4c7b7b596adf1a01e440973 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 11:27:15 +0000
Subject: [PATCH 62/76] refactor: use get_text() method for cleaner content
 extraction

- Replaced manual content[0].text extraction with get_text() method
- Removed unused TextContent import
- Cleaner and more maintainable code using existing API

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .../openhands/tools/execute_bash/impl.py         | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 7178f79f97..6b868c876d 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -3,7 +3,6 @@
 
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import ToolExecutor
-from openhands.sdk.tool.schema import TextContent
 
 
 if TYPE_CHECKING:
@@ -144,19 +143,8 @@ def __call__(
                 command_result = self.session.execute(command_action)
 
                 # Extract text from content
-                reset_text = (
-                    reset_result.content[0].text
-                    if reset_result.content
-                    and isinstance(reset_result.content[0], TextContent)
-                    else ""
-                )
-
-                command_text = (
-                    command_result.content[0].text
-                    if command_result.content
-                    and isinstance(command_result.content[0], TextContent)
-                    else ""
-                )
+                reset_text = reset_result.get_text()
+                command_text = command_result.get_text()
 
                 observation = command_result.model_copy(
                     update={

From 8c3753c2e11df33d1688b88369c3a3c7904515e0 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 12:04:40 +0000
Subject: [PATCH 63/76] fix: correctly set content field as list of TextContent
 in ExecuteBash reset operation

Fixed critical type violation where content field was incorrectly set to a
string instead of a list of TextContent objects during reset+command operations.
This caused observation.get_text() to return empty string and broke the type
contract defined in the base Observation class.

Also removed test workaround that was masking this bug in conftest.py.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 openhands-tools/openhands/tools/execute_bash/impl.py | 5 ++++-
 tests/tools/execute_bash/conftest.py                 | 4 +---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 6b868c876d..3477bdf61c 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -1,6 +1,7 @@
 import json
 from typing import TYPE_CHECKING, Literal
 
+from openhands.sdk.llm import TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.tool import ToolExecutor
 
@@ -148,7 +149,9 @@ def __call__(
 
                 observation = command_result.model_copy(
                     update={
-                        "content": f"{reset_text}\n\n{command_text}",
+                        "content": [
+                            TextContent(text=f"{reset_text}\n\n{command_text}")
+                        ],
                         "command": f"[RESET] {action.command}",
                     }
                 )
diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index 743eca4da9..f6c08f2907 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -16,10 +16,8 @@ def get_output_text(obs: ExecuteBashObservation) -> str:
     """Extract text from observation content field.
 
     This helper handles type-safe extraction of text from the observation's
-    content field, which can be a str or list of Content items.
+    content field, which is a list of Content items.
     """
-    if isinstance(obs.content, str):
-        return obs.content
     if not obs.content:
         return ""
     first_item = obs.content[0]

From e05c35d01ac346e9b1bcc0d7f35e0187f82e9469 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 17:55:57 +0000
Subject: [PATCH 64/76] refactor: make ERROR_MESSAGE_HEADER a class variable
 and rename get_text() to text property

- Changed error_message_header from instance field to ERROR_MESSAGE_HEADER class variable
  This allows child classes to easily override the error message header
- Changed get_text() method to text property for cleaner API
- Updated all references throughout the codebase to use the new names
- Updated all Observation subclasses to use self.ERROR_MESSAGE_HEADER
- Updated all test files to use .text instead of .get_text()

Co-authored-by: openhands <openhands@all-hands.dev>
---
 examples/01_standalone_sdk/02_custom_tools.py |  2 +-
 openhands-sdk/openhands/sdk/mcp/definition.py |  2 +-
 openhands-sdk/openhands/sdk/tool/schema.py    | 15 +++++-------
 .../openhands/tools/browser_use/definition.py |  4 ++--
 .../tools/execute_bash/definition.py          |  8 +++----
 .../openhands/tools/execute_bash/impl.py      |  6 ++---
 .../openhands/tools/file_editor/definition.py |  2 +-
 .../tools/task_tracker/definition.py          |  2 +-
 tests/cross/test_agent_secrets_integration.py | 24 +++++++++----------
 .../local/test_confirmation_mode.py           |  6 ++---
 tests/sdk/mcp/test_mcp_tool.py                |  2 +-
 tests/tools/browser_use/conftest.py           |  2 +-
 .../browser_use/test_browser_executor.py      |  2 +-
 .../browser_use/test_browser_observation.py   |  6 ++---
 tests/tools/delegation/test_delegation.py     | 12 +++++-----
 tests/tools/execute_bash/test_bash_session.py |  4 ++--
 tests/tools/glob/test_glob_executor.py        |  2 +-
 tests/tools/glob/test_glob_tool.py            |  2 +-
 tests/tools/grep/test_grep_executor.py        |  4 ++--
 tests/tools/grep/test_grep_tool.py            |  4 ++--
 20 files changed, 54 insertions(+), 57 deletions(-)

diff --git a/examples/01_standalone_sdk/02_custom_tools.py b/examples/01_standalone_sdk/02_custom_tools.py
index 2b75edd295..cb30ed0443 100644
--- a/examples/01_standalone_sdk/02_custom_tools.py
+++ b/examples/01_standalone_sdk/02_custom_tools.py
@@ -92,7 +92,7 @@ def __call__(self, action: GrepAction, conversation=None) -> GrepObservation:  #
         files: set[str] = set()
 
         # grep returns exit code 1 when no matches; treat as empty
-        output_text = result.get_text()
+        output_text = result.text
 
         if output_text.strip():
             for line in output_text.strip().splitlines():
diff --git a/openhands-sdk/openhands/sdk/mcp/definition.py b/openhands-sdk/openhands/sdk/mcp/definition.py
index 81af345a86..771729e11d 100644
--- a/openhands-sdk/openhands/sdk/mcp/definition.py
+++ b/openhands-sdk/openhands/sdk/mcp/definition.py
@@ -89,7 +89,7 @@ def visualize(self) -> Text:
 
         if self.is_error:
             text.append("❌ ", style="red bold")
-            text.append(self.error_message_header, style="bold red")
+            text.append(self.ERROR_MESSAGE_HEADER, style="bold red")
 
         text.append(f"[MCP Tool '{self.tool_name}' Observation]\n", style="bold")
         for block in self.content:
diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index c1c94b5001..50a3dee15f 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -193,6 +193,8 @@ def visualize(self) -> Text:
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
+    ERROR_MESSAGE_HEADER: ClassVar[str] = "Tool Execution Error. "
+
     content: list[TextContent | ImageContent] = Field(
         default_factory=list,
         description=(
@@ -204,12 +206,6 @@ class Observation(Schema, ABC):
     is_error: bool = Field(
         default=False, description="Whether the observation indicates an error"
     )
-    error_message_header: str = Field(
-        default="Tool Execution Error. ",
-        description=(
-            "Header prepended to content and visualization when is_error is True"
-        ),
-    )
 
     @classmethod
     def from_text(
@@ -230,7 +226,8 @@ def from_text(
         """
         return cls(content=[TextContent(text=text)], is_error=is_error, **kwargs)
 
-    def get_text(self) -> str:
+    @property
+    def text(self) -> str:
         """Extract all text content from the observation.
 
         Returns:
@@ -250,7 +247,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text=self.error_message_header))
+            llm_content.append(TextContent(text=self.ERROR_MESSAGE_HEADER))
 
         # Add content (now always a list)
         llm_content.extend(self.content)
@@ -268,7 +265,7 @@ def visualize(self) -> Text:
 
         if self.is_error:
             text.append("❌ ", style="red bold")
-            text.append(self.error_message_header, style="bold red")
+            text.append(self.ERROR_MESSAGE_HEADER, style="bold red")
 
         text_parts = content_to_str(self.to_llm_content)
         if text_parts:
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 9396a580ca..6a02026b2b 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -38,10 +38,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text=self.error_message_header))
+            llm_content.append(TextContent(text=self.ERROR_MESSAGE_HEADER))
 
         # Get text content and truncate if needed
-        content_text = self.get_text()
+        content_text = self.text
         if content_text:
             llm_content.append(
                 TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
diff --git a/openhands-tools/openhands/tools/execute_bash/definition.py b/openhands-tools/openhands/tools/execute_bash/definition.py
index d906f822c0..32c118691b 100644
--- a/openhands-tools/openhands/tools/execute_bash/definition.py
+++ b/openhands-tools/openhands/tools/execute_bash/definition.py
@@ -107,10 +107,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
 
         # If is_error is true, prepend error message
         if self.is_error:
-            llm_content.append(TextContent(text=self.error_message_header))
+            llm_content.append(TextContent(text=self.ERROR_MESSAGE_HEADER))
 
         # ExecuteBashObservation always has content as a single TextContent
-        content_text = self.get_text()
+        content_text = self.text
 
         ret = f"{self.metadata.prefix}{content_text}{self.metadata.suffix}"
         if self.metadata.working_dir:
@@ -130,10 +130,10 @@ def visualize(self) -> Text:
 
         if self.is_error:
             text.append("❌ ", style="red bold")
-            text.append(self.error_message_header, style="bold red")
+            text.append(self.ERROR_MESSAGE_HEADER, style="bold red")
 
         # ExecuteBashObservation always has content as a single TextContent
-        content_text = self.get_text()
+        content_text = self.text
 
         if content_text:
             # Style the output based on content
diff --git a/openhands-tools/openhands/tools/execute_bash/impl.py b/openhands-tools/openhands/tools/execute_bash/impl.py
index 3477bdf61c..79c39d45f8 100644
--- a/openhands-tools/openhands/tools/execute_bash/impl.py
+++ b/openhands-tools/openhands/tools/execute_bash/impl.py
@@ -144,8 +144,8 @@ def __call__(
                 command_result = self.session.execute(command_action)
 
                 # Extract text from content
-                reset_text = reset_result.get_text()
-                command_text = command_result.get_text()
+                reset_text = reset_result.text
+                command_text = command_result.text
 
                 observation = command_result.model_copy(
                     update={
@@ -164,7 +164,7 @@ def __call__(
             observation = self.session.execute(action)
 
         # Apply automatic secrets masking
-        content_text = observation.get_text()
+        content_text = observation.text
 
         if content_text and conversation is not None:
             try:
diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
index c2b3a69a60..ba6fdd7656 100644
--- a/openhands-tools/openhands/tools/file_editor/definition.py
+++ b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -99,7 +99,7 @@ def visualize(self) -> Text:
 
         if self.is_error:
             text.append("❌ ", style="red bold")
-            text.append(self.error_message_header, style="bold red")
+            text.append(self.ERROR_MESSAGE_HEADER, style="bold red")
 
         if not self._has_meaningful_diff:
             return super().visualize
diff --git a/openhands-tools/openhands/tools/task_tracker/definition.py b/openhands-tools/openhands/tools/task_tracker/definition.py
index 7b9fc66528..3829fd9924 100644
--- a/openhands-tools/openhands/tools/task_tracker/definition.py
+++ b/openhands-tools/openhands/tools/task_tracker/definition.py
@@ -85,7 +85,7 @@ def visualize(self) -> Text:
 
         if self.is_error:
             text.append("❌ ", style="red bold")
-            text.append(self.error_message_header, style="bold red")
+            text.append(self.ERROR_MESSAGE_HEADER, style="bold red")
 
         if self.task_list:
             # Count tasks by status
diff --git a/tests/cross/test_agent_secrets_integration.py b/tests/cross/test_agent_secrets_integration.py
index cb1cf27b3d..c6c3894b30 100644
--- a/tests/cross/test_agent_secrets_integration.py
+++ b/tests/cross/test_agent_secrets_integration.py
@@ -234,13 +234,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $API_KEY")
         result = bash_executor(action, conversation=conversation)
-        assert "test-api-key" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "test-api-key" not in result.text
+        assert "<secret-hidden>" in result.text
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "dynamic-secret" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "dynamic-secret" not in result.text
+        assert "<secret-hidden>" in result.text
 
     finally:
         bash_executor.close()
@@ -265,13 +265,13 @@ def get_value(self):
     try:
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "changing-secret" not in result.text
+        assert "<secret-hidden>" in result.text
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "changing-secret" not in result.text
+        assert "<secret-hidden>" in result.text
 
     finally:
         bash_executor.close()
@@ -303,13 +303,13 @@ def get_value(self):
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
         print(result)
-        assert "changing-secret" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "changing-secret" not in result.text
+        assert "<secret-hidden>" in result.text
 
         action = ExecuteBashAction(command="echo $DB_PASSWORD")
         result = bash_executor(action, conversation=conversation)
-        assert "changing-secret" not in result.get_text()
-        assert "<secret-hidden>" in result.get_text()
+        assert "changing-secret" not in result.text
+        assert "<secret-hidden>" in result.text
         assert dynamic_secret.raised_on_second
 
     finally:
diff --git a/tests/sdk/conversation/local/test_confirmation_mode.py b/tests/sdk/conversation/local/test_confirmation_mode.py
index 343ee87524..0176aee01d 100644
--- a/tests/sdk/conversation/local/test_confirmation_mode.py
+++ b/tests/sdk/conversation/local/test_confirmation_mode.py
@@ -553,7 +553,7 @@ def test_single_finish_action_skips_confirmation_entirely(self):
         ]
         assert len(obs_events) == 1
         # FinishObservation should contain the finish message in content
-        assert obs_events[0].observation.get_text() == "Task completed successfully!"
+        assert obs_events[0].observation.text == "Task completed successfully!"
 
     def test_think_and_finish_action_skips_confirmation_entirely(self):
         """First step: ThinkAction (skips confirmation). Second step: FinishAction."""
@@ -597,11 +597,11 @@ def test_think_and_finish_action_skips_confirmation_entirely(self):
 
         # 1) ThinkAction observation - should contain the standard message
         assert hasattr(obs_events[0].observation, "content")
-        assert obs_events[0].observation.get_text() == "Your thought has been logged."
+        assert obs_events[0].observation.text == "Your thought has been logged."
 
         # 2) FinishAction observation - should contain the finish message
         assert hasattr(obs_events[1].observation, "content")
-        assert obs_events[1].observation.get_text() == "Analysis complete"
+        assert obs_events[1].observation.text == "Analysis complete"
 
     def test_pause_during_confirmation_preserves_waiting_status(self):
         """Test that pausing during WAITING_FOR_CONFIRMATION preserves the status.
diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 0ca70c41aa..927d8a5fd4 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -216,7 +216,7 @@ def mock_call_async_from_sync(coro_func, **kwargs):
         assert observation.tool_name == "test_tool"
         assert observation.is_error is True
         assert observation.is_error is True
-        assert "Connection failed" in observation.get_text()
+        assert "Connection failed" in observation.text
 
 
 class TestMCPTool:
diff --git a/tests/tools/browser_use/conftest.py b/tests/tools/browser_use/conftest.py
index eebd394bc9..d30f51faf6 100644
--- a/tests/tools/browser_use/conftest.py
+++ b/tests/tools/browser_use/conftest.py
@@ -61,4 +61,4 @@ def assert_browser_observation_error(
     assert isinstance(observation, BrowserObservation)
     assert observation.is_error is True
     if expected_error:
-        assert expected_error in observation.get_text()
+        assert expected_error in observation.text
diff --git a/tests/tools/browser_use/test_browser_executor.py b/tests/tools/browser_use/test_browser_executor.py
index c83463cab6..25377b26da 100644
--- a/tests/tools/browser_use/test_browser_executor.py
+++ b/tests/tools/browser_use/test_browser_executor.py
@@ -105,7 +105,7 @@ async def test_browser_executor_error_wrapping(mock_navigate, mock_browser_execu
     result = await mock_browser_executor._execute_action(action)
 
     assert_browser_observation_error(result, "Browser operation failed")
-    assert "Browser error occurred" in result.get_text()
+    assert "Browser error occurred" in result.text
 
 
 def test_browser_executor_async_execution(mock_browser_executor):
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index 3174feecbd..bfd5fdd66a 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -8,7 +8,7 @@ def test_browser_observation_basic_output():
     """Test basic BrowserObservation creation with output."""
     observation = BrowserObservation.from_text(text="Test output")
 
-    assert observation.get_text() == "Test output"
+    assert observation.text == "Test output"
     assert observation.is_error is False
     assert observation.screenshot_data is None
 
@@ -17,7 +17,7 @@ def test_browser_observation_with_error():
     """Test BrowserObservation with error."""
     observation = BrowserObservation.from_text(text="Test error", is_error=True)
 
-    assert observation.get_text() == "Test error"
+    assert observation.text == "Test error"
     assert observation.is_error is True
     assert observation.screenshot_data is None
 
@@ -29,7 +29,7 @@ def test_browser_observation_with_screenshot():
         text="Screenshot taken", screenshot_data=screenshot_data
     )
 
-    assert observation.get_text() == "Screenshot taken"
+    assert observation.text == "Screenshot taken"
     assert observation.is_error is False
     assert observation.screenshot_data == screenshot_data
 
diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index 85f19144fa..b8e9d969f3 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -71,7 +71,7 @@ def test_delegate_observation_creation():
         command="spawn",
     )
     assert isinstance(spawn_observation.content, list)
-    assert spawn_observation.get_text() == "spawn: Sub-agents created successfully"
+    assert spawn_observation.text == "spawn: Sub-agents created successfully"
     # Verify to_llm_content returns TextContent
     llm_content = spawn_observation.to_llm_content
     assert len(llm_content) == 1
@@ -87,9 +87,9 @@ def test_delegate_observation_creation():
         command="delegate",
     )
     assert isinstance(delegate_observation.content, list)
-    assert "Tasks completed successfully" in delegate_observation.get_text()
-    assert "Result 1" in delegate_observation.get_text()
-    assert "Result 2" in delegate_observation.get_text()
+    assert "Tasks completed successfully" in delegate_observation.text
+    assert "Result 1" in delegate_observation.text
+    assert "Result 2" in delegate_observation.text
     # Verify to_llm_content
     llm_content = delegate_observation.to_llm_content
     assert len(llm_content) == 1
@@ -105,7 +105,7 @@ def test_delegate_executor_delegate():
     spawn_action = DelegateAction(command="spawn", ids=["agent1", "agent2"])
     spawn_observation = executor(spawn_action, parent_conversation)
     assert isinstance(spawn_observation.content, list)
-    assert "Successfully spawned" in spawn_observation.get_text()
+    assert "Successfully spawned" in spawn_observation.text
 
     # Then delegate tasks to them
     delegate_action = DelegateAction(
@@ -128,7 +128,7 @@ def test_delegate_executor_delegate():
 
     assert isinstance(observation, DelegateObservation)
     assert isinstance(observation.content, list)
-    text_content = observation.get_text()
+    text_content = observation.text
     assert "Agent agent1: Code analysis complete" in text_content
     assert "Agent agent2: Tests written" in text_content
 
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 5d96d70f90..a95cde2f33 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -320,7 +320,7 @@ def test_empty_command_error(terminal_type):
     obs = session.execute(ExecuteBashAction(command=""))
 
     assert obs.is_error is True
-    assert obs.get_text() == "No previous running command to retrieve logs from."
+    assert obs.text == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
     assert obs.to_llm_content[0].text == "Tool Execution Error. "
@@ -720,7 +720,7 @@ def test_multiple_multiline_commands(terminal_type):
             # First test that running multiple commands at once fails
             obs = _run_bash_action(session, joined_cmds)
             assert obs.is_error is True
-            assert "Cannot execute multiple commands at once" in obs.get_text()
+            assert "Cannot execute multiple commands at once" in obs.text
 
             # Now run each command individually and verify they work
             results = []
diff --git a/tests/tools/glob/test_glob_executor.py b/tests/tools/glob/test_glob_executor.py
index f4d478b7bc..c0f215372e 100644
--- a/tests/tools/glob/test_glob_executor.py
+++ b/tests/tools/glob/test_glob_executor.py
@@ -84,7 +84,7 @@ def test_glob_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.get_text()
+        assert "is not a valid directory" in observation.text
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index a4fc06ce72..b68bc8d5d5 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -162,7 +162,7 @@ def test_glob_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "is not a valid directory" in observation.get_text()
+        assert "is not a valid directory" in observation.text
         assert len(observation.files) == 0
 
 
diff --git a/tests/tools/grep/test_grep_executor.py b/tests/tools/grep/test_grep_executor.py
index 014df969e4..120f2b1da0 100644
--- a/tests/tools/grep/test_grep_executor.py
+++ b/tests/tools/grep/test_grep_executor.py
@@ -106,7 +106,7 @@ def test_grep_executor_invalid_path():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.get_text()
+        assert "not a valid directory" in observation.text
 
 
 def test_grep_executor_no_matches():
@@ -186,4 +186,4 @@ def test_grep_executor_invalid_regex():
         observation = executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.get_text()
+        assert "Invalid regex pattern" in observation.text
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index 2f4f99ccb8..43c8e7b7c9 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -164,7 +164,7 @@ def test_grep_tool_invalid_regex():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "Invalid regex pattern" in observation.get_text()
+        assert "Invalid regex pattern" in observation.text
 
 
 def test_grep_tool_invalid_directory():
@@ -179,7 +179,7 @@ def test_grep_tool_invalid_directory():
         observation = tool.executor(action)
 
         assert observation.is_error is True
-        assert "not a valid directory" in observation.get_text()
+        assert "not a valid directory" in observation.text
 
 
 def test_grep_tool_hidden_files_excluded():

From e62771b8fbb17aa704251e9ef617f942b1ad9d1c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 5 Nov 2025 18:35:43 +0000
Subject: [PATCH 65/76] fix: update test to handle observation.content as list

The base Observation class now always has content as a list of TextContent/ImageContent.
Updated test_real_mcp_tool_execution_without_kind_field to properly extract text from
the content list instead of treating it as a string.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 tests/sdk/mcp/test_mcp_tool_kind_field.py | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

diff --git a/tests/sdk/mcp/test_mcp_tool_kind_field.py b/tests/sdk/mcp/test_mcp_tool_kind_field.py
index 101ecbeeb1..af67777c57 100644
--- a/tests/sdk/mcp/test_mcp_tool_kind_field.py
+++ b/tests/sdk/mcp/test_mcp_tool_kind_field.py
@@ -86,20 +86,14 @@ def test_real_mcp_tool_execution_without_kind_field(fetch_tool):
 
     # Verify we got a valid response (not an error about 'kind')
     # Check output if no error, otherwise check error message
-    if observation.is_error:
-        assert observation.is_error is True
-        content_str = observation.content
-    else:
-        assert observation.content is not None
-        # Extract text from content blocks
-        from openhands.sdk.llm import TextContent
-
-        text_parts = [
-            block.text
-            for block in observation.content
-            if isinstance(block, TextContent)
-        ]
-        content_str = " ".join(text_parts)
+    from openhands.sdk.llm import TextContent
+
+    assert observation.content is not None
+    # Extract text from content blocks (content is always a list now)
+    text_parts = [
+        block.text for block in observation.content if isinstance(block, TextContent)
+    ]
+    content_str = " ".join(text_parts)
 
     # Check that the response doesn't contain validation error about 'kind'
     if "error" in content_str.lower():

From 38256bfcbbbf9f0234fb70a98dad8450361e253d Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 6 Nov 2025 02:39:40 +0800
Subject: [PATCH 66/76] Apply suggestion from @xingyaoww

---
 openhands-sdk/openhands/sdk/tool/schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/tool/schema.py b/openhands-sdk/openhands/sdk/tool/schema.py
index 50a3dee15f..1102df7005 100644
--- a/openhands-sdk/openhands/sdk/tool/schema.py
+++ b/openhands-sdk/openhands/sdk/tool/schema.py
@@ -193,7 +193,7 @@ def visualize(self) -> Text:
 class Observation(Schema, ABC):
     """Base schema for output observation."""
 
-    ERROR_MESSAGE_HEADER: ClassVar[str] = "Tool Execution Error. "
+    ERROR_MESSAGE_HEADER: ClassVar[str] = "[An error occurred during execution.]\n"
 
     content: list[TextContent | ImageContent] = Field(
         default_factory=list,

From 85b1f1535df4bc0ef0106e164b47b1cb17742989 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Thu, 6 Nov 2025 02:48:40 +0800
Subject: [PATCH 67/76] Apply suggestion from @xingyaoww

---
 openhands-tools/openhands/tools/browser_use/definition.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 6a02026b2b..0a72d6edc6 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -40,11 +40,9 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.ERROR_MESSAGE_HEADER))
 
-        # Get text content and truncate if needed
-        content_text = self.text
-        if content_text:
+        if self.text:
             llm_content.append(
-                TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
+                TextContent(text=maybe_truncate(self.text, MAX_BROWSER_OUTPUT_SIZE))
             )
 
         if self.screenshot_data:

From a19e4541fdb82ad3b9cb0c0ebc27b6f58843285a Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 13:57:05 -0500
Subject: [PATCH 68/76] Revert "Apply suggestion from @xingyaoww"

This reverts commit 85b1f1535df4bc0ef0106e164b47b1cb17742989.
---
 openhands-tools/openhands/tools/browser_use/definition.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
index 0a72d6edc6..6a02026b2b 100644
--- a/openhands-tools/openhands/tools/browser_use/definition.py
+++ b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -40,9 +40,11 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         if self.is_error:
             llm_content.append(TextContent(text=self.ERROR_MESSAGE_HEADER))
 
-        if self.text:
+        # Get text content and truncate if needed
+        content_text = self.text
+        if content_text:
             llm_content.append(
-                TextContent(text=maybe_truncate(self.text, MAX_BROWSER_OUTPUT_SIZE))
+                TextContent(text=maybe_truncate(content_text, MAX_BROWSER_OUTPUT_SIZE))
             )
 
         if self.screenshot_data:

From 4a564b622e03525db2a50bddd2623bbfbb7c4267 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 20:03:31 +0100
Subject: [PATCH 69/76] fix error msg

---
 tests/sdk/mcp/test_mcp_tool.py                      | 2 +-
 tests/tools/browser_use/test_browser_observation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 927d8a5fd4..29af980e7c 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -123,7 +123,7 @@ def test_to_llm_content_error(self):
         agent_obs = observation.to_llm_content
         assert len(agent_obs) == 2
         assert isinstance(agent_obs[0], TextContent)
-        assert agent_obs[0].text == "Tool Execution Error. "
+        assert agent_obs[0].text == MCPToolObservation.ERROR_MESSAGE_HEADER
         assert isinstance(agent_obs[1], TextContent)
         assert "[Tool 'test_tool' executed.]" in agent_obs[1].text
         assert "[An error occurred during execution.]" in agent_obs[1].text
diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py
index bfd5fdd66a..09992781ee 100644
--- a/tests/tools/browser_use/test_browser_observation.py
+++ b/tests/tools/browser_use/test_browser_observation.py
@@ -68,7 +68,7 @@ def test_browser_observation_to_llm_content_with_error():
 
     assert len(agent_obs) == 2
     assert isinstance(agent_obs[0], TextContent)
-    assert agent_obs[0].text == "Tool Execution Error. "
+    assert agent_obs[0].text == BrowserObservation.ERROR_MESSAGE_HEADER
     assert isinstance(agent_obs[1], TextContent)
     assert "Test error" in agent_obs[1].text
 

From 4293eb1d47920634dcd00ad0d9c0d69ae9e9fbe8 Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Wed, 5 Nov 2025 20:05:39 +0100
Subject: [PATCH 70/76] use ERROR_MESSAGE_HEADER

---
 tests/sdk/mcp/test_mcp_tool.py                          | 2 +-
 tests/tools/execute_bash/test_bash_ps1_metadata.py      | 2 +-
 tests/tools/execute_bash/test_bash_session.py           | 7 +++++--
 tests/tools/execute_bash/test_observation_truncation.py | 2 +-
 tests/tools/glob/test_glob_tool.py                      | 2 +-
 tests/tools/grep/test_grep_tool.py                      | 2 +-
 6 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/tests/sdk/mcp/test_mcp_tool.py b/tests/sdk/mcp/test_mcp_tool.py
index 29af980e7c..74c075e7a1 100644
--- a/tests/sdk/mcp/test_mcp_tool.py
+++ b/tests/sdk/mcp/test_mcp_tool.py
@@ -106,7 +106,7 @@ def test_to_llm_content_success(self):
         assert isinstance(agent_obs[0], TextContent)
         assert "[Tool 'test_tool' executed.]" in agent_obs[0].text
         assert "Success result" in agent_obs[0].text
-        assert "Tool Execution Error." not in agent_obs[0].text
+        assert MCPToolObservation.ERROR_MESSAGE_HEADER not in agent_obs[0].text
 
     def test_to_llm_content_error(self):
         """Test agent observation formatting for error."""
diff --git a/tests/tools/execute_bash/test_bash_ps1_metadata.py b/tests/tools/execute_bash/test_bash_ps1_metadata.py
index e34664ed50..9dd94b54e3 100644
--- a/tests/tools/execute_bash/test_bash_ps1_metadata.py
+++ b/tests/tools/execute_bash/test_bash_ps1_metadata.py
@@ -303,7 +303,7 @@ def test_cmd_output_observation_properties():
     assert obs.is_error
     assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
-    assert obs.to_llm_content[0].text == "Tool Execution Error. "
+    assert obs.to_llm_content[0].text == ExecuteBashObservation.ERROR_MESSAGE_HEADER
     assert isinstance(obs.to_llm_content[1], TextContent)
     assert "Command failed" in obs.to_llm_content[1].text
 
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index a95cde2f33..30e6c32b89 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -17,7 +17,10 @@
 
 from openhands.sdk import TextContent
 from openhands.sdk.logger import get_logger
-from openhands.tools.execute_bash.definition import ExecuteBashAction
+from openhands.tools.execute_bash.definition import (
+    ExecuteBashAction,
+    ExecuteBashObservation,
+)
 from openhands.tools.execute_bash.terminal import (
     TerminalCommandStatus,
     create_terminal_session,
@@ -323,7 +326,7 @@ def test_empty_command_error(terminal_type):
     assert obs.text == "No previous running command to retrieve logs from."
     assert len(obs.to_llm_content) == 2
     assert isinstance(obs.to_llm_content[0], TextContent)
-    assert obs.to_llm_content[0].text == "Tool Execution Error. "
+    assert obs.to_llm_content[0].text == ExecuteBashObservation.ERROR_MESSAGE_HEADER
     assert isinstance(obs.to_llm_content[1], TextContent)
     assert (
         "No previous running command to retrieve logs from."
diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 97c1bc1366..9a94f3f56d 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -98,7 +98,7 @@ def test_execute_bash_observation_truncation_with_error():
     assert isinstance(result[1], TextContent)
 
     # First part is the error prefix
-    assert result[0].text == "Tool Execution Error. "
+    assert result[0].text == ExecuteBashObservation.ERROR_MESSAGE_HEADER
 
     # Second part includes the error message with metadata
     full_text = result[1].text
diff --git a/tests/tools/glob/test_glob_tool.py b/tests/tools/glob/test_glob_tool.py
index b68bc8d5d5..854daeae9d 100644
--- a/tests/tools/glob/test_glob_tool.py
+++ b/tests/tools/glob/test_glob_tool.py
@@ -288,7 +288,7 @@ def test_glob_tool_to_llm_content_error():
 
         content = observation.to_llm_content
         assert len(content) == 2
-        assert content[0].text == "Tool Execution Error. "
+        assert content[0].text == GlobObservation.ERROR_MESSAGE_HEADER
         text_content = content[1].text
         assert "is not a valid directory" in text_content
 
diff --git a/tests/tools/grep/test_grep_tool.py b/tests/tools/grep/test_grep_tool.py
index 43c8e7b7c9..04cefd668a 100644
--- a/tests/tools/grep/test_grep_tool.py
+++ b/tests/tools/grep/test_grep_tool.py
@@ -270,7 +270,7 @@ def test_grep_tool_to_llm_content_error():
 
         content = observation.to_llm_content
         assert len(content) == 2
-        assert content[0].text == "Tool Execution Error. "
+        assert content[0].text == GrepObservation.ERROR_MESSAGE_HEADER
         text = content[1].text
         assert "Invalid regex pattern" in text
 

From 2fab1ddd08124bf8b343a0326086f5c54dbdf06b Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 14:46:58 -0500
Subject: [PATCH 71/76] simplify

---
 tests/tools/delegation/test_delegation.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/tools/delegation/test_delegation.py b/tests/tools/delegation/test_delegation.py
index b8e9d969f3..37d808a6b9 100644
--- a/tests/tools/delegation/test_delegation.py
+++ b/tests/tools/delegation/test_delegation.py
@@ -146,13 +146,7 @@ def test_delegate_executor_missing_task():
     # Error message should be in the error field
     assert observation.is_error
     assert observation.is_error is True
-    content_text = (
-        observation.content
-        if isinstance(observation.content, str)
-        else "".join(
-            [c.text for c in observation.content if isinstance(c, TextContent)]
-        )
-    )
+    content_text = observation.text
     assert (
         "task is required" in content_text.lower()
         or "at least one task" in content_text.lower()

From 1d16b6f918934148d9b0ff8e0842a3a25aea9200 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 14:49:49 -0500
Subject: [PATCH 72/76] fix test

---
 .../test_observation_truncation.py            | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 9a94f3f56d..8a6433647e 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -85,9 +85,12 @@ def test_execute_bash_observation_truncation_with_error():
         pid=123,
     )
 
+    # Create output that exceeds the limit
+    long_output = "B" * (MAX_CMD_OUTPUT_SIZE + 500)
+
     observation = ExecuteBashObservation(
         command="false",
-        content=[TextContent(text="Command failed")],
+        content=[TextContent(text=long_output)],
         metadata=metadata,
         is_error=True,
     )
@@ -96,16 +99,18 @@ def test_execute_bash_observation_truncation_with_error():
     assert len(result) == 2
     assert isinstance(result[0], TextContent)
     assert isinstance(result[1], TextContent)
+    result = result[1].text
 
-    # First part is the error prefix
-    assert result[0].text == ExecuteBashObservation.ERROR_MESSAGE_HEADER
-
-    # Second part includes the error message with metadata
-    full_text = result[1].text
-    assert "Command failed" in full_text
-    assert "[Current working directory: /test]" in full_text
-    assert "[Python interpreter: /usr/bin/python]" in full_text
-    assert "[Command finished with exit code 1]" in full_text
+    # The result should be truncated and have error prefix
+    assert result.startswith("[There was an error during command execution.]")
+    assert len(result) < len(long_output) + 300  # Account for metadata and error prefix
+    # With head-and-tail truncation, should end with original content + metadata
+    expected_end = (
+        "B\n[Current working directory: /test]\n[Python interpreter: /usr/bin/python]\n"
+        "[Command finished with exit code 1]"
+    )
+    assert result.endswith(expected_end)
+    assert "<response clipped>" in result  # Should contain truncation notice
 
 
 def test_execute_bash_observation_truncation_exact_limit():

From 2bf41e247a52ed12ade759fa858048c077bc6ee1 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 14:50:28 -0500
Subject: [PATCH 73/76] simplify

---
 tests/tools/execute_bash/conftest.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index f6c08f2907..9b68c04daf 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -3,7 +3,6 @@
 import tempfile
 
 from openhands.sdk.logger import get_logger
-from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash.constants import TIMEOUT_MESSAGE_TEMPLATE
 from openhands.tools.execute_bash.definition import ExecuteBashObservation
 from openhands.tools.execute_bash.terminal import create_terminal_session
@@ -13,15 +12,7 @@
 
 
 def get_output_text(obs: ExecuteBashObservation) -> str:
-    """Extract text from observation content field.
-
-    This helper handles type-safe extraction of text from the observation's
-    content field, which is a list of Content items.
-    """
-    if not obs.content:
-        return ""
-    first_item = obs.content[0]
-    return first_item.text if isinstance(first_item, TextContent) else ""
+    return obs.text
 
 
 def get_no_change_timeout_suffix(timeout_seconds):

From ece030edeaf5686e1ee136062a23fb026622aa70 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 14:53:26 -0500
Subject: [PATCH 74/76] simplify test

---
 tests/tools/file_editor/conftest.py          | 4 +---
 tests/tools/file_editor/test_memory_usage.py | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/tools/file_editor/conftest.py b/tests/tools/file_editor/conftest.py
index 7a778d9427..c8efdaecb2 100644
--- a/tests/tools/file_editor/conftest.py
+++ b/tests/tools/file_editor/conftest.py
@@ -85,6 +85,4 @@ def create_test_file(path: Path, content: str):
 
 def get_output_text(result: FileEditorObservation) -> str:
     """Extract text content from a FileEditorObservation's content."""
-    if isinstance(result.content, str):
-        return result.content
-    return "".join([c.text for c in result.content if isinstance(c, TextContent)])
+    return result.text
diff --git a/tests/tools/file_editor/test_memory_usage.py b/tests/tools/file_editor/test_memory_usage.py
index e5542dd53d..3e2a973437 100644
--- a/tests/tools/file_editor/test_memory_usage.py
+++ b/tests/tools/file_editor/test_memory_usage.py
@@ -187,11 +187,7 @@ def test_file_editor_memory_leak(temp_file):
                     new_str=new_content,
                 )
                 if i == 0:
-                    content_str = (
-                        result.content
-                        if isinstance(result.content, str)
-                        else str(result.content)
-                    )
+                    content_str = result.text
                     print(f"First edit result: {content_str[:200]}...")
             except Exception as e:
                 print(f"\nError during edit {i}:")

From c648e235979142ff3d36dcbe02a05f515fd35483 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 15:03:44 -0500
Subject: [PATCH 75/76] clean up get_output_text

---
 .../browser_use/test_browser_executor_e2e.py  |  24 +--
 tests/tools/execute_bash/conftest.py          |   5 -
 tests/tools/execute_bash/test_bash_reset.py   |  33 ++-
 tests/tools/execute_bash/test_bash_session.py | 132 ++++++------
 tests/tools/execute_bash/test_bash_tool.py    |   5 +-
 .../test_bash_tool_auto_detection.py          |   5 +-
 .../execute_bash/test_secrets_masking.py      |  11 +-
 tests/tools/file_editor/conftest.py           |   5 -
 .../file_editor/test_basic_operations.py      | 203 ++++++++----------
 .../tools/file_editor/test_error_handling.py  |  31 ++-
 .../file_editor/test_file_editor_tool.py      |  12 +-
 tests/tools/file_editor/test_memory_usage.py  |   4 +-
 .../test_view_supported_binary_files.py       |  14 +-
 .../tools/file_editor/utils/test_encoding.py  |  78 ++-----
 14 files changed, 227 insertions(+), 335 deletions(-)

diff --git a/tests/tools/browser_use/test_browser_executor_e2e.py b/tests/tools/browser_use/test_browser_executor_e2e.py
index 26a6bf48e5..551b5d4bef 100644
--- a/tests/tools/browser_use/test_browser_executor_e2e.py
+++ b/tests/tools/browser_use/test_browser_executor_e2e.py
@@ -6,7 +6,6 @@
 
 import pytest
 
-from openhands.sdk.tool.schema import TextContent
 from openhands.tools.browser_use.definition import (
     BrowserClickAction,
     BrowserCloseTabAction,
@@ -23,13 +22,6 @@
 from openhands.tools.browser_use.impl import BrowserToolExecutor
 
 
-def get_output_text(observation: BrowserObservation) -> str:
-    """Extract text from observation content."""
-    if isinstance(observation.content, str):
-        return observation.content
-    return "".join([c.text for c in observation.content if isinstance(c, TextContent)])
-
-
 # Test HTML content for browser operations
 TEST_HTML = """<!DOCTYPE html>
 <html lang="en">
@@ -180,7 +172,7 @@ def test_navigate_action(
 
         assert isinstance(result, BrowserObservation)
         assert not result.is_error
-        output_text = get_output_text(result).lower()
+        output_text = result.text.lower()
         assert "successfully" in output_text or "navigated" in output_text
 
     def test_get_state_action(
@@ -197,7 +189,7 @@ def test_get_state_action(
 
         assert isinstance(result, BrowserObservation)
         assert not result.is_error
-        assert "Browser Test Page" in get_output_text(result)
+        assert "Browser Test Page" in result.text
 
     def test_get_state_with_screenshot(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -230,7 +222,7 @@ def test_click_action(
 
         # Parse the state to find button index
         # The test button should be indexed in the interactive elements
-        assert "Click Me" in get_output_text(state_result)
+        assert "Click Me" in state_result.text
 
         # Try to click the first interactive element (likely the button)
         click_action = BrowserClickAction(index=0)
@@ -250,7 +242,7 @@ def test_type_action(self, browser_executor: BrowserToolExecutor, test_server: s
         state_result = browser_executor(get_state_action)
 
         # Look for input field in the state
-        state_output = get_output_text(state_result)
+        state_output = state_result.text
         assert "test-input" in state_output or "Type here" in state_output
 
         # Find the input field index and type into it
@@ -297,7 +289,7 @@ def test_get_content_action(
 
         assert isinstance(result, BrowserObservation)
         assert not result.is_error
-        assert "Browser Test Page" in get_output_text(result)
+        assert "Browser Test Page" in result.text
 
         # Get content with links
         content_with_links_action = BrowserGetContentAction(
@@ -307,7 +299,7 @@ def test_get_content_action(
 
         assert isinstance(result, BrowserObservation)
         assert not result.is_error
-        assert "Browser Test Page" in get_output_text(result)
+        assert "Browser Test Page" in result.text
 
     def test_navigate_new_tab(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -335,7 +327,7 @@ def test_list_tabs_action(
         assert isinstance(result, BrowserObservation)
         assert not result.is_error
         # Should contain tab information
-        assert len(get_output_text(result)) > 0
+        assert len(result.text) > 0
 
     def test_go_back_action(
         self, browser_executor: BrowserToolExecutor, test_server: str
@@ -377,7 +369,7 @@ def test_switch_tab_action(
 
         # Parse tab information to get a tab ID
         # This is a simplified approach - in practice you'd parse the JSON response
-        if "tab" in get_output_text(tabs_result).lower():
+        if "tab" in tabs_result.text.lower():
             # Try to switch to first tab (assuming tab ID format)
             switch_action = BrowserSwitchTabAction(tab_id="0")
             result = browser_executor(switch_action)
diff --git a/tests/tools/execute_bash/conftest.py b/tests/tools/execute_bash/conftest.py
index 9b68c04daf..f566d3fb9b 100644
--- a/tests/tools/execute_bash/conftest.py
+++ b/tests/tools/execute_bash/conftest.py
@@ -4,17 +4,12 @@
 
 from openhands.sdk.logger import get_logger
 from openhands.tools.execute_bash.constants import TIMEOUT_MESSAGE_TEMPLATE
-from openhands.tools.execute_bash.definition import ExecuteBashObservation
 from openhands.tools.execute_bash.terminal import create_terminal_session
 
 
 logger = get_logger(__name__)
 
 
-def get_output_text(obs: ExecuteBashObservation) -> str:
-    return obs.text
-
-
 def get_no_change_timeout_suffix(timeout_seconds):
     """Helper function to generate the expected no-change timeout suffix."""
     return (
diff --git a/tests/tools/execute_bash/test_bash_reset.py b/tests/tools/execute_bash/test_bash_reset.py
index 9e16150222..8335bd0556 100644
--- a/tests/tools/execute_bash/test_bash_reset.py
+++ b/tests/tools/execute_bash/test_bash_reset.py
@@ -15,7 +15,6 @@
     ExecuteBashAction,
     ExecuteBashObservation,
 )
-from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_conv_state(working_dir: str) -> ConversationState:
@@ -44,13 +43,13 @@ def test_bash_reset_basic():
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "hello" in get_output_text(result)
+        assert "hello" in result.text
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
         assert reset_result.command == "[RESET]"
 
         # Verify the variable is no longer set after reset
@@ -58,7 +57,7 @@ def test_bash_reset_basic():
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
         # The variable should be empty after reset
-        assert get_output_text(result).strip() == ""
+        assert result.text.strip() == ""
 
 
 def test_bash_reset_with_command():
@@ -79,15 +78,15 @@ def test_bash_reset_with_command():
         )
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
-        assert "hello from fresh terminal" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
+        assert "hello from fresh terminal" in reset_result.text
         assert reset_result.command == "[RESET] echo 'hello from fresh terminal'"
 
         # Verify the variable is no longer set (confirming reset worked)
         action = ExecuteBashAction(command="echo $TEST_VAR")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert get_output_text(result).strip() == ""
+        assert result.text.strip() == ""
 
 
 def test_bash_reset_working_directory():
@@ -100,7 +99,7 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in get_output_text(result)
+        assert temp_dir in result.text
 
         # Change directory
         action = ExecuteBashAction(command="cd /home")
@@ -111,19 +110,19 @@ def test_bash_reset_working_directory():
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "/home" in get_output_text(result)
+        assert "/home" in result.text
 
         # Reset the terminal
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
 
         # Verify working directory is back to original
         action = ExecuteBashAction(command="pwd")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in get_output_text(result)
+        assert temp_dir in result.text
 
 
 def test_bash_reset_multiple_times():
@@ -136,25 +135,25 @@ def test_bash_reset_multiple_times():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
 
         # Execute a command after first reset
         action = ExecuteBashAction(command="echo 'after first reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after first reset" in get_output_text(result)
+        assert "after first reset" in result.text
 
         # Second reset
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
 
         # Execute a command after second reset
         action = ExecuteBashAction(command="echo 'after second reset'")
         result = tool(action)
         assert isinstance(result, ExecuteBashObservation)
-        assert "after second reset" in get_output_text(result)
+        assert "after second reset" in result.text
 
 
 def test_bash_reset_with_timeout():
@@ -167,7 +166,7 @@ def test_bash_reset_with_timeout():
         reset_action = ExecuteBashAction(command="", reset=True, timeout=5.0)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
         assert reset_result.command == "[RESET]"
 
 
@@ -197,5 +196,5 @@ def test_bash_reset_only_with_empty_command():
         reset_action = ExecuteBashAction(command="", reset=True)
         reset_result = tool(reset_action)
         assert isinstance(reset_result, ExecuteBashObservation)
-        assert "Terminal session has been reset" in get_output_text(reset_result)
+        assert "Terminal session has been reset" in reset_result.text
         assert reset_result.command == "[RESET]"
diff --git a/tests/tools/execute_bash/test_bash_session.py b/tests/tools/execute_bash/test_bash_session.py
index 30e6c32b89..ee827c9a34 100644
--- a/tests/tools/execute_bash/test_bash_session.py
+++ b/tests/tools/execute_bash/test_bash_session.py
@@ -26,7 +26,7 @@
     create_terminal_session,
 )
 
-from .conftest import get_no_change_timeout_suffix, get_output_text
+from .conftest import get_no_change_timeout_suffix
 
 
 logger = get_logger(__name__)
@@ -46,7 +46,7 @@ def test_session_initialization(terminal_type):
         session.initialize()
         obs = session.execute(ExecuteBashAction(command="pwd"))
 
-        assert temp_dir in get_output_text(obs)
+        assert temp_dir in obs.text
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
         session.close()
 
@@ -69,7 +69,7 @@ def test_cwd_property(tmp_path, terminal_type):
 
     # For other implementations, just verify the command executed successfully
     obs = session.execute(ExecuteBashAction(command="pwd"))
-    assert str(random_dir) in get_output_text(obs)
+    assert str(random_dir) in obs.text
 
     # Note: CWD tracking may vary between terminal implementations
     # For tmux, it should track properly. For subprocess, it may not.
@@ -87,7 +87,7 @@ def test_basic_command(terminal_type):
     # Test simple command
     obs = session.execute(ExecuteBashAction(command="echo 'hello world'"))
 
-    assert "hello world" in get_output_text(obs)
+    assert "hello world" in obs.text
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -98,16 +98,16 @@ def test_basic_command(terminal_type):
 
     # Note: Exit code handling may vary between terminal implementations
     # The important thing is that the error message is captured
-    assert "nonexistent_command: command not found" in get_output_text(obs)
+    assert "nonexistent_command: command not found" in obs.text
     assert session.prev_status == TerminalCommandStatus.COMPLETED
 
     # Test multiple commands in sequence
     obs = session.execute(
         ExecuteBashAction(command='echo "first" && echo "second" && echo "third"')
     )
-    assert "first" in get_output_text(obs)
-    assert "second" in get_output_text(obs)
-    assert "third" in get_output_text(obs)
+    assert "first" in obs.text
+    assert "second" in obs.text
+    assert "third" in obs.text
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     # Note: prefix may vary between terminal implementations
     assert obs.metadata.exit_code == 0
@@ -128,7 +128,7 @@ def test_environment_variable_persistence(terminal_type):
 
     # Use the environment variable in a subsequent command
     obs = session.execute(ExecuteBashAction(command="echo $TEST_VAR"))
-    assert "hello world" in get_output_text(obs)
+    assert "hello world" in obs.text
     assert obs.metadata.exit_code == 0
 
     session.close()
@@ -154,8 +154,8 @@ def test_environment_variable_inheritance_from_parent(terminal_type):
 
         # Check if the environment variable is available in the terminal
         obs = session.execute(ExecuteBashAction(command=f"echo ${test_var_name}"))
-        assert test_var_value in get_output_text(obs), (
-            f"Expected '{test_var_value}' in output, but got: {get_output_text(obs)}"
+        assert test_var_value in obs.text, (
+            f"Expected '{test_var_value}' in output, but got: {obs.text}"
         )
         assert obs.metadata.exit_code == 0
 
@@ -179,7 +179,7 @@ def test_long_running_command_follow_by_execute():
         ExecuteBashAction(command="echo 1; sleep 3; echo 2; sleep 3; echo 3")
     )
 
-    assert "1" in get_output_text(obs)  # First number should appear before timeout
+    assert "1" in obs.text  # First number should appear before timeout
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
@@ -188,7 +188,7 @@ def test_long_running_command_follow_by_execute():
     # Continue watching output
     obs = session.execute(ExecuteBashAction(command="", is_input=True))
 
-    assert "2" in get_output_text(obs)
+    assert "2" in obs.text
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -197,7 +197,7 @@ def test_long_running_command_follow_by_execute():
     # Test command that produces no output
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert "3" not in get_output_text(obs)
+    assert "3" not in obs.text
     assert obs.metadata.prefix == "[Below is the output of the previous command.]\n"
     assert "The previous command is still running" in obs.metadata.suffix
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -208,9 +208,7 @@ def test_long_running_command_follow_by_execute():
     # Run it again, this time it should produce output and then start a new command
     obs = session.execute(ExecuteBashAction(command="sleep 15"))
 
-    assert "3" in get_output_text(
-        obs
-    )  # Should see the final output from the previous command
+    assert "3" in obs.text  # Should see the final output from the previous command
     assert obs.metadata.exit_code == -1  # -1 indicates new command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
@@ -232,7 +230,7 @@ def test_interactive_command(terminal_type):
         )
     )
 
-    assert "Enter name:" in get_output_text(obs)
+    assert "Enter name:" in obs.text
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
     assert obs.metadata.suffix == get_no_change_timeout_suffix(3)
@@ -241,7 +239,7 @@ def test_interactive_command(terminal_type):
     # Send input
     obs = session.execute(ExecuteBashAction(command="John", is_input=True))
 
-    assert "Hello John" in get_output_text(obs)
+    assert "Hello John" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -271,7 +269,7 @@ def test_interactive_command(terminal_type):
 
     obs = session.execute(ExecuteBashAction(command="EOF", is_input=True))
 
-    assert "line 1" in get_output_text(obs) and "line 2" in get_output_text(obs)
+    assert "line 1" in obs.text and "line 2" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert obs.metadata.prefix == ""
@@ -292,7 +290,7 @@ def test_ctrl_c(terminal_type):
         ExecuteBashAction(command="while true; do echo 'looping'; sleep 3; done"),
     )
 
-    assert "looping" in get_output_text(obs)
+    assert "looping" in obs.text
     assert obs.metadata.suffix == get_no_change_timeout_suffix(2)
     assert obs.metadata.prefix == ""
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
@@ -362,22 +360,22 @@ def test_command_output_continuation(terminal_type):
     if session.prev_status == TerminalCommandStatus.COMPLETED:
         # If the command completed immediately, verify we got all the output
         logger.info("Command completed immediately", extra={"msg_type": "TEST_INFO"})
-        assert "1" in get_output_text(obs)
-        assert "2" in get_output_text(obs)
-        assert "3" in get_output_text(obs)
-        assert "4" in get_output_text(obs)
-        assert "5" in get_output_text(obs)
+        assert "1" in obs.text
+        assert "2" in obs.text
+        assert "3" in obs.text
+        assert "4" in obs.text
+        assert "5" in obs.text
         assert "[The command completed with exit code 0.]" in obs.metadata.suffix
     else:
         # If the command timed out, verify we got the timeout message
         assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
-        assert "1" in get_output_text(obs)
+        assert "1" in obs.text
         assert "[The command has no new output after 1 seconds." in obs.metadata.suffix
 
         # Continue getting output until we see all numbers
         numbers_seen = set()
         for i in range(1, 6):
-            if str(i) in get_output_text(obs):
+            if str(i) in obs.text:
                 numbers_seen.add(i)
 
         # We need to see numbers 2-5 and then the command completion
@@ -389,7 +387,7 @@ def test_command_output_continuation(terminal_type):
 
             # Check for numbers in the output
             for i in range(1, 6):
-                if str(i) in get_output_text(obs) and i not in numbers_seen:
+                if str(i) in obs.text and i not in numbers_seen:
                     numbers_seen.add(i)
                     logger.info(
                         f"Found number {i} in output", extra={"msg_type": "TEST_INFO"}
@@ -429,8 +427,8 @@ def test_long_output(terminal_type):
         ExecuteBashAction(command='for i in {1..5000}; do echo "Line $i"; done')
     )
 
-    assert "Line 1" in get_output_text(obs)
-    assert "Line 5000" in get_output_text(obs)
+    assert "Line 1" in obs.text
+    assert "Line 5000" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -449,8 +447,8 @@ def test_long_output_exceed_history_limit(terminal_type):
     )
 
     assert "Previous command outputs are truncated" in obs.metadata.prefix
-    assert "Line 40000" in get_output_text(obs)
-    assert "Line 50000" in get_output_text(obs)
+    assert "Line 40000" in obs.text
+    assert "Line 50000" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
 
@@ -470,7 +468,7 @@ def test_multiline_command():
         )
     )
 
-    assert "inside if" in get_output_text(obs)
+    assert "inside if" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.prefix == ""
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
@@ -494,21 +492,21 @@ def test_python_interactive_input(terminal_type):
     # Start Python with the interactive script
     obs = session.execute(ExecuteBashAction(command=f'python3 -c "{python_script}"'))
 
-    assert "Enter your name:" in get_output_text(obs)
+    assert "Enter your name:" in obs.text
     assert obs.metadata.exit_code == -1  # -1 indicates command is still running
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send first input (name)
     obs = session.execute(ExecuteBashAction(command="Alice", is_input=True))
 
-    assert "Enter your age:" in get_output_text(obs)
+    assert "Enter your age:" in obs.text
     assert obs.metadata.exit_code == -1
     assert session.prev_status == TerminalCommandStatus.NO_CHANGE_TIMEOUT
 
     # Send second input (age)
     obs = session.execute(ExecuteBashAction(command="25", is_input=True))
 
-    assert "Hello Alice, you are 25 years old" in get_output_text(obs)
+    assert "Hello Alice, you are 25 years old" in obs.text
     assert obs.metadata.exit_code == 0
     assert obs.metadata.suffix == "\n[The command completed with exit code 0.]"
     assert session.prev_status == TerminalCommandStatus.COMPLETED
@@ -521,7 +519,7 @@ def _run_bash_action(session, command: str, **kwargs):
     action = ExecuteBashAction(command=command, **kwargs)
     obs = session.execute(action)
     logger.info(f"Command: {command}")
-    output_text = get_output_text(obs) if obs.content else ""
+    output_text = obs.text if obs.content else ""
     logger.info(f"Output: {output_text}")
     logger.info(f"Exit code: {obs.metadata.exit_code}")
     return obs
@@ -542,12 +540,12 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in get_output_text(obs)
+            assert "Serving HTTP on" in obs.text
 
             # Send Ctrl+C to interrupt
             obs = _run_bash_action(session, "C-c", is_input=True)
             assert "CTRL+C was sent" in obs.metadata.suffix
-            assert "Keyboard interrupt received, exiting." in get_output_text(obs)
+            assert "Keyboard interrupt received, exiting." in obs.text
 
             # Verify we can run commands after interrupt
             obs = _run_bash_action(session, "ls")
@@ -558,7 +556,7 @@ def test_bash_server(terminal_type):
                 session, "python -u -m http.server 8081", timeout=1.0
             )
             assert obs.metadata.exit_code == -1
-            assert "Serving HTTP on" in get_output_text(obs)
+            assert "Serving HTTP on" in obs.text
 
         finally:
             session.close()
@@ -585,7 +583,7 @@ def test_bash_background_server(terminal_type):
             obs = _run_bash_action(session, f"curl http://localhost:{server_port}")
             assert obs.metadata.exit_code == 0
             # Check for content typical of python http.server directory listing
-            assert "Directory listing for" in get_output_text(obs)
+            assert "Directory listing for" in obs.text
 
             # Kill the server
             obs = _run_bash_action(session, 'pkill -f "http.server"')
@@ -608,17 +606,17 @@ def test_multiline_commands(terminal_type):
             # single multiline command
             obs = _run_bash_action(session, 'echo \\\n -e "foo"')
             assert obs.metadata.exit_code == 0
-            assert "foo" in get_output_text(obs)
+            assert "foo" in obs.text
 
             # test multiline echo
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
-            assert "hello\nworld" in get_output_text(obs)
+            assert "hello\nworld" in obs.text
 
             # test whitespace
             obs = _run_bash_action(session, 'echo -e "a\\n\\n\\nz"')
             assert obs.metadata.exit_code == 0
-            assert "\n\n\n" in get_output_text(obs)
+            assert "\n\n\n" in obs.text
         finally:
             session.close()
 
@@ -641,7 +639,7 @@ def test_complex_commands(terminal_type):
         try:
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "Got 3 heads in a row after 3 flips!" in get_output_text(obs)
+            assert "Got 3 heads in a row after 3 flips!" in obs.text
         finally:
             session.close()
 
@@ -658,8 +656,8 @@ def test_no_ps2_in_output(terminal_type):
             obs = _run_bash_action(session, 'echo -e "hello\nworld"')
             assert obs.metadata.exit_code == 0
 
-            assert "hello\nworld" in get_output_text(obs)
-            assert ">" not in get_output_text(obs)
+            assert "hello\nworld" in obs.text
+            assert ">" not in obs.text
         finally:
             session.close()
 
@@ -689,11 +687,11 @@ def test_multiline_command_loop(terminal_type):
         try:
             obs = _run_bash_action(session, init_cmd)
             assert obs.metadata.exit_code == 0
-            assert "created files" in get_output_text(obs)
+            assert "created files" in obs.text
 
             obs = _run_bash_action(session, follow_up_cmd)
             assert obs.metadata.exit_code == 0
-            assert "success" in get_output_text(obs)
+            assert "success" in obs.text
         finally:
             session.close()
 
@@ -730,7 +728,7 @@ def test_multiple_multiline_commands(terminal_type):
             for cmd in cmds:
                 obs = _run_bash_action(session, cmd)
                 assert obs.metadata.exit_code == 0
-                results.append(get_output_text(obs))
+                results.append(obs.text)
 
             # Verify all expected outputs are present
             assert "total 0" in results[0]  # ls -l
@@ -763,21 +761,21 @@ def test_cmd_run(terminal_type):
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "total 0" in get_output_text(obs)
+            assert "total 0" in obs.text
 
             obs = _run_bash_action(session, "mkdir test")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l")
             assert obs.metadata.exit_code == 0
-            assert "test" in get_output_text(obs)
+            assert "test" in obs.text
 
             obs = _run_bash_action(session, "touch test/foo.txt")
             assert obs.metadata.exit_code == 0
 
             obs = _run_bash_action(session, "ls -l test")
             assert obs.metadata.exit_code == 0
-            assert "foo.txt" in get_output_text(obs)
+            assert "foo.txt" in obs.text
 
             # clean up
             _run_bash_action(session, "rm -rf test")
@@ -799,7 +797,7 @@ def test_run_as_user_correct_home_dir(terminal_type):
             obs = _run_bash_action(session, "cd ~ && pwd")
             assert obs.metadata.exit_code == 0
             home = os.getenv("HOME")
-            assert home and home in get_output_text(obs)
+            assert home and home in obs.text
         finally:
             session.close()
 
@@ -814,8 +812,8 @@ def test_multi_cmd_run_in_single_line(terminal_type):
             # Original Linux version using &&
             obs = _run_bash_action(session, "pwd && ls -l")
             assert obs.metadata.exit_code == 0
-            assert temp_dir in get_output_text(obs)
-            assert "total 0" in get_output_text(obs)
+            assert temp_dir in obs.text
+            assert "total 0" in obs.text
         finally:
             session.close()
 
@@ -838,7 +836,7 @@ def test_stateful_cmd(terminal_type):
 
             obs = _run_bash_action(session, "pwd")
             assert obs.metadata.exit_code == 0
-            assert f"{temp_dir}/test" in get_output_text(obs).strip()
+            assert f"{temp_dir}/test" in obs.text.strip()
         finally:
             session.close()
 
@@ -869,7 +867,7 @@ def test_python_version(terminal_type):
         try:
             obs = _run_bash_action(session, "python --version")
             assert obs.metadata.exit_code == 0
-            assert "Python 3" in get_output_text(obs)
+            assert "Python 3" in obs.text
         finally:
             session.close()
 
@@ -889,7 +887,7 @@ def test_pwd_property(terminal_type):
 
             obs = _run_bash_action(session, "cd random_dir && pwd")
             assert obs.metadata.exit_code == 0
-            assert "random_dir" in get_output_text(obs)
+            assert "random_dir" in obs.text
         finally:
             session.close()
 
@@ -918,10 +916,10 @@ def test_long_output_from_nested_directories(terminal_type):
             assert obs.metadata.exit_code == 0
 
             # Verify output contains expected files
-            assert "folder_1" in get_output_text(obs)
-            assert "file_1.txt" in get_output_text(obs)
-            assert "folder_100" in get_output_text(obs)
-            assert "file_100.txt" in get_output_text(obs)
+            assert "folder_1" in obs.text
+            assert "file_1.txt" in obs.text
+            assert "folder_100" in obs.text
+            assert "file_100.txt" in obs.text
         finally:
             session.close()
 
@@ -955,7 +953,7 @@ def test_command_backslash(terminal_type):
             )
             obs = _run_bash_action(session, cmd)
             assert obs.metadata.exit_code == 0
-            assert "/tmp/test_dir/file_1.txt" in get_output_text(obs)
+            assert "/tmp/test_dir/file_1.txt" in obs.text
         finally:
             session.close()
 
@@ -979,7 +977,7 @@ def test_bash_remove_prefix(terminal_type):
             # Check git remote - same for both platforms
             obs = _run_bash_action(session, "git remote -v")
             assert obs.metadata.exit_code == 0
-            assert "https://github.com/OpenHands/OpenHands" in get_output_text(obs)
-            assert "git remote -v" not in get_output_text(obs)
+            assert "https://github.com/OpenHands/OpenHands" in obs.text
+            assert "git remote -v" not in obs.text
         finally:
             session.close()
diff --git a/tests/tools/execute_bash/test_bash_tool.py b/tests/tools/execute_bash/test_bash_tool.py
index c2acdef888..fd9e088693 100644
--- a/tests/tools/execute_bash/test_bash_tool.py
+++ b/tests/tools/execute_bash/test_bash_tool.py
@@ -14,7 +14,6 @@
     ExecuteBashAction,
     ExecuteBashObservation,
 )
-from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_test_conv_state(temp_dir: str) -> ConversationState:
@@ -70,7 +69,7 @@ def test_bash_tool_execution():
         # Check the result
         assert result is not None
         assert isinstance(result, ExecuteBashObservation)
-        assert "Hello, World!" in get_output_text(result)
+        assert "Hello, World!" in result.text
 
 
 def test_bash_tool_working_directory():
@@ -88,7 +87,7 @@ def test_bash_tool_working_directory():
 
         # Check that the working directory is correct
         assert isinstance(result, ExecuteBashObservation)
-        assert temp_dir in get_output_text(result)
+        assert temp_dir in result.text
 
 
 def test_bash_tool_to_openai_tool():
diff --git a/tests/tools/execute_bash/test_bash_tool_auto_detection.py b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
index ab450a6aee..dbc1c36349 100644
--- a/tests/tools/execute_bash/test_bash_tool_auto_detection.py
+++ b/tests/tools/execute_bash/test_bash_tool_auto_detection.py
@@ -18,7 +18,6 @@
     TerminalSession,
     TmuxTerminal,
 )
-from tests.tools.execute_bash.conftest import get_output_text
 
 
 def _create_conv_state(working_dir: str) -> ConversationState:
@@ -53,7 +52,7 @@ def test_default_auto_detection():
         # Test that it works
         action = ExecuteBashAction(command="echo 'Auto-detection test'")
         obs = executor(action)
-        assert "Auto-detection test" in get_output_text(obs)
+        assert "Auto-detection test" in obs.text
 
 
 def test_forced_terminal_types():
@@ -139,7 +138,7 @@ def test_backward_compatibility():
         assert tool.executor is not None
         action = ExecuteBashAction(command="echo 'Backward compatibility test'")
         obs = tool.executor(action)
-        assert "Backward compatibility test" in get_output_text(obs)
+        assert "Backward compatibility test" in obs.text
         assert obs.metadata.exit_code == 0
 
 
diff --git a/tests/tools/execute_bash/test_secrets_masking.py b/tests/tools/execute_bash/test_secrets_masking.py
index e546c211eb..16d5917022 100644
--- a/tests/tools/execute_bash/test_secrets_masking.py
+++ b/tests/tools/execute_bash/test_secrets_masking.py
@@ -11,7 +11,6 @@
 from openhands.sdk.tool.schema import TextContent
 from openhands.tools.execute_bash import ExecuteBashAction, ExecuteBashObservation
 from openhands.tools.execute_bash.impl import BashExecutor
-from tests.tools.execute_bash.conftest import get_output_text
 
 
 def test_bash_executor_without_conversation():
@@ -26,8 +25,8 @@ def test_bash_executor_without_conversation():
             result = executor(action)
 
             # Check that the output is not masked (no conversation provided)
-            assert "secret-value-123" in get_output_text(result)
-            assert "<secret-hidden>" not in get_output_text(result)
+            assert "secret-value-123" in result.text
+            assert "<secret-hidden>" not in result.text
 
         finally:
             executor.close()
@@ -82,10 +81,10 @@ def test_bash_executor_with_conversation_secrets():
             assert mock_session.execute.called
 
             # Check that both secrets were masked in the output
-            assert "secret-value-123" not in get_output_text(result)
-            assert "another-secret-456" not in get_output_text(result)
+            assert "secret-value-123" not in result.text
+            assert "another-secret-456" not in result.text
             # SecretsManager uses <secret-hidden> as the mask
-            assert "<secret-hidden>" in get_output_text(result)
+            assert "<secret-hidden>" in result.text
 
         finally:
             executor.close()
diff --git a/tests/tools/file_editor/conftest.py b/tests/tools/file_editor/conftest.py
index c8efdaecb2..2588541733 100644
--- a/tests/tools/file_editor/conftest.py
+++ b/tests/tools/file_editor/conftest.py
@@ -81,8 +81,3 @@ def create_test_file(path: Path, content: str):
     """Helper to create a test file with given content."""
     path.write_text(content)
     return path
-
-
-def get_output_text(result: FileEditorObservation) -> str:
-    """Extract text content from a FileEditorObservation's content."""
-    return result.text
diff --git a/tests/tools/file_editor/test_basic_operations.py b/tests/tools/file_editor/test_basic_operations.py
index ff3f56d4e5..c26f6c1fcf 100644
--- a/tests/tools/file_editor/test_basic_operations.py
+++ b/tests/tools/file_editor/test_basic_operations.py
@@ -21,7 +21,6 @@
 
 from .conftest import (
     assert_successful_result,
-    get_output_text,
 )
 
 
@@ -63,13 +62,11 @@ def test_file_editor_happy_path(temp_file):
     # Validate the result
     assert_successful_result(result, str(temp_file))
     assert (
-        get_output_text(result) is not None
-        and "The file" in get_output_text(result)
-        and "has been edited" in get_output_text(result)
+        result.text is not None
+        and "The file" in result.text
+        and "has been edited" in result.text
     )
-    assert get_output_text(
-        result
-    ) is not None and "This is a sample file." in get_output_text(result)
+    assert result.text is not None and "This is a sample file." in result.text
     assert result.path == str(temp_file)
     assert result.prev_exist is True
     assert (
@@ -108,19 +105,16 @@ def test_file_editor_view_operation(temp_file):
 
     # Validate the result
     assert_successful_result(result, str(temp_file))
-    assert get_output_text(
-        result
-    ) is not None and "Here's the result of running `cat -n`" in get_output_text(result)
     assert (
-        get_output_text(result) is not None
-        and "This is a file with XML tags parsing logic..." in get_output_text(result)
+        result.text is not None
+        and "Here's the result of running `cat -n`" in result.text
     )
-    assert get_output_text(
-        result
-    ) is not None and "match = re.search(" in get_output_text(result)
-    assert get_output_text(
-        result
-    ) is not None and "...More text here." in get_output_text(result)
+    assert (
+        result.text is not None
+        and "This is a file with XML tags parsing logic..." in result.text
+    )
+    assert result.text is not None and "match = re.search(" in result.text
+    assert result.text is not None and "...More text here." in result.text
 
 
 def test_successful_operations(temp_file):
@@ -136,10 +130,11 @@ def test_successful_operations(temp_file):
         path=str(temp_file),
     )
     assert_successful_result(result)
-    assert get_output_text(
-        result
-    ) is not None and "Here's the result of running `cat -n`" in get_output_text(result)
-    assert get_output_text(result) is not None and "line 1" in get_output_text(result)
+    assert (
+        result.text is not None
+        and "Here's the result of running `cat -n`" in result.text
+    )
+    assert result.text is not None and "line 1" in result.text
 
     # Test str_replace
     result = file_editor(
@@ -149,12 +144,8 @@ def test_successful_operations(temp_file):
         new_str="replaced line",
     )
     assert_successful_result(result)
-    assert get_output_text(result) is not None and "has been edited" in get_output_text(
-        result
-    )
-    assert get_output_text(result) is not None and "replaced line" in get_output_text(
-        result
-    )
+    assert result.text is not None and "has been edited" in result.text
+    assert result.text is not None and "replaced line" in result.text
 
     # Test insert
     result = file_editor(
@@ -164,12 +155,8 @@ def test_successful_operations(temp_file):
         new_str="inserted line",
     )
     assert_successful_result(result)
-    assert get_output_text(result) is not None and "has been edited" in get_output_text(
-        result
-    )
-    assert get_output_text(result) is not None and "inserted line" in get_output_text(
-        result
-    )
+    assert result.text is not None and "has been edited" in result.text
+    assert result.text is not None and "inserted line" in result.text
 
     # Test undo
     result = file_editor(
@@ -177,9 +164,7 @@ def test_successful_operations(temp_file):
         path=str(temp_file),
     )
     assert_successful_result(result)
-    assert get_output_text(
-        result
-    ) is not None and "undone successfully" in get_output_text(result)
+    assert result.text is not None and "undone successfully" in result.text
 
 
 def test_tab_expansion(temp_file):
@@ -196,12 +181,8 @@ def test_tab_expansion(temp_file):
     )
     assert_successful_result(result)
     # Tabs should be preserved in output
-    assert get_output_text(result) is not None and "\tindented" in get_output_text(
-        result
-    )
-    assert get_output_text(
-        result
-    ) is not None and "line\twith\ttabs" in get_output_text(result)
+    assert result.text is not None and "\tindented" in result.text
+    assert result.text is not None and "line\twith\ttabs" in result.text
 
     # Test str_replace with tabs in old_str
     result = file_editor(
@@ -211,9 +192,7 @@ def test_tab_expansion(temp_file):
         new_str="replaced line",
     )
     assert_successful_result(result)
-    assert get_output_text(result) is not None and "replaced line" in get_output_text(
-        result
-    )
+    assert result.text is not None and "replaced line" in result.text
 
     # Test str_replace with tabs in new_str
     result = file_editor(
@@ -223,9 +202,7 @@ def test_tab_expansion(temp_file):
         new_str="new\tline\twith\ttabs",
     )
     assert_successful_result(result)
-    assert get_output_text(
-        result
-    ) is not None and "new\tline\twith\ttabs" in get_output_text(result)
+    assert result.text is not None and "new\tline\twith\ttabs" in result.text
 
     # Test insert with tabs
     result = file_editor(
@@ -235,9 +212,7 @@ def test_tab_expansion(temp_file):
         new_str="\tindented\tline",
     )
     assert_successful_result(result)
-    assert get_output_text(
-        result
-    ) is not None and "\tindented\tline" in get_output_text(result)
+    assert result.text is not None and "\tindented\tline" in result.text
 
 
 def test_create_operation(temp_file):
@@ -254,9 +229,7 @@ def test_create_operation(temp_file):
     )
 
     assert_successful_result(result, str(temp_file))
-    assert get_output_text(
-        result
-    ) is not None and "created successfully" in get_output_text(result)
+    assert result.text is not None and "created successfully" in result.text
     assert result.prev_exist is False
     assert result.new_content == content
 
@@ -285,31 +258,29 @@ def test_view_operation_truncation(temp_file):
     )
 
     assert_successful_result(result)
-    assert get_output_text(result) is not None
+    assert result.text is not None
 
     # Check that truncation notice is present
-    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
+    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in result.text
 
     # The content should be truncated before line numbers are added
     # So the final output will be longer than MAX_RESPONSE_LEN_CHAR due to formatting
     # but the original content was truncated
-    assert "Here's the result of running `cat -n`" in get_output_text(result)
+    assert "Here's the result of running `cat -n`" in result.text
 
     # With head-and-tail truncation, should contain both start and end content
     # The line numbers will show as "     1\tA..." at start and end with "A"
-    assert "\tA" in get_output_text(result)  # Should have A's with tab formatting
+    assert "\tA" in result.text  # Should have A's with tab formatting
 
 
 def test_view_file(editor):
     editor, test_file = editor
     result = editor(command="view", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert f"Here's the result of running `cat -n` on {test_file}:" in get_output_text(
-        result
-    )
-    assert "1\tThis is a test file." in get_output_text(result)
-    assert "2\tThis file is for testing purposes." in get_output_text(result)
-    assert "3\t" not in get_output_text(result)  # No extra line
+    assert f"Here's the result of running `cat -n` on {test_file}:" in result.text
+    assert "1\tThis is a test file." in result.text
+    assert "2\tThis file is for testing purposes." in result.text
+    assert "3\t" not in result.text  # No extra line
 
 
 def test_view_directory(editor):
@@ -317,7 +288,7 @@ def test_view_directory(editor):
     parent_dir = test_file.parent
     result = editor(command="view", path=str(parent_dir))
     assert (
-        get_output_text(result)
+        result.text
         == f"""Here's the files and directories up to 2 levels deep in {parent_dir}, excluding hidden items:
 {parent_dir}/
 {parent_dir}/test.txt"""  # noqa: E501
@@ -344,13 +315,11 @@ def test_view_with_a_specific_range(editor):
 
     # View file in range 50-100
     result = editor(command="view", path=str(test_file), view_range=[50, 100])
-    assert f"Here's the result of running `cat -n` on {test_file}:" in get_output_text(
-        result
-    )
-    assert "    49\tLine 49" not in get_output_text(result)
-    assert "    50\tLine 50" in get_output_text(result)
-    assert "   100\tLine 100" in get_output_text(result)
-    assert "101" not in get_output_text(result)
+    assert f"Here's the result of running `cat -n` on {test_file}:" in result.text
+    assert "    49\tLine 49" not in result.text
+    assert "    50\tLine 50" in result.text
+    assert "   100\tLine 100" in result.text
+    assert "101" not in result.text
 
 
 def test_create_file(editor):
@@ -359,7 +328,7 @@ def test_create_file(editor):
     result = editor(command="create", path=str(new_file), file_text="New file content")
     assert new_file.exists()
     assert new_file.read_text() == "New file content"
-    assert "File created successfully" in get_output_text(result)
+    assert "File created successfully" in result.text
 
 
 def test_create_with_empty_string(editor):
@@ -368,14 +337,12 @@ def test_create_with_empty_string(editor):
     result = editor(command="create", path=str(new_file), file_text="")
     assert new_file.exists()
     assert new_file.read_text() == ""
-    assert "File created successfully" in get_output_text(result)
+    assert "File created successfully" in result.text
 
     # Test the view command showing an empty line
     result = editor(command="view", path=str(new_file))
-    assert f"Here's the result of running `cat -n` on {new_file}:" in get_output_text(
-        result
-    )
-    assert "1\t" in get_output_text(result)  # Check for empty line
+    assert f"Here's the result of running `cat -n` on {new_file}:" in result.text
+    assert "1\t" in result.text  # Check for empty line
 
 
 def test_create_with_none_file_text(editor):
@@ -398,7 +365,7 @@ def test_str_replace_no_linting(editor):
 
     # Test str_replace command
     assert (
-        get_output_text(result)
+        result.text
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tThis is a sample file.
      2\tThis file is for testing purposes.
@@ -421,7 +388,7 @@ def test_str_replace_multi_line_no_linting(editor):
 
     # Test str_replace command
     assert (
-        get_output_text(result)
+        result.text
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tThis is a sample file.
      2\tThis file is for testing purposes.
@@ -440,7 +407,7 @@ def test_str_replace_multi_line_with_tabs_no_linting(editor_python_file_with_tab
     assert isinstance(result, FileEditorObservation)
 
     assert (
-        get_output_text(result)
+        result.text
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of {test_file}:
      1\tdef test():
      2\t\tprint("Hello, Universe!")
@@ -543,7 +510,7 @@ def test_insert_no_linting(editor):
     assert isinstance(result, FileEditorObservation)
     assert "Inserted line" in test_file.read_text()
     assert (
-        get_output_text(result)
+        result.text
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of the edited file:
      1\tThis is a test file.
      2\tInserted line
@@ -592,7 +559,7 @@ def test_insert_chinese_text_into_english_file(editor):
     assert isinstance(result, FileEditorObservation)
     assert "中文文本" in test_file.read_text()
     assert (
-        get_output_text(result)
+        result.text
         == f"""The file {test_file} has been edited. Here's the result of running `cat -n` on a snippet of the edited file:
      1\t中文文本
      2\tThis is a test file.
@@ -625,7 +592,7 @@ def test_undo_edit(editor):
     # Undo the edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in get_output_text(result)
+    assert "Last edit to" in result.text
     assert "test file" in test_file.read_text()  # Original content restored
 
 
@@ -648,13 +615,13 @@ def test_multiple_undo_edits(editor):
     # Undo the last edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in get_output_text(result)
+    assert "Last edit to" in result.text
     assert "sample file v1" in test_file.read_text()  # Previous content restored
 
     # Undo the first edit
     result = editor(command="undo_edit", path=str(test_file))
     assert isinstance(result, FileEditorObservation)
-    assert "Last edit to" in get_output_text(result)
+    assert "Last edit to" in result.text
     assert "test file" in test_file.read_text()  # Original content restored
 
 
@@ -730,17 +697,16 @@ def test_view_directory_with_hidden_files(tmp_path):
 
     # Verify output
     assert isinstance(result, FileEditorObservation)
-    assert str(test_dir) in get_output_text(result)
-    assert "visible.txt" in get_output_text(result)  # Visible file is shown
-    assert "visible_dir" in get_output_text(result)  # Visible directory is shown
-    assert ".hidden1" not in get_output_text(result)  # Hidden files not shown
-    assert ".hidden2" not in get_output_text(result)
-    assert ".hidden_dir" not in get_output_text(result)
+    assert str(test_dir) in result.text
+    assert "visible.txt" in result.text  # Visible file is shown
+    assert "visible_dir" in result.text  # Visible directory is shown
+    assert ".hidden1" not in result.text  # Hidden files not shown
+    assert ".hidden2" not in result.text
+    assert ".hidden_dir" not in result.text
     assert (
-        "3 hidden files/directories in this directory are excluded"
-        in get_output_text(result)
+        "3 hidden files/directories in this directory are excluded" in result.text
     )  # Shows count of hidden items in current dir only
-    assert "ls -la" in get_output_text(result)  # Shows command to view hidden files
+    assert "ls -la" in result.text  # Shows command to view hidden files
 
 
 def test_view_symlinked_directory(tmp_path):
@@ -766,11 +732,11 @@ def test_view_symlinked_directory(tmp_path):
 
     # Verify that all files are listed through the symlink
     assert isinstance(result, FileEditorObservation)
-    assert str(symlink_dir) in get_output_text(result)
-    assert "file1.txt" in get_output_text(result)
-    assert "file2.txt" in get_output_text(result)
-    assert "subdir" in get_output_text(result)
-    assert "file3.txt" in get_output_text(result)
+    assert str(symlink_dir) in result.text
+    assert "file1.txt" in result.text
+    assert "file2.txt" in result.text
+    assert "subdir" in result.text
+    assert "file3.txt" in result.text
 
 
 def test_view_large_directory_with_truncation(editor, tmp_path):
@@ -783,7 +749,7 @@ def test_view_large_directory_with_truncation(editor, tmp_path):
 
     result = editor(command="view", path=str(large_dir))
     assert isinstance(result, FileEditorObservation)
-    assert DIRECTORY_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
+    assert DIRECTORY_CONTENT_TRUNCATED_NOTICE in result.text
 
 
 def test_view_directory_on_hidden_path(tmp_path):
@@ -825,23 +791,22 @@ def test_view_directory_on_hidden_path(tmp_path):
     # Verify output
     assert isinstance(result, FileEditorObservation)
     # Depth 1: Visible files/dirs shown, hidden files/dirs not shown
-    assert "visible1.txt" in get_output_text(result)
-    assert "visible_dir" in get_output_text(result)
-    assert ".hidden1" not in get_output_text(result)
-    assert ".hidden_dir" not in get_output_text(result)
+    assert "visible1.txt" in result.text
+    assert "visible_dir" in result.text
+    assert ".hidden1" not in result.text
+    assert ".hidden_dir" not in result.text
 
     # Depth 2: Files in visible_dir shown
-    assert "visible2.txt" in get_output_text(result)
-    assert ".hidden2" not in get_output_text(result)
+    assert "visible2.txt" in result.text
+    assert ".hidden2" not in result.text
 
     # Depth 2: Files in hidden_dir not shown
-    assert "visible3.txt" not in get_output_text(result)
-    assert ".hidden3" not in get_output_text(result)
+    assert "visible3.txt" not in result.text
+    assert ".hidden3" not in result.text
 
     # Hidden file count only includes depth 1
     assert (
-        "2 hidden files/directories in this directory are excluded"
-        in get_output_text(result)
+        "2 hidden files/directories in this directory are excluded" in result.text
     )  # Only .hidden1 and .hidden_dir at depth 1
 
 
@@ -854,7 +819,7 @@ def test_view_large_file_with_truncation(editor, tmp_path):
 
     result = editor(command="view", path=str(large_file))
     assert isinstance(result, FileEditorObservation)
-    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in get_output_text(result)
+    assert TEXT_FILE_CONTENT_TRUNCATED_NOTICE in result.text
 
 
 def test_validate_path_suggests_absolute_path(editor, tmp_path):
@@ -903,8 +868,8 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
 
     # View file
     result = editor(command="view", path=str(test_file))
-    assert "     1\tLine 1" in get_output_text(result)
-    assert "   500\tLine 500" in get_output_text(result)
+    assert "     1\tLine 1" in result.text
+    assert "   500\tLine 500" in result.text
 
     # Replace line 500's content with '500 new'
     result = editor(
@@ -913,14 +878,14 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
         old_str="Line 500",
         new_str="500 new",
     )
-    assert "   500\t500 new" in get_output_text(result)
+    assert "   500\t500 new" in result.text
 
     # Delete the line '500 new'
     result = editor(
         command="str_replace", path=str(test_file), old_str="500 new\n", new_str=""
     )
-    assert "   499\tLine 499" in get_output_text(result)
-    assert "   500\tLine 501" in get_output_text(result)
+    assert "   499\tLine 499" in result.text
+    assert "   500\tLine 501" in result.text
 
     # Insert content at line 500
     result = editor(
@@ -929,4 +894,4 @@ def test_str_replace_and_insert_snippet_output_on_a_large_file(editor):
         insert_line=499,
         new_str="Inserted line at 500",
     )
-    assert "   500\tInserted line at 500" in get_output_text(result)
+    assert "   500\tInserted line at 500" in result.text
diff --git a/tests/tools/file_editor/test_error_handling.py b/tests/tools/file_editor/test_error_handling.py
index 655daa3263..f705a2a09d 100644
--- a/tests/tools/file_editor/test_error_handling.py
+++ b/tests/tools/file_editor/test_error_handling.py
@@ -2,7 +2,7 @@
 
 from openhands.tools.file_editor.impl import file_editor
 
-from .conftest import assert_error_result, get_output_text
+from .conftest import assert_error_result
 
 
 def test_validation_error_formatting():
@@ -12,7 +12,7 @@ def test_validation_error_formatting():
         path="/nonexistent/file.txt",
     )
     assert_error_result(result)
-    assert result.is_error and "does not exist" in get_output_text(result)
+    assert result.is_error and "does not exist" in result.text
 
     # Test directory validation for non-view commands
     result = file_editor(
@@ -22,10 +22,7 @@ def test_validation_error_formatting():
         new_str="new",
     )
     assert_error_result(result)
-    assert (
-        result.is_error
-        and "directory and only the `view` command" in get_output_text(result)
-    )
+    assert result.is_error and "directory and only the `view` command" in result.text
 
 
 def test_str_replace_error_handling(temp_file):
@@ -43,7 +40,7 @@ def test_str_replace_error_handling(temp_file):
         new_str="something",
     )
     assert_error_result(result)
-    assert result.is_error and "did not appear verbatim" in get_output_text(result)
+    assert result.is_error and "did not appear verbatim" in result.text
 
     # Test multiple occurrences
     with open(temp_file, "w") as f:
@@ -56,8 +53,8 @@ def test_str_replace_error_handling(temp_file):
         new_str="new_line",
     )
     assert_error_result(result)
-    assert result.is_error and "Multiple occurrences" in get_output_text(result)
-    assert result.is_error and "lines [1, 2]" in get_output_text(result)
+    assert result.is_error and "Multiple occurrences" in result.text
+    assert result.is_error and "lines [1, 2]" in result.text
 
 
 def test_view_range_validation(temp_file):
@@ -74,9 +71,7 @@ def test_view_range_validation(temp_file):
         view_range=[1],  # Should be [start, end]
     )
     assert_error_result(result)
-    assert result.is_error and "should be a list of two integers" in get_output_text(
-        result
-    )
+    assert result.is_error and "should be a list of two integers" in result.text
 
     # Test out of bounds range: should clamp to file end and show a warning
     result = file_editor(
@@ -88,7 +83,7 @@ def test_view_range_validation(temp_file):
     assert not result.is_error
     assert (
         "NOTE: We only show up to 3 since there're only 3 lines in this file."
-        in get_output_text(result)
+        in result.text
     )
 
     # Test invalid range order
@@ -98,9 +93,7 @@ def test_view_range_validation(temp_file):
         view_range=[3, 1],  # End before start
     )
     assert_error_result(result)
-    assert result.is_error and "should be greater than or equal to" in get_output_text(
-        result
-    )
+    assert result.is_error and "should be greater than or equal to" in result.text
 
 
 def test_insert_validation(temp_file):
@@ -118,7 +111,7 @@ def test_insert_validation(temp_file):
         new_str="new line",
     )
     assert_error_result(result)
-    assert result.is_error and "should be within the range" in get_output_text(result)
+    assert result.is_error and "should be within the range" in result.text
 
     # Test insert beyond file length
     result = file_editor(
@@ -128,7 +121,7 @@ def test_insert_validation(temp_file):
         new_str="new line",
     )
     assert_error_result(result)
-    assert result.is_error and "should be within the range" in get_output_text(result)
+    assert result.is_error and "should be within the range" in result.text
 
 
 def test_undo_validation(temp_file):
@@ -144,4 +137,4 @@ def test_undo_validation(temp_file):
         path=temp_file,
     )
     assert_error_result(result)
-    assert result.is_error and "No edit history found" in get_output_text(result)
+    assert result.is_error and "No edit history found" in result.text
diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py
index 4c3a02948e..f5963d34b7 100644
--- a/tests/tools/file_editor/test_file_editor_tool.py
+++ b/tests/tools/file_editor/test_file_editor_tool.py
@@ -16,8 +16,6 @@
     FileEditorTool,
 )
 
-from .conftest import get_output_text
-
 
 def _create_test_conv_state(temp_dir: str) -> ConversationState:
     """Helper to create a test conversation state."""
@@ -97,9 +95,9 @@ def test_file_editor_tool_view_file():
         assert result is not None
         assert isinstance(result, FileEditorObservation)
         assert not result.is_error
-        assert "Line 1" in get_output_text(result)
-        assert "Line 2" in get_output_text(result)
-        assert "Line 3" in get_output_text(result)
+        assert "Line 1" in result.text
+        assert "Line 2" in result.text
+        assert "Line 3" in result.text
 
 
 def test_file_editor_tool_str_replace():
@@ -180,8 +178,8 @@ def test_file_editor_tool_view_directory():
         assert result is not None
         assert isinstance(result, FileEditorObservation)
         assert not result.is_error
-        assert "file1.txt" in get_output_text(result)
-        assert "file2.txt" in get_output_text(result)
+        assert "file1.txt" in result.text
+        assert "file2.txt" in result.text
 
 
 def test_file_editor_tool_includes_working_directory_in_description():
diff --git a/tests/tools/file_editor/test_memory_usage.py b/tests/tools/file_editor/test_memory_usage.py
index 3e2a973437..109d551f7d 100644
--- a/tests/tools/file_editor/test_memory_usage.py
+++ b/tests/tools/file_editor/test_memory_usage.py
@@ -11,7 +11,7 @@
 
 from openhands.tools.file_editor import file_editor
 
-from .conftest import assert_successful_result, get_output_text
+from .conftest import assert_successful_result
 
 
 # Apply the forked marker and serialize execution across workers
@@ -71,7 +71,7 @@ def test_file_read_memory_usage(temp_file):
 
     # Pull output before measuring and drop references to encourage GC
     assert_successful_result(result)
-    content = get_output_text(result)
+    content = result.text
     del result
     gc.collect()
 
diff --git a/tests/tools/file_editor/test_view_supported_binary_files.py b/tests/tools/file_editor/test_view_supported_binary_files.py
index f8938e0622..bbd0dc027c 100644
--- a/tests/tools/file_editor/test_view_supported_binary_files.py
+++ b/tests/tools/file_editor/test_view_supported_binary_files.py
@@ -6,7 +6,7 @@
     FileEditorObservation,
 )
 
-from .conftest import assert_successful_result, get_output_text
+from .conftest import assert_successful_result
 
 
 def test_view_pdf_file():
@@ -74,16 +74,12 @@ def test_view_pdf_file():
 
         assert isinstance(result, FileEditorObservation)
         assert_successful_result(result)
-        assert (
-            f"Here's the result of running `cat -n` on {test_file}"
-            in get_output_text(result)
-        )
+        assert f"Here's the result of running `cat -n` on {test_file}" in result.text
 
         # Check for specific content present in the PDF
-        assert get_output_text(
-            result
-        ) is not None and "Printer-Friendly Caltrain Schedule" in get_output_text(
-            result
+        assert (
+            result.text is not None
+            and "Printer-Friendly Caltrain Schedule" in result.text
         )
     finally:
         # Clean up the temporary file
diff --git a/tests/tools/file_editor/utils/test_encoding.py b/tests/tools/file_editor/utils/test_encoding.py
index 0bfce2c26b..361606fdf1 100644
--- a/tests/tools/file_editor/utils/test_encoding.py
+++ b/tests/tools/file_editor/utils/test_encoding.py
@@ -15,8 +15,6 @@
     with_encoding,
 )
 
-from ..conftest import get_output_text
-
 
 @pytest.fixture
 def temp_file():
@@ -289,15 +287,9 @@ def test_view_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the content was read correctly
-    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
-        result
-    )
-    assert get_output_text(
-        result
-    ) is not None and "Тестовый файл с кириллицей" in get_output_text(result)
-    assert get_output_text(
-        result
-    ) is not None and "Это тестовая строка" in get_output_text(result)
+    assert result.text is not None and "Привет, мир!" in result.text
+    assert result.text is not None and "Тестовый файл с кириллицей" in result.text
+    assert result.text is not None and "Это тестовая строка" in result.text
 
 
 def test_view_range_non_utf8_file(temp_non_utf8_file):
@@ -313,17 +305,11 @@ def test_view_range_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the content was read correctly
-    assert get_output_text(
-        result
-    ) is not None and "Тестовый файл с кириллицей" in get_output_text(result)
-    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
-        result
-    )
+    assert result.text is not None and "Тестовый файл с кириллицей" in result.text
+    assert result.text is not None and "Привет, мир!" in result.text
 
     # Verify that line 6 is not included
-    assert get_output_text(
-        result
-    ) is not None and "Это тестовая строка" not in get_output_text(result)
+    assert result.text is not None and "Это тестовая строка" not in result.text
 
 
 def test_str_replace_non_utf8_file(temp_non_utf8_file):
@@ -340,12 +326,8 @@ def test_str_replace_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the replacement was successful
-    assert get_output_text(
-        result
-    ) is not None and "Здравствуй, мир!" in get_output_text(result)
-    assert get_output_text(
-        result
-    ) is not None and "Привет, мир!" not in get_output_text(result)
+    assert result.text is not None and "Здравствуй, мир!" in result.text
+    assert result.text is not None and "Привет, мир!" not in result.text
 
     # Verify the file was saved with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -372,9 +354,7 @@ def test_insert_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the insertion was successful
-    assert get_output_text(
-        result
-    ) is not None and "Новая переменная" in get_output_text(result)
+    assert result.text is not None and "Новая переменная" in result.text
 
     # Verify the file was saved with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -411,9 +391,7 @@ def test_create_non_utf8_file():
         # Parse the result - now using direct access
 
         # Verify the file was created successfully
-        assert get_output_text(
-            result
-        ) is not None and "File created successfully" in get_output_text(result)
+        assert result.text is not None and "File created successfully" in result.text
 
         # Read the file with cp1251 encoding to verify content
         encoding_manager = EncodingManager()
@@ -453,9 +431,7 @@ def test_undo_edit_non_utf8_file(temp_non_utf8_file):
     # Parse the result - now using direct access
 
     # Verify the undo was successful
-    assert get_output_text(
-        result
-    ) is not None and "undone successfully" in get_output_text(result)
+    assert result.text is not None and "undone successfully" in result.text
 
     # Verify the original content was restored with the correct encoding
     with open(temp_non_utf8_file, "rb") as f:
@@ -477,9 +453,7 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         path=str(temp_non_utf8_file),
     )
     # Parse the result - now using direct access
-    assert get_output_text(result) is not None and "Привет, мир!" in get_output_text(
-        result
-    )
+    assert result.text is not None and "Привет, мир!" in result.text
 
     # 2. Replace text
     result = file_editor(
@@ -489,9 +463,7 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         new_str="Здравствуй, мир!",
     )
     # Parse the result - now using direct access
-    assert get_output_text(
-        result
-    ) is not None and "Здравствуй, мир!" in get_output_text(result)
+    assert result.text is not None and "Здравствуй, мир!" in result.text
 
     # 3. Insert text
     result = file_editor(
@@ -501,9 +473,7 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         new_str="# Добавленная строка\nboolean_var = True",
     )
     # Parse the result - now using direct access
-    assert get_output_text(
-        result
-    ) is not None and "Добавленная строка" in get_output_text(result)
+    assert result.text is not None and "Добавленная строка" in result.text
 
     # 4. View specific range
     result = file_editor(
@@ -512,12 +482,8 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         view_range=[5, 7],
     )
     # Parse the result - now using direct access
-    assert get_output_text(
-        result
-    ) is not None and "Добавленная строка" in get_output_text(result)
-    assert get_output_text(
-        result
-    ) is not None and "boolean_var = True" in get_output_text(result)
+    assert result.text is not None and "Добавленная строка" in result.text
+    assert result.text is not None and "boolean_var = True" in result.text
 
     # 5. Undo the last edit
     result = file_editor(
@@ -525,9 +491,7 @@ def test_complex_workflow_non_utf8_file(temp_non_utf8_file):
         path=str(temp_non_utf8_file),
     )
     # Parse the result - now using direct access
-    assert get_output_text(
-        result
-    ) is not None and "undone successfully" in get_output_text(result)
+    assert result.text is not None and "undone successfully" in result.text
 
     # 6. Verify the file content after all operations
     with open(temp_non_utf8_file, "rb") as f:
@@ -566,7 +530,7 @@ def test_mixed_encoding_workflow():
             path=path1,
         )
         # Parse the result - now using direct access
-        assert "Текст в кодировке CP1251" in get_output_text(result1)
+        assert "Текст в кодировке CP1251" in result1.text
 
         # 2. View the UTF-8 file
         result2 = file_editor(
@@ -574,7 +538,7 @@ def test_mixed_encoding_workflow():
             path=path2,
         )
         # Parse the result - now using direct access
-        assert "Текст в кодировке UTF-8" in get_output_text(result2)
+        assert "Текст в кодировке UTF-8" in result2.text
 
         # 3. Edit the cp1251 file
         result3 = file_editor(
@@ -584,7 +548,7 @@ def test_mixed_encoding_workflow():
             new_str="Измененный текст в CP1251",
         )
         # Parse the result - now using direct access
-        assert "Измененный текст в CP1251" in get_output_text(result3)
+        assert "Измененный текст в CP1251" in result3.text
 
         # 4. Edit the UTF-8 file
         result4 = file_editor(
@@ -594,7 +558,7 @@ def test_mixed_encoding_workflow():
             new_str="Измененный текст в UTF-8",
         )
         # Parse the result - now using direct access
-        assert "Измененный текст в UTF-8" in get_output_text(result4)
+        assert "Измененный текст в UTF-8" in result4.text
 
         # 5. Verify both files maintain their original encodings
         with open(path1, "rb") as f:

From 1d0e7f4a95c776211a418d1d2b671ed3317ce155 Mon Sep 17 00:00:00 2001
From: Xingyao Wang <xingyao@all-hands.dev>
Date: Wed, 5 Nov 2025 15:06:34 -0500
Subject: [PATCH 76/76] fix test

---
 tests/tools/execute_bash/test_observation_truncation.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/tools/execute_bash/test_observation_truncation.py b/tests/tools/execute_bash/test_observation_truncation.py
index 8a6433647e..9e61976310 100644
--- a/tests/tools/execute_bash/test_observation_truncation.py
+++ b/tests/tools/execute_bash/test_observation_truncation.py
@@ -98,11 +98,12 @@ def test_execute_bash_observation_truncation_with_error():
     result = observation.to_llm_content
     assert len(result) == 2
     assert isinstance(result[0], TextContent)
+    assert result[0].text == ExecuteBashObservation.ERROR_MESSAGE_HEADER
+
     assert isinstance(result[1], TextContent)
     result = result[1].text
 
-    # The result should be truncated and have error prefix
-    assert result.startswith("[There was an error during command execution.]")
+    # The result should be truncated
     assert len(result) < len(long_output) + 300  # Account for metadata and error prefix
     # With head-and-tail truncation, should end with original content + metadata
     expected_end = (