OpenHands · SmartManoj · Nov 4, 2025 · Nov 4, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py
@@ -24,6 +24,29 @@
 # Maximum output size for browser observations
 MAX_BROWSER_OUTPUT_SIZE = 50000
 
+# Mapping of base64 prefixes to MIME types for image detection
+BASE64_IMAGE_PREFIXES = {
+    "/9j/": "image/jpeg",
+    "iVBORw0KGgo": "image/png",
+    "R0lGODlh": "image/gif",
+    "UklGR": "image/webp",
+}
+
+
+def detect_image_mime_type(base64_data: str) -> str:
+    """Detect MIME type from base64-encoded image data.
+
+    Args:
+        base64_data: Base64-encoded image data
+
+    Returns:
+        Detected MIME type, defaults to "image/png" if not detected
+    """
+    for prefix, mime_type in BASE64_IMAGE_PREFIXES.items():
+        if base64_data.startswith(prefix):
+            return mime_type
+    return "image/png"
+
 
 class BrowserObservation(Observation):
     """Base observation for browser operations."""
@@ -48,15 +71,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
             )
 
         if self.screenshot_data:
-            mime_type = "image/png"
-            if self.screenshot_data.startswith("/9j/"):
-                mime_type = "image/jpeg"
-            elif self.screenshot_data.startswith("iVBORw0KGgo"):
-                mime_type = "image/png"
-            elif self.screenshot_data.startswith("R0lGODlh"):
-                mime_type = "image/gif"
-            elif self.screenshot_data.startswith("UklGR"):
-                mime_type = "image/webp"
+            mime_type = detect_image_mime_type(self.screenshot_data)
             # Convert base64 to data URL format for ImageContent
             data_url = f"data:{mime_type};base64,{self.screenshot_data}"
             llm_content.append(ImageContent(image_urls=[data_url]))

diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py
@@ -209,11 +209,28 @@ def create(
         # Initialize the executor
         executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir)
 
+        # Build the tool description with conditional image viewing support
+        # Split TOOL_DESCRIPTION to insert image viewing line after the second bullet
+        description_lines = TOOL_DESCRIPTION.split("\n")
+        base_description = "\n".join(description_lines[:2])  # First two lines
+        remaining_description = "\n".join(description_lines[2:])  # Rest of description
+
+        # Add image viewing line if LLM supports vision
+        if conv_state.agent.llm.vision_is_active():
+            tool_description = (
+                f"{base_description}\n"
+                "* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, "
+                ".bmp), `view` displays the image content\n"
+                f"{remaining_description}"
+            )
+        else:
+            tool_description = TOOL_DESCRIPTION
+
         # Add working directory information to the tool description
         # to guide the agent to use the correct directory instead of root
         working_dir = conv_state.workspace.working_dir
         enhanced_description = (
-            f"{TOOL_DESCRIPTION}\n\n"
+            f"{tool_description}\n\n"
             f"Your current working directory is: {working_dir}\n"
             f"When exploring project structure, start with this directory "
             f"instead of the root filesystem."

diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py
@@ -1,3 +1,5 @@
+import base64
+import mimetypes
 import os
 import re
 import shutil
@@ -7,6 +9,7 @@
 
 from binaryornot.check import is_binary
 
+from openhands.sdk import ImageContent, TextContent
 from openhands.sdk.logger import get_logger
 from openhands.sdk.utils.truncate import maybe_truncate
 from openhands.tools.file_editor.definition import (
@@ -36,6 +39,9 @@
 
 logger = get_logger(__name__)
 
+# Supported image extensions for viewing as base64-encoded content
+IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}
+
 
 class FileEditor:
     """
@@ -327,6 +333,34 @@ def view(
                 prev_exist=True,
             )
 
+        # Check if the file is an image
+        file_extension = path.suffix.lower()
+        if file_extension in IMAGE_EXTENSIONS:
+            # Read image file as base64
+            try:
+                with open(path, "rb") as f:
+                    image_bytes = f.read()
+                image_base64 = base64.b64encode(image_bytes).decode("utf-8")
+
+                mime_type, _ = mimetypes.guess_type(str(path))
+                if not mime_type or not mime_type.startswith("image/"):
+                    mime_type = "image/png"
+                output_msg = (
+                    f"Image file {path} read successfully. Displaying image content."
+                )
+                image_url = f"data:{mime_type};base64,{image_base64}"
+                return FileEditorObservation(
+                    command="view",
+                    content=[
+                        TextContent(text=output_msg),
+                        ImageContent(image_urls=[image_url]),
+                    ],
+                    path=str(path),
+                    prev_exist=True,
+                )
+            except Exception as e:
+                raise ToolError(f"Failed to read image file {path}: {e}") from None
+
         # Validate file and count lines
         self.validate_file(path)
         num_lines = self._count_lines(path)
@@ -609,8 +643,9 @@ def validate_file(self, path: Path) -> None:
                 ),
             )
 
-        # Check file type
-        if is_binary(str(path)):
+        # Check file type - allow image files
+        file_extension = path.suffix.lower()
+        if is_binary(str(path)) and file_extension not in IMAGE_EXTENSIONS:
             raise FileValidationError(
                 path=str(path),
                 reason=(

diff --git a/tests/integration/base.py b/tests/integration/base.py
@@ -26,11 +26,23 @@
 from openhands.sdk.tool import Tool
 
 
+class SkipTest(Exception):
+    """
+    Exception raised to indicate that a test should be skipped.
+
+    This is useful for tests that require specific capabilities (e.g., vision)
+    that may not be available in all LLMs.
+    """
+
+    pass
+
+
 class TestResult(BaseModel):
     """Result of an integration test."""
 
     success: bool
     reason: str | None = None
+    skipped: bool = False
 
 
 class BaseIntegrationTest(ABC):

diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py
@@ -17,7 +17,7 @@
 from pydantic import BaseModel, ConfigDict
 
 from openhands.sdk.logger import get_logger
-from tests.integration.base import BaseIntegrationTest, TestResult
+from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
 from tests.integration.schemas import ModelTestResults
 from tests.integration.utils.format_costs import format_cost
 
@@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
             log_file_path=log_file_path,
         )
 
+    except SkipTest as e:
+        # Test should be skipped (e.g., LLM doesn't support required capabilities)
+        logger.info("Test %s skipped: %s", instance.instance_id, str(e))
+        return EvalOutput(
+            instance_id=instance.instance_id,
+            test_result=TestResult(
+                success=False,
+                reason=str(e),
+                skipped=True,
+            ),
+            llm_model=llm_config.get("model", "unknown"),
+            cost=0.0,
+        )
+
     except Exception as e:
         logger.error("Error running test %s: %s", instance.instance_id, e)
         return EvalOutput(
@@ -274,11 +288,17 @@ def generate_structured_results(
     # Print summary for console output
     success_rate = structured_results.success_rate
     successful = structured_results.successful_tests
+    skipped = structured_results.skipped_tests
     total = structured_results.total_tests
     logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
+    if skipped > 0:
+        logger.info("Skipped tests: %d", skipped)
     logger.info("Evaluation Results:")
     for instance in structured_results.test_instances:
-        status = "✓" if instance.test_result.success else "✗"
+        if instance.test_result.skipped:
+            status = "⊘"  # Skipped symbol
+        else:
+            status = "✓" if instance.test_result.success else "✗"
         reason = instance.test_result.reason or "N/A"
         logger.info("%s: %s - %s", instance.instance_id, status, reason)
     logger.info("Total cost: %s", format_cost(structured_results.total_cost))

diff --git a/tests/integration/schemas.py b/tests/integration/schemas.py
@@ -20,6 +20,7 @@ class TestResultData(BaseModel):
 
     success: bool
     reason: str | None = None
+    skipped: bool = False
 
 
 class TestInstanceResult(BaseModel):
@@ -46,6 +47,7 @@ class ModelTestResults(BaseModel):
     # Summary statistics
     total_tests: int
     successful_tests: int
+    skipped_tests: int
     success_rate: float
     total_cost: float
 
@@ -75,6 +77,7 @@ def from_eval_outputs(
                     test_result=TestResultData(
                         success=output.test_result.success,
                         reason=output.test_result.reason,
+                        skipped=output.test_result.skipped,
                     ),
                     cost=output.cost,
                     error_message=output.error_message,
@@ -84,6 +87,7 @@ def from_eval_outputs(
         # Calculate summary statistics
         total_tests = len(test_instances)
         successful_tests = sum(1 for t in test_instances if t.test_result.success)
+        skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
         success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
         total_cost = sum(t.cost for t in test_instances)
 
@@ -94,6 +98,7 @@ def from_eval_outputs(
             test_instances=test_instances,
             total_tests=total_tests,
             successful_tests=successful_tests,
+            skipped_tests=skipped_tests,
             success_rate=success_rate,
             total_cost=total_cost,
             eval_note=eval_note,

diff --git a/tests/integration/tests/t08_image_file_viewing.py b/tests/integration/tests/t08_image_file_viewing.py
@@ -0,0 +1,92 @@
+"""Test that an agent can view and analyze image files using FileEditor."""
+
+import os
+import urllib.request
+
+from openhands.sdk import TextContent, get_logger
+from openhands.sdk.event.llm_convertible import MessageEvent
+from openhands.sdk.tool import Tool, register_tool
+from openhands.tools.file_editor import FileEditorTool
+from openhands.tools.terminal import TerminalTool
+from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
+
+
+INSTRUCTION = (
+    "Please view the logo.png file in the current directory and tell me what "
+    "colors you see in it. Is the logo blue, yellow, or green? Please analyze "
+    "the image and provide your answer."
+)
+
+IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png"
+
+logger = get_logger(__name__)
+
+
+class ImageFileViewingTest(BaseIntegrationTest):
+    """Test that an agent can view and analyze image files."""
+
+    INSTRUCTION: str = INSTRUCTION
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.logo_path: str = os.path.join(self.workspace, "logo.png")
+
+        # Verify that the LLM supports vision
+        if not self.llm.vision_is_active():
+            raise SkipTest(
+                "This test requires a vision-capable LLM model. "
+                "Please use a model that supports image input."
+            )
+
+    @property
+    def tools(self) -> list[Tool]:
+        """List of tools available to the agent."""
+        register_tool("TerminalTool", TerminalTool)
+        register_tool("FileEditorTool", FileEditorTool)
+        return [
+            Tool(name="TerminalTool"),
+            Tool(name="FileEditorTool"),
+        ]
+
+    def setup(self) -> None:
+        """Download the OpenHands logo for the agent to analyze."""
+        try:
+            urllib.request.urlretrieve(IMAGE_URL, self.logo_path)
+            logger.info(f"Downloaded test logo to: {self.logo_path}")
+        except Exception as e:
+            logger.error(f"Failed to download logo: {e}")
+            raise
+
+    def verify_result(self) -> TestResult:
+        """Verify that the agent identified yellow as one of the logo colors."""
+        if not os.path.exists(self.logo_path):
+            return TestResult(
+                success=False, reason="Logo file not found after agent execution"
+            )
+
+        # Check the agent's responses in collected events
+        # Look for messages mentioning yellow color
+        agent_responses = []
+        for event in self.collected_events:
+            if isinstance(event, MessageEvent):
+                message = event.llm_message
+                if message.role == "assistant":
+                    for content_item in message.content:
+                        if isinstance(content_item, TextContent):
+                            agent_responses.append(content_item.text.lower())
+
+        combined_response = " ".join(agent_responses)
+
+        if "yellow" in combined_response:
+            return TestResult(
+                success=True,
+                reason="Agent successfully identified yellow color in the logo",
+            )
+        else:
+            return TestResult(
+                success=False,
+                reason=(
+                    f"Agent did not identify yellow color in the logo. "
+                    f"Response: {combined_response[:500]}"
+                ),
+            )