Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 24 additions & 9 deletions openhands-tools/openhands/tools/browser_use/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,29 @@
# Maximum output size for browser observations
MAX_BROWSER_OUTPUT_SIZE = 50000

# Mapping of base64 prefixes to MIME types for image detection
BASE64_IMAGE_PREFIXES = {
"/9j/": "image/jpeg",
"iVBORw0KGgo": "image/png",
"R0lGODlh": "image/gif",
"UklGR": "image/webp",
}


def detect_image_mime_type(base64_data: str) -> str:
"""Detect MIME type from base64-encoded image data.

Args:
base64_data: Base64-encoded image data

Returns:
Detected MIME type, defaults to "image/png" if not detected
"""
for prefix, mime_type in BASE64_IMAGE_PREFIXES.items():
if base64_data.startswith(prefix):
return mime_type
return "image/png"


class BrowserObservation(Observation):
"""Base observation for browser operations."""
Expand All @@ -48,15 +71,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
)

if self.screenshot_data:
mime_type = "image/png"
if self.screenshot_data.startswith("/9j/"):
mime_type = "image/jpeg"
elif self.screenshot_data.startswith("iVBORw0KGgo"):
mime_type = "image/png"
elif self.screenshot_data.startswith("R0lGODlh"):
mime_type = "image/gif"
elif self.screenshot_data.startswith("UklGR"):
mime_type = "image/webp"
mime_type = detect_image_mime_type(self.screenshot_data)
# Convert base64 to data URL format for ImageContent
data_url = f"data:{mime_type};base64,{self.screenshot_data}"
llm_content.append(ImageContent(image_urls=[data_url]))
Expand Down
19 changes: 18 additions & 1 deletion openhands-tools/openhands/tools/file_editor/definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,11 +209,28 @@ def create(
# Initialize the executor
executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir)

# Build the tool description with conditional image viewing support
# Split TOOL_DESCRIPTION to insert image viewing line after the second bullet
description_lines = TOOL_DESCRIPTION.split("\n")
base_description = "\n".join(description_lines[:2]) # First two lines
remaining_description = "\n".join(description_lines[2:]) # Rest of description

# Add image viewing line if LLM supports vision
if conv_state.agent.llm.vision_is_active():
tool_description = (
f"{base_description}\n"
"* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, "
".bmp), `view` displays the image content\n"
f"{remaining_description}"
)
else:
tool_description = TOOL_DESCRIPTION

# Add working directory information to the tool description
# to guide the agent to use the correct directory instead of root
working_dir = conv_state.workspace.working_dir
enhanced_description = (
f"{TOOL_DESCRIPTION}\n\n"
f"{tool_description}\n\n"
f"Your current working directory is: {working_dir}\n"
f"When exploring project structure, start with this directory "
f"instead of the root filesystem."
Expand Down
39 changes: 37 additions & 2 deletions openhands-tools/openhands/tools/file_editor/editor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import base64
import mimetypes
import os
import re
import shutil
Expand All @@ -7,6 +9,7 @@

from binaryornot.check import is_binary

from openhands.sdk import ImageContent, TextContent
from openhands.sdk.logger import get_logger
from openhands.sdk.utils.truncate import maybe_truncate
from openhands.tools.file_editor.definition import (
Expand Down Expand Up @@ -36,6 +39,9 @@

logger = get_logger(__name__)

# Supported image extensions for viewing as base64-encoded content
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"}


class FileEditor:
"""
Expand Down Expand Up @@ -327,6 +333,34 @@ def view(
prev_exist=True,
)

# Check if the file is an image
file_extension = path.suffix.lower()
if file_extension in IMAGE_EXTENSIONS:
# Read image file as base64
try:
with open(path, "rb") as f:
image_bytes = f.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")

mime_type, _ = mimetypes.guess_type(str(path))
if not mime_type or not mime_type.startswith("image/"):
mime_type = "image/png"
output_msg = (
f"Image file {path} read successfully. Displaying image content."
)
image_url = f"data:{mime_type};base64,{image_base64}"
return FileEditorObservation(
command="view",
content=[
TextContent(text=output_msg),
ImageContent(image_urls=[image_url]),
],
path=str(path),
prev_exist=True,
)
except Exception as e:
raise ToolError(f"Failed to read image file {path}: {e}") from None

# Validate file and count lines
self.validate_file(path)
num_lines = self._count_lines(path)
Expand Down Expand Up @@ -609,8 +643,9 @@ def validate_file(self, path: Path) -> None:
),
)

# Check file type
if is_binary(str(path)):
# Check file type - allow image files
file_extension = path.suffix.lower()
if is_binary(str(path)) and file_extension not in IMAGE_EXTENSIONS:
raise FileValidationError(
path=str(path),
reason=(
Expand Down
12 changes: 12 additions & 0 deletions tests/integration/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,23 @@
from openhands.sdk.tool import Tool


class SkipTest(Exception):
"""
Exception raised to indicate that a test should be skipped.

This is useful for tests that require specific capabilities (e.g., vision)
that may not be available in all LLMs.
"""

pass


class TestResult(BaseModel):
"""Result of an integration test."""

success: bool
reason: str | None = None
skipped: bool = False


class BaseIntegrationTest(ABC):
Expand Down
24 changes: 22 additions & 2 deletions tests/integration/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pydantic import BaseModel, ConfigDict

from openhands.sdk.logger import get_logger
from tests.integration.base import BaseIntegrationTest, TestResult
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult
from tests.integration.schemas import ModelTestResults
from tests.integration.utils.format_costs import format_cost

Expand Down Expand Up @@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval
log_file_path=log_file_path,
)

except SkipTest as e:
# Test should be skipped (e.g., LLM doesn't support required capabilities)
logger.info("Test %s skipped: %s", instance.instance_id, str(e))
return EvalOutput(
instance_id=instance.instance_id,
test_result=TestResult(
success=False,
reason=str(e),
skipped=True,
),
llm_model=llm_config.get("model", "unknown"),
cost=0.0,
)

except Exception as e:
logger.error("Error running test %s: %s", instance.instance_id, e)
return EvalOutput(
Expand Down Expand Up @@ -274,11 +288,17 @@ def generate_structured_results(
# Print summary for console output
success_rate = structured_results.success_rate
successful = structured_results.successful_tests
skipped = structured_results.skipped_tests
total = structured_results.total_tests
logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total)
if skipped > 0:
logger.info("Skipped tests: %d", skipped)
logger.info("Evaluation Results:")
for instance in structured_results.test_instances:
status = "✓" if instance.test_result.success else "✗"
if instance.test_result.skipped:
status = "⊘" # Skipped symbol
else:
status = "✓" if instance.test_result.success else "✗"
reason = instance.test_result.reason or "N/A"
logger.info("%s: %s - %s", instance.instance_id, status, reason)
logger.info("Total cost: %s", format_cost(structured_results.total_cost))
Expand Down
5 changes: 5 additions & 0 deletions tests/integration/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class TestResultData(BaseModel):

success: bool
reason: str | None = None
skipped: bool = False


class TestInstanceResult(BaseModel):
Expand All @@ -46,6 +47,7 @@ class ModelTestResults(BaseModel):
# Summary statistics
total_tests: int
successful_tests: int
skipped_tests: int
success_rate: float
total_cost: float

Expand Down Expand Up @@ -75,6 +77,7 @@ def from_eval_outputs(
test_result=TestResultData(
success=output.test_result.success,
reason=output.test_result.reason,
skipped=output.test_result.skipped,
),
cost=output.cost,
error_message=output.error_message,
Expand All @@ -84,6 +87,7 @@ def from_eval_outputs(
# Calculate summary statistics
total_tests = len(test_instances)
successful_tests = sum(1 for t in test_instances if t.test_result.success)
skipped_tests = sum(1 for t in test_instances if t.test_result.skipped)
success_rate = successful_tests / total_tests if total_tests > 0 else 0.0
total_cost = sum(t.cost for t in test_instances)

Expand All @@ -94,6 +98,7 @@ def from_eval_outputs(
test_instances=test_instances,
total_tests=total_tests,
successful_tests=successful_tests,
skipped_tests=skipped_tests,
success_rate=success_rate,
total_cost=total_cost,
eval_note=eval_note,
Expand Down
92 changes: 92 additions & 0 deletions tests/integration/tests/t08_image_file_viewing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Test that an agent can view and analyze image files using FileEditor."""

import os
import urllib.request

from openhands.sdk import TextContent, get_logger
from openhands.sdk.event.llm_convertible import MessageEvent
from openhands.sdk.tool import Tool, register_tool
from openhands.tools.file_editor import FileEditorTool
from openhands.tools.terminal import TerminalTool
from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult


INSTRUCTION = (
"Please view the logo.png file in the current directory and tell me what "
"colors you see in it. Is the logo blue, yellow, or green? Please analyze "
"the image and provide your answer."
)

IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png"

logger = get_logger(__name__)


class ImageFileViewingTest(BaseIntegrationTest):
"""Test that an agent can view and analyze image files."""

INSTRUCTION: str = INSTRUCTION

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.logo_path: str = os.path.join(self.workspace, "logo.png")

# Verify that the LLM supports vision
if not self.llm.vision_is_active():
raise SkipTest(
"This test requires a vision-capable LLM model. "
"Please use a model that supports image input."
)

@property
def tools(self) -> list[Tool]:
"""List of tools available to the agent."""
register_tool("TerminalTool", TerminalTool)
register_tool("FileEditorTool", FileEditorTool)
return [
Tool(name="TerminalTool"),
Tool(name="FileEditorTool"),
]

def setup(self) -> None:
"""Download the OpenHands logo for the agent to analyze."""
try:
urllib.request.urlretrieve(IMAGE_URL, self.logo_path)
logger.info(f"Downloaded test logo to: {self.logo_path}")
except Exception as e:
logger.error(f"Failed to download logo: {e}")
raise

def verify_result(self) -> TestResult:
"""Verify that the agent identified yellow as one of the logo colors."""
if not os.path.exists(self.logo_path):
return TestResult(
success=False, reason="Logo file not found after agent execution"
)

# Check the agent's responses in collected events
# Look for messages mentioning yellow color
agent_responses = []
for event in self.collected_events:
if isinstance(event, MessageEvent):
message = event.llm_message
if message.role == "assistant":
for content_item in message.content:
if isinstance(content_item, TextContent):
agent_responses.append(content_item.text.lower())

combined_response = " ".join(agent_responses)

if "yellow" in combined_response:
return TestResult(
success=True,
reason="Agent successfully identified yellow color in the logo",
)
else:
return TestResult(
success=False,
reason=(
f"Agent did not identify yellow color in the logo. "
f"Response: {combined_response[:500]}"
),
)
Loading
Loading