diff --git a/openhands-tools/openhands/tools/browser_use/definition.py b/openhands-tools/openhands/tools/browser_use/definition.py index 6a02026b2b..b3463af2bd 100644 --- a/openhands-tools/openhands/tools/browser_use/definition.py +++ b/openhands-tools/openhands/tools/browser_use/definition.py @@ -24,6 +24,29 @@ # Maximum output size for browser observations MAX_BROWSER_OUTPUT_SIZE = 50000 +# Mapping of base64 prefixes to MIME types for image detection +BASE64_IMAGE_PREFIXES = { + "/9j/": "image/jpeg", + "iVBORw0KGgo": "image/png", + "R0lGODlh": "image/gif", + "UklGR": "image/webp", +} + + +def detect_image_mime_type(base64_data: str) -> str: + """Detect MIME type from base64-encoded image data. + + Args: + base64_data: Base64-encoded image data + + Returns: + Detected MIME type, defaults to "image/png" if not detected + """ + for prefix, mime_type in BASE64_IMAGE_PREFIXES.items(): + if base64_data.startswith(prefix): + return mime_type + return "image/png" + class BrowserObservation(Observation): """Base observation for browser operations.""" @@ -48,15 +71,7 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: ) if self.screenshot_data: - mime_type = "image/png" - if self.screenshot_data.startswith("/9j/"): - mime_type = "image/jpeg" - elif self.screenshot_data.startswith("iVBORw0KGgo"): - mime_type = "image/png" - elif self.screenshot_data.startswith("R0lGODlh"): - mime_type = "image/gif" - elif self.screenshot_data.startswith("UklGR"): - mime_type = "image/webp" + mime_type = detect_image_mime_type(self.screenshot_data) # Convert base64 to data URL format for ImageContent data_url = f"data:{mime_type};base64,{self.screenshot_data}" llm_content.append(ImageContent(image_urls=[data_url])) diff --git a/openhands-tools/openhands/tools/file_editor/definition.py b/openhands-tools/openhands/tools/file_editor/definition.py index ba6fdd7656..02743da6ab 100644 --- a/openhands-tools/openhands/tools/file_editor/definition.py +++ b/openhands-tools/openhands/tools/file_editor/definition.py @@ -209,11 +209,28 @@ def create( # Initialize the executor executor = FileEditorExecutor(workspace_root=conv_state.workspace.working_dir) + # Build the tool description with conditional image viewing support + # Split TOOL_DESCRIPTION to insert image viewing line after the second bullet + description_lines = TOOL_DESCRIPTION.split("\n") + base_description = "\n".join(description_lines[:2]) # First two lines + remaining_description = "\n".join(description_lines[2:]) # Rest of description + + # Add image viewing line if LLM supports vision + if conv_state.agent.llm.vision_is_active(): + tool_description = ( + f"{base_description}\n" + "* If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, " + ".bmp), `view` displays the image content\n" + f"{remaining_description}" + ) + else: + tool_description = TOOL_DESCRIPTION + # Add working directory information to the tool description # to guide the agent to use the correct directory instead of root working_dir = conv_state.workspace.working_dir enhanced_description = ( - f"{TOOL_DESCRIPTION}\n\n" + f"{tool_description}\n\n" f"Your current working directory is: {working_dir}\n" f"When exploring project structure, start with this directory " f"instead of the root filesystem." diff --git a/openhands-tools/openhands/tools/file_editor/editor.py b/openhands-tools/openhands/tools/file_editor/editor.py index 99adb7a409..e4b455014d 100644 --- a/openhands-tools/openhands/tools/file_editor/editor.py +++ b/openhands-tools/openhands/tools/file_editor/editor.py @@ -1,3 +1,5 @@ +import base64 +import mimetypes import os import re import shutil @@ -7,6 +9,7 @@ from binaryornot.check import is_binary +from openhands.sdk import ImageContent, TextContent from openhands.sdk.logger import get_logger from openhands.sdk.utils.truncate import maybe_truncate from openhands.tools.file_editor.definition import ( @@ -36,6 +39,9 @@ logger = get_logger(__name__) +# Supported image extensions for viewing as base64-encoded content +IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp"} + class FileEditor: """ @@ -327,6 +333,34 @@ def view( prev_exist=True, ) + # Check if the file is an image + file_extension = path.suffix.lower() + if file_extension in IMAGE_EXTENSIONS: + # Read image file as base64 + try: + with open(path, "rb") as f: + image_bytes = f.read() + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + + mime_type, _ = mimetypes.guess_type(str(path)) + if not mime_type or not mime_type.startswith("image/"): + mime_type = "image/png" + output_msg = ( + f"Image file {path} read successfully. Displaying image content." + ) + image_url = f"data:{mime_type};base64,{image_base64}" + return FileEditorObservation( + command="view", + content=[ + TextContent(text=output_msg), + ImageContent(image_urls=[image_url]), + ], + path=str(path), + prev_exist=True, + ) + except Exception as e: + raise ToolError(f"Failed to read image file {path}: {e}") from None + # Validate file and count lines self.validate_file(path) num_lines = self._count_lines(path) @@ -609,8 +643,9 @@ def validate_file(self, path: Path) -> None: ), ) - # Check file type - if is_binary(str(path)): + # Check file type - allow image files + file_extension = path.suffix.lower() + if is_binary(str(path)) and file_extension not in IMAGE_EXTENSIONS: raise FileValidationError( path=str(path), reason=( diff --git a/tests/integration/base.py b/tests/integration/base.py index 7bbc0841ae..4c22a65a2f 100644 --- a/tests/integration/base.py +++ b/tests/integration/base.py @@ -26,11 +26,23 @@ from openhands.sdk.tool import Tool +class SkipTest(Exception): + """ + Exception raised to indicate that a test should be skipped. + + This is useful for tests that require specific capabilities (e.g., vision) + that may not be available in all LLMs. + """ + + pass + + class TestResult(BaseModel): """Result of an integration test.""" success: bool reason: str | None = None + skipped: bool = False class BaseIntegrationTest(ABC): diff --git a/tests/integration/run_infer.py b/tests/integration/run_infer.py index 9107ac35cd..e202409845 100755 --- a/tests/integration/run_infer.py +++ b/tests/integration/run_infer.py @@ -17,7 +17,7 @@ from pydantic import BaseModel, ConfigDict from openhands.sdk.logger import get_logger -from tests.integration.base import BaseIntegrationTest, TestResult +from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult from tests.integration.schemas import ModelTestResults from tests.integration.utils.format_costs import format_cost @@ -171,6 +171,20 @@ def process_instance(instance: TestInstance, llm_config: dict[str, Any]) -> Eval log_file_path=log_file_path, ) + except SkipTest as e: + # Test should be skipped (e.g., LLM doesn't support required capabilities) + logger.info("Test %s skipped: %s", instance.instance_id, str(e)) + return EvalOutput( + instance_id=instance.instance_id, + test_result=TestResult( + success=False, + reason=str(e), + skipped=True, + ), + llm_model=llm_config.get("model", "unknown"), + cost=0.0, + ) + except Exception as e: logger.error("Error running test %s: %s", instance.instance_id, e) return EvalOutput( @@ -274,11 +288,17 @@ def generate_structured_results( # Print summary for console output success_rate = structured_results.success_rate successful = structured_results.successful_tests + skipped = structured_results.skipped_tests total = structured_results.total_tests logger.info("Success rate: %.2f%% (%d/%d)", success_rate * 100, successful, total) + if skipped > 0: + logger.info("Skipped tests: %d", skipped) logger.info("Evaluation Results:") for instance in structured_results.test_instances: - status = "✓" if instance.test_result.success else "✗" + if instance.test_result.skipped: + status = "⊘" # Skipped symbol + else: + status = "✓" if instance.test_result.success else "✗" reason = instance.test_result.reason or "N/A" logger.info("%s: %s - %s", instance.instance_id, status, reason) logger.info("Total cost: %s", format_cost(structured_results.total_cost)) diff --git a/tests/integration/schemas.py b/tests/integration/schemas.py index c557d939b5..dd20c354ab 100644 --- a/tests/integration/schemas.py +++ b/tests/integration/schemas.py @@ -20,6 +20,7 @@ class TestResultData(BaseModel): success: bool reason: str | None = None + skipped: bool = False class TestInstanceResult(BaseModel): @@ -46,6 +47,7 @@ class ModelTestResults(BaseModel): # Summary statistics total_tests: int successful_tests: int + skipped_tests: int success_rate: float total_cost: float @@ -75,6 +77,7 @@ def from_eval_outputs( test_result=TestResultData( success=output.test_result.success, reason=output.test_result.reason, + skipped=output.test_result.skipped, ), cost=output.cost, error_message=output.error_message, @@ -84,6 +87,7 @@ def from_eval_outputs( # Calculate summary statistics total_tests = len(test_instances) successful_tests = sum(1 for t in test_instances if t.test_result.success) + skipped_tests = sum(1 for t in test_instances if t.test_result.skipped) success_rate = successful_tests / total_tests if total_tests > 0 else 0.0 total_cost = sum(t.cost for t in test_instances) @@ -94,6 +98,7 @@ def from_eval_outputs( test_instances=test_instances, total_tests=total_tests, successful_tests=successful_tests, + skipped_tests=skipped_tests, success_rate=success_rate, total_cost=total_cost, eval_note=eval_note, diff --git a/tests/integration/tests/t08_image_file_viewing.py b/tests/integration/tests/t08_image_file_viewing.py new file mode 100644 index 0000000000..d48a04ed37 --- /dev/null +++ b/tests/integration/tests/t08_image_file_viewing.py @@ -0,0 +1,92 @@ +"""Test that an agent can view and analyze image files using FileEditor.""" + +import os +import urllib.request + +from openhands.sdk import TextContent, get_logger +from openhands.sdk.event.llm_convertible import MessageEvent +from openhands.sdk.tool import Tool, register_tool +from openhands.tools.file_editor import FileEditorTool +from openhands.tools.terminal import TerminalTool +from tests.integration.base import BaseIntegrationTest, SkipTest, TestResult + + +INSTRUCTION = ( + "Please view the logo.png file in the current directory and tell me what " + "colors you see in it. Is the logo blue, yellow, or green? Please analyze " + "the image and provide your answer." +) + +IMAGE_URL = "https://github.com/OpenHands/docs/raw/main/openhands/static/img/logo.png" + +logger = get_logger(__name__) + + +class ImageFileViewingTest(BaseIntegrationTest): + """Test that an agent can view and analyze image files.""" + + INSTRUCTION: str = INSTRUCTION + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.logo_path: str = os.path.join(self.workspace, "logo.png") + + # Verify that the LLM supports vision + if not self.llm.vision_is_active(): + raise SkipTest( + "This test requires a vision-capable LLM model. " + "Please use a model that supports image input." + ) + + @property + def tools(self) -> list[Tool]: + """List of tools available to the agent.""" + register_tool("TerminalTool", TerminalTool) + register_tool("FileEditorTool", FileEditorTool) + return [ + Tool(name="TerminalTool"), + Tool(name="FileEditorTool"), + ] + + def setup(self) -> None: + """Download the OpenHands logo for the agent to analyze.""" + try: + urllib.request.urlretrieve(IMAGE_URL, self.logo_path) + logger.info(f"Downloaded test logo to: {self.logo_path}") + except Exception as e: + logger.error(f"Failed to download logo: {e}") + raise + + def verify_result(self) -> TestResult: + """Verify that the agent identified yellow as one of the logo colors.""" + if not os.path.exists(self.logo_path): + return TestResult( + success=False, reason="Logo file not found after agent execution" + ) + + # Check the agent's responses in collected events + # Look for messages mentioning yellow color + agent_responses = [] + for event in self.collected_events: + if isinstance(event, MessageEvent): + message = event.llm_message + if message.role == "assistant": + for content_item in message.content: + if isinstance(content_item, TextContent): + agent_responses.append(content_item.text.lower()) + + combined_response = " ".join(agent_responses) + + if "yellow" in combined_response: + return TestResult( + success=True, + reason="Agent successfully identified yellow color in the logo", + ) + else: + return TestResult( + success=False, + reason=( + f"Agent did not identify yellow color in the logo. " + f"Response: {combined_response[:500]}" + ), + ) diff --git a/tests/integration/utils/generate_markdown_report.py b/tests/integration/utils/generate_markdown_report.py index 0cfaf59e28..4747199f2a 100644 --- a/tests/integration/utils/generate_markdown_report.py +++ b/tests/integration/utils/generate_markdown_report.py @@ -18,20 +18,21 @@ def generate_model_summary_table(model_results: list[ModelTestResults]) -> str: """Generate a summary table for all models.""" table_lines = [ - "| Model | Success Rate | Tests Passed | Total Tests | Cost |", - "|-------|--------------|--------------|-------------|------|", + "| Model | Success Rate | Tests Passed | Skipped | Total Tests | Cost |", + "|-------|--------------|--------------|---------|-------------|------|", ] for result in model_results: success_rate = f"{result.success_rate:.1%}" tests_passed = f"{result.successful_tests}/{result.total_tests}" + skipped = f"{result.skipped_tests}" cost = format_cost(result.total_cost) model_name = result.model_name total_tests = result.total_tests row = ( f"| {model_name} | {success_rate} | {tests_passed} | " - f"{total_tests} | {cost} |" + f"{skipped} | {total_tests} | {cost} |" ) table_lines.append(row) @@ -51,11 +52,35 @@ def generate_detailed_results(model_results: list[ModelTestResults]) -> str: f"({result.successful_tests}/{result.total_tests})", f"- **Total Cost**: {format_cost(result.total_cost)}", f"- **Run Suffix**: `{result.run_suffix}`", - "", ] + if result.skipped_tests > 0: + section_lines.append(f"- **Skipped Tests**: {result.skipped_tests}") + + section_lines.append("") + + # Add skipped tests if any + skipped_tests = [t for t in result.test_instances if t.test_result.skipped] + if skipped_tests: + section_lines.extend( + [ + "**Skipped Tests:**", + "", + ] + ) + + for test in skipped_tests: + reason = test.test_result.reason or "No reason provided" + section_lines.append(f"- `{test.instance_id}`: {reason}") + + section_lines.append("") + # Add failed tests if any - failed_tests = [t for t in result.test_instances if not t.test_result.success] + failed_tests = [ + t + for t in result.test_instances + if not t.test_result.success and not t.test_result.skipped + ] if failed_tests: section_lines.extend( [ diff --git a/tests/tools/browser_use/test_browser_observation.py b/tests/tools/browser_use/test_browser_observation.py index 09992781ee..612ad5bb50 100644 --- a/tests/tools/browser_use/test_browser_observation.py +++ b/tests/tools/browser_use/test_browser_observation.py @@ -112,3 +112,41 @@ def test_browser_observation_empty_screenshot_handling(): observation = BrowserObservation.from_text(text="Test", screenshot_data=None) agent_obs = observation.to_llm_content assert len(agent_obs) == 1 # Only text content, no image + + +def test_browser_observation_mime_type_detection(): + """Test MIME type detection for different image formats.""" + test_cases = [ + ( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg==", # noqa: E501 + "image/png", + ), + ( + "/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQH/wAARCAABAAEDASIAAhEBAxEB/8QAFQABAQAAAAAAAAAAAAAAAAAAAAv/xAAUEAEAAAAAAAAAAAAAAAAAAAAA/8QAFQEBAQAAAAAAAAAAAAAAAAAAAAX/xAAUEQEAAAAAAAAAAAAAAAAAAAAA/9oADAMBAAIRAxEAPwA/", # noqa: E501 + "image/jpeg", + ), + ( + "R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7", + "image/gif", + ), + ( + "UklGRiQAAABXRUJQVlA4IBgAAAAwAQCdASoBAAEAAQAcJaQAA3AA/v3AgAA=", + "image/webp", + ), + ( + "AAAABBBBCCCC", # Unknown format + "image/png", # Falls back to PNG + ), + ] + + for screenshot_data, expected_mime_type in test_cases: + observation = BrowserObservation.from_text( + text="Test", screenshot_data=screenshot_data + ) + agent_obs = observation.to_llm_content + + assert len(agent_obs) == 2 + assert isinstance(agent_obs[1], ImageContent) + assert ( + agent_obs[1].image_urls[0].startswith(f"data:{expected_mime_type};base64,") + ) diff --git a/tests/tools/file_editor/test_file_editor_tool.py b/tests/tools/file_editor/test_file_editor_tool.py index f5963d34b7..e0e6d1186c 100644 --- a/tests/tools/file_editor/test_file_editor_tool.py +++ b/tests/tools/file_editor/test_file_editor_tool.py @@ -222,3 +222,50 @@ def test_file_editor_tool_openai_format_includes_working_directory(): "When exploring project structure, start with this directory " "instead of the root filesystem." ) in description + + +def test_file_editor_tool_image_viewing_line_with_vision_enabled(): + """Test that image viewing line is included when LLM supports vision.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create LLM with vision support (gpt-4o-mini supports vision) + llm = LLM( + model="gpt-4o-mini", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_state = ConversationState.create( + id=uuid4(), + agent=agent, + workspace=LocalWorkspace(working_dir=temp_dir), + ) + + tools = FileEditorTool.create(conv_state) + tool = tools[0] + + # Check that the image viewing line is included in description + assert ( + "If `path` is an image file (.png, .jpg, .jpeg, .gif, .webp, .bmp)" + in tool.description + ) + assert "view` displays the image content" in tool.description + + +def test_file_editor_tool_image_viewing_line_with_vision_disabled(): + """Test that image viewing line is excluded when LLM doesn't support vision.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create LLM without vision support (gpt-3.5-turbo doesn't support vision) + llm = LLM( + model="gpt-3.5-turbo", api_key=SecretStr("test-key"), usage_id="test-llm" + ) + agent = Agent(llm=llm, tools=[]) + conv_state = ConversationState.create( + id=uuid4(), + agent=agent, + workspace=LocalWorkspace(working_dir=temp_dir), + ) + + tools = FileEditorTool.create(conv_state) + tool = tools[0] + + # Check that the image viewing line is NOT included in description + assert "is an image file" not in tool.description + assert "displays the image content" not in tool.description diff --git a/tests/tools/file_editor/test_file_validation.py b/tests/tools/file_editor/test_file_validation.py index 7f41fe54e3..c86f696e9a 100644 --- a/tests/tools/file_editor/test_file_validation.py +++ b/tests/tools/file_editor/test_file_validation.py @@ -3,6 +3,7 @@ import pytest from binaryornot.check import is_binary +from openhands.sdk import ImageContent from openhands.tools.file_editor.editor import FileEditor from openhands.tools.file_editor.exceptions import ( FileValidationError, @@ -96,8 +97,39 @@ def test_validate_image_file(tmp_path): assert is_binary(str(image_file)) - # Images are not supported and should be detected as binary - with pytest.raises(FileValidationError) as exc_info: - editor.validate_file(image_file) + # Images are not supported, so no exception should be raised + editor.validate_file(image_file) - assert "file appears to be binary" in str(exc_info.value).lower() + +def test_view_image_file_returns_image_content(tmp_path): + """Test that viewing an image file returns ImageContent without error.""" + editor = FileEditor() + image_file = tmp_path / "test.png" + + # Create a minimal valid 1x1 PNG image (red pixel) + # This is a complete, valid PNG file + png_data = ( + b"\x89PNG\r\n\x1a\n" # PNG signature + b"\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01" # IHDR chunk (1x1) + b"\x08\x02\x00\x00\x00\x90wS\xde" # IHDR data + CRC + b"\x00\x00\x00\x0cIDATx\x9cc\xf8\xcf\xc0\x00\x00\x00\x03\x00\x01" # IDAT chunk + b"\x00\x18\xdd\x8d\xb4" # IDAT CRC + b"\x00\x00\x00\x00IEND\xaeB`\x82" # IEND chunk + ) + + with open(image_file, "wb") as f: + f.write(png_data) + + # View the image file - should return ImageContent + result = editor(command="view", path=str(image_file)) + + # Verify result contains ImageContent + assert result is not None + assert hasattr(result, "content") + assert len(result.content) == 2 # TextContent with message + ImageContent + assert any(isinstance(c, ImageContent) for c in result.content) + + # Get the ImageContent and verify it has image_urls + image_content = [c for c in result.content if isinstance(c, ImageContent)][0] + assert len(image_content.image_urls) == 1 + assert image_content.image_urls[0].startswith("data:image/png;base64,")