From 5a08f83858743e77adbbe07ca83dcc978f1c74b8 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Thu, 9 Oct 2025 22:22:04 +0200 Subject: [PATCH 01/14] WIP --- examples/04_openrouter_quickstart.py | 47 +++ examples/mcp_sum_server.py | 23 ++ hud/agents/__init__.py | 2 + hud/agents/openrouter.py | 592 +++++++++++++++++++++++++++ hud/agents/tests/test_openrouter.py | 205 ++++++++++ hud/cli/__init__.py | 8 +- hud/cli/eval.py | 38 +- hud/utils/agent_factories.py | 16 + 8 files changed, 923 insertions(+), 8 deletions(-) create mode 100644 examples/04_openrouter_quickstart.py create mode 100644 examples/mcp_sum_server.py create mode 100644 hud/agents/openrouter.py create mode 100644 hud/agents/tests/test_openrouter.py diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py new file mode 100644 index 00000000..2ac56044 --- /dev/null +++ b/examples/04_openrouter_quickstart.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +import asyncio +from pathlib import Path + +from hud.agents.openrouter import OpenRouterAgent +from hud.utils.hud_console import HUDConsole + + +async def main() -> None: + hud_console = HUDConsole() + + # Inline FastMCP sum task (no external JSON needed) + server_path = Path(__file__).parent / "mcp_sum_server.py" + task = { + "id": "sum-demo", + "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.", + "mcp_config": { + "local": { + "command": "python", + "args": [str(server_path)], + } + }, + "agent_config": { + "allowed_tools": ["sum"], + "system_prompt": ( + "You are a concise math assistant. Always call the `sum` tool when asked to add " + "numbers, wait for the result, then explain the answer in one sentence." + ), + }, + } + + # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env) + agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True) + + hud_console.info("Running task with OpenRouter agent...") + result = await agent.run(task, max_steps=3) + + hud_console.info("\nFinal content:") + hud_console.info(result.content or "") + hud_console.success(f"Reward: {result.reward}") + + +if __name__ == "__main__": + asyncio.run(main()) + + diff --git a/examples/mcp_sum_server.py b/examples/mcp_sum_server.py new file mode 100644 index 00000000..7c26d123 --- /dev/null +++ b/examples/mcp_sum_server.py @@ -0,0 +1,23 @@ +"""FastMCP server exposing a simple sum tool. + +Run with: `python examples/mcp_sum_server.py`. +""" + +from __future__ import annotations + +from fastmcp import FastMCP + + +server = FastMCP("SumServer") + + +@server.tool() +def sum(a: int, b: int) -> dict[str, int]: + """Return the sum of two integers.""" + return {"result": a + b} + + +if __name__ == "__main__": + server.run() + + diff --git a/hud/agents/__init__.py b/hud/agents/__init__.py index 7470adb3..55a531ca 100644 --- a/hud/agents/__init__.py +++ b/hud/agents/__init__.py @@ -4,10 +4,12 @@ from .claude import ClaudeAgent from .openai import OperatorAgent from .openai_chat_generic import GenericOpenAIChatAgent +from .openrouter import OpenRouterAgent __all__ = [ "ClaudeAgent", "GenericOpenAIChatAgent", "MCPAgent", "OperatorAgent", + "OpenRouterAgent", ] diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py new file mode 100644 index 00000000..4306a386 --- /dev/null +++ b/hud/agents/openrouter.py @@ -0,0 +1,592 @@ +"""OpenRouter agent that uses the Responses API with prompt caching.""" + +from __future__ import annotations + +import json +import logging +import uuid +from typing import Any, Iterable + +import mcp.types as types +from openai import AsyncOpenAI + +from hud import instrument +from hud.settings import settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult + +from .openai_chat_generic import GenericOpenAIChatAgent + +logger = logging.getLogger(__name__) + +_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha" +_DEFAULT_HEADERS = { + "HTTP-Referer": "https://hud.so", + "X-Title": "HUD Python SDK", + "Accept": "application/json", +} + +_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = { + "temperature": 0.1, + "max_output_tokens": 1024, +} + + +class OpenRouterAgent(GenericOpenAIChatAgent): + """MCP-enabled agent that talks to OpenRouter through the Responses API.""" + + def __init__( + self, + *, + api_key: str | None = None, + base_url: str | None = None, + model_name: str = "z-ai/glm-4.5v", + default_headers: dict[str, str] | None = None, + cache_control: dict[str, Any] | bool | None = True, + cacheable_roles: Iterable[str] | None = None, + openai_client: AsyncOpenAI | None = None, + completion_kwargs: dict[str, Any] | None = None, + **agent_kwargs: Any, + ) -> None: + api_key = api_key or settings.openrouter_api_key + if not api_key: + raise ValueError( + "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly." + ) + + base_url = base_url or _DEFAULT_BASE_URL + + headers: dict[str, str] = dict(_DEFAULT_HEADERS) + if default_headers: + headers.update(default_headers) + + client = openai_client or AsyncOpenAI( + api_key=api_key, + base_url=base_url, + default_headers=headers, + ) + + super().__init__( + openai_client=client, + model_name=model_name, + completion_kwargs=completion_kwargs, + **agent_kwargs, + ) + + self._responses_kwargs = { + "tool_choice": "auto", + **_DEFAULT_COMPLETION_KWARGS, + **dict(self.completion_kwargs), + } + self.completion_kwargs.clear() + + self._cache_control = self._normalize_cache_control(cache_control) + self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool")) + + @staticmethod + def _normalize_cache_control( + cache_control: dict[str, Any] | bool | str | None, + ) -> dict[str, Any] | None: + if cache_control is False: + return None + if cache_control is None: + return {"type": "ephemeral"} + if cache_control is True: + return {"type": "ephemeral"} + if isinstance(cache_control, dict): + return cache_control + return {"type": str(cache_control)} + + def _should_cache(self, role: str) -> bool: + return self._cache_control is not None and role in self._cacheable_roles + + def _text_item(self, text: str, role: str) -> dict[str, Any]: + item: dict[str, Any] = {"type": "input_text", "text": text} + if self._should_cache(role): + item["cache_control"] = self._cache_control + return item + + def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]: + url: str | None = None + detail = None + + if isinstance(image_payload, dict): + # Standard OpenAI-style wrapper + if "image_url" in image_payload and isinstance(image_payload["image_url"], dict): + img = image_payload["image_url"] + url = img.get("url") + detail = img.get("detail") or image_payload.get("detail") + # Direct url / data uri + elif image_payload.get("url"): + url = image_payload.get("url") + detail = image_payload.get("detail") + # Raw base64 payload from computer/tool results + elif image_payload.get("data"): + mime = ( + image_payload.get("mimeType") + or image_payload.get("mime_type") + or "image/png" + ) + data = image_payload.get("data") + if data: + url = f"data:{mime};base64,{data}" + detail = image_payload.get("detail") + elif isinstance(image_payload.get("source"), dict): + source = image_payload["source"] + data = source.get("data") + mime = source.get("media_type") or source.get("mime_type") or "image/png" + if data: + url = f"data:{mime};base64,{data}" + detail = source.get("detail") + elif isinstance(image_payload, str): + url = image_payload + + item: dict[str, Any] = {"type": "input_image"} + if url: + item["image_url"] = url + item["detail"] = str(detail or "auto") + if self._should_cache(role): + item["cache_control"] = self._cache_control + return item + + def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]: + if content is None: + return [] + + blocks: list[dict[str, Any]] = [] + if isinstance(content, str): + blocks.append(self._text_item(content, role)) + return blocks + + if isinstance(content, dict): + content = [content] + + if isinstance(content, list): + for entry in content: + if isinstance(entry, str): + blocks.append(self._text_item(entry, role)) + elif isinstance(entry, dict): + entry_copy = dict(entry) + entry_type = entry_copy.get("type") + if entry_type in {"text", "input_text", None}: + text = entry_copy.get("text") or "" + blocks.append(self._text_item(text, role)) + elif entry_type in {"image_url", "input_image"}: + payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy + blocks.append(self._image_item(payload, role)) + elif entry_type in {"image", "output_image", "rendered"}: + blocks.append(self._image_item(entry_copy, role)) + elif entry_type == "tool_result": + text = entry_copy.get("text", "") + blocks.append(self._text_item(text, role)) + else: + text_value = entry_copy.get("text") or json.dumps(entry_copy) + blocks.append(self._text_item(text_value, role)) + else: + blocks.append(self._text_item(str(entry), role)) + return blocks + + blocks.append(self._text_item(str(content), role)) + return blocks + + def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]: + converted: list[dict[str, Any]] = [] + for message in messages: + if not isinstance(message, dict): + logger.debug("Skipping non-dict message: %s", message) + continue + + if "type" in message and "role" not in message: + converted.append(message) + continue + + role = message.get("role") or "user" + + if role == "assistant" and message.get("tool_calls"): + content_items = self._convert_message_content(role, message.get("content")) + if content_items: + converted.append({"role": "assistant", "content": content_items}) + for tool_call in message.get("tool_calls", []): + converted.append(self._convert_tool_call(tool_call)) + continue + + if role == "tool": + converted.extend(self._convert_tool_message(message)) + continue + + payload: dict[str, Any] = {"role": role} + content_items = self._convert_message_content(role, message.get("content")) + if content_items: + payload["content"] = content_items + if message.get("name"): + payload["name"] = message["name"] + if message.get("metadata"): + payload["metadata"] = message["metadata"] + converted.append(payload) + + return converted + + @staticmethod + def _jsonify_schema(value: Any) -> Any: + from pydantic import BaseModel + from pydantic.fields import FieldInfo + + if isinstance(value, (str, int, float, bool)) or value is None: + return value + + if isinstance(value, dict): + return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()} + + if isinstance(value, (list, tuple, set)): + return [OpenRouterAgent._jsonify_schema(v) for v in value] + + try: + return json.loads(json.dumps(value)) + except Exception: + if isinstance(value, BaseModel): + return OpenRouterAgent._jsonify_schema(value.model_dump()) + if isinstance(value, FieldInfo): + data: dict[str, Any] = {} + if value.annotation is not None: + data.setdefault( + "type", + getattr(value.annotation, "__name__", str(value.annotation)), + ) + if value.description: + data["description"] = value.description + if value.title: + data["title"] = value.title + if value.default not in (None, Ellipsis): + data["default"] = OpenRouterAgent._jsonify_schema(value.default) + if value.json_schema_extra: + extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra) + if isinstance(extra, dict): + data.update(extra) + return data or str(value) + if hasattr(value, "model_dump"): + return OpenRouterAgent._jsonify_schema(value.model_dump()) + if hasattr(value, "__dict__") and value.__dict__: + return OpenRouterAgent._jsonify_schema( + { + k: v + for k, v in value.__dict__.items() + if not k.startswith("_") + } + ) + return str(value) + + @staticmethod + def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]: + if not tools: + return [] + + converted: list[dict] = [] + for tool in tools: + if not isinstance(tool, dict): + continue + + if tool.get("type") == "function" and isinstance(tool.get("function"), dict): + fn = tool["function"] + name = fn.get("name") + params = fn.get("parameters", {}) + description = fn.get("description", "") + + if not isinstance(name, str) or not name: + logger.debug("Skipping tool with missing name: %s", tool) + continue + + converted.append( + { + "type": "function", + "name": name, + "description": str(description or ""), + "parameters": OpenRouterAgent._jsonify_schema(params), + } + ) + else: + converted.append(OpenRouterAgent._jsonify_schema(tool)) + + return converted + + def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]: + if not isinstance(tool_call, dict): + return {} + + function = tool_call.get("function") or {} + name = function.get("name") or tool_call.get("name") or "tool_call" + raw_arguments = function.get("arguments") + + if isinstance(raw_arguments, dict): + arguments = json.dumps(self._jsonify_schema(raw_arguments)) + elif isinstance(raw_arguments, str): + try: + parsed = json.loads(raw_arguments) + except json.JSONDecodeError: + arguments = raw_arguments + else: + arguments = json.dumps(self._jsonify_schema(parsed)) + elif raw_arguments is None: + arguments = "{}" + else: + arguments = json.dumps(self._jsonify_schema(raw_arguments)) + + call_id = ( + tool_call.get("id") + or function.get("id") + or function.get("call_id") + or f"call_{uuid.uuid4().hex}" + ) + + return { + "type": "function_call", + "id": call_id, + "name": name, + "arguments": arguments or "{}", + } + + def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}" + + text_parts: list[str] = [] + image_payloads: list[Any] = [] + + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict): + item_type = item.get("type") + if item_type in {"text", "input_text"} and item.get("text"): + text_parts.append(str(item.get("text"))) + elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}: + image_payloads.append(item) + elif isinstance(item, str): + text_parts.append(item) + elif isinstance(content, str): + text_parts.append(content) + + structured = message.get("structuredContent") + if structured and not text_parts: + try: + text_parts.append(json.dumps(structured)) + except Exception: + text_parts.append(str(structured)) + + output_text = "\n".join(part for part in text_parts if part) or "" + + entries.append( + { + "type": "function_call_output", + "id": message.get("id") or call_id, + "call_id": call_id, + "output": output_text, + } + ) + + for payload in image_payloads: + entries.append( + { + "role": "user", + "content": [self._image_item(payload, "user")], + } + ) + + return entries + + async def format_tool_results( + self, + tool_calls: list[MCPToolCall], + tool_results: list[MCPToolResult], + ) -> list[dict[str, Any]]: + converted: list[dict[str, Any]] = [] + + for call, result in zip(tool_calls, tool_results, strict=False): + call_id = call.id or call.name or f"call_{uuid.uuid4().hex}" + + text_parts: list[str] = [] + image_payloads: list[Any] = [] + + for item in result.content or []: + if isinstance(item, types.TextContent): + text_parts.append(item.text) + elif isinstance(item, types.ImageContent): + image_payloads.append( + { + "mimeType": item.mimeType, + "data": item.data, + "detail": getattr(item, "detail", None), + } + ) + elif isinstance(item, dict): + if item.get("type") in {"text", "input_text"}: + text_parts.append(str(item.get("text", ""))) + elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}: + image_payloads.append(item) + elif isinstance(item, str): + text_parts.append(item) + + if result.structuredContent and not text_parts: + try: + text_parts.append(json.dumps(result.structuredContent)) + except Exception: + text_parts.append(str(result.structuredContent)) + + if getattr(result, "isError", False): + text_parts.append(getattr(result, "error", "Tool execution failed.")) + + output_text = "\n".join(part for part in text_parts if part) or "" + + converted.append( + { + "type": "function_call_output", + "id": call_id, + "call_id": call_id, + "output": output_text, + } + ) + + for payload in image_payloads: + converted.append( + { + "role": "user", + "content": [self._image_item(payload, "user")], + } + ) + + return converted + + @staticmethod + def _parse_arguments(arguments: Any) -> dict[str, Any]: + if isinstance(arguments, dict): + return arguments + if isinstance(arguments, str) and arguments: + try: + parsed = json.loads(arguments) + if isinstance(parsed, dict): + return parsed + except json.JSONDecodeError: + logger.debug("Failed to decode arguments: %s", arguments) + return {} + + def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall: + tool_name = payload.get("name") or payload.get("function", {}).get("name") or "" + call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id") + if not call_id: + call_id = tool_name + arguments = payload.get("arguments") + if not arguments and "function" in payload: + arguments = payload["function"].get("arguments") + parsed_arguments = self._parse_arguments(arguments) + return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments) + + def _coerce_response_payload(self, response: Any) -> dict[str, Any]: + """Convert OpenRouter SDK return types into a plain dictionary.""" + + if response is None: + return {} + + if isinstance(response, dict): + return response + + for attr in ("model_dump", "dict", "to_dict"): + if hasattr(response, attr): + try: + payload = getattr(response, attr)() + except Exception as exc: # pragma: no cover - defensive + logger.debug("Failed to read response via %s: %s", attr, exc) + else: + if isinstance(payload, dict): + return payload + + snapshot = getattr(response, "__dict__", None) + if isinstance(snapshot, dict): + return snapshot + + logger.error("Unexpected response carrier from OpenRouter: %r", response) + raise TypeError("Unexpected response type from OpenRouter") + + def _extract_response(self, response: Any) -> AgentResponse: + data = self._coerce_response_payload(response) + if not isinstance(data, dict): + raise TypeError("Unexpected response type from OpenRouter") + + output = data.get("output", []) + text_parts: list[str] = [] + tool_calls: list[MCPToolCall] = [] + reasoning_parts: list[str] = [] + + for item in output: + item_type = item.get("type") if isinstance(item, dict) else None + if item_type == "message": + contents = item.get("content", []) + if isinstance(contents, list): + for block in contents: + if not isinstance(block, dict): + continue + block_type = block.get("type") + if block_type in {"output_text", "text"}: + text = block.get("text") + if text: + text_parts.append(text) + elif block_type == "reasoning" and block.get("text"): + reasoning_parts.append(block["text"]) + for tc in item.get("tool_calls", []) or []: + if isinstance(tc, dict): + tool_calls.append(self._to_mcp_tool_call(tc)) + elif item_type in {"tool_call", "function_call"} and isinstance(item, dict): + tool_calls.append(self._to_mcp_tool_call(item)) + elif item_type == "reasoning" and isinstance(item, dict): + summary = item.get("summary") + if isinstance(summary, list): + for block in summary: + if isinstance(block, dict) and block.get("text"): + reasoning_parts.append(block["text"]) + elif isinstance(summary, str): + reasoning_parts.append(summary) + + merged_text = "\n".join(reasoning_parts + text_parts).strip() + status = data.get("status", "completed") + done = not tool_calls and status != "in_progress" + return AgentResponse( + content=merged_text, + tool_calls=tool_calls, + done=done, + raw=response, + ) + + @instrument( + span_type="agent", + record_args=False, + record_result=True, + ) + async def get_response(self, messages: list[Any]) -> AgentResponse: + converted_messages = self._convert_messages(messages) + tools = self._convert_tools_for_responses(self.get_tool_schemas()) + + protected_keys = {"model", "input", "tools"} + extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys} + # If tools are provided and tool_choice isn't explicitly set, require tool use + if tools and "tool_choice" not in extra: + extra["tool_choice"] = "required" + + try: + payload: dict[str, Any] = { + "model": self.model_name, + "input": converted_messages, + **extra, + } + if tools: + payload["tools"] = tools + + response = await self.oai.responses.create(**payload) + except Exception as exc: + error_content = f"Error getting response {exc}" + logger.exception("OpenRouter call failed: %s", exc) + return AgentResponse( + content=error_content, + tool_calls=[], + done=True, + isError=True, + raw=None, + ) + + return self._extract_response(response) diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py new file mode 100644 index 00000000..d3010e0d --- /dev/null +++ b/hud/agents/tests/test_openrouter.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import pytest +from unittest.mock import AsyncMock, MagicMock + +import mcp.types as types + +from hud.agents.openrouter import OpenRouterAgent +from hud.settings import settings +from hud.types import MCPToolCall, MCPToolResult + + +@pytest.fixture(autouse=True) +def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: + """Disable HUD telemetry during unit tests.""" + monkeypatch.setattr(settings, "telemetry_enabled", False) + monkeypatch.setattr(settings, "api_key", None) + + +class FakeResponse: + def __init__(self, payload: dict) -> None: + self._payload = payload + + def model_dump(self) -> dict: + return self._payload + + +@pytest.mark.asyncio +async def test_openrouter_agent_builds_cached_messages() -> None: + responses_create = AsyncMock( + return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"}) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent( + api_key="test-key", + openai_client=mock_client, + cache_control={"type": "ephemeral"}, + ) + agent._available_tools = [] # mimic initialized agent + + messages = [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + {"role": "assistant", "content": "Previous reply"}, + ] + + await agent.get_response(messages) + + await_call = responses_create.await_args + assert await_call is not None + kwargs = await_call.kwargs + assert kwargs["model"] == agent.model_name + input_payload = kwargs["input"] + + system_block = input_payload[0]["content"][0] + user_block = input_payload[1]["content"][0] + assistant_block = input_payload[2]["content"][0] + + assert system_block["cache_control"] == {"type": "ephemeral"} + assert user_block["cache_control"] == {"type": "ephemeral"} + assert "cache_control" not in assistant_block + + +@pytest.mark.asyncio +async def test_openrouter_agent_parses_tool_calls() -> None: + responses_create = AsyncMock( + return_value=FakeResponse( + { + "output": [ + { + "type": "message", + "content": [{"type": "output_text", "text": "Calling tool"}], + "tool_calls": [ + { + "id": "call_1", + "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"}, + } + ], + } + ], + "status": "requires_action", + } + ) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + agent._available_tools = [] + + result = await agent.get_response( + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + ] + ) + + assert not result.done + assert result.tool_calls[0].name == "search" + assert result.tool_calls[0].arguments == {"query": "hud"} + + +@pytest.mark.asyncio +async def test_openrouter_agent_returns_text_response() -> None: + responses_create = AsyncMock( + return_value=FakeResponse( + { + "output": [ + { + "type": "message", + "content": [{"type": "output_text", "text": "Hi there"}], + } + ], + "status": "completed", + } + ) + ) + mock_client = MagicMock() + mock_client.responses.create = responses_create + + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + agent._available_tools = [] + + result = await agent.get_response( + [ + {"role": "system", "content": "You are helpful."}, + {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, + ] + ) + + assert result.done + assert result.content == "Hi there" + assert result.tool_calls == [] + + +def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + from pydantic import Field + + tools = [ + { + "type": "function", + "function": { + "name": "click", + "description": "Click an element", + "parameters": { + "type": "object", + "properties": { + "selector": Field(default="", description="CSS selector"), + }, + "required": ["selector"], + }, + }, + } + ] + + converted = agent._convert_tools_for_responses(tools) + selector_schema = converted[0]["parameters"]["properties"]["selector"] + assert isinstance(selector_schema, dict) + assert selector_schema.get("description") == "CSS selector" + + +def test_openrouter_agent_converts_image_blocks() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + content = [ + { + "type": "image", + "mimeType": "image/png", + "data": "dGVzdA==", + "detail": "high", + } + ] + + message_blocks = agent._convert_messages([{"role": "user", "content": content}]) + image_block = message_blocks[0]["content"][0] + assert image_block["type"] == "input_image" + assert image_block["image_url"].startswith("data:image/png;base64,") + assert image_block["detail"] == "high" + + +@pytest.mark.asyncio +async def test_format_tool_results_produces_function_call_output() -> None: + mock_client = MagicMock() + agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + + tool_call = MCPToolCall(id="call-1", name="playwright", arguments={}) + tool_result = MCPToolResult( + content=[ + types.TextContent(type="text", text="navigation complete"), + types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"), + ] + ) + + formatted = await agent.format_tool_results([tool_call], [tool_result]) + + assert formatted[0]["type"] == "function_call_output" + assert formatted[0]["call_id"] == "call-1" + assert formatted[1]["role"] == "user" + assert formatted[1]["content"][0]["type"] == "input_image" diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 3708cf0e..99771913 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -777,7 +777,8 @@ def eval( agent: str | None = typer.Argument( None, help=( - "Agent backend to use (claude, openai, vllm, or litellm). If not provided, will prompt interactively." # noqa: E501 + "Agent backend to use (claude, openai computer use, openrouter responses, " + "vllm, or litellm). If not provided, will prompt interactively." ), ), full: bool = typer.Option( @@ -893,6 +894,7 @@ def eval( [ {"name": "Claude 4 Sonnet", "value": "claude"}, {"name": "OpenAI Computer Use", "value": "openai"}, + {"name": "OpenRouter (Responses)", "value": "openrouter"}, {"name": "vLLM (Local Server)", "value": "vllm"}, {"name": "LiteLLM (Multi-provider)", "value": "litellm"}, ] @@ -901,7 +903,7 @@ def eval( agent = hud_console.select("Select an agent to use:", choices=choices, default=0) # Handle HUD model selection - if agent and agent not in ["claude", "openai", "vllm", "litellm", "integration_test"]: + if agent and agent not in ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"]: # Find remote model name model = agent if not vllm_base_url: @@ -922,7 +924,7 @@ def eval( hud_console.info(f"Using HUD model: {model} (trained on {base_model})") # Validate agent choice - valid_agents = ["claude", "openai", "vllm", "litellm", "integration_test"] + valid_agents = ["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] if agent not in valid_agents: hud_console.error(f"Invalid agent: {agent}. Must be one of: {', '.join(valid_agents)}") raise typer.Exit(1) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index e8afceac..4900ba85 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -113,7 +113,7 @@ def _build_vllm_config( def build_agent( - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"], + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"], *, model: str | None = None, allowed_tools: list[str] | None = None, @@ -180,6 +180,21 @@ def build_agent( allowed_tools=allowed_tools, verbose=verbose, ) + elif agent_type == "openrouter": + try: + from hud.agents.openrouter import OpenRouterAgent + except ImportError as e: + hud_console.error( + "OpenRouter agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + return OpenRouterAgent( + model_name=model or "z-ai/glm-4.6", + allowed_tools=allowed_tools, + verbose=verbose, + ) # Fallback Claude agent (Anthropic) try: @@ -209,7 +224,7 @@ def build_agent( async def run_single_task( source: str, *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude", model: str | None = None, allowed_tools: list[str] | None = None, max_steps: int = 10, @@ -305,6 +320,16 @@ async def run_single_task( } if allowed_tools: agent_config["allowed_tools"] = allowed_tools + elif agent_type == "openrouter": + from hud.agents.openrouter import OpenRouterAgent + + agent_class = OpenRouterAgent + agent_config = { + "model_name": model or "z-ai/glm-4.5v", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools elif agent_type == "claude": from hud.agents import ClaudeAgent @@ -353,7 +378,7 @@ async def run_single_task( async def run_full_dataset( source: str, *, - agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude", + agent_type: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = "claude", model: str | None = None, allowed_tools: list[str] | None = None, max_concurrent: int = 30, @@ -539,10 +564,13 @@ def eval_command( "--full", help="Run the entire dataset (omit for single-task debug mode)", ), - agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option( + agent: Literal["claude", "openai", "openrouter", "vllm", "litellm", "integration_test"] = typer.Option( "claude", "--agent", - help="Agent backend to use (claude, openai, vllm for local server, or litellm)", + help=( + "Agent backend to use (claude, openai computer use, openrouter responses, " + "vllm for local server, or litellm)" + ), ), model: str | None = typer.Option( None, diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py index e15cb240..37b9fa7a 100644 --- a/hud/utils/agent_factories.py +++ b/hud/utils/agent_factories.py @@ -8,6 +8,7 @@ from hud.agents.grounded_openai import GroundedOpenAIChatAgent from hud.agents.openai_chat_generic import GenericOpenAIChatAgent +from hud.agents.openrouter import OpenRouterAgent from hud.tools.grounding import GrounderConfig @@ -82,3 +83,18 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent: return GroundedOpenAIChatAgent( openai_client=openai_client, grounder_config=grounder_config, **kwargs ) + + +def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent: + """Factory for OpenRouterAgent with run_dataset compatibility.""" + + api_key = kwargs.pop("api_key", None) + base_url = kwargs.pop("base_url", None) + cache_control = kwargs.pop("cache_control", True) + + return OpenRouterAgent( + api_key=api_key, + base_url=base_url, + cache_control=cache_control, + **kwargs, + ) From 8281d6bc948287d295d800f248ae870def754023 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Sat, 11 Oct 2025 23:06:03 +0200 Subject: [PATCH 02/14] litllm, glm-4.5v cua loop --- examples/04_openrouter_quickstart.py | 47 -- hud/agents/glm45v.py | 820 +++++++++++++++++++++++ hud/agents/openrouter.py | 960 ++++++++++++--------------- hud/agents/tests/test_openrouter.py | 237 ++----- hud/cli/__init__.py | 2 +- hud/cli/eval.py | 4 +- hud/utils/agent_factories.py | 11 +- pyproject.toml | 2 +- 8 files changed, 1298 insertions(+), 785 deletions(-) delete mode 100644 examples/04_openrouter_quickstart.py create mode 100644 hud/agents/glm45v.py diff --git a/examples/04_openrouter_quickstart.py b/examples/04_openrouter_quickstart.py deleted file mode 100644 index 2ac56044..00000000 --- a/examples/04_openrouter_quickstart.py +++ /dev/null @@ -1,47 +0,0 @@ -from __future__ import annotations - -import asyncio -from pathlib import Path - -from hud.agents.openrouter import OpenRouterAgent -from hud.utils.hud_console import HUDConsole - - -async def main() -> None: - hud_console = HUDConsole() - - # Inline FastMCP sum task (no external JSON needed) - server_path = Path(__file__).parent / "mcp_sum_server.py" - task = { - "id": "sum-demo", - "prompt": "Call the `sum` tool to add 7 and 5, then reply with the total in natural language.", - "mcp_config": { - "local": { - "command": "python", - "args": [str(server_path)], - } - }, - "agent_config": { - "allowed_tools": ["sum"], - "system_prompt": ( - "You are a concise math assistant. Always call the `sum` tool when asked to add " - "numbers, wait for the result, then explain the answer in one sentence." - ), - }, - } - - # Instantiate the OpenRouter agent (uses OPENROUTER_API_KEY from env) - agent = OpenRouterAgent(model_name="z-ai/glm-4.5v", verbose=True) - - hud_console.info("Running task with OpenRouter agent...") - result = await agent.run(task, max_steps=3) - - hud_console.info("\nFinal content:") - hud_console.info(result.content or "") - hud_console.success(f"Reward: {result.reward}") - - -if __name__ == "__main__": - asyncio.run(main()) - - diff --git a/hud/agents/glm45v.py b/hud/agents/glm45v.py new file mode 100644 index 00000000..e7ff0fdc --- /dev/null +++ b/hud/agents/glm45v.py @@ -0,0 +1,820 @@ +"""glm-4.5v computer-use agent backed by litellm + openrouter.""" + +from __future__ import annotations + +import json +import logging +import re +from typing import Any, ClassVar + +import litellm +import mcp.types as types +from litellm.types.utils import ModelResponse + +from hud.agents.base import MCPAgent +from hud.tools.computer.settings import computer_settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult +from hud import instrument +from hud.agents.openrouter import ( + _convert_json_action_to_items, + _decode_image_dimensions, + _extract_user_instruction, + _make_click_item, + _make_double_click_item, + _make_drag_item, + _make_failed_tool_call_items, + _make_keypress_item, + _make_output_text_item, + _make_reasoning_item, + _make_screenshot_item, + _make_scroll_item, + _make_type_item, + _make_wait_item, + _parse_json_action_string, + _random_id, + get_last_image_from_messages, +) + +logger = logging.getLogger(__name__) + + +DEFAULT_SYSTEM_PROMPT = """ +You are an autonomous computer-using agent. Follow these guidelines: + +1. Do not ask for permission; act decisively to finish the task. +2. Always ground actions in the latest screenshot and task instructions. +3. Use the provided mouse/keyboard tools precisely (coordinates are 0-999). +4. Keep memory concise—store only facts that matter for later steps. +5. When the task is complete, reply with DONE() and include the final answer. +6. If the task is impossible, reply with FAIL() and explain briefly. +""".strip() + + +GLM_ACTION_SPACE = """ +### {left,right,middle}_click + +Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')` +{ + 'name': ['left_click', 'right_click', 'middle_click'], + 'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being clicked.' + } + }, + 'required': ['start_box'] + } +} + +### hover + +Call rule: `hover(start_box='[x,y]', element_info='')` +{ + 'name': 'hover', + 'description': 'Move the mouse pointer to the specified coordinates without performing any click action.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being hovered over.' + } + }, + 'required': ['start_box'] + } +} + +### left_double_click + +Call rule: `left_double_click(start_box='[x,y]', element_info='')` +{ + 'name': 'left_double_click', + 'description': 'Perform a left mouse double-click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being double-clicked.' + } + }, + 'required': ['start_box'] + } +} + +### left_drag + +Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')` +{ + 'name': 'left_drag', + 'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.' + }, + 'end_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being dragged.' + } + }, + 'required': ['start_box', 'end_box'] + } +} + +### key + +Call rule: `key(keys='')` +{ + 'name': 'key', + 'description': 'Simulate pressing a single key or combination of keys on the keyboard.', + 'parameters': { + 'type': 'object', + 'properties': { + 'keys': { + 'type': 'string', + 'description': "The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab')." + } + }, + 'required': ['keys'] + } +} + +### type + +Call rule: `type(content='')` +{ + 'name': 'type', + 'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.', + 'parameters': { + 'type': 'object', + 'properties': { + 'content': { + 'type': 'string', + 'description': 'The text content to be typed into the active text field.' + } + }, + 'required': ['content'] + } +} + +### scroll + +Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')` +{ + 'name': 'scroll', + 'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.' + }, + 'direction': { + 'type': 'string', + 'enum': ['down', 'up'], + 'description': "The direction to scroll: 'down' or 'up'." + }, + 'step': { + 'type': 'integer', + 'default': 5, + 'description': 'Number of wheel steps to scroll, default is 5.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being scrolled.' + } + }, + 'required': ['start_box', 'direction'] + } +} + +### WAIT + +Call rule: `WAIT()` +{ + 'name': 'WAIT', + 'description': 'Wait for 5 seconds before proceeding to the next action.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### DONE + +Call rule: `DONE()` +{ + 'name': 'DONE', + 'description': 'Indicate that the current task has been completed successfully and no further actions are needed.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### FAIL + +Call rule: `FAIL()` +{ + 'name': 'FAIL', + 'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +}""" + + + +def convert_responses_items_to_glm45v_pc_prompt( + messages: list[dict[str, Any]], + task: str, + memory: str = "[]", +) -> list[dict[str, Any]]: + action_space = GLM_ACTION_SPACE + head_text = ( + "You are a GUI Agent, and your primary task is to respond accurately to user" + " requests or questions. In addition to directly answering the user's queries," + " you can also use tools or perform GUI operations directly until you fulfill" + " the user's request or provide a correct answer. You should carefully read and" + " understand the images and questions provided by the user, and engage in" + " thinking and reflection when appropriate. The coordinates involved are all" + " represented in thousandths (0-999)." + "\n\n# Task:\n" + f"{task}\n\n# Task Platform\nUbuntu\n\n# Action Space\n{action_space}\n\n" + "# Historical Actions and Current Memory\nHistory:" + ) + + tail_text = ( + "\nMemory:\n" + f"{memory}\n" + "# Output Format\nPlain text explanation with action(param='...')\n" + "Memory:\n[{\"key\": \"value\"}, ...]\n\n# Some Additional Notes\n" + "- I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.\n" + "- You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round." + " The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory." + " Even if you don't need to remember anything, you should also output an empty list.\n" + "- If elevated privileges are needed, credentials are referenced as .\n" + "- For any mail account interactions, credentials are referenced as .\n\n" + "Current Screenshot:\n" + ) + + history: list[dict[str, Any]] = [] + history_images: list[str] = [] + current_step: list[dict[str, Any]] = [] + step_num = 0 + + for message in messages: + if not isinstance(message, dict): + continue + msg_type = message.get("type") + + if msg_type in {"reasoning", "message", "computer_call", "computer_call_output"}: + current_step.append(message) + + if msg_type == "computer_call_output" and current_step: + step_num += 1 + + bot_thought = "" + action_text = "" + for item in current_step: + if item.get("type") == "message" and item.get("role") == "assistant": + content = item.get("content") or [] + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") == "output_text": + bot_thought = block.get("text", "") + break + if item.get("type") == "computer_call": + action_text = json.dumps(item.get("action", {})) + + history.append({ + "step_num": step_num, + "bot_thought": bot_thought, + "action_text": action_text, + }) + + output = message.get("output") or {} + if isinstance(output, dict) and output.get("type") == "input_image": + url = output.get("image_url") + if isinstance(url, str): + history_images.append(url) + + current_step = [] + + content: list[dict[str, Any]] = [] + current_text = head_text + + total_steps = len(history) + image_tail = min(4, len(history_images)) + + for idx, step in enumerate(history): + step_no = step["step_num"] + bot_thought = step["bot_thought"] + action_text = step["action_text"] + + if idx < total_steps - image_tail: + current_text += ( + f"\nstep {step_no}: Screenshot:(Omitted in context.)" + f" Thought: {bot_thought}\nAction: {action_text}" + ) + else: + current_text += f"\nstep {step_no}: Screenshot:" + content.append({"type": "text", "text": current_text}) + image_idx = idx - (total_steps - image_tail) + if 0 <= image_idx < len(history_images): + content.append({"type": "image_url", "image_url": {"url": history_images[image_idx]}}) + current_text = f" Thought: {bot_thought}\nAction: {action_text}" + + current_text += tail_text + content.append({"type": "text", "text": current_text}) + return content + + +def convert_glm_completion_to_responses_items( + response: ModelResponse, + image_width: int, + image_height: int, + parsed_response: dict[str, str] | None = None, +) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] + + if not getattr(response, "choices", None): + return items + + choice = response.choices[0] + message = getattr(choice, "message", None) + if not message: + return items + + content = getattr(message, "content", "") or "" + reasoning_content = getattr(message, "reasoning_content", None) + + if reasoning_content: + items.append(_make_reasoning_item(str(reasoning_content))) + + parsed = parsed_response or parse_glm_response(content) + action = parsed.get("action", "") + action_text = parsed.get("action_text", "") + + if action_text: + clean_text = action_text + if action: + clean_text = clean_text.replace(action, "").strip() + clean_text = re.sub(r"Memory:\s*\[.*?\]\s*$", "", clean_text, flags=re.DOTALL).strip() + if clean_text: + items.append(_make_output_text_item(clean_text)) + + if action: + call_id = _random_id() + handled_json = False + + json_action = _parse_json_action_string(action) + if json_action: + json_entries = _convert_json_action_to_items( + json_action, + call_id=call_id, + image_width=image_width, + image_height=image_height, + ) + if json_entries: + items.extend(json_entries) + handled_json = True + + if action.startswith("left_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_click_item(actual_x, actual_y, call_id=call_id)) + elif action.startswith("right_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_click_item(actual_x, actual_y, button="right", call_id=call_id)) + elif action.startswith("left_double_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: + x, y = int(match.group(1)), int(match.group(2)) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + if not handled_json: + items.append(_make_double_click_item(actual_x, actual_y, call_id=call_id)) + elif action.startswith("left_drag"): + start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action) + if start_match and end_match: + x1, y1 = int(start_match.group(1)), int(start_match.group(2)) + x2, y2 = int(end_match.group(1)), int(end_match.group(2)) + actual_x1 = int((x1 / 999.0) * image_width) + actual_y1 = int((y1 / 999.0) * image_height) + actual_x2 = int((x2 / 999.0) * image_width) + actual_y2 = int((y2 / 999.0) * image_height) + path = [ + {"x": actual_x1, "y": actual_y1}, + {"x": actual_x2, "y": actual_y2}, + ] + if not handled_json: + items.append(_make_drag_item(path, call_id=call_id)) + elif action.startswith("key"): + key_match = re.search(r"keys='([^']+)'", action) + if key_match: + keys = key_match.group(1) + key_list = keys.split("+") if "+" in keys else [keys] + if not handled_json: + items.append(_make_keypress_item(key_list, call_id=call_id)) + elif action.startswith("type"): + content_match = re.search(r"content='([^']*)'", action) + if content_match: + text = content_match.group(1) + if not handled_json: + items.append(_make_type_item(text, call_id=call_id)) + elif action.startswith("scroll"): + coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + direction_match = re.search(r"direction='([^']+)'", action) + if coord_match and direction_match: + x, y = int(coord_match.group(1)), int(coord_match.group(2)) + direction = direction_match.group(1) + actual_x = int((x / 999.0) * image_width) + actual_y = int((y / 999.0) * image_height) + scroll_x = 0 + scroll_y = 0 + if direction == "up": + scroll_y = -5 + elif direction == "down": + scroll_y = 5 + elif direction == "left": + scroll_x = -5 + elif direction == "right": + scroll_x = 5 + if not handled_json: + items.append(_make_scroll_item(actual_x, actual_y, scroll_x, scroll_y, call_id=call_id)) + elif action == "WAIT()": + if not handled_json: + items.append(_make_wait_item(call_id=call_id)) + + return items + + +def parse_glm_response(response: str) -> dict[str, str]: + pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>" + match = re.search(pattern, response) + if match: + action = match.group(1).strip() + else: + action_pattern = r"[\w_]+\([^)]*\)" + matches = re.findall(action_pattern, response) + action = matches[0] if matches else "" + + memory_pattern = r"Memory:(.*?)$" + memory_match = re.search(memory_pattern, response, re.DOTALL) + memory = memory_match.group(1).strip() if memory_match else "[]" + + action_text_pattern = r"^(.*?)Memory:" + action_text_match = re.search(action_text_pattern, response, re.DOTALL) + action_text = action_text_match.group(1).strip() if action_text_match else response + if action_text: + action_text = action_text.replace("<|begin_of_box|>", "").replace("<|end_of_box|>", "") + + return { + "action": action or "", + "action_text": action_text, + "memory": memory, + } + + + + + + +class Glm45vAgent(MCPAgent): + """LiteLLM-backed GLM-4.5V agent that speaks MCP.""" + + metadata: ClassVar[dict[str, Any]] = { + "display_width": computer_settings.OPENAI_COMPUTER_WIDTH, + "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT, + } + + required_tools: ClassVar[list[str]] = ["openai_computer"] + + def __init__( + self, + *, + model_name: str = "z-ai/glm-4.5v", + completion_kwargs: dict[str, Any] | None = None, + system_prompt: str | None = None, + **agent_kwargs: Any, + ) -> None: + super().__init__(**agent_kwargs) + # Normalize to canonical openrouter// + if not model_name.startswith("openrouter/"): + self.model_name = f"openrouter/{model_name}" + else: + self.model_name = model_name + self.completion_kwargs = completion_kwargs or {} + combined_prompt = DEFAULT_SYSTEM_PROMPT + if system_prompt: + combined_prompt = f"{combined_prompt}\n\n{system_prompt}" + + if self.system_prompt: + self.system_prompt = f"{self.system_prompt}\n\n{combined_prompt}" + else: + self.system_prompt = combined_prompt + self._memory = "[]" + self._last_instruction = "" + self._task_description = "" + + async def get_system_messages(self) -> list[Any]: + return [] + + @instrument(span_type="agent", record_args=False) + async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: + content_items: list[dict[str, Any]] = [] + text_parts: list[str] = [] + for block in blocks: + if isinstance(block, types.TextContent): + text_parts.append(block.text) + elif isinstance(block, types.ImageContent): + content_items.append( + { + "type": "message", + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", + }, + } + ], + } + ) + + if text_parts: + content_items.insert( + 0, + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + }, + ) + + return content_items + + def _glm_tool_call_to_mcp(self, item: dict[str, Any]) -> MCPToolCall: + call_id = item.get("call_id") or _random_id() + action = item.get("action") or {} + action_type = action.get("type", "") + + arguments: dict[str, Any] = {"type": action_type} + for key in ("x", "y", "scroll_x", "scroll_y"): + if key in action: + arguments[key] = action[key] + if "button" in action: + arguments["button"] = action["button"] + if "keys" in action: + arguments["keys"] = action["keys"] + if "text" in action: + arguments["text"] = action["text"] + if "path" in action: + arguments["path"] = action["path"] + + return MCPToolCall(id=call_id, name="openai_computer", arguments=arguments) + + @instrument(span_type="agent", record_args=False) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + instruction = _extract_user_instruction(messages) + if instruction: + self._last_instruction = instruction # type: ignore[attr-defined] + self._task_description = instruction + task_instruction = self._task_description or getattr(self, "_last_instruction", "") + + screenshot_b64 = get_last_image_from_messages(messages) + if not screenshot_b64: + call_id = _random_id() + screenshot_call = _make_screenshot_item(call_id) + messages.append(screenshot_call) + logger.debug("glm45v requesting initial screenshot") + tool_call = MCPToolCall( + id=call_id, + name="openai_computer", + arguments={"type": "screenshot"}, + ) + return AgentResponse( + content="capturing initial screenshot", + tool_calls=[tool_call], + done=False, + ) + + self.console.debug(f"glm45v task instruction: {task_instruction}") + self.console.debug(f"glm45v memory (pre-step): {self._memory}") + + prompt_content = convert_responses_items_to_glm45v_pc_prompt( + messages=messages, + task=task_instruction, + memory=self._memory, + ) + prompt_content.append( + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}} + ) + + system_prompt = self.system_prompt or "You are a helpful GUI agent assistant." + litellm_messages = [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt_content}, + ] + + api_kwargs = {"model": self.model_name, "messages": litellm_messages} + api_kwargs.update(self.completion_kwargs) + + try: + response = await litellm.acompletion(**api_kwargs) + except Exception as exc: # pragma: no cover - network errors + logger.exception("glm45v completion failed: %s", exc) + return AgentResponse( + content=f"GLM-4.5V request failed: {exc}", + tool_calls=[], + done=True, + isError=True, + ) + + choice = response.choices[0] + message = getattr(choice, "message", None) + response_content = getattr(message, "content", "") if message else "" + parsed = parse_glm_response(response_content or "") if response_content else { + "memory": self._memory, + } + if parsed.get("memory"): + self._memory = parsed["memory"] + logger.debug("glm45v model content: %s", response_content) + trimmed = response_content[:400] if response_content else "" + self.console.debug(f"glm45v model content: {trimmed}") + self.console.debug(f"glm45v parsed response: {parsed}") + + image_width, image_height = _decode_image_dimensions(screenshot_b64) + response_items = convert_glm_completion_to_responses_items( + response, + image_width=image_width, + image_height=image_height, + parsed_response=parsed, + ) + + messages.extend(response_items) + + text_parts: list[str] = [] + reasoning_parts: list[str] = [] + tool_calls: list[MCPToolCall] = [] + + for item in response_items: + if not isinstance(item, dict): + continue + if item.get("type") == "message" and item.get("role") == "assistant": + for block in item.get("content", []) or []: + if isinstance(block, dict) and block.get("type") == "output_text": + text = block.get("text") + if isinstance(text, str): + text_parts.append(text) + elif item.get("type") == "reasoning": + summary = item.get("summary", []) + for block in summary: + if isinstance(block, dict) and block.get("text"): + reasoning_parts.append(block["text"]) + elif item.get("type") == "computer_call": + tool_calls.append(self._glm_tool_call_to_mcp(item)) + + content_text = "\n".join(text_parts).strip() + reasoning_text = "\n".join(reasoning_parts).strip() + + if not tool_calls: + self.console.info_log( + f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'" + ) + self.console.info_log(f"glm45v parsed response: {parsed}") + + return AgentResponse( + content=content_text or None, + reasoning=reasoning_text or None, + tool_calls=tool_calls, + done=not tool_calls, + raw=response, + ) + + @instrument(span_type="agent", record_args=False) + async def format_tool_results( + self, + tool_calls: list[MCPToolCall], + tool_results: list[MCPToolResult], + ) -> list[dict[str, Any]]: + rendered: list[dict[str, Any]] = [] + + for call, result in zip(tool_calls, tool_results, strict=False): + call_args = call.arguments or {} + if result.isError: + error_text = "".join( + content.text + for content in result.content + if isinstance(content, types.TextContent) + ) + rendered.extend( + _make_failed_tool_call_items( + tool_name=call_args.get("type", call.name), + tool_kwargs=call_args, + error_message=error_text or "Unknown error", + call_id=call.id, + ) + ) + continue + + screenshot_found = False + for content in result.content: + if isinstance(content, types.ImageContent): + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": { + "type": "input_image", + "image_url": f"data:{content.mimeType};base64,{content.data}", + }, + } + ) + screenshot_found = True + break + + text_parts = [ + content.text + for content in result.content + if isinstance(content, types.TextContent) and content.text + ] + if text_parts: + rendered.append( + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + } + ) + + if not screenshot_found and not text_parts: + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": {"type": "input_text", "text": "Tool executed"}, + } + ) + + return rendered + + +__all__ = ["Glm45vAgent"] diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index 4306a386..c9445258 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -1,592 +1,452 @@ -"""OpenRouter agent that uses the Responses API with prompt caching.""" +"""OpenRouter agent facade plus shared tooling helpers.""" from __future__ import annotations +import base64 import json -import logging +import re import uuid -from typing import Any, Iterable +from importlib import import_module +from io import BytesIO +from typing import Any, Dict, Type -import mcp.types as types -from openai import AsyncOpenAI +from PIL import Image -from hud import instrument -from hud.settings import settings -from hud.types import AgentResponse, MCPToolCall, MCPToolResult +from hud.agents.base import MCPAgent +from hud.tools.computer.settings import computer_settings -from .openai_chat_generic import GenericOpenAIChatAgent +# Shared helper utilities for computer-use adapters +def _random_id() -> str: + return f"call_{uuid.uuid4().hex[:8]}" -logger = logging.getLogger(__name__) -_DEFAULT_BASE_URL = "https://openrouter.ai/api/alpha" -_DEFAULT_HEADERS = { - "HTTP-Referer": "https://hud.so", - "X-Title": "HUD Python SDK", - "Accept": "application/json", -} +def _make_reasoning_item(reasoning: str) -> dict[str, Any]: + return { + "id": _random_id(), + "type": "reasoning", + "summary": [{"type": "summary_text", "text": reasoning}], + } -_DEFAULT_COMPLETION_KWARGS: dict[str, Any] = { - "temperature": 0.1, - "max_output_tokens": 1024, -} +def _make_output_text_item(content: str) -> dict[str, Any]: + return { + "id": _random_id(), + "type": "message", + "role": "assistant", + "status": "completed", + "content": [{"type": "output_text", "text": content, "annotations": []}], + } -class OpenRouterAgent(GenericOpenAIChatAgent): - """MCP-enabled agent that talks to OpenRouter through the Responses API.""" - - def __init__( - self, - *, - api_key: str | None = None, - base_url: str | None = None, - model_name: str = "z-ai/glm-4.5v", - default_headers: dict[str, str] | None = None, - cache_control: dict[str, Any] | bool | None = True, - cacheable_roles: Iterable[str] | None = None, - openai_client: AsyncOpenAI | None = None, - completion_kwargs: dict[str, Any] | None = None, - **agent_kwargs: Any, - ) -> None: - api_key = api_key or settings.openrouter_api_key - if not api_key: - raise ValueError( - "OpenRouter API key not found. Set OPENROUTER_API_KEY or pass api_key explicitly." - ) - base_url = base_url or _DEFAULT_BASE_URL +def _make_computer_call_item(action: dict[str, Any], call_id: str | None = None) -> dict[str, Any]: + call_id = call_id or _random_id() + return { + "id": _random_id(), + "call_id": call_id, + "type": "computer_call", + "status": "completed", + "pending_safety_checks": [], + "action": action, + } - headers: dict[str, str] = dict(_DEFAULT_HEADERS) - if default_headers: - headers.update(default_headers) - client = openai_client or AsyncOpenAI( - api_key=api_key, - base_url=base_url, - default_headers=headers, - ) +def _make_click_item(x: int, y: int, button: str = "left", call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "click", "x": x, "y": y, "button": button}, call_id) - super().__init__( - openai_client=client, - model_name=model_name, - completion_kwargs=completion_kwargs, - **agent_kwargs, - ) - self._responses_kwargs = { - "tool_choice": "auto", - **_DEFAULT_COMPLETION_KWARGS, - **dict(self.completion_kwargs), - } - self.completion_kwargs.clear() +def _make_double_click_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "double_click", "x": x, "y": y}, call_id) - self._cache_control = self._normalize_cache_control(cache_control) - self._cacheable_roles = tuple(cacheable_roles or ("system", "user", "tool")) - @staticmethod - def _normalize_cache_control( - cache_control: dict[str, Any] | bool | str | None, - ) -> dict[str, Any] | None: - if cache_control is False: - return None - if cache_control is None: - return {"type": "ephemeral"} - if cache_control is True: - return {"type": "ephemeral"} - if isinstance(cache_control, dict): - return cache_control - return {"type": str(cache_control)} - - def _should_cache(self, role: str) -> bool: - return self._cache_control is not None and role in self._cacheable_roles - - def _text_item(self, text: str, role: str) -> dict[str, Any]: - item: dict[str, Any] = {"type": "input_text", "text": text} - if self._should_cache(role): - item["cache_control"] = self._cache_control - return item - - def _image_item(self, image_payload: Any, role: str) -> dict[str, Any]: - url: str | None = None - detail = None - - if isinstance(image_payload, dict): - # Standard OpenAI-style wrapper - if "image_url" in image_payload and isinstance(image_payload["image_url"], dict): - img = image_payload["image_url"] - url = img.get("url") - detail = img.get("detail") or image_payload.get("detail") - # Direct url / data uri - elif image_payload.get("url"): - url = image_payload.get("url") - detail = image_payload.get("detail") - # Raw base64 payload from computer/tool results - elif image_payload.get("data"): - mime = ( - image_payload.get("mimeType") - or image_payload.get("mime_type") - or "image/png" - ) - data = image_payload.get("data") - if data: - url = f"data:{mime};base64,{data}" - detail = image_payload.get("detail") - elif isinstance(image_payload.get("source"), dict): - source = image_payload["source"] - data = source.get("data") - mime = source.get("media_type") or source.get("mime_type") or "image/png" - if data: - url = f"data:{mime};base64,{data}" - detail = source.get("detail") - elif isinstance(image_payload, str): - url = image_payload - - item: dict[str, Any] = {"type": "input_image"} - if url: - item["image_url"] = url - item["detail"] = str(detail or "auto") - if self._should_cache(role): - item["cache_control"] = self._cache_control - return item - - def _convert_message_content(self, role: str, content: Any) -> list[dict[str, Any]]: - if content is None: - return [] - - blocks: list[dict[str, Any]] = [] - if isinstance(content, str): - blocks.append(self._text_item(content, role)) - return blocks - - if isinstance(content, dict): - content = [content] - - if isinstance(content, list): - for entry in content: - if isinstance(entry, str): - blocks.append(self._text_item(entry, role)) - elif isinstance(entry, dict): - entry_copy = dict(entry) - entry_type = entry_copy.get("type") - if entry_type in {"text", "input_text", None}: - text = entry_copy.get("text") or "" - blocks.append(self._text_item(text, role)) - elif entry_type in {"image_url", "input_image"}: - payload = entry_copy.get("image_url", entry_copy.get("image")) or entry_copy - blocks.append(self._image_item(payload, role)) - elif entry_type in {"image", "output_image", "rendered"}: - blocks.append(self._image_item(entry_copy, role)) - elif entry_type == "tool_result": - text = entry_copy.get("text", "") - blocks.append(self._text_item(text, role)) - else: - text_value = entry_copy.get("text") or json.dumps(entry_copy) - blocks.append(self._text_item(text_value, role)) - else: - blocks.append(self._text_item(str(entry), role)) - return blocks - - blocks.append(self._text_item(str(content), role)) - return blocks - - def _convert_messages(self, messages: list[Any]) -> list[dict[str, Any]]: - converted: list[dict[str, Any]] = [] - for message in messages: - if not isinstance(message, dict): - logger.debug("Skipping non-dict message: %s", message) - continue - - if "type" in message and "role" not in message: - converted.append(message) - continue - - role = message.get("role") or "user" - - if role == "assistant" and message.get("tool_calls"): - content_items = self._convert_message_content(role, message.get("content")) - if content_items: - converted.append({"role": "assistant", "content": content_items}) - for tool_call in message.get("tool_calls", []): - converted.append(self._convert_tool_call(tool_call)) - continue - - if role == "tool": - converted.extend(self._convert_tool_message(message)) - continue - - payload: dict[str, Any] = {"role": role} - content_items = self._convert_message_content(role, message.get("content")) - if content_items: - payload["content"] = content_items - if message.get("name"): - payload["name"] = message["name"] - if message.get("metadata"): - payload["metadata"] = message["metadata"] - converted.append(payload) - - return converted +def _make_move_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "move", "x": x, "y": y}, call_id) + + +def _make_drag_item(path: list[dict[str, int]], call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "drag", "path": path}, call_id) + + +def _make_keypress_item(keys: list[str], call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "keypress", "keys": keys}, call_id) + + +def _make_type_item(text: str, call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "type", "text": text}, call_id) - @staticmethod - def _jsonify_schema(value: Any) -> Any: - from pydantic import BaseModel - from pydantic.fields import FieldInfo - if isinstance(value, (str, int, float, bool)) or value is None: - return value +def _make_scroll_item( + x: int, + y: int, + scroll_x: int, + scroll_y: int, + call_id: str | None = None, +) -> dict[str, Any]: + action = {"type": "scroll", "x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y} + return _make_computer_call_item(action, call_id) - if isinstance(value, dict): - return {str(k): OpenRouterAgent._jsonify_schema(v) for k, v in value.items()} - if isinstance(value, (list, tuple, set)): - return [OpenRouterAgent._jsonify_schema(v) for v in value] +def _make_wait_item(call_id: str | None = None) -> dict[str, Any]: + return _make_computer_call_item({"type": "wait"}, call_id) + +def _make_screenshot_item(call_id: str) -> dict[str, Any]: + return _make_computer_call_item({"type": "screenshot"}, call_id) + + +def _make_failed_tool_call_items( + tool_name: str, + tool_kwargs: dict[str, Any], + error_message: str, + call_id: str, +) -> list[dict[str, Any]]: + call = _make_computer_call_item({"type": tool_name, **tool_kwargs}, call_id) + call["status"] = "failed" + failure_text = _make_output_text_item(f"Tool {tool_name} failed: {error_message}") + failure_text["role"] = "assistant" + return [call, failure_text] + + +def _coerce_to_pixel_coordinates( + x_val: Any, + y_val: Any, + *, + width: int, + height: int, +) -> tuple[int, int] | None: + try: + x_float = float(x_val) + y_float = float(y_val) + except (TypeError, ValueError): + return None + + def clamp(value: int, maximum: int) -> int: + return max(0, min(maximum - 1, value)) + + abs_x = abs(x_float) + abs_y = abs(y_float) + if abs_x <= 1.0 and abs_y <= 1.0: + px = int(x_float * width) + py = int(y_float * height) + elif abs_x <= 999.0 and abs_y <= 999.0: + px = int((x_float / 999.0) * width) + py = int((y_float / 999.0) * height) + else: + px = int(x_float) + py = int(y_float) + + return clamp(px, width), clamp(py, height) + + +def _parse_coordinate_box(value: Any) -> tuple[float, float] | None: + if isinstance(value, (list, tuple)) and len(value) >= 2: try: - return json.loads(json.dumps(value)) - except Exception: - if isinstance(value, BaseModel): - return OpenRouterAgent._jsonify_schema(value.model_dump()) - if isinstance(value, FieldInfo): - data: dict[str, Any] = {} - if value.annotation is not None: - data.setdefault( - "type", - getattr(value.annotation, "__name__", str(value.annotation)), - ) - if value.description: - data["description"] = value.description - if value.title: - data["title"] = value.title - if value.default not in (None, Ellipsis): - data["default"] = OpenRouterAgent._jsonify_schema(value.default) - if value.json_schema_extra: - extra = OpenRouterAgent._jsonify_schema(value.json_schema_extra) - if isinstance(extra, dict): - data.update(extra) - return data or str(value) - if hasattr(value, "model_dump"): - return OpenRouterAgent._jsonify_schema(value.model_dump()) - if hasattr(value, "__dict__") and value.__dict__: - return OpenRouterAgent._jsonify_schema( - { - k: v - for k, v in value.__dict__.items() - if not k.startswith("_") - } - ) - return str(value) + return float(value[0]), float(value[1]) + except (TypeError, ValueError): + return None - @staticmethod - def _convert_tools_for_responses(tools: list[dict] | None) -> list[dict]: - if not tools: - return [] - - converted: list[dict] = [] - for tool in tools: - if not isinstance(tool, dict): - continue - - if tool.get("type") == "function" and isinstance(tool.get("function"), dict): - fn = tool["function"] - name = fn.get("name") - params = fn.get("parameters", {}) - description = fn.get("description", "") - - if not isinstance(name, str) or not name: - logger.debug("Skipping tool with missing name: %s", tool) - continue - - converted.append( - { - "type": "function", - "name": name, - "description": str(description or ""), - "parameters": OpenRouterAgent._jsonify_schema(params), - } - ) - else: - converted.append(OpenRouterAgent._jsonify_schema(tool)) - - return converted - - def _convert_tool_call(self, tool_call: dict[str, Any]) -> dict[str, Any]: - if not isinstance(tool_call, dict): - return {} - - function = tool_call.get("function") or {} - name = function.get("name") or tool_call.get("name") or "tool_call" - raw_arguments = function.get("arguments") - - if isinstance(raw_arguments, dict): - arguments = json.dumps(self._jsonify_schema(raw_arguments)) - elif isinstance(raw_arguments, str): - try: - parsed = json.loads(raw_arguments) - except json.JSONDecodeError: - arguments = raw_arguments - else: - arguments = json.dumps(self._jsonify_schema(parsed)) - elif raw_arguments is None: - arguments = "{}" + if isinstance(value, str): + stripped = value.strip() + try: + loaded = json.loads(stripped) + except Exception: + matches = re.findall(r"-?\d+(?:\.\d+)?", stripped) + if len(matches) >= 2: + return float(matches[0]), float(matches[1]) else: - arguments = json.dumps(self._jsonify_schema(raw_arguments)) + if isinstance(loaded, (list, tuple)) and len(loaded) >= 2: + try: + return float(loaded[0]), float(loaded[1]) + except (TypeError, ValueError): + return None + return None + + +def _coerce_box_to_pixels( + box: Any, + *, + width: int, + height: int, +) -> tuple[int, int] | None: + coords = _parse_coordinate_box(box) + if not coords: + return None + return _coerce_to_pixel_coordinates(coords[0], coords[1], width=width, height=height) + + +def _parse_json_action_string(action_text: str) -> dict[str, Any] | None: + candidate = action_text.strip() + if not (candidate.startswith("{") and candidate.endswith("}")): + return None + + attempts = [candidate] + if "\\" in candidate: + try: + attempts.append(candidate.encode("utf-8").decode("unicode_escape")) + except Exception: + pass + attempts.append(candidate.replace("\\\"", '"')) - call_id = ( - tool_call.get("id") - or function.get("id") - or function.get("call_id") - or f"call_{uuid.uuid4().hex}" - ) + for attempt in attempts: + try: + return json.loads(attempt) + except Exception: + continue - return { - "type": "function_call", - "id": call_id, - "name": name, - "arguments": arguments or "{}", - } - - def _convert_tool_message(self, message: dict[str, Any]) -> list[dict[str, Any]]: - entries: list[dict[str, Any]] = [] - call_id = message.get("tool_call_id") or message.get("id") or f"call_{uuid.uuid4().hex}" - - text_parts: list[str] = [] - image_payloads: list[Any] = [] - - content = message.get("content") - if isinstance(content, list): - for item in content: - if isinstance(item, dict): - item_type = item.get("type") - if item_type in {"text", "input_text"} and item.get("text"): - text_parts.append(str(item.get("text"))) - elif item_type in {"image", "input_image", "image_url", "output_image", "rendered"}: - image_payloads.append(item) - elif isinstance(item, str): - text_parts.append(item) - elif isinstance(content, str): - text_parts.append(content) - - structured = message.get("structuredContent") - if structured and not text_parts: - try: - text_parts.append(json.dumps(structured)) - except Exception: - text_parts.append(str(structured)) - - output_text = "\n".join(part for part in text_parts if part) or "" - - entries.append( - { - "type": "function_call_output", - "id": message.get("id") or call_id, - "call_id": call_id, - "output": output_text, - } - ) + return None - for payload in image_payloads: - entries.append( - { - "role": "user", - "content": [self._image_item(payload, "user")], - } - ) +def _convert_json_action_to_items( + json_action: dict[str, Any], + *, + call_id: str, + image_width: int, + image_height: int, +) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + action_type = str(json_action.get("type", "")).lower() + if not action_type: return entries - async def format_tool_results( - self, - tool_calls: list[MCPToolCall], - tool_results: list[MCPToolResult], - ) -> list[dict[str, Any]]: - converted: list[dict[str, Any]] = [] - - for call, result in zip(tool_calls, tool_results, strict=False): - call_id = call.id or call.name or f"call_{uuid.uuid4().hex}" - - text_parts: list[str] = [] - image_payloads: list[Any] = [] - - for item in result.content or []: - if isinstance(item, types.TextContent): - text_parts.append(item.text) - elif isinstance(item, types.ImageContent): - image_payloads.append( - { - "mimeType": item.mimeType, - "data": item.data, - "detail": getattr(item, "detail", None), - } - ) - elif isinstance(item, dict): - if item.get("type") in {"text", "input_text"}: - text_parts.append(str(item.get("text", ""))) - elif item.get("type") in {"image", "input_image", "image_url", "output_image", "rendered"}: - image_payloads.append(item) - elif isinstance(item, str): - text_parts.append(item) - - if result.structuredContent and not text_parts: - try: - text_parts.append(json.dumps(result.structuredContent)) - except Exception: - text_parts.append(str(result.structuredContent)) - - if getattr(result, "isError", False): - text_parts.append(getattr(result, "error", "Tool execution failed.")) - - output_text = "\n".join(part for part in text_parts if part) or "" - - converted.append( - { - "type": "function_call_output", - "id": call_id, - "call_id": call_id, - "output": output_text, - } + if action_type in {"type", "text"}: + text_value = json_action.get("content") or json_action.get("text") or "" + if text_value: + entries.append(_make_type_item(str(text_value), call_id=call_id)) + elif action_type in {"click", "left_click"}: + start_box = ( + json_action.get("start_box") + or json_action.get("startBox") + or json_action.get("position") + ) + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + button = str(json_action.get("button", "left") or "left").lower() + entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) + elif action_type in {"right_click", "middle_click"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + button = "right" if action_type == "right_click" else "middle" + entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) + elif action_type in {"double_click", "left_double_click"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, ) + if coords: + entries.append(_make_double_click_item(coords[0], coords[1], call_id=call_id)) + elif action_type in {"drag", "left_drag"}: + start_box = json_action.get("start_box") or json_action.get("startBox") + end_box = json_action.get("end_box") or json_action.get("endBox") + start_coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + end_coords = _coerce_box_to_pixels(end_box, width=image_width, height=image_height) + if not start_coords and json_action.get("x") is not None and json_action.get("y") is not None: + start_coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if start_coords and end_coords: + path = [ + {"x": start_coords[0], "y": start_coords[1]}, + {"x": end_coords[0], "y": end_coords[1]}, + ] + entries.append(_make_drag_item(path, call_id=call_id)) + elif action_type == "scroll": + start_box = json_action.get("start_box") or json_action.get("startBox") + coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + direction = str(json_action.get("direction", "")).lower() + step = int(json_action.get("step", 5) or 5) + if coords: + scroll_x = 0 + scroll_y = 0 + if direction == "up": + scroll_y = -abs(step) + elif direction == "down": + scroll_y = abs(step) + elif direction == "left": + scroll_x = -abs(step) + elif direction == "right": + scroll_x = abs(step) + entries.append( + _make_scroll_item(coords[0], coords[1], scroll_x, scroll_y, call_id=call_id) + ) + elif action_type in {"hover", "move"}: + target_box = ( + json_action.get("start_box") + or json_action.get("startBox") + or json_action.get("position") + ) + coords = _coerce_box_to_pixels(target_box, width=image_width, height=image_height) + if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + coords = _coerce_to_pixel_coordinates( + json_action.get("x"), + json_action.get("y"), + width=image_width, + height=image_height, + ) + if coords: + entries.append(_make_move_item(coords[0], coords[1], call_id=call_id)) + elif action_type in {"keypress", "key", "key_press"}: + keys = json_action.get("keys") + key_list: list[str] = [] + if isinstance(keys, str): + key_list = [segment.strip() for segment in keys.split("+") if segment.strip()] + elif isinstance(keys, list): + key_list = [str(segment).strip() for segment in keys if str(segment).strip()] + if key_list: + entries.append(_make_keypress_item(key_list, call_id=call_id)) + elif action_type == "wait": + entries.append(_make_wait_item(call_id=call_id)) + elif action_type == "screenshot": + entries.append(_make_screenshot_item(call_id)) + + return entries + + +def _decode_image_dimensions(image_b64: str) -> tuple[int, int]: + try: + data = base64.b64decode(image_b64) + with Image.open(BytesIO(data)) as img: + return img.size + except Exception: # pragma: no cover - defensive fallback + return computer_settings.OPENAI_COMPUTER_WIDTH, computer_settings.OPENAI_COMPUTER_HEIGHT + + +def _extract_user_instruction(messages: list[dict[str, Any]]) -> str: + for message in messages: + if not isinstance(message, dict): + continue + if message.get("type") == "message" and message.get("role") == "user": + content = message.get("content") or [] + if isinstance(content, list): + for block in content: + if isinstance(block, dict) and block.get("type") in {"text", "input_text"}: + text = block.get("text") + if isinstance(text, str) and text.strip(): + return text.strip() + return "" + + +def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: + for message in reversed(messages): + if not isinstance(message, dict): + continue + msg_type = message.get("type") + if msg_type == "computer_call_output": + output = message.get("output") or {} + if isinstance(output, dict): + image_url = output.get("image_url") + if isinstance(image_url, str) and image_url.startswith("data:image/"): + return image_url.split(",", 1)[1] + if msg_type == "message" and message.get("role") == "user": + content = message.get("content") + if isinstance(content, list): + for item in reversed(content): + if isinstance(item, dict) and item.get("type") == "image_url": + url_obj = item.get("image_url") + if isinstance(url_obj, dict): + url = url_obj.get("url") + if isinstance(url, str) and url.startswith("data:image/"): + return url.split(",", 1)[1] + return None + +# Adapter dispatch +_ADAPTER_REGISTRY: Dict[str, str] = { + "z-ai/glm-4.5v": "hud.agents.glm45v:Glm45vAgent", +} - for payload in image_payloads: - converted.append( - { - "role": "user", - "content": [self._image_item(payload, "user")], - } - ) - return converted +def _load_adapter(path: str) -> Type[MCPAgent]: + module_name, class_name = path.split(":", 1) + module = import_module(module_name) + return getattr(module, class_name) - @staticmethod - def _parse_arguments(arguments: Any) -> dict[str, Any]: - if isinstance(arguments, dict): - return arguments - if isinstance(arguments, str) and arguments: - try: - parsed = json.loads(arguments) - if isinstance(parsed, dict): - return parsed - except json.JSONDecodeError: - logger.debug("Failed to decode arguments: %s", arguments) - return {} - - def _to_mcp_tool_call(self, payload: dict[str, Any]) -> MCPToolCall: - tool_name = payload.get("name") or payload.get("function", {}).get("name") or "" - call_id = payload.get("id") or payload.get("tool_call_id") or payload.get("call_id") - if not call_id: - call_id = tool_name - arguments = payload.get("arguments") - if not arguments and "function" in payload: - arguments = payload["function"].get("arguments") - parsed_arguments = self._parse_arguments(arguments) - return MCPToolCall(id=call_id, name=tool_name, arguments=parsed_arguments) - - def _coerce_response_payload(self, response: Any) -> dict[str, Any]: - """Convert OpenRouter SDK return types into a plain dictionary.""" - - if response is None: - return {} - - if isinstance(response, dict): - return response - - for attr in ("model_dump", "dict", "to_dict"): - if hasattr(response, attr): - try: - payload = getattr(response, attr)() - except Exception as exc: # pragma: no cover - defensive - logger.debug("Failed to read response via %s: %s", attr, exc) - else: - if isinstance(payload, dict): - return payload - - snapshot = getattr(response, "__dict__", None) - if isinstance(snapshot, dict): - return snapshot - - logger.error("Unexpected response carrier from OpenRouter: %r", response) - raise TypeError("Unexpected response type from OpenRouter") - - def _extract_response(self, response: Any) -> AgentResponse: - data = self._coerce_response_payload(response) - if not isinstance(data, dict): - raise TypeError("Unexpected response type from OpenRouter") - - output = data.get("output", []) - text_parts: list[str] = [] - tool_calls: list[MCPToolCall] = [] - reasoning_parts: list[str] = [] - - for item in output: - item_type = item.get("type") if isinstance(item, dict) else None - if item_type == "message": - contents = item.get("content", []) - if isinstance(contents, list): - for block in contents: - if not isinstance(block, dict): - continue - block_type = block.get("type") - if block_type in {"output_text", "text"}: - text = block.get("text") - if text: - text_parts.append(text) - elif block_type == "reasoning" and block.get("text"): - reasoning_parts.append(block["text"]) - for tc in item.get("tool_calls", []) or []: - if isinstance(tc, dict): - tool_calls.append(self._to_mcp_tool_call(tc)) - elif item_type in {"tool_call", "function_call"} and isinstance(item, dict): - tool_calls.append(self._to_mcp_tool_call(item)) - elif item_type == "reasoning" and isinstance(item, dict): - summary = item.get("summary") - if isinstance(summary, list): - for block in summary: - if isinstance(block, dict) and block.get("text"): - reasoning_parts.append(block["text"]) - elif isinstance(summary, str): - reasoning_parts.append(summary) - - merged_text = "\n".join(reasoning_parts + text_parts).strip() - status = data.get("status", "completed") - done = not tool_calls and status != "in_progress" - return AgentResponse( - content=merged_text, - tool_calls=tool_calls, - done=done, - raw=response, - ) - @instrument( - span_type="agent", - record_args=False, - record_result=True, - ) - async def get_response(self, messages: list[Any]) -> AgentResponse: - converted_messages = self._convert_messages(messages) - tools = self._convert_tools_for_responses(self.get_tool_schemas()) - - protected_keys = {"model", "input", "tools"} - extra = {k: v for k, v in self._responses_kwargs.items() if k not in protected_keys} - # If tools are provided and tool_choice isn't explicitly set, require tool use - if tools and "tool_choice" not in extra: - extra["tool_choice"] = "required" +class OpenRouterAgent: + """Dispatch wrapper that selects the correct OpenRouter adapter by model.""" + def __init__(self, *, model_name: str = "z-ai/glm-4.5v", **kwargs: Any) -> None: + normalized = self._normalize_model_name(model_name) try: - payload: dict[str, Any] = { - "model": self.model_name, - "input": converted_messages, - **extra, - } - if tools: - payload["tools"] = tools - - response = await self.oai.responses.create(**payload) - except Exception as exc: - error_content = f"Error getting response {exc}" - logger.exception("OpenRouter call failed: %s", exc) - return AgentResponse( - content=error_content, - tool_calls=[], - done=True, - isError=True, - raw=None, - ) + adapter_path = _ADAPTER_REGISTRY[normalized] + except KeyError as exc: # pragma: no cover - defensive + raise ValueError(f"Unsupported OpenRouter model: {model_name}") from exc + + adapter_cls = _load_adapter(adapter_path) + canonical_model = f"openrouter/{normalized}" + self.model_name = canonical_model + self._adapter = adapter_cls(model_name=canonical_model, **kwargs) - return self._extract_response(response) + @staticmethod + def _normalize_model_name(raw_model: str | None) -> str: + if not raw_model: + raise ValueError("Model name must be provided for OpenRouterAgent") + key = raw_model.strip() + if key.startswith("openrouter/"): + key = key[len("openrouter/") :] + key = key.lower() + if key in _ADAPTER_REGISTRY: + return key + raise ValueError(f"Unknown OpenRouter model: {raw_model}") + + def __getattr__(self, item: str) -> Any: + return getattr(self._adapter, item) + + def __dir__(self) -> list[str]: + base_dir = set(super().__dir__()) + base_dir.update(self.__dict__.keys()) + base_dir.update(dir(self._adapter)) + return sorted(base_dir) + + +__all__ = [ + "OpenRouterAgent", + "_random_id", + "_make_reasoning_item", + "_make_output_text_item", + "_make_computer_call_item", + "_make_click_item", + "_make_double_click_item", + "_make_drag_item", + "_make_keypress_item", + "_make_type_item", + "_make_scroll_item", + "_make_wait_item", + "_make_screenshot_item", + "_make_failed_tool_call_items", + "_coerce_to_pixel_coordinates", + "_parse_coordinate_box", + "_coerce_box_to_pixels", + "_parse_json_action_string", + "_convert_json_action_to_items", + "_decode_image_dimensions", + "_extract_user_instruction", + "get_last_image_from_messages", +] diff --git a/hud/agents/tests/test_openrouter.py b/hud/agents/tests/test_openrouter.py index d3010e0d..7328586e 100644 --- a/hud/agents/tests/test_openrouter.py +++ b/hud/agents/tests/test_openrouter.py @@ -1,205 +1,94 @@ from __future__ import annotations import pytest -from unittest.mock import AsyncMock, MagicMock -import mcp.types as types +from types import SimpleNamespace +from typing import Any -from hud.agents.openrouter import OpenRouterAgent -from hud.settings import settings -from hud.types import MCPToolCall, MCPToolResult +def _import_agents(): + import mcp.types as types + from hud.agents.glm45v import Glm45vAgent + from hud.agents.openrouter import OpenRouterAgent + from hud.types import MCPToolResult + return Glm45vAgent, OpenRouterAgent, MCPToolResult, types -@pytest.fixture(autouse=True) -def disable_telemetry(monkeypatch: pytest.MonkeyPatch) -> None: - """Disable HUD telemetry during unit tests.""" - monkeypatch.setattr(settings, "telemetry_enabled", False) - monkeypatch.setattr(settings, "api_key", None) +def test_openrouter_agent_defaults_to_glm45v() -> None: + Glm45vAgent, OpenRouterAgent, _, _ = _import_agents() + agent = OpenRouterAgent() + assert isinstance(agent._adapter, Glm45vAgent) + assert agent.model_name == "openrouter/z-ai/glm-4.5v" -class FakeResponse: - def __init__(self, payload: dict) -> None: - self._payload = payload +def test_openrouter_agent_normalizes_alias() -> None: + _, OpenRouterAgent, _, _ = _import_agents() + agent = OpenRouterAgent(model_name="Z-AI/GLM-4.5V") + assert agent.model_name == "openrouter/z-ai/glm-4.5v" - def model_dump(self) -> dict: - return self._payload - -@pytest.mark.asyncio -async def test_openrouter_agent_builds_cached_messages() -> None: - responses_create = AsyncMock( - return_value=FakeResponse({"output": [{"type": "message", "content": []}], "status": "completed"}) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create - - agent = OpenRouterAgent( - api_key="test-key", - openai_client=mock_client, - cache_control={"type": "ephemeral"}, - ) - agent._available_tools = [] # mimic initialized agent - - messages = [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - {"role": "assistant", "content": "Previous reply"}, - ] - - await agent.get_response(messages) - - await_call = responses_create.await_args - assert await_call is not None - kwargs = await_call.kwargs - assert kwargs["model"] == agent.model_name - input_payload = kwargs["input"] - - system_block = input_payload[0]["content"][0] - user_block = input_payload[1]["content"][0] - assistant_block = input_payload[2]["content"][0] - - assert system_block["cache_control"] == {"type": "ephemeral"} - assert user_block["cache_control"] == {"type": "ephemeral"} - assert "cache_control" not in assistant_block +def test_openrouter_agent_rejects_unknown_model() -> None: + _, OpenRouterAgent, _, _ = _import_agents() + with pytest.raises(ValueError): + OpenRouterAgent(model_name="unknown/model") @pytest.mark.asyncio -async def test_openrouter_agent_parses_tool_calls() -> None: - responses_create = AsyncMock( - return_value=FakeResponse( - { - "output": [ - { - "type": "message", - "content": [{"type": "output_text", "text": "Calling tool"}], - "tool_calls": [ - { - "id": "call_1", - "function": {"name": "search", "arguments": "{\"query\": \"hud\"}"}, - } - ], - } - ], - "status": "requires_action", - } - ) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create +async def test_openrouter_agent_parses_tool_calls(monkeypatch: pytest.MonkeyPatch) -> None: + Glm45vAgent, OpenRouterAgent, MCPToolResult, types = _import_agents() + png_base64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO61uFYAAAAASUVORK5CYII=" - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - agent._available_tools = [] + async def fake_completion(*_: Any, **__: Any) -> Any: + message = SimpleNamespace(content=( + "I will click the button.\n" + "<|begin_of_box|>{\"type\": \"click\", \"start_box\": [100, 200]}<|end_of_box|>\n" + "Memory:[]" + ), reasoning_content=None) + choice = SimpleNamespace(message=message) + return SimpleNamespace(choices=[choice]) - result = await agent.get_response( - [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - ] - ) + monkeypatch.setattr("hud.agents.glm45v.litellm.acompletion", fake_completion) - assert not result.done - assert result.tool_calls[0].name == "search" - assert result.tool_calls[0].arguments == {"query": "hud"} - - -@pytest.mark.asyncio -async def test_openrouter_agent_returns_text_response() -> None: - responses_create = AsyncMock( - return_value=FakeResponse( - { - "output": [ - { - "type": "message", - "content": [{"type": "output_text", "text": "Hi there"}], - } - ], - "status": "completed", - } - ) - ) - mock_client = MagicMock() - mock_client.responses.create = responses_create + agent = OpenRouterAgent(model_name="z-ai/glm-4.5v") - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - agent._available_tools = [] - - result = await agent.get_response( - [ - {"role": "system", "content": "You are helpful."}, - {"role": "user", "content": [{"type": "text", "text": "Hello"}]}, - ] - ) - - assert result.done - assert result.content == "Hi there" - assert result.tool_calls == [] - - -def test_openrouter_agent_sanitizes_fieldinfo_in_tools() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - - from pydantic import Field - - tools = [ + messages: list[dict[str, Any]] = [ { - "type": "function", - "function": { - "name": "click", - "description": "Click an element", - "parameters": { - "type": "object", - "properties": { - "selector": Field(default="", description="CSS selector"), - }, - "required": ["selector"], - }, - }, - } - ] - - converted = agent._convert_tools_for_responses(tools) - selector_schema = converted[0]["parameters"]["properties"]["selector"] - assert isinstance(selector_schema, dict) - assert selector_schema.get("description") == "CSS selector" - - -def test_openrouter_agent_converts_image_blocks() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) - - content = [ + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "click the highlighted cell"}], + }, { - "type": "image", - "mimeType": "image/png", - "data": "dGVzdA==", - "detail": "high", - } + "type": "computer_call_output", + "call_id": "initial", + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{png_base64}", + }, + }, ] - message_blocks = agent._convert_messages([{"role": "user", "content": content}]) - image_block = message_blocks[0]["content"][0] - assert image_block["type"] == "input_image" - assert image_block["image_url"].startswith("data:image/png;base64,") - assert image_block["detail"] == "high" + response = await agent.get_response(list(messages)) + assert not response.done + assert response.tool_calls, "expected at least one tool call" -@pytest.mark.asyncio -async def test_format_tool_results_produces_function_call_output() -> None: - mock_client = MagicMock() - agent = OpenRouterAgent(api_key="test-key", openai_client=mock_client) + tool_call = response.tool_calls[0] + assert tool_call.name == "openai_computer" + assert tool_call.arguments["type"] == "click" + # coordinates are normalized from the 1x1 PNG back to pixel space -> 0/0 + assert tool_call.arguments["x"] == 0 + assert tool_call.arguments["y"] == 0 - tool_call = MCPToolCall(id="call-1", name="playwright", arguments={}) tool_result = MCPToolResult( content=[ - types.TextContent(type="text", text="navigation complete"), - types.ImageContent(type="image", data="dGVzdA==", mimeType="image/png"), + types.ImageContent(type="image", data=png_base64, mimeType="image/png"), + types.TextContent(type="text", text="button pressed"), ] ) - formatted = await agent.format_tool_results([tool_call], [tool_result]) + rendered = await agent.format_tool_results([tool_call], [tool_result]) - assert formatted[0]["type"] == "function_call_output" - assert formatted[0]["call_id"] == "call-1" - assert formatted[1]["role"] == "user" - assert formatted[1]["content"][0]["type"] == "input_image" + assert any(item.get("type") == "computer_call_output" for item in rendered) + assert any( + item.get("type") == "message" and item.get("role") == "user" + for item in rendered + ) diff --git a/hud/cli/__init__.py b/hud/cli/__init__.py index 99771913..c1701f5c 100644 --- a/hud/cli/__init__.py +++ b/hud/cli/__init__.py @@ -894,7 +894,7 @@ def eval( [ {"name": "Claude 4 Sonnet", "value": "claude"}, {"name": "OpenAI Computer Use", "value": "openai"}, - {"name": "OpenRouter (Responses)", "value": "openrouter"}, + {"name": "OpenRouter", "value": "openrouter"}, {"name": "vLLM (Local Server)", "value": "vllm"}, {"name": "LiteLLM (Multi-provider)", "value": "litellm"}, ] diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 4900ba85..2b63222d 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -191,7 +191,7 @@ def build_agent( raise typer.Exit(1) from e return OpenRouterAgent( - model_name=model or "z-ai/glm-4.6", + model_name=model or "z-ai/glm-4.5v", allowed_tools=allowed_tools, verbose=verbose, ) @@ -568,7 +568,7 @@ def eval_command( "claude", "--agent", help=( - "Agent backend to use (claude, openai computer use, openrouter responses, " + "Agent backend to use (claude, openai computer use, openrouter, " "vllm for local server, or litellm)" ), ), diff --git a/hud/utils/agent_factories.py b/hud/utils/agent_factories.py index 37b9fa7a..f42248a4 100644 --- a/hud/utils/agent_factories.py +++ b/hud/utils/agent_factories.py @@ -88,13 +88,4 @@ def create_grounded_agent(**kwargs: Any) -> GroundedOpenAIChatAgent: def create_openrouter_agent(**kwargs: Any) -> OpenRouterAgent: """Factory for OpenRouterAgent with run_dataset compatibility.""" - api_key = kwargs.pop("api_key", None) - base_url = kwargs.pop("base_url", None) - cache_control = kwargs.pop("cache_control", True) - - return OpenRouterAgent( - api_key=api_key, - base_url=base_url, - cache_control=cache_control, - **kwargs, - ) + return OpenRouterAgent(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index dc6c77b4..0cfc9dfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ dev = [ "langchain", "langchain-openai", "langchain-anthropic", - "litellm>=1.55.0", + "litellm", # Jupyter support "ipykernel", "ipython <9", From 45fe54e9f59f1fe1cc1f1b792eefebe98b115c62 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Sat, 11 Oct 2025 23:13:49 +0200 Subject: [PATCH 03/14] eval run_full_dataset fix --- hud/cli/eval.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/hud/cli/eval.py b/hud/cli/eval.py index 2b63222d..7719e84d 100644 --- a/hud/cli/eval.py +++ b/hud/cli/eval.py @@ -479,6 +479,39 @@ async def run_full_dataset( if allowed_tools: agent_config["allowed_tools"] = allowed_tools + elif agent_type == "openrouter": + try: + # Use adapter class directly so it satisfies type[MCPAgent] + from hud.agents.openrouter import ( + OpenRouterAgent, + _ADAPTER_REGISTRY, + _load_adapter, + ) + except ImportError as e: + hud_console.error( + "OpenRouter agent dependencies are not installed. " + "Please install with: pip install 'hud-python[agent]'" + ) + raise typer.Exit(1) from e + + # Normalize model and resolve adapter + raw_model = model or "z-ai/glm-4.5v" + try: + normalized = OpenRouterAgent._normalize_model_name(raw_model) + adapter_path = _ADAPTER_REGISTRY[normalized] + except Exception as e: + hud_console.error(f"Unsupported OpenRouter model: {raw_model}") + raise typer.Exit(1) from e + + adapter_cls = _load_adapter(adapter_path) + agent_class = adapter_cls + agent_config = { + "model_name": f"openrouter/{normalized}", + "verbose": verbose, + } + if allowed_tools: + agent_config["allowed_tools"] = allowed_tools + else: try: from hud.agents import ClaudeAgent From beb181618fc8a6fcbdf892ba3fdd4a2b28ddaf93 Mon Sep 17 00:00:00 2001 From: ilya <95108691+shfunc@users.noreply.github.com> Date: Mon, 13 Oct 2025 09:07:25 +0200 Subject: [PATCH 04/14] Update pyproject.toml, litellm version fix --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0cfc9dfa..dc6c77b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -135,7 +135,7 @@ dev = [ "langchain", "langchain-openai", "langchain-anthropic", - "litellm", + "litellm>=1.55.0", # Jupyter support "ipykernel", "ipython <9", From b8b149fd54f02d5fe10c41678655973061aaf48f Mon Sep 17 00:00:00 2001 From: shfunc Date: Tue, 14 Oct 2025 13:26:12 +0200 Subject: [PATCH 05/14] refactor openrouter/glm45v, trim actions --- hud/agents/openrouter.py | 113 ++++--- .../openrouter/models/glm45v/action_space.txt | 188 ++++++++++++ .../{ => openrouter/models/glm45v}/glm45v.py | 281 ++---------------- 3 files changed, 267 insertions(+), 315 deletions(-) create mode 100644 hud/agents/openrouter/models/glm45v/action_space.txt rename hud/agents/{ => openrouter/models/glm45v}/glm45v.py (73%) diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index c9445258..5312bd2d 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -7,6 +7,8 @@ import re import uuid from importlib import import_module +import importlib.util +from pathlib import Path from io import BytesIO from typing import Any, Dict, Type @@ -19,7 +21,6 @@ def _random_id() -> str: return f"call_{uuid.uuid4().hex[:8]}" - def _make_reasoning_item(reasoning: str) -> dict[str, Any]: return { "id": _random_id(), @@ -27,7 +28,6 @@ def _make_reasoning_item(reasoning: str) -> dict[str, Any]: "summary": [{"type": "summary_text", "text": reasoning}], } - def _make_output_text_item(content: str) -> dict[str, Any]: return { "id": _random_id(), @@ -37,7 +37,6 @@ def _make_output_text_item(content: str) -> dict[str, Any]: "content": [{"type": "output_text", "text": content, "annotations": []}], } - def _make_computer_call_item(action: dict[str, Any], call_id: str | None = None) -> dict[str, Any]: call_id = call_id or _random_id() return { @@ -49,31 +48,21 @@ def _make_computer_call_item(action: dict[str, Any], call_id: str | None = None) "action": action, } - def _make_click_item(x: int, y: int, button: str = "left", call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "click", "x": x, "y": y, "button": button}, call_id) - def _make_double_click_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "double_click", "x": x, "y": y}, call_id) - -def _make_move_item(x: int, y: int, call_id: str | None = None) -> dict[str, Any]: - return _make_computer_call_item({"type": "move", "x": x, "y": y}, call_id) - - def _make_drag_item(path: list[dict[str, int]], call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "drag", "path": path}, call_id) - def _make_keypress_item(keys: list[str], call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "keypress", "keys": keys}, call_id) - def _make_type_item(text: str, call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "type", "text": text}, call_id) - def _make_scroll_item( x: int, y: int, @@ -84,15 +73,12 @@ def _make_scroll_item( action = {"type": "scroll", "x": x, "y": y, "scroll_x": scroll_x, "scroll_y": scroll_y} return _make_computer_call_item(action, call_id) - def _make_wait_item(call_id: str | None = None) -> dict[str, Any]: return _make_computer_call_item({"type": "wait"}, call_id) - def _make_screenshot_item(call_id: str) -> dict[str, Any]: return _make_computer_call_item({"type": "screenshot"}, call_id) - def _make_failed_tool_call_items( tool_name: str, tool_kwargs: dict[str, Any], @@ -105,7 +91,6 @@ def _make_failed_tool_call_items( failure_text["role"] = "assistant" return [call, failure_text] - def _coerce_to_pixel_coordinates( x_val: Any, y_val: Any, @@ -136,7 +121,6 @@ def clamp(value: int, maximum: int) -> int: return clamp(px, width), clamp(py, height) - def _parse_coordinate_box(value: Any) -> tuple[float, float] | None: if isinstance(value, (list, tuple)) and len(value) >= 2: try: @@ -160,7 +144,6 @@ def _parse_coordinate_box(value: Any) -> tuple[float, float] | None: return None return None - def _coerce_box_to_pixels( box: Any, *, @@ -172,7 +155,6 @@ def _coerce_box_to_pixels( return None return _coerce_to_pixel_coordinates(coords[0], coords[1], width=width, height=height) - def _parse_json_action_string(action_text: str) -> dict[str, Any] | None: candidate = action_text.strip() if not (candidate.startswith("{") and candidate.endswith("}")): @@ -194,7 +176,6 @@ def _parse_json_action_string(action_text: str) -> dict[str, Any] | None: return None - def _convert_json_action_to_items( json_action: dict[str, Any], *, @@ -203,11 +184,32 @@ def _convert_json_action_to_items( image_height: int, ) -> list[dict[str, Any]]: entries: list[dict[str, Any]] = [] - action_type = str(json_action.get("type", "")).lower() + + action_type = str(json_action.get("type", json_action.get("action_type", ""))).lower() if not action_type: return entries - if action_type in {"type", "text"}: + def box2d_center_pixels(box2d: Any) -> tuple[int, int] | None: + try: + if isinstance(box2d, str): + parsed = json.loads(box2d) + else: + parsed = box2d + if isinstance(parsed, list) and len(parsed) >= 1: + first = parsed[0] + if isinstance(first, (list, tuple)) and len(first) >= 4: + xmin, ymin, xmax, ymax = float(first[0]), float(first[1]), float(first[2]), float(first[3]) + cx = (xmin + xmax) / 2.0 + cy = (ymin + ymax) / 2.0 + # interpret as 0-999 normalized + px = int((cx / 999.0) * image_width) + py = int((cy / 999.0) * image_height) + return px, py + except Exception: + return None + return None + + if action_type in {"type", "text", "input_text"}: text_value = json_action.get("content") or json_action.get("text") or "" if text_value: entries.append(_make_type_item(str(text_value), call_id=call_id)) @@ -218,6 +220,8 @@ def _convert_json_action_to_items( or json_action.get("position") ) coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("box_2d") is not None: + coords = box2d_center_pixels(json_action.get("box_2d")) if not coords and json_action.get("x") is not None and json_action.get("y") is not None: coords = _coerce_to_pixel_coordinates( json_action.get("x"), @@ -228,22 +232,11 @@ def _convert_json_action_to_items( if coords: button = str(json_action.get("button", "left") or "left").lower() entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) - elif action_type in {"right_click", "middle_click"}: - start_box = json_action.get("start_box") or json_action.get("startBox") - coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) - if not coords and json_action.get("x") is not None and json_action.get("y") is not None: - coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), - width=image_width, - height=image_height, - ) - if coords: - button = "right" if action_type == "right_click" else "middle" - entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) elif action_type in {"double_click", "left_double_click"}: start_box = json_action.get("start_box") or json_action.get("startBox") coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("box_2d") is not None: + coords = box2d_center_pixels(json_action.get("box_2d")) if not coords and json_action.get("x") is not None and json_action.get("y") is not None: coords = _coerce_to_pixel_coordinates( json_action.get("x"), @@ -258,6 +251,10 @@ def _convert_json_action_to_items( end_box = json_action.get("end_box") or json_action.get("endBox") start_coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) end_coords = _coerce_box_to_pixels(end_box, width=image_width, height=image_height) + if not start_coords and json_action.get("box_2d") is not None: + start_coords = box2d_center_pixels(json_action.get("box_2d")) + if not end_coords and json_action.get("end_box_2d") is not None: + end_coords = box2d_center_pixels(json_action.get("end_box_2d")) if not start_coords and json_action.get("x") is not None and json_action.get("y") is not None: start_coords = _coerce_to_pixel_coordinates( json_action.get("x"), @@ -274,6 +271,8 @@ def _convert_json_action_to_items( elif action_type == "scroll": start_box = json_action.get("start_box") or json_action.get("startBox") coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) + if not coords and json_action.get("box_2d") is not None: + coords = box2d_center_pixels(json_action.get("box_2d")) if not coords and json_action.get("x") is not None and json_action.get("y") is not None: coords = _coerce_to_pixel_coordinates( json_action.get("x"), @@ -297,22 +296,7 @@ def _convert_json_action_to_items( entries.append( _make_scroll_item(coords[0], coords[1], scroll_x, scroll_y, call_id=call_id) ) - elif action_type in {"hover", "move"}: - target_box = ( - json_action.get("start_box") - or json_action.get("startBox") - or json_action.get("position") - ) - coords = _coerce_box_to_pixels(target_box, width=image_width, height=image_height) - if not coords and json_action.get("x") is not None and json_action.get("y") is not None: - coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), - width=image_width, - height=image_height, - ) - if coords: - entries.append(_make_move_item(coords[0], coords[1], call_id=call_id)) + # hover/move dropped in minimal action surface elif action_type in {"keypress", "key", "key_press"}: keys = json_action.get("keys") key_list: list[str] = [] @@ -324,8 +308,6 @@ def _convert_json_action_to_items( entries.append(_make_keypress_item(key_list, call_id=call_id)) elif action_type == "wait": entries.append(_make_wait_item(call_id=call_id)) - elif action_type == "screenshot": - entries.append(_make_screenshot_item(call_id)) return entries @@ -379,16 +361,29 @@ def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: # Adapter dispatch _ADAPTER_REGISTRY: Dict[str, str] = { - "z-ai/glm-4.5v": "hud.agents.glm45v:Glm45vAgent", + "z-ai/glm-4.5v": "hud.agents.openrouter.models.glm45v.glm45v:Glm45vAgent", } - def _load_adapter(path: str) -> Type[MCPAgent]: module_name, class_name = path.split(":", 1) - module = import_module(module_name) + try: + module = import_module(module_name) + except ModuleNotFoundError: + here = Path(__file__).resolve() + # e.g., models/glm45v/glm45v.py + parts = module_name.split(".models.") + if len(parts) == 2: + rel = parts[1].replace(".", "/") + ".py" + candidate = here.with_name("openrouter") / "models" / Path(rel) + if candidate.exists(): + spec = importlib.util.spec_from_file_location("hud.agents._adapter", str(candidate)) + if spec and spec.loader: + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return getattr(mod, class_name) + raise return getattr(module, class_name) - class OpenRouterAgent: """Dispatch wrapper that selects the correct OpenRouter adapter by model.""" @@ -425,7 +420,6 @@ def __dir__(self) -> list[str]: base_dir.update(dir(self._adapter)) return sorted(base_dir) - __all__ = [ "OpenRouterAgent", "_random_id", @@ -438,7 +432,6 @@ def __dir__(self) -> list[str]: "_make_keypress_item", "_make_type_item", "_make_scroll_item", - "_make_wait_item", "_make_screenshot_item", "_make_failed_tool_call_items", "_coerce_to_pixel_coordinates", diff --git a/hud/agents/openrouter/models/glm45v/action_space.txt b/hud/agents/openrouter/models/glm45v/action_space.txt new file mode 100644 index 00000000..908cd718 --- /dev/null +++ b/hud/agents/openrouter/models/glm45v/action_space.txt @@ -0,0 +1,188 @@ +### left_click + +Call rule: `left_click(start_box='[x,y]')` +{ + 'name': 'left_click', + 'description': 'Perform a left mouse click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.' + } + }, + 'required': ['start_box'] + } +} + +### left_double_click + +Call rule: `left_double_click(start_box='[x,y]', element_info='')` +{ + 'name': 'left_double_click', + 'description': 'Perform a left mouse double-click at the specified coordinates on the screen.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being double-clicked.' + } + }, + 'required': ['start_box'] + } +} + +### left_drag + +Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')` +{ + 'name': 'left_drag', + 'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.' + }, + 'end_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being dragged.' + } + }, + 'required': ['start_box', 'end_box'] + } +} + +### key + +Call rule: `key(keys='')` +{ + 'name': 'key', + 'description': 'Simulate pressing a single key or combination of keys on the keyboard.', + 'parameters': { + 'type': 'object', + 'properties': { + 'keys': { + 'type': 'string', + 'description': "The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab')." + } + }, + 'required': ['keys'] + } +} + +### type + +Call rule: `type(content='')` +{ + 'name': 'type', + 'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.', + 'parameters': { + 'type': 'object', + 'properties': { + 'content': { + 'type': 'string', + 'description': 'The text content to be typed into the active text field.' + } + }, + 'required': ['content'] + } +} + +### scroll + +Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')` +{ + 'name': 'scroll', + 'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.', + 'parameters': { + 'type': 'object', + 'properties': { + 'start_box': { + 'type': 'array', + 'items': { + 'type': 'integer' + }, + 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.' + }, + 'direction': { + 'type': 'string', + 'enum': ['down', 'up'], + 'description': "The direction to scroll: 'down' or 'up'." + }, + 'step': { + 'type': 'integer', + 'default': 5, + 'description': 'Number of wheel steps to scroll, default is 5.' + }, + 'element_info': { + 'type': 'string', + 'description': 'Optional text description of the UI element being scrolled.' + } + }, + 'required': ['start_box', 'direction'] + } +} + +### WAIT + +Call rule: `WAIT()` +{ + 'name': 'WAIT', + 'description': 'Wait for 5 seconds before proceeding to the next action.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### DONE + +Call rule: `DONE()` +{ + 'name': 'DONE', + 'description': 'Indicate that the current task has been completed successfully and no further actions are needed.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + +### FAIL + +Call rule: `FAIL()` +{ + 'name': 'FAIL', + 'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.', + 'parameters': { + 'type': 'object', + 'properties': {}, + 'required': [] + } +} + diff --git a/hud/agents/glm45v.py b/hud/agents/openrouter/models/glm45v/glm45v.py similarity index 73% rename from hud/agents/glm45v.py rename to hud/agents/openrouter/models/glm45v/glm45v.py index e7ff0fdc..1e517d37 100644 --- a/hud/agents/glm45v.py +++ b/hud/agents/openrouter/models/glm45v/glm45v.py @@ -6,12 +6,14 @@ import logging import re from typing import Any, ClassVar +from pathlib import Path import litellm import mcp.types as types from litellm.types.utils import ModelResponse from hud.agents.base import MCPAgent +from hud.settings import settings from hud.tools.computer.settings import computer_settings from hud.types import AgentResponse, MCPToolCall, MCPToolResult from hud import instrument @@ -37,245 +39,26 @@ logger = logging.getLogger(__name__) +def _load_text_resource(path: str | Path) -> str | None: + try: + p = Path(path) + with p.open("r", encoding="utf-8") as f: + return f.read() + except Exception: + return None -DEFAULT_SYSTEM_PROMPT = """ -You are an autonomous computer-using agent. Follow these guidelines: - -1. Do not ask for permission; act decisively to finish the task. -2. Always ground actions in the latest screenshot and task instructions. -3. Use the provided mouse/keyboard tools precisely (coordinates are 0-999). -4. Keep memory concise—store only facts that matter for later steps. -5. When the task is complete, reply with DONE() and include the final answer. -6. If the task is impossible, reply with FAIL() and explain briefly. -""".strip() - - -GLM_ACTION_SPACE = """ -### {left,right,middle}_click - -Call rule: `{left,right,middle}_click(start_box='[x,y]', element_info='')` -{ - 'name': ['left_click', 'right_click', 'middle_click'], - 'description': 'Perform a left/right/middle mouse click at the specified coordinates on the screen.', - 'parameters': { - 'type': 'object', - 'properties': { - 'start_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Coordinates [x,y] where to perform the click, normalized to 0-999 range.' - }, - 'element_info': { - 'type': 'string', - 'description': 'Optional text description of the UI element being clicked.' - } - }, - 'required': ['start_box'] - } -} - -### hover - -Call rule: `hover(start_box='[x,y]', element_info='')` -{ - 'name': 'hover', - 'description': 'Move the mouse pointer to the specified coordinates without performing any click action.', - 'parameters': { - 'type': 'object', - 'properties': { - 'start_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Coordinates [x,y] where to move the mouse pointer, normalized to 0-999 range.' - }, - 'element_info': { - 'type': 'string', - 'description': 'Optional text description of the UI element being hovered over.' - } - }, - 'required': ['start_box'] - } -} - -### left_double_click - -Call rule: `left_double_click(start_box='[x,y]', element_info='')` -{ - 'name': 'left_double_click', - 'description': 'Perform a left mouse double-click at the specified coordinates on the screen.', - 'parameters': { - 'type': 'object', - 'properties': { - 'start_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Coordinates [x,y] where to perform the double-click, normalized to 0-999 range.' - }, - 'element_info': { - 'type': 'string', - 'description': 'Optional text description of the UI element being double-clicked.' - } - }, - 'required': ['start_box'] - } -} - -### left_drag - -Call rule: `left_drag(start_box='[x1,y1]', end_box='[x2,y2]', element_info='')` -{ - 'name': 'left_drag', - 'description': 'Drag the mouse from starting coordinates to ending coordinates while holding the left mouse button.', - 'parameters': { - 'type': 'object', - 'properties': { - 'start_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Starting coordinates [x1,y1] for the drag operation, normalized to 0-999 range.' - }, - 'end_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Ending coordinates [x2,y2] for the drag operation, normalized to 0-999 range.' - }, - 'element_info': { - 'type': 'string', - 'description': 'Optional text description of the UI element being dragged.' - } - }, - 'required': ['start_box', 'end_box'] - } -} - -### key - -Call rule: `key(keys='')` -{ - 'name': 'key', - 'description': 'Simulate pressing a single key or combination of keys on the keyboard.', - 'parameters': { - 'type': 'object', - 'properties': { - 'keys': { - 'type': 'string', - 'description': "The key or key combination to press. Use '+' to separate keys in combinations (e.g., 'ctrl+c', 'alt+tab')." - } - }, - 'required': ['keys'] - } -} - -### type - -Call rule: `type(content='')` -{ - 'name': 'type', - 'description': 'Type text content into the currently focused text input field. This action only performs typing and does not handle field activation or clearing.', - 'parameters': { - 'type': 'object', - 'properties': { - 'content': { - 'type': 'string', - 'description': 'The text content to be typed into the active text field.' - } - }, - 'required': ['content'] - } -} - -### scroll - -Call rule: `scroll(start_box='[x,y]', direction='', step=5, element_info='')` -{ - 'name': 'scroll', - 'description': 'Scroll an element at the specified coordinates in the specified direction by a given number of wheel steps.', - 'parameters': { - 'type': 'object', - 'properties': { - 'start_box': { - 'type': 'array', - 'items': { - 'type': 'integer' - }, - 'description': 'Coordinates [x,y] of the element or area to scroll, normalized to 0-999 range.' - }, - 'direction': { - 'type': 'string', - 'enum': ['down', 'up'], - 'description': "The direction to scroll: 'down' or 'up'." - }, - 'step': { - 'type': 'integer', - 'default': 5, - 'description': 'Number of wheel steps to scroll, default is 5.' - }, - 'element_info': { - 'type': 'string', - 'description': 'Optional text description of the UI element being scrolled.' - } - }, - 'required': ['start_box', 'direction'] - } -} - -### WAIT - -Call rule: `WAIT()` -{ - 'name': 'WAIT', - 'description': 'Wait for 5 seconds before proceeding to the next action.', - 'parameters': { - 'type': 'object', - 'properties': {}, - 'required': [] - } -} - -### DONE - -Call rule: `DONE()` -{ - 'name': 'DONE', - 'description': 'Indicate that the current task has been completed successfully and no further actions are needed.', - 'parameters': { - 'type': 'object', - 'properties': {}, - 'required': [] - } -} - -### FAIL - -Call rule: `FAIL()` -{ - 'name': 'FAIL', - 'description': 'Indicate that the current task cannot be completed or is impossible to accomplish.', - 'parameters': { - 'type': 'object', - 'properties': {}, - 'required': [] - } -}""" - +_BASE_DIR = Path(__file__).resolve().parent +_ACTION_SPACE_PATH = _BASE_DIR / "action_space.txt" +GLM_ACTION_SPACE = _load_text_resource(_ACTION_SPACE_PATH) or "" +if not GLM_ACTION_SPACE.strip(): + raise RuntimeError(f"Missing action space file at {_ACTION_SPACE_PATH}") def convert_responses_items_to_glm45v_pc_prompt( messages: list[dict[str, Any]], task: str, memory: str = "[]", ) -> list[dict[str, Any]]: - action_space = GLM_ACTION_SPACE head_text = ( "You are a GUI Agent, and your primary task is to respond accurately to user" " requests or questions. In addition to directly answering the user's queries," @@ -285,7 +68,7 @@ def convert_responses_items_to_glm45v_pc_prompt( " thinking and reflection when appropriate. The coordinates involved are all" " represented in thousandths (0-999)." "\n\n# Task:\n" - f"{task}\n\n# Task Platform\nUbuntu\n\n# Action Space\n{action_space}\n\n" + f"{task}\n\n# Task Platform\nUbuntu\n\n# Action Space\n{GLM_ACTION_SPACE}\n\n" "# Historical Actions and Current Memory\nHistory:" ) @@ -294,7 +77,7 @@ def convert_responses_items_to_glm45v_pc_prompt( f"{memory}\n" "# Output Format\nPlain text explanation with action(param='...')\n" "Memory:\n[{\"key\": \"value\"}, ...]\n\n# Some Additional Notes\n" - "- I'll give you the most recent 4 history screenshots(shrunked to 50%*50%) along with the historical action steps.\n" + "- I'll give you the most recent history screenshots(shrunked to 50%*50%) along with the historical action steps.\n" "- You should put the key information you *have to remember* in a seperated memory part and I'll give it to you in the next round." " The content in this part should be a dict list. If you no longer need some given information, you should remove it from the memory." " Even if you don't need to remember anything, you should also output an empty list.\n" @@ -350,7 +133,7 @@ def convert_responses_items_to_glm45v_pc_prompt( current_text = head_text total_steps = len(history) - image_tail = min(4, len(history_images)) + image_tail = min(2, len(history_images)) for idx, step in enumerate(history): step_no = step["step_num"] @@ -531,11 +314,6 @@ def parse_glm_response(response: str) -> dict[str, str]: "memory": memory, } - - - - - class Glm45vAgent(MCPAgent): """LiteLLM-backed GLM-4.5V agent that speaks MCP.""" @@ -555,20 +333,12 @@ def __init__( **agent_kwargs: Any, ) -> None: super().__init__(**agent_kwargs) - # Normalize to canonical openrouter// - if not model_name.startswith("openrouter/"): - self.model_name = f"openrouter/{model_name}" - else: - self.model_name = model_name + self.model_name = model_name self.completion_kwargs = completion_kwargs or {} - combined_prompt = DEFAULT_SYSTEM_PROMPT if system_prompt: - combined_prompt = f"{combined_prompt}\n\n{system_prompt}" - - if self.system_prompt: - self.system_prompt = f"{self.system_prompt}\n\n{combined_prompt}" + self.system_prompt = system_prompt else: - self.system_prompt = combined_prompt + self.system_prompt = "" self._memory = "[]" self._last_instruction = "" self._task_description = "" @@ -668,13 +438,14 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}} ) - system_prompt = self.system_prompt or "You are a helpful GUI agent assistant." - litellm_messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": prompt_content}, - ] + litellm_messages: list[dict[str, Any]] = [] + if getattr(self, "system_prompt", None): + litellm_messages.append({"role": "system", "content": self.system_prompt}) + litellm_messages.append({"role": "user", "content": prompt_content}) api_kwargs = {"model": self.model_name, "messages": litellm_messages} + if settings.openrouter_api_key: + api_kwargs["api_key"] = settings.openrouter_api_key api_kwargs.update(self.completion_kwargs) try: From dba903d0b2dd642a87ff556e13c265b65cb2d426 Mon Sep 17 00:00:00 2001 From: shfunc Date: Tue, 14 Oct 2025 14:48:30 +0200 Subject: [PATCH 06/14] latest little refactor --- hud/agents/openrouter.py | 93 ++++------- hud/agents/openrouter/models/glm45v/glm45v.py | 151 +++++++----------- 2 files changed, 90 insertions(+), 154 deletions(-) diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index 5312bd2d..9360ae81 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -178,87 +178,62 @@ def _parse_json_action_string(action_text: str) -> dict[str, Any] | None: def _convert_json_action_to_items( json_action: dict[str, Any], - *, call_id: str, image_width: int, image_height: int, ) -> list[dict[str, Any]]: - entries: list[dict[str, Any]] = [] + items: list[dict[str, Any]] = [] action_type = str(json_action.get("type", json_action.get("action_type", ""))).lower() if not action_type: - return entries - - def box2d_center_pixels(box2d: Any) -> tuple[int, int] | None: - try: - if isinstance(box2d, str): - parsed = json.loads(box2d) - else: - parsed = box2d - if isinstance(parsed, list) and len(parsed) >= 1: - first = parsed[0] - if isinstance(first, (list, tuple)) and len(first) >= 4: - xmin, ymin, xmax, ymax = float(first[0]), float(first[1]), float(first[2]), float(first[3]) - cx = (xmin + xmax) / 2.0 - cy = (ymin + ymax) / 2.0 - # interpret as 0-999 normalized - px = int((cx / 999.0) * image_width) - py = int((cy / 999.0) * image_height) - return px, py - except Exception: - return None - return None + return items if action_type in {"type", "text", "input_text"}: text_value = json_action.get("content") or json_action.get("text") or "" if text_value: - entries.append(_make_type_item(str(text_value), call_id=call_id)) - elif action_type in {"click", "left_click"}: - start_box = ( - json_action.get("start_box") - or json_action.get("startBox") - or json_action.get("position") - ) + items.append(_make_type_item(str(text_value), call_id=call_id)) + elif action_type in {"click", "left_click", "right_click"}: + # Handle both "start_box" and the new "start_x"/"start_y" format + start_box = json_action.get("start_box") or json_action.get("startBox") coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) - if not coords and json_action.get("box_2d") is not None: - coords = box2d_center_pixels(json_action.get("box_2d")) - if not coords and json_action.get("x") is not None and json_action.get("y") is not None: - coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), + if not coords: + coords = _coerce_to_pixel_coordinates( + json_action.get("start_x") or json_action.get("x"), + json_action.get("start_y") or json_action.get("y"), width=image_width, height=image_height, ) if coords: button = str(json_action.get("button", "left") or "left").lower() - entries.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) + items.append(_make_click_item(coords[0], coords[1], button=button, call_id=call_id)) elif action_type in {"double_click", "left_double_click"}: start_box = json_action.get("start_box") or json_action.get("startBox") coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) - if not coords and json_action.get("box_2d") is not None: - coords = box2d_center_pixels(json_action.get("box_2d")) - if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + if not coords: coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), + json_action.get("start_x") or json_action.get("x"), + json_action.get("start_y") or json_action.get("y"), width=image_width, height=image_height, ) if coords: - entries.append(_make_double_click_item(coords[0], coords[1], call_id=call_id)) + items.append(_make_double_click_item(coords[0], coords[1], call_id=call_id)) elif action_type in {"drag", "left_drag"}: start_box = json_action.get("start_box") or json_action.get("startBox") end_box = json_action.get("end_box") or json_action.get("endBox") start_coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) end_coords = _coerce_box_to_pixels(end_box, width=image_width, height=image_height) - if not start_coords and json_action.get("box_2d") is not None: - start_coords = box2d_center_pixels(json_action.get("box_2d")) - if not end_coords and json_action.get("end_box_2d") is not None: - end_coords = box2d_center_pixels(json_action.get("end_box_2d")) - if not start_coords and json_action.get("x") is not None and json_action.get("y") is not None: + if not start_coords: start_coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), + json_action.get("start_x") or json_action.get("x"), + json_action.get("start_y") or json_action.get("y"), + width=image_width, + height=image_height, + ) + if not end_coords: + end_coords = _coerce_to_pixel_coordinates( + json_action.get("end_x"), + json_action.get("end_y"), width=image_width, height=image_height, ) @@ -267,16 +242,14 @@ def box2d_center_pixels(box2d: Any) -> tuple[int, int] | None: {"x": start_coords[0], "y": start_coords[1]}, {"x": end_coords[0], "y": end_coords[1]}, ] - entries.append(_make_drag_item(path, call_id=call_id)) + items.append(_make_drag_item(path, call_id=call_id)) elif action_type == "scroll": start_box = json_action.get("start_box") or json_action.get("startBox") coords = _coerce_box_to_pixels(start_box, width=image_width, height=image_height) - if not coords and json_action.get("box_2d") is not None: - coords = box2d_center_pixels(json_action.get("box_2d")) - if not coords and json_action.get("x") is not None and json_action.get("y") is not None: + if not coords: coords = _coerce_to_pixel_coordinates( - json_action.get("x"), - json_action.get("y"), + json_action.get("start_x") or json_action.get("x"), + json_action.get("start_y") or json_action.get("y"), width=image_width, height=image_height, ) @@ -293,7 +266,7 @@ def box2d_center_pixels(box2d: Any) -> tuple[int, int] | None: scroll_x = -abs(step) elif direction == "right": scroll_x = abs(step) - entries.append( + items.append( _make_scroll_item(coords[0], coords[1], scroll_x, scroll_y, call_id=call_id) ) # hover/move dropped in minimal action surface @@ -305,11 +278,11 @@ def box2d_center_pixels(box2d: Any) -> tuple[int, int] | None: elif isinstance(keys, list): key_list = [str(segment).strip() for segment in keys if str(segment).strip()] if key_list: - entries.append(_make_keypress_item(key_list, call_id=call_id)) + items.append(_make_keypress_item(key_list, call_id=call_id)) elif action_type == "wait": - entries.append(_make_wait_item(call_id=call_id)) + items.append(_make_wait_item(call_id=call_id)) - return entries + return items def _decode_image_dimensions(image_b64: str) -> tuple[int, int]: diff --git a/hud/agents/openrouter/models/glm45v/glm45v.py b/hud/agents/openrouter/models/glm45v/glm45v.py index 1e517d37..b7e03c04 100644 --- a/hud/agents/openrouter/models/glm45v/glm45v.py +++ b/hud/agents/openrouter/models/glm45v/glm45v.py @@ -21,17 +21,10 @@ _convert_json_action_to_items, _decode_image_dimensions, _extract_user_instruction, - _make_click_item, - _make_double_click_item, - _make_drag_item, _make_failed_tool_call_items, - _make_keypress_item, _make_output_text_item, _make_reasoning_item, _make_screenshot_item, - _make_scroll_item, - _make_type_item, - _make_wait_item, _parse_json_action_string, _random_id, get_last_image_from_messages, @@ -91,7 +84,8 @@ def convert_responses_items_to_glm45v_pc_prompt( current_step: list[dict[str, Any]] = [] step_num = 0 - for message in messages: + # Optimization: Limit history to last 10 messages to improve performance + for message in messages[-10:]: if not isinstance(message, dict): continue msg_type = message.get("type") @@ -157,6 +151,48 @@ def convert_responses_items_to_glm45v_pc_prompt( content.append({"type": "text", "text": current_text}) return content +def _parse_string_action_to_dict(action: str) -> dict[str, Any]: + """Converts GLM's string-based action output to a structured dictionary.""" + if action.startswith("left_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: return {"type": "click", "button": "left", "start_box": [match.group(1), match.group(2)]} + elif action.startswith("right_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: return {"type": "click", "button": "right", "start_box": [match.group(1), match.group(2)]} + elif action.startswith("left_double_click"): + match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + if match: return {"type": "double_click", "start_box": [match.group(1), match.group(2)]} + elif action.startswith("left_drag"): + start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action) + if start_match and end_match: + return { + "type": "drag", + "start_box": [start_match.group(1), start_match.group(2)], + "end_box": [end_match.group(1), end_match.group(2)], + } + elif action.startswith("key"): + key_match = re.search(r"keys='([^']+)'", action) + if key_match: + keys = key_match.group(1) + key_list = keys.split("+") if "+" in keys else [keys] + return {"type": "keypress", "keys": key_list} + elif action.startswith("type"): + content_match = re.search(r"content='([^']*)'", action) + if content_match: return {"type": "type", "content": content_match.group(1)} + elif action.startswith("scroll"): + coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) + direction_match = re.search(r"direction='([^']+)'", action) + if coord_match and direction_match: + return { + "type": "scroll", + "start_box": [coord_match.group(1), coord_match.group(2)], + "direction": direction_match.group(1), + } + elif action == "WAIT()": + return {"type": "wait"} + return {} + def convert_glm_completion_to_responses_items( response: ModelResponse, @@ -194,9 +230,11 @@ def convert_glm_completion_to_responses_items( if action: call_id = _random_id() - handled_json = False json_action = _parse_json_action_string(action) + if not json_action: + json_action = _parse_string_action_to_dict(action) + if json_action: json_entries = _convert_json_action_to_items( json_action, @@ -206,97 +244,22 @@ def convert_glm_completion_to_responses_items( ) if json_entries: items.extend(json_entries) - handled_json = True - - if action.startswith("left_click"): - match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) - if match: - x, y = int(match.group(1)), int(match.group(2)) - actual_x = int((x / 999.0) * image_width) - actual_y = int((y / 999.0) * image_height) - if not handled_json: - items.append(_make_click_item(actual_x, actual_y, call_id=call_id)) - elif action.startswith("right_click"): - match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) - if match: - x, y = int(match.group(1)), int(match.group(2)) - actual_x = int((x / 999.0) * image_width) - actual_y = int((y / 999.0) * image_height) - if not handled_json: - items.append(_make_click_item(actual_x, actual_y, button="right", call_id=call_id)) - elif action.startswith("left_double_click"): - match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) - if match: - x, y = int(match.group(1)), int(match.group(2)) - actual_x = int((x / 999.0) * image_width) - actual_y = int((y / 999.0) * image_height) - if not handled_json: - items.append(_make_double_click_item(actual_x, actual_y, call_id=call_id)) - elif action.startswith("left_drag"): - start_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) - end_match = re.search(r"end_box='?\[(\d+),\s*(\d+)\]'?", action) - if start_match and end_match: - x1, y1 = int(start_match.group(1)), int(start_match.group(2)) - x2, y2 = int(end_match.group(1)), int(end_match.group(2)) - actual_x1 = int((x1 / 999.0) * image_width) - actual_y1 = int((y1 / 999.0) * image_height) - actual_x2 = int((x2 / 999.0) * image_width) - actual_y2 = int((y2 / 999.0) * image_height) - path = [ - {"x": actual_x1, "y": actual_y1}, - {"x": actual_x2, "y": actual_y2}, - ] - if not handled_json: - items.append(_make_drag_item(path, call_id=call_id)) - elif action.startswith("key"): - key_match = re.search(r"keys='([^']+)'", action) - if key_match: - keys = key_match.group(1) - key_list = keys.split("+") if "+" in keys else [keys] - if not handled_json: - items.append(_make_keypress_item(key_list, call_id=call_id)) - elif action.startswith("type"): - content_match = re.search(r"content='([^']*)'", action) - if content_match: - text = content_match.group(1) - if not handled_json: - items.append(_make_type_item(text, call_id=call_id)) - elif action.startswith("scroll"): - coord_match = re.search(r"start_box='?\[(\d+),\s*(\d+)\]'?", action) - direction_match = re.search(r"direction='([^']+)'", action) - if coord_match and direction_match: - x, y = int(coord_match.group(1)), int(coord_match.group(2)) - direction = direction_match.group(1) - actual_x = int((x / 999.0) * image_width) - actual_y = int((y / 999.0) * image_height) - scroll_x = 0 - scroll_y = 0 - if direction == "up": - scroll_y = -5 - elif direction == "down": - scroll_y = 5 - elif direction == "left": - scroll_x = -5 - elif direction == "right": - scroll_x = 5 - if not handled_json: - items.append(_make_scroll_item(actual_x, actual_y, scroll_x, scroll_y, call_id=call_id)) - elif action == "WAIT()": - if not handled_json: - items.append(_make_wait_item(call_id=call_id)) return items def parse_glm_response(response: str) -> dict[str, str]: - pattern = r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>" - match = re.search(pattern, response) - if match: - action = match.group(1).strip() + json_match = re.search(r'(\{.*\})', response) + if json_match: + action = json_match.group(1).strip() else: - action_pattern = r"[\w_]+\([^)]*\)" - matches = re.findall(action_pattern, response) - action = matches[0] if matches else "" + box_match = re.search(r"<\|begin_of_box\|>(.*?)<\|end_of_box\|>", response) + if box_match: + action = box_match.group(1).strip() + else: + action_pattern = r"[\w_]+\([^)]*\)" + matches = re.findall(action_pattern, response) + action = matches[0] if matches else "" memory_pattern = r"Memory:(.*?)$" memory_match = re.search(memory_pattern, response, re.DOTALL) From c33eb1068c8efa2b74bb3b98745ca6a510ea298a Mon Sep 17 00:00:00 2001 From: shfunc Date: Wed, 15 Oct 2025 16:09:05 +0200 Subject: [PATCH 07/14] UITars adapter, output debug --- hud/agents/openrouter.py | 1 + hud/agents/openrouter/models/uitars/uitars.py | 619 ++++++++++++++++++ 2 files changed, 620 insertions(+) create mode 100644 hud/agents/openrouter/models/uitars/uitars.py diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index 9360ae81..f11a0d8c 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -335,6 +335,7 @@ def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: # Adapter dispatch _ADAPTER_REGISTRY: Dict[str, str] = { "z-ai/glm-4.5v": "hud.agents.openrouter.models.glm45v.glm45v:Glm45vAgent", + "huggingface/bytedance-seed/ui-tars-1.5-7b": "hud.agents.openrouter.models.uitars.uitars:UITarsAgent", } def _load_adapter(path: str) -> Type[MCPAgent]: diff --git a/hud/agents/openrouter/models/uitars/uitars.py b/hud/agents/openrouter/models/uitars/uitars.py new file mode 100644 index 00000000..ba529f76 --- /dev/null +++ b/hud/agents/openrouter/models/uitars/uitars.py @@ -0,0 +1,619 @@ +"""UITARS adapter rebuilt using official parser utilities""" + +from __future__ import annotations + +import ast +import base64 +import logging +import math +import os +import re +from io import BytesIO +from typing import Any, ClassVar + +from PIL import Image + +import litellm +import mcp.types as types + +from hud import instrument +from hud.agents.base import MCPAgent +from hud.agents.openrouter import ( + _convert_json_action_to_items, + _decode_image_dimensions, + _extract_user_instruction, + _make_failed_tool_call_items, + _make_screenshot_item, + _random_id, + get_last_image_from_messages, +) +from hud.tools.computer.settings import computer_settings +from hud.types import AgentResponse, MCPToolCall, MCPToolResult + +logger = logging.getLogger(__name__) + +# Constants from the official UITARS parser +IMAGE_FACTOR = 28 +MIN_PIXELS = 100 * 28 * 28 +MAX_PIXELS = 16384 * 28 * 28 +MAX_RATIO = 200 +def _resolve_provider_model_name(model_name: str) -> str: + key = (model_name or "").strip() + if key.startswith("openrouter/"): + key = key[len("openrouter/") :] + lowered = key.lower() + if lowered in {"huggingface/bytedance-seed/ui-tars-1.5-7b", "bytedance-seed/ui-tars-1.5-7b"}: + return "ByteDance-Seed/UI-TARS-1.5-7B" + return key + +COMPUTER_USE_DOUBAO = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task. + +## Output Format +``` +Thought: ... +Action: ... +``` + +## Action Space + +click(point='x1 y1') +left_double(point='x1 y1') +right_single(point='x1 y1') +drag(start_point='x1 y1', end_point='x2 y2') +hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action. +type(content='xxx') # Use escape characters \\', \\\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content. +scroll(point='x1 y1', direction='down or up or right or left') # Show more information on the `direction` side. +wait() #Sleep for 5s and take a screenshot to check for any changes. +finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. + + +## Note +- Use {language} in `Thought` part. +- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part. + +## User Instruction +{instruction} +""" + +def convert_point_to_coordinates(text: str, is_answer: bool = False) -> str: + pattern = r"(\d+)\s+(\d+)" + + def replace_match(match: re.Match[str]) -> str: + x1, y1 = map(int, match.groups()) + x = (x1 + x1) // 2 + y = (y1 + y1) // 2 + return f"({x},{y})" + + text = re.sub(r"\[EOS\]", "", text) + return re.sub(pattern, replace_match, text).strip() + + +def parse_action(action_str: str) -> dict[str, Any] | None: + try: + node = ast.parse(action_str, mode="eval") + if not isinstance(node, ast.Expression): + raise ValueError("Not an expression") + call = node.body + if not isinstance(call, ast.Call): + raise ValueError("Not a function call") + + if isinstance(call.func, ast.Name): + func_name = call.func.id + elif isinstance(call.func, ast.Attribute): + func_name = call.func.attr + else: + func_name = None + + kwargs: dict[str, Any] = {} + for kw in call.keywords: + key = kw.arg + if key is None: + # Skip unpacked kwargs like **extra + continue + if isinstance(kw.value, ast.Constant): + value = kw.value.value + elif isinstance(kw.value, ast.Str): # compatibility + value = kw.value.s + else: + value = None + kwargs[key] = value + + return {"function": func_name, "args": kwargs} + except Exception as exc: + logger.debug("Failed to parse action '%s': %s", action_str, exc) + return None + +def escape_single_quotes(text: str) -> str: + pattern = r"(? int: + return round(number / factor) * factor + + +def ceil_by_factor(number: int, factor: int) -> int: + return math.ceil(number / factor) * factor + + +def floor_by_factor(number: int, factor: int) -> int: + return math.floor(number / factor) * factor + + +def smart_resize(height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS) -> tuple[int, int]: + if max(height, width) / min(height, width) > MAX_RATIO: + raise ValueError( + f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)}" + ) + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(int(height / beta), factor) + w_bar = floor_by_factor(int(width / beta), factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(int(height * beta), factor) + w_bar = ceil_by_factor(int(width * beta), factor) + return h_bar, w_bar + + +def _preprocess_text_for_parsing(text: str) -> str: + if "" in text: + text = convert_point_to_coordinates(text) + if "start_point=" in text: + text = text.replace("start_point=", "start_box=") + if "end_point=" in text: + text = text.replace("end_point=", "end_box=") + if "point=" in text: + text = text.replace("point=", "start_box=") + return text + + +def parse_action_to_structure_output( + text: str, + factor: int, + origin_resized_height: int, + origin_resized_width: int, + model_type: str = "qwen25vl", + max_pixels: int = MAX_PIXELS, + min_pixels: int = MIN_PIXELS, +) -> list[dict[str, Any]]: + text = _preprocess_text_for_parsing(text.strip()) + + # Thought/Action extraction + if text.startswith("Thought:"): + thought_pattern = r"Thought: (.+?)(?=\s*Action: |$)" + elif text.startswith("Reflection:"): + thought_pattern = r"Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action: |$)" + elif text.startswith("Action_Summary:"): + thought_pattern = r"Action_Summary: (.+?)(?=\s*Action: |$)" + else: + thought_pattern = r"Thought: (.+?)(?=\s*Action: |$)" + + reflection, thought = None, None + thought_match = re.search(thought_pattern, text, re.DOTALL) + if thought_match: + if len(thought_match.groups()) == 1: + thought = thought_match.group(1).strip() + elif len(thought_match.groups()) == 2: + thought = thought_match.group(2).strip() + reflection = thought_match.group(1).strip() + + if "Action:" not in text: + return [] + action_str_full = text.split("Action: ")[-1] + + # Split multiple actions if present (rare; we expect exactly one) + raw_actions: list[str] = [] + for seg in action_str_full.split(")\n\n"): + act = seg.strip() + if not act: + continue + if not act.endswith(")"): + act += ")" + # Handle type(content='...') with quotes inside + if "type(content" in act: + def _unbox(m: re.Match[str]) -> str: + return m.group(1) + pat = r"type\(content='(.*?)'\)" + if re.search(pat, act): + inner = re.sub(pat, _unbox, act) + inner = escape_single_quotes(inner) + act = "type(content='" + inner + "')" + raw_actions.append(act) + + parsed_actions = [parse_action(a.replace("\n", "\\n").lstrip()) for a in raw_actions] + + actions: list[dict[str, Any]] = [] + for action_instance, raw_str in zip(parsed_actions, raw_actions): + if not action_instance: + raise ValueError(f"Action can't parse: {raw_str}") + action_type = action_instance["function"] + params = action_instance["args"] + + action_inputs: dict[str, Any] = {} + for param_name, param in params.items(): + if param == "": + continue + if isinstance(param, str): + param = param.lstrip() + action_inputs[param_name.strip()] = param + + if "start_box" in param_name or "end_box" in param_name: + ori_box = str(param) + numbers = ori_box.replace("(", "").replace(")", "").split(",") + + # qwen25vl branch -> absolute pixel coords relative to processed dims -> normalize to 0..1f + if model_type == "qwen25vl": + float_numbers: list[float] = [] + for idx, num in enumerate(numbers): + val = float(num) + if (idx + 1) % 2 == 0: + float_numbers.append(val / float(origin_resized_height or 1)) + else: + float_numbers.append(val / float(origin_resized_width or 1)) + else: + # Otherwise assume factor-based normalization (e.g., 1000) + float_numbers = [float(num) / float(factor or 1) for num in numbers] + + if len(float_numbers) == 2: + float_numbers = [float_numbers[0], float_numbers[1], float_numbers[0], float_numbers[1]] + action_inputs[param_name.strip()] = str(float_numbers) + + actions.append( + { + "reflection": reflection, + "thought": thought, + "action_type": action_type, + "action_inputs": action_inputs, + "text": text, + } + ) + return actions + +def _pil_to_data_uri(img: Image.Image) -> str: + buf = BytesIO() + img.save(buf, format="PNG") + return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode("utf-8") + +def _resize_for_model(img: Image.Image) -> tuple[Image.Image, int, int]: + w, h = img.size + new_h, new_w = smart_resize(h, w) + if (new_w, new_h) != (w, h): + img = img.resize((new_w, new_h)) + if img.mode != "RGB": + img = img.convert("RGB") + return img, new_w, new_h + +def _format_action_to_doubao_string(action: dict[str, Any], width: int, height: int) -> str | None: + a_type = (action.get("type") or "").lower() + + if a_type in {"click", "left_click"}: + x = action.get("x", 0) + y = action.get("y", 0) + return f"click(start_box='({x},{y})')" + elif a_type == "double_click": + x = action.get("x", 0) + y = action.get("y", 0) + return f"left_double(start_box='({x},{y})')" + elif a_type == "drag": + path = action.get("path", []) + if len(path) >= 2: + sx, sy = path[0].get("x", 0), path[0].get("y", 0) + ex, ey = path[-1].get("x", 0), path[-1].get("y", 0) + return f"drag(start_point='({sx},{sy})', end_point='({ex},{ey})')" + elif a_type == "keypress": + keys = " ".join(action.get("keys", [])) + return f"hotkey(key='{keys}')" + elif a_type == "type": + content = action.get("text", "") + return f"type(content='{content}')" + elif a_type == "scroll": + x = action.get("x", 0) + y = action.get("y", 0) + direction = action.get("scroll_y", 0) + dir_str = "down" if direction > 0 else "up" + return f"scroll(point='({x},{y})', direction='{dir_str}')" + return None + +def _parse_to_json_action(actions: list[dict[str, Any]]) -> dict[str, Any] | None: + if not actions: + return None + act = actions[0] + a_type = (act.get("action_type") or "").lower() + inputs = act.get("action_inputs") or {} + + def _coerce_box(value: Any) -> list[float] | None: + try: + if isinstance(value, str): + nums = re.findall(r"-?\d+(?:\.\d+)?", value) + if len(nums) >= 2: + return [float(nums[0]), float(nums[1])] + elif isinstance(value, (list, tuple)) and len(value) >= 2: + return [float(value[0]), float(value[1])] + except Exception: + return None + return None + + if a_type in {"click", "left_single"}: + sb = _coerce_box(inputs.get("start_box")) + if sb: + return {"type": "click", "button": "left", "start_box": sb} + if a_type in {"left_double", "double_click"}: + sb = _coerce_box(inputs.get("start_box")) + if sb: + return {"type": "double_click", "start_box": sb} + if a_type in {"right_single", "right_click"}: + sb = _coerce_box(inputs.get("start_box")) + if sb: + return {"type": "click", "button": "right", "start_box": sb} + if a_type in {"drag", "select", "left_drag"}: + s = _coerce_box(inputs.get("start_box")) + e = _coerce_box(inputs.get("end_box")) + if s and e: + return {"type": "drag", "start_box": s, "end_box": e} + if a_type in {"hotkey", "key", "keydown", "keypress"}: + key_str = inputs.get("key") or inputs.get("hotkey") or inputs.get("keys") or "" + key_str = str(key_str) + # Normalize arrow aliases and spacing + key_str = key_str.replace("arrowleft", "left").replace("arrowright", "right").replace("arrowup", "up").replace("arrowdown", "down") + keys = [seg for seg in re.split(r"[+\s]+", key_str.strip()) if seg] + if keys: + return {"type": "keypress", "keys": keys} + if a_type == "type": + content = inputs.get("content", "") + return {"type": "type", "content": str(content)} + if a_type == "scroll": + sb = _coerce_box(inputs.get("start_box")) + direction = str(inputs.get("direction") or "down").lower() + if sb: + return {"type": "scroll", "start_box": sb, "direction": direction} + if a_type == "wait": + return {"type": "wait"} + if a_type == "finished": + return {"type": "finished", "content": str(inputs.get("content") or "")} + return None + +class UITarsAgent(MCPAgent): + """UITARS computer-use agent (Doubao-style prompts + official parser).""" + + metadata: ClassVar[dict[str, Any]] = { + "display_width": computer_settings.OPENAI_COMPUTER_WIDTH, + "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT, + } + required_tools: ClassVar[list[str]] = ["openai_computer"] + + def __init__(self, *, model_name: str = "ByteDance-Seed/UI-TARS-1.5-7B", completion_kwargs: dict[str, Any] | None = None, **agent_kwargs: Any) -> None: + super().__init__(**agent_kwargs) + self.model_name = model_name + self._base_completion_kwargs = dict(completion_kwargs or {}) + # Allow configuring a Hugging Face endpoint via environment + env_base = os.getenv("HF_ENDPOINT_BASE_URL") + env_token = os.getenv("HF_ENDPOINT_TOKEN") or os.getenv("HF_API_KEY") + env_provider = os.getenv("HF_ENDPOINT_PROVIDER") + if env_base and "api_base" not in self._base_completion_kwargs: + self._base_completion_kwargs["api_base"] = env_base + if env_provider: + self._base_completion_kwargs.setdefault("custom_llm_provider", str(env_provider)) + if env_token and "api_key" not in self._base_completion_kwargs: + self._base_completion_kwargs["api_key"] = env_token + + # If HF endpoint is configured and provider not set, default to huggingface + if os.getenv("HF_ENDPOINT_BASE_URL") and "custom_llm_provider" not in self._base_completion_kwargs: + self._base_completion_kwargs["custom_llm_provider"] = "huggingface" + + self._provider_model = _resolve_provider_model_name(self.model_name) + + async def get_system_messages(self) -> list[Any]: + return [] + + @instrument(span_type="agent", record_args=False) + async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: + content_items: list[dict[str, Any]] = [] + text_parts: list[str] = [] + for block in blocks: + if isinstance(block, types.TextContent): + if block.text: + text_parts.append(block.text) + elif isinstance(block, types.ImageContent): + content_items.append( + { + "type": "message", + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", + }, + } + ], + } + ) + + if text_parts: + content_items.insert( + 0, + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + }, + ) + + return content_items + + def _tool_call(self, item: dict[str, Any]) -> MCPToolCall: + call_id = item.get("call_id") or _random_id() + action = item.get("action") or {} + return MCPToolCall(id=call_id, name="openai_computer", arguments=action) + + @instrument(span_type="agent", record_args=False) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + instruction = _extract_user_instruction(messages) + screenshot_b64 = get_last_image_from_messages(messages) + if not screenshot_b64: + call_id = _random_id() + messages.append(_make_screenshot_item(call_id)) + return AgentResponse( + content="capturing initial screenshot", + tool_calls=[MCPToolCall(id=call_id, name="openai_computer", arguments={"type": "screenshot"})], + done=False, + ) + + # Decode original image dims and make a processed copy for the model + try: + data = base64.b64decode(screenshot_b64.split(",", 1)[1] if screenshot_b64.startswith("data:image") else screenshot_b64) + img = Image.open(BytesIO(data)) + orig_w, orig_h = img.size + except Exception: + orig_w, orig_h = _decode_image_dimensions(screenshot_b64) + img = None + + proc_w, proc_h = orig_w, orig_h + proc_uri = f"data:image/png;base64,{screenshot_b64}" + if img is not None: + img, proc_w, proc_h = _resize_for_model(img) + proc_uri = _pil_to_data_uri(img) + + # Build messages with history: system prompt + previous turns + current screenshot + system_prompt = COMPUTER_USE_DOUBAO.format(language="English", instruction=instruction or "") + + litellm_messages: list[dict[str, Any]] = [] + + # Add history of previous actions and screenshots + for msg in messages[:-1]: # Skip the current screenshot + if not isinstance(msg, dict): + continue + msg_type = msg.get("type") + + if msg_type == "computer_call_output": + output = msg.get("output") or {} + if isinstance(output, dict) and output.get("type") == "input_image": + image_url = output.get("image_url") + if image_url: + litellm_messages.append({ + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": image_url}}] + }) + + elif msg_type == "computer_call": + action = msg.get("action") or {} + action_str = _format_action_to_doubao_string(action, proc_w, proc_h) + if action_str: + litellm_messages.append({ + "role": "assistant", + "content": f"Thought: Executing action.\nAction: {action_str}" + }) + + litellm_messages.append({ + "role": "user", + "content": [ + {"type": "text", "text": system_prompt}, + {"type": "image_url", "image_url": {"url": proc_uri}}, + ], + }) + + api_kwargs: dict[str, Any] = { + "model": self._provider_model, + "messages": litellm_messages, + "temperature": 0.1, + "max_tokens": 256, + } + api_kwargs.update(self._base_completion_kwargs) + + try: + response = await litellm.acompletion(**api_kwargs) + except Exception as exc: # pragma: no cover - network errors + logger.exception("uitars completion failed: %s", exc) + return AgentResponse(content=f"UITARS request failed: {exc}", tool_calls=[], done=True, isError=True) + + content = (getattr(response.choices[0], "message", None) or {}).get("content", "") + + if content: + print(f"\n{'='*60}\nUITars output:\n{content}\n{'='*60}\n") + + actions = parse_action_to_structure_output( + content or "", + factor=1000, + origin_resized_height=proc_h, + origin_resized_width=proc_w, + model_type="qwen25vl", + ) + json_action = _parse_to_json_action(actions) + + items: list[dict[str, Any]] = [] + if json_action: + call_id = _random_id() + # Feed original dimensions so normalized coords map to real pixels + items.extend( + _convert_json_action_to_items( + json_action, + call_id=call_id, + image_width=orig_w, + image_height=orig_h, + ) + ) + + # Auto-wait after clicking taskbar/launcher icons (left edge, x < 50) + if items and json_action.get("type") in {"click", "left_click"}: + x_coord = json_action.get("x", 0) + if x_coord < 50: # Likely a launcher icon + logger.info("Detected launcher click at x=%d, adding auto-wait", x_coord) + items.append({ + "type": "computer_call", + "action": {"type": "screenshot"}, + "computer_call_id": _random_id(), + }) + + if not items: + items.append(_make_screenshot_item(call_id)) + else: + call_id = _random_id() + items.append(_make_screenshot_item(call_id)) + + tool_calls = [self._tool_call(i) for i in items if i.get("type") == "computer_call"] + return AgentResponse(content=None, tool_calls=tool_calls, done=not tool_calls, raw=response) + + @instrument(span_type="agent", record_args=False) + async def format_tool_results(self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]) -> list[dict[str, Any]]: + rendered: list[dict[str, Any]] = [] + for call, result in zip(tool_calls, tool_results, strict=False): + call_args = call.arguments or {} + if result.isError: + error_text = "".join(c.text for c in result.content if isinstance(c, types.TextContent)) + rendered.extend( + _make_failed_tool_call_items( + tool_name=call_args.get("type", call.name), + tool_kwargs=call_args, + error_message=error_text or "Unknown error", + call_id=call.id, + ) + ) + continue + + screenshot_found = False + for content in result.content: + if isinstance(content, types.ImageContent): + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": {"type": "input_image", "image_url": f"data:{content.mimeType};base64,{content.data}"}, + } + ) + screenshot_found = True + break + + text_parts = [c.text for c in result.content if isinstance(c, types.TextContent) and c.text] + if text_parts: + rendered.append({"type": "message", "role": "user", "content": [{"type": "input_text", "text": "\n".join(text_parts)}]}) + + if not screenshot_found and not text_parts: + rendered.append({"type": "computer_call_output", "call_id": call.id, "output": {"type": "input_text", "text": "Tool executed"}}) + + return rendered + + +__all__ = ["UITarsAgent"] From aac9285974fb18f344481c93daff8226cdbcf5d1 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Wed, 15 Oct 2025 20:26:24 +0200 Subject: [PATCH 08/14] commented debugs lines --- hud/agents/openrouter/models/glm45v/glm45v.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/hud/agents/openrouter/models/glm45v/glm45v.py b/hud/agents/openrouter/models/glm45v/glm45v.py index b7e03c04..e007a4da 100644 --- a/hud/agents/openrouter/models/glm45v/glm45v.py +++ b/hud/agents/openrouter/models/glm45v/glm45v.py @@ -430,10 +430,7 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: } if parsed.get("memory"): self._memory = parsed["memory"] - logger.debug("glm45v model content: %s", response_content) - trimmed = response_content[:400] if response_content else "" - self.console.debug(f"glm45v model content: {trimmed}") - self.console.debug(f"glm45v parsed response: {parsed}") + # logger.debug("glm45v model content: %s", response_content) image_width, image_height = _decode_image_dimensions(screenshot_b64) response_items = convert_glm_completion_to_responses_items( @@ -470,10 +467,11 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: reasoning_text = "\n".join(reasoning_parts).strip() if not tool_calls: - self.console.info_log( - f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'" - ) - self.console.info_log(f"glm45v parsed response: {parsed}") + pass + # self.console.info_log( + # f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'" + # ) + # self.console.info_log(f"glm45v parsed response: {parsed}") return AgentResponse( content=content_text or None, From 7ddba455ee64987e9108fe1d54ba5a7b2dbded57 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Wed, 15 Oct 2025 23:36:47 +0200 Subject: [PATCH 09/14] latest, duplicate fix, base class --- hud/agents/openrouter.py | 105 ++++++++++++++++ hud/agents/openrouter/models/glm45v/glm45v.py | 114 +----------------- hud/agents/openrouter/models/uitars/uitars.py | 84 +------------ 3 files changed, 110 insertions(+), 193 deletions(-) diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index f11a0d8c..42ddfbfc 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -12,10 +12,12 @@ from io import BytesIO from typing import Any, Dict, Type +import mcp.types as types from PIL import Image from hud.agents.base import MCPAgent from hud.tools.computer.settings import computer_settings +from hud.types import MCPToolCall, MCPToolResult # Shared helper utilities for computer-use adapters def _random_id() -> str: @@ -332,6 +334,108 @@ def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: return url.split(",", 1)[1] return None +class OpenRouterBaseAgent(MCPAgent): + """Base class for OpenRouter vision-language agents with shared formatting logic.""" + + async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: + """Format MCP content blocks into message items.""" + content_items: list[dict[str, Any]] = [] + text_parts: list[str] = [] + for block in blocks: + if isinstance(block, types.TextContent): + if block.text: + text_parts.append(block.text) + elif isinstance(block, types.ImageContent): + content_items.append( + { + "type": "message", + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", + }, + } + ], + } + ) + + if text_parts: + content_items.insert( + 0, + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + }, + ) + + return content_items + + async def format_tool_results( + self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult] + ) -> list[dict[str, Any]]: + """Format tool execution results into message items.""" + import mcp.types as types # noqa: PLC0415 + + rendered: list[dict[str, Any]] = [] + for call, result in zip(tool_calls, tool_results, strict=False): + call_args = call.arguments or {} + if result.isError: + error_text = "".join( + c.text for c in result.content if isinstance(c, types.TextContent) + ) + rendered.extend( + _make_failed_tool_call_items( + tool_name=call_args.get("type", call.name), + tool_kwargs=call_args, + error_message=error_text or "Unknown error", + call_id=call.id, + ) + ) + continue + + screenshot_found = False + for content in result.content: + if isinstance(content, types.ImageContent): + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": { + "type": "input_image", + "image_url": f"data:{content.mimeType};base64,{content.data}", + }, + } + ) + screenshot_found = True + break + + text_parts = [ + c.text for c in result.content if isinstance(c, types.TextContent) and c.text + ] + if text_parts: + rendered.append( + { + "type": "message", + "role": "user", + "content": [{"type": "input_text", "text": "\n".join(text_parts)}], + } + ) + + if not screenshot_found and not text_parts: + rendered.append( + { + "type": "computer_call_output", + "call_id": call.id, + "output": {"type": "input_text", "text": "Tool executed"}, + } + ) + + return rendered + + # Adapter dispatch _ADAPTER_REGISTRY: Dict[str, str] = { "z-ai/glm-4.5v": "hud.agents.openrouter.models.glm45v.glm45v:Glm45vAgent", @@ -396,6 +500,7 @@ def __dir__(self) -> list[str]: __all__ = [ "OpenRouterAgent", + "OpenRouterBaseAgent", "_random_id", "_make_reasoning_item", "_make_output_text_item", diff --git a/hud/agents/openrouter/models/glm45v/glm45v.py b/hud/agents/openrouter/models/glm45v/glm45v.py index e007a4da..526e91af 100644 --- a/hud/agents/openrouter/models/glm45v/glm45v.py +++ b/hud/agents/openrouter/models/glm45v/glm45v.py @@ -12,12 +12,12 @@ import mcp.types as types from litellm.types.utils import ModelResponse -from hud.agents.base import MCPAgent from hud.settings import settings from hud.tools.computer.settings import computer_settings from hud.types import AgentResponse, MCPToolCall, MCPToolResult from hud import instrument from hud.agents.openrouter import ( + OpenRouterBaseAgent, _convert_json_action_to_items, _decode_image_dimensions, _extract_user_instruction, @@ -277,7 +277,7 @@ def parse_glm_response(response: str) -> dict[str, str]: "memory": memory, } -class Glm45vAgent(MCPAgent): +class Glm45vAgent(OpenRouterBaseAgent): """LiteLLM-backed GLM-4.5V agent that speaks MCP.""" metadata: ClassVar[dict[str, Any]] = { @@ -309,41 +309,6 @@ def __init__( async def get_system_messages(self) -> list[Any]: return [] - @instrument(span_type="agent", record_args=False) - async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: - content_items: list[dict[str, Any]] = [] - text_parts: list[str] = [] - for block in blocks: - if isinstance(block, types.TextContent): - text_parts.append(block.text) - elif isinstance(block, types.ImageContent): - content_items.append( - { - "type": "message", - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", - }, - } - ], - } - ) - - if text_parts: - content_items.insert( - 0, - { - "type": "message", - "role": "user", - "content": [{"type": "input_text", "text": "\n".join(text_parts)}], - }, - ) - - return content_items - def _glm_tool_call_to_mcp(self, item: dict[str, Any]) -> MCPToolCall: call_id = item.get("call_id") or _random_id() action = item.get("action") or {} @@ -430,7 +395,6 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: } if parsed.get("memory"): self._memory = parsed["memory"] - # logger.debug("glm45v model content: %s", response_content) image_width, image_height = _decode_image_dimensions(screenshot_b64) response_items = convert_glm_completion_to_responses_items( @@ -466,13 +430,6 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: content_text = "\n".join(text_parts).strip() reasoning_text = "\n".join(reasoning_parts).strip() - if not tool_calls: - pass - # self.console.info_log( - # f"glm45v returned no tool calls. content='{content_text}' reasoning='{reasoning_text}'" - # ) - # self.console.info_log(f"glm45v parsed response: {parsed}") - return AgentResponse( content=content_text or None, reasoning=reasoning_text or None, @@ -481,72 +438,5 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: raw=response, ) - @instrument(span_type="agent", record_args=False) - async def format_tool_results( - self, - tool_calls: list[MCPToolCall], - tool_results: list[MCPToolResult], - ) -> list[dict[str, Any]]: - rendered: list[dict[str, Any]] = [] - - for call, result in zip(tool_calls, tool_results, strict=False): - call_args = call.arguments or {} - if result.isError: - error_text = "".join( - content.text - for content in result.content - if isinstance(content, types.TextContent) - ) - rendered.extend( - _make_failed_tool_call_items( - tool_name=call_args.get("type", call.name), - tool_kwargs=call_args, - error_message=error_text or "Unknown error", - call_id=call.id, - ) - ) - continue - - screenshot_found = False - for content in result.content: - if isinstance(content, types.ImageContent): - rendered.append( - { - "type": "computer_call_output", - "call_id": call.id, - "output": { - "type": "input_image", - "image_url": f"data:{content.mimeType};base64,{content.data}", - }, - } - ) - screenshot_found = True - break - - text_parts = [ - content.text - for content in result.content - if isinstance(content, types.TextContent) and content.text - ] - if text_parts: - rendered.append( - { - "type": "message", - "role": "user", - "content": [{"type": "input_text", "text": "\n".join(text_parts)}], - } - ) - - if not screenshot_found and not text_parts: - rendered.append( - { - "type": "computer_call_output", - "call_id": call.id, - "output": {"type": "input_text", "text": "Tool executed"}, - } - ) - - return rendered - __all__ = ["Glm45vAgent"] diff --git a/hud/agents/openrouter/models/uitars/uitars.py b/hud/agents/openrouter/models/uitars/uitars.py index ba529f76..2d551a7d 100644 --- a/hud/agents/openrouter/models/uitars/uitars.py +++ b/hud/agents/openrouter/models/uitars/uitars.py @@ -17,8 +17,8 @@ import mcp.types as types from hud import instrument -from hud.agents.base import MCPAgent from hud.agents.openrouter import ( + OpenRouterBaseAgent, _convert_json_action_to_items, _decode_image_dimensions, _extract_user_instruction, @@ -32,7 +32,6 @@ logger = logging.getLogger(__name__) -# Constants from the official UITARS parser IMAGE_FACTOR = 28 MIN_PIXELS = 100 * 28 * 28 MAX_PIXELS = 16384 * 28 * 28 @@ -375,7 +374,7 @@ def _coerce_box(value: Any) -> list[float] | None: return {"type": "finished", "content": str(inputs.get("content") or "")} return None -class UITarsAgent(MCPAgent): +class UITarsAgent(OpenRouterBaseAgent): """UITARS computer-use agent (Doubao-style prompts + official parser).""" metadata: ClassVar[dict[str, Any]] = { @@ -408,42 +407,6 @@ def __init__(self, *, model_name: str = "ByteDance-Seed/UI-TARS-1.5-7B", complet async def get_system_messages(self) -> list[Any]: return [] - @instrument(span_type="agent", record_args=False) - async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: - content_items: list[dict[str, Any]] = [] - text_parts: list[str] = [] - for block in blocks: - if isinstance(block, types.TextContent): - if block.text: - text_parts.append(block.text) - elif isinstance(block, types.ImageContent): - content_items.append( - { - "type": "message", - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": { - "url": f"data:{getattr(block, 'mimeType', 'image/png')};base64,{block.data}", - }, - } - ], - } - ) - - if text_parts: - content_items.insert( - 0, - { - "type": "message", - "role": "user", - "content": [{"type": "input_text", "text": "\n".join(text_parts)}], - }, - ) - - return content_items - def _tool_call(self, item: dict[str, Any]) -> MCPToolCall: call_id = item.get("call_id") or _random_id() action = item.get("action") or {} @@ -530,9 +493,7 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: return AgentResponse(content=f"UITARS request failed: {exc}", tool_calls=[], done=True, isError=True) content = (getattr(response.choices[0], "message", None) or {}).get("content", "") - - if content: - print(f"\n{'='*60}\nUITars output:\n{content}\n{'='*60}\n") + logger.debug("UITARS model output: %s", content) actions = parse_action_to_structure_output( content or "", @@ -576,44 +537,5 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: tool_calls = [self._tool_call(i) for i in items if i.get("type") == "computer_call"] return AgentResponse(content=None, tool_calls=tool_calls, done=not tool_calls, raw=response) - @instrument(span_type="agent", record_args=False) - async def format_tool_results(self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]) -> list[dict[str, Any]]: - rendered: list[dict[str, Any]] = [] - for call, result in zip(tool_calls, tool_results, strict=False): - call_args = call.arguments or {} - if result.isError: - error_text = "".join(c.text for c in result.content if isinstance(c, types.TextContent)) - rendered.extend( - _make_failed_tool_call_items( - tool_name=call_args.get("type", call.name), - tool_kwargs=call_args, - error_message=error_text or "Unknown error", - call_id=call.id, - ) - ) - continue - - screenshot_found = False - for content in result.content: - if isinstance(content, types.ImageContent): - rendered.append( - { - "type": "computer_call_output", - "call_id": call.id, - "output": {"type": "input_image", "image_url": f"data:{content.mimeType};base64,{content.data}"}, - } - ) - screenshot_found = True - break - - text_parts = [c.text for c in result.content if isinstance(c, types.TextContent) and c.text] - if text_parts: - rendered.append({"type": "message", "role": "user", "content": [{"type": "input_text", "text": "\n".join(text_parts)}]}) - - if not screenshot_found and not text_parts: - rendered.append({"type": "computer_call_output", "call_id": call.id, "output": {"type": "input_text", "text": "Tool executed"}}) - - return rendered - __all__ = ["UITarsAgent"] From b241f696be3d807a68e20f8ed2978fb314db28e8 Mon Sep 17 00:00:00 2001 From: shfunc Date: Thu, 16 Oct 2025 09:32:31 +0200 Subject: [PATCH 10/14] clean up, shared get_response --- hud/agents/openrouter.py | 61 +++++++++++++++++- hud/agents/openrouter/models/glm45v/glm45v.py | 56 ++++------------- hud/agents/openrouter/models/uitars/uitars.py | 62 +++++-------------- 3 files changed, 88 insertions(+), 91 deletions(-) diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index 42ddfbfc..cc08e1ab 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -11,13 +11,22 @@ from pathlib import Path from io import BytesIO from typing import Any, Dict, Type +from abc import abstractmethod import mcp.types as types from PIL import Image +import litellm + from hud.agents.base import MCPAgent from hud.tools.computer.settings import computer_settings -from hud.types import MCPToolCall, MCPToolResult +from hud.types import MCPToolCall, MCPToolResult, AgentResponse +from hud import instrument +import logging +logger = logging.getLogger(__name__) + +from hud.settings import settings +import os # Shared helper utilities for computer-use adapters def _random_id() -> str: @@ -337,6 +346,10 @@ def get_last_image_from_messages(messages: list[dict[str, Any]]) -> str | None: class OpenRouterBaseAgent(MCPAgent): """Base class for OpenRouter vision-language agents with shared formatting logic.""" + def __init__(self, completion_kwargs: dict[str, Any] | None = None, **kwargs: Any) -> None: + super().__init__(**kwargs) + self.completion_kwargs = completion_kwargs or {} + async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]: """Format MCP content blocks into message items.""" content_items: list[dict[str, Any]] = [] @@ -435,6 +448,52 @@ async def format_tool_results( return rendered + @abstractmethod + async def build_prompt(self, messages: list[dict[str, Any]], instruction: str, screenshot_b64: str) -> list[dict[str, Any]]: + """Subclass hook to build model-specific prompt/messages.""" + pass + + @abstractmethod + async def parse_response(self, response: Any, messages: list[dict[str, Any]], screenshot_b64: str) -> AgentResponse: + """Subclass hook to parse model response into AgentResponse.""" + pass + + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + instruction = _extract_user_instruction(messages) + + screenshot_b64 = get_last_image_from_messages(messages) + if not screenshot_b64: + call_id = _random_id() + messages.append(_make_screenshot_item(call_id)) + return AgentResponse( + content="capturing initial screenshot", + tool_calls=[MCPToolCall(id=call_id, name="openai_computer", arguments={"type": "screenshot"})], + done=False, + ) + + litellm_messages = await self.build_prompt(messages, instruction, screenshot_b64) + + api_kwargs: dict[str, Any] = { + "model": self.model_name, + "messages": litellm_messages, + } + if "openrouter" in self.model_name.lower(): + api_kwargs["api_key"] = settings.openrouter_api_key or os.getenv("OPENROUTER_API_KEY") + api_kwargs.update(self.completion_kwargs) + + try: + response = await litellm.acompletion(**api_kwargs) + except Exception as exc: + logger.exception(f"{self.__class__.__name__} completion failed: %s", exc) + return AgentResponse( + content=f"{self.__class__.__name__} request failed: {exc}", + tool_calls=[], + done=True, + isError=True, + ) + + return await self.parse_response(response, messages, screenshot_b64) + # Adapter dispatch _ADAPTER_REGISTRY: Dict[str, str] = { diff --git a/hud/agents/openrouter/models/glm45v/glm45v.py b/hud/agents/openrouter/models/glm45v/glm45v.py index 526e91af..993c06c2 100644 --- a/hud/agents/openrouter/models/glm45v/glm45v.py +++ b/hud/agents/openrouter/models/glm45v/glm45v.py @@ -8,26 +8,19 @@ from typing import Any, ClassVar from pathlib import Path -import litellm -import mcp.types as types from litellm.types.utils import ModelResponse from hud.settings import settings from hud.tools.computer.settings import computer_settings -from hud.types import AgentResponse, MCPToolCall, MCPToolResult -from hud import instrument +from hud.types import AgentResponse, MCPToolCall from hud.agents.openrouter import ( OpenRouterBaseAgent, _convert_json_action_to_items, _decode_image_dimensions, - _extract_user_instruction, - _make_failed_tool_call_items, _make_output_text_item, _make_reasoning_item, - _make_screenshot_item, _parse_json_action_string, _random_id, - get_last_image_from_messages, ) logger = logging.getLogger(__name__) @@ -329,31 +322,13 @@ def _glm_tool_call_to_mcp(self, item: dict[str, Any]) -> MCPToolCall: return MCPToolCall(id=call_id, name="openai_computer", arguments=arguments) - @instrument(span_type="agent", record_args=False) - async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: - instruction = _extract_user_instruction(messages) + async def build_prompt(self, messages: list[dict[str, Any]], instruction: str, screenshot_b64: str) -> list[dict[str, Any]]: + # Original prompt building logic from get_response if instruction: - self._last_instruction = instruction # type: ignore[attr-defined] + self._last_instruction = instruction self._task_description = instruction task_instruction = self._task_description or getattr(self, "_last_instruction", "") - screenshot_b64 = get_last_image_from_messages(messages) - if not screenshot_b64: - call_id = _random_id() - screenshot_call = _make_screenshot_item(call_id) - messages.append(screenshot_call) - logger.debug("glm45v requesting initial screenshot") - tool_call = MCPToolCall( - id=call_id, - name="openai_computer", - arguments={"type": "screenshot"}, - ) - return AgentResponse( - content="capturing initial screenshot", - tool_calls=[tool_call], - done=False, - ) - self.console.debug(f"glm45v task instruction: {task_instruction}") self.console.debug(f"glm45v memory (pre-step): {self._memory}") @@ -370,23 +345,11 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: if getattr(self, "system_prompt", None): litellm_messages.append({"role": "system", "content": self.system_prompt}) litellm_messages.append({"role": "user", "content": prompt_content}) + + return litellm_messages - api_kwargs = {"model": self.model_name, "messages": litellm_messages} - if settings.openrouter_api_key: - api_kwargs["api_key"] = settings.openrouter_api_key - api_kwargs.update(self.completion_kwargs) - - try: - response = await litellm.acompletion(**api_kwargs) - except Exception as exc: # pragma: no cover - network errors - logger.exception("glm45v completion failed: %s", exc) - return AgentResponse( - content=f"GLM-4.5V request failed: {exc}", - tool_calls=[], - done=True, - isError=True, - ) - + async def parse_response(self, response: Any, messages: list[dict[str, Any]], screenshot_b64: str) -> AgentResponse: + # Original parsing logic from get_response choice = response.choices[0] message = getattr(choice, "message", None) response_content = getattr(message, "content", "") if message else "" @@ -438,5 +401,8 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: raw=response, ) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + return await super().get_response(messages) + __all__ = ["Glm45vAgent"] diff --git a/hud/agents/openrouter/models/uitars/uitars.py b/hud/agents/openrouter/models/uitars/uitars.py index 2d551a7d..6097e9c5 100644 --- a/hud/agents/openrouter/models/uitars/uitars.py +++ b/hud/agents/openrouter/models/uitars/uitars.py @@ -13,22 +13,15 @@ from PIL import Image -import litellm -import mcp.types as types - -from hud import instrument from hud.agents.openrouter import ( OpenRouterBaseAgent, _convert_json_action_to_items, _decode_image_dimensions, - _extract_user_instruction, - _make_failed_tool_call_items, _make_screenshot_item, _random_id, - get_last_image_from_messages, ) from hud.tools.computer.settings import computer_settings -from hud.types import AgentResponse, MCPToolCall, MCPToolResult +from hud.types import AgentResponse, MCPToolCall logger = logging.getLogger(__name__) @@ -403,6 +396,8 @@ def __init__(self, *, model_name: str = "ByteDance-Seed/UI-TARS-1.5-7B", complet self._base_completion_kwargs["custom_llm_provider"] = "huggingface" self._provider_model = _resolve_provider_model_name(self.model_name) + self.completion_kwargs = self._base_completion_kwargs + self.model_name = self._provider_model async def get_system_messages(self) -> list[Any]: return [] @@ -412,20 +407,7 @@ def _tool_call(self, item: dict[str, Any]) -> MCPToolCall: action = item.get("action") or {} return MCPToolCall(id=call_id, name="openai_computer", arguments=action) - @instrument(span_type="agent", record_args=False) - async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: - instruction = _extract_user_instruction(messages) - screenshot_b64 = get_last_image_from_messages(messages) - if not screenshot_b64: - call_id = _random_id() - messages.append(_make_screenshot_item(call_id)) - return AgentResponse( - content="capturing initial screenshot", - tool_calls=[MCPToolCall(id=call_id, name="openai_computer", arguments={"type": "screenshot"})], - done=False, - ) - - # Decode original image dims and make a processed copy for the model + async def build_prompt(self, messages: list[dict[str, Any]], instruction: str, screenshot_b64: str) -> list[dict[str, Any]]: try: data = base64.b64decode(screenshot_b64.split(",", 1)[1] if screenshot_b64.startswith("data:image") else screenshot_b64) img = Image.open(BytesIO(data)) @@ -440,13 +422,11 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: img, proc_w, proc_h = _resize_for_model(img) proc_uri = _pil_to_data_uri(img) - # Build messages with history: system prompt + previous turns + current screenshot system_prompt = COMPUTER_USE_DOUBAO.format(language="English", instruction=instruction or "") - + litellm_messages: list[dict[str, Any]] = [] - # Add history of previous actions and screenshots - for msg in messages[:-1]: # Skip the current screenshot + for msg in messages[:-1]: if not isinstance(msg, dict): continue msg_type = msg.get("type") @@ -477,29 +457,20 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: {"type": "image_url", "image_url": {"url": proc_uri}}, ], }) + + return litellm_messages - api_kwargs: dict[str, Any] = { - "model": self._provider_model, - "messages": litellm_messages, - "temperature": 0.1, - "max_tokens": 256, - } - api_kwargs.update(self._base_completion_kwargs) - - try: - response = await litellm.acompletion(**api_kwargs) - except Exception as exc: # pragma: no cover - network errors - logger.exception("uitars completion failed: %s", exc) - return AgentResponse(content=f"UITARS request failed: {exc}", tool_calls=[], done=True, isError=True) - + async def parse_response(self, response: Any, messages: list[dict[str, Any]], screenshot_b64: str) -> AgentResponse: content = (getattr(response.choices[0], "message", None) or {}).get("content", "") logger.debug("UITARS model output: %s", content) + orig_w, orig_h = _decode_image_dimensions(screenshot_b64) + actions = parse_action_to_structure_output( content or "", factor=1000, - origin_resized_height=proc_h, - origin_resized_width=proc_w, + origin_resized_height=orig_h, + origin_resized_width=orig_w, model_type="qwen25vl", ) json_action = _parse_to_json_action(actions) @@ -507,7 +478,6 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: items: list[dict[str, Any]] = [] if json_action: call_id = _random_id() - # Feed original dimensions so normalized coords map to real pixels items.extend( _convert_json_action_to_items( json_action, @@ -517,10 +487,9 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: ) ) - # Auto-wait after clicking taskbar/launcher icons (left edge, x < 50) if items and json_action.get("type") in {"click", "left_click"}: x_coord = json_action.get("x", 0) - if x_coord < 50: # Likely a launcher icon + if x_coord < 50: logger.info("Detected launcher click at x=%d, adding auto-wait", x_coord) items.append({ "type": "computer_call", @@ -537,5 +506,8 @@ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: tool_calls = [self._tool_call(i) for i in items if i.get("type") == "computer_call"] return AgentResponse(content=None, tool_calls=tool_calls, done=not tool_calls, raw=response) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: + return await super().get_response(messages) + __all__ = ["UITarsAgent"] From 045dce8f1e0000cb63c84dca1c8aa3c58db20d59 Mon Sep 17 00:00:00 2001 From: shfunc Date: Thu, 16 Oct 2025 09:44:00 +0200 Subject: [PATCH 11/14] click action bug fix --- hud/agents/openrouter/models/uitars/uitars.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hud/agents/openrouter/models/uitars/uitars.py b/hud/agents/openrouter/models/uitars/uitars.py index 6097e9c5..d077df92 100644 --- a/hud/agents/openrouter/models/uitars/uitars.py +++ b/hud/agents/openrouter/models/uitars/uitars.py @@ -488,7 +488,8 @@ async def parse_response(self, response: Any, messages: list[dict[str, Any]], sc ) if items and json_action.get("type") in {"click", "left_click"}: - x_coord = json_action.get("x", 0) + converted_action = items[0].get("action", {}) if items else {} + x_coord = converted_action.get("x", 0) if x_coord < 50: logger.info("Detected launcher click at x=%d, adding auto-wait", x_coord) items.append({ From bd7f56567f936d162c4589db767dc77355ab3fa3 Mon Sep 17 00:00:00 2001 From: shfunc Date: Thu, 16 Oct 2025 09:49:53 +0200 Subject: [PATCH 12/14] midpoint calc fix --- hud/agents/openrouter/models/uitars/uitars.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hud/agents/openrouter/models/uitars/uitars.py b/hud/agents/openrouter/models/uitars/uitars.py index d077df92..ba68e50b 100644 --- a/hud/agents/openrouter/models/uitars/uitars.py +++ b/hud/agents/openrouter/models/uitars/uitars.py @@ -72,9 +72,7 @@ def convert_point_to_coordinates(text: str, is_answer: bool = False) -> str: def replace_match(match: re.Match[str]) -> str: x1, y1 = map(int, match.groups()) - x = (x1 + x1) // 2 - y = (y1 + y1) // 2 - return f"({x},{y})" + return f"({x1},{y1})" text = re.sub(r"\[EOS\]", "", text) return re.sub(pattern, replace_match, text).strip() From d823528540401bb18b53cd7751be61d02b210fa3 Mon Sep 17 00:00:00 2001 From: ilya <95108691+shfunc@users.noreply.github.com> Date: Thu, 16 Oct 2025 10:12:24 +0200 Subject: [PATCH 13/14] Delete examples/mcp_sum_server.py --- examples/mcp_sum_server.py | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100644 examples/mcp_sum_server.py diff --git a/examples/mcp_sum_server.py b/examples/mcp_sum_server.py deleted file mode 100644 index 7c26d123..00000000 --- a/examples/mcp_sum_server.py +++ /dev/null @@ -1,23 +0,0 @@ -"""FastMCP server exposing a simple sum tool. - -Run with: `python examples/mcp_sum_server.py`. -""" - -from __future__ import annotations - -from fastmcp import FastMCP - - -server = FastMCP("SumServer") - - -@server.tool() -def sum(a: int, b: int) -> dict[str, int]: - """Return the sum of two integers.""" - return {"result": a + b} - - -if __name__ == "__main__": - server.run() - - From 1279555c3e4b84b46abbbf6c978b83c4eb161dd4 Mon Sep 17 00:00:00 2001 From: shinbehavior Date: Fri, 17 Oct 2025 20:25:26 +0200 Subject: [PATCH 14/14] instument fix --- hud/agents/openrouter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hud/agents/openrouter.py b/hud/agents/openrouter.py index cc08e1ab..58bc9019 100644 --- a/hud/agents/openrouter.py +++ b/hud/agents/openrouter.py @@ -458,6 +458,12 @@ async def parse_response(self, response: Any, messages: list[dict[str, Any]], sc """Subclass hook to parse model response into AgentResponse.""" pass + @instrument( + span_type="agent", + record_args=False, + record_result=True, + ) + async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse: instruction = _extract_user_instruction(messages)