trycua/cua # codebase.md

This is page 13 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_mcp_server_session_management.py
    ├── test_mcp_server_streaming.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/examples/som_examples.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Example script demonstrating the usage of OmniParser's UI element detection functionality.
  4 | This script shows how to:
  5 | 1. Initialize the OmniParser
  6 | 2. Load and process images
  7 | 3. Visualize detection results
  8 | 4. Compare performance between CPU and MPS (Apple Silicon)
  9 | """
 10 | 
 11 | import argparse
 12 | import logging
 13 | import sys
 14 | from pathlib import Path
 15 | import time
 16 | from PIL import Image
 17 | from typing import Dict, Any, List, Optional
 18 | import numpy as np
 19 | import io
 20 | import base64
 21 | import glob
 22 | import os
 23 | 
 24 | # Load environment variables from .env file
 25 | project_root = Path(__file__).parent.parent
 26 | env_file = project_root / ".env"
 27 | print(f"Loading environment from: {env_file}")
 28 | from dotenv import load_dotenv
 29 | 
 30 | load_dotenv(env_file)
 31 | 
 32 | # Add paths to sys.path if needed
 33 | pythonpath = os.environ.get("PYTHONPATH", "")
 34 | for path in pythonpath.split(":"):
 35 |     if path and path not in sys.path:
 36 |         sys.path.append(path)
 37 |         print(f"Added to sys.path: {path}")
 38 | 
 39 | # Add the libs directory to the path to find som
 40 | libs_path = project_root / "libs"
 41 | if str(libs_path) not in sys.path:
 42 |     sys.path.append(str(libs_path))
 43 |     print(f"Added to sys.path: {libs_path}")
 44 | 
 45 | from som import OmniParser, ParseResult, IconElement, TextElement
 46 | from som.models import UIElement, ParserMetadata, BoundingBox
 47 | 
 48 | # Configure logging
 49 | logging.basicConfig(
 50 |     level=logging.INFO,
 51 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 52 |     datefmt="%Y-%m-%d %H:%M:%S",
 53 | )
 54 | logger = logging.getLogger(__name__)
 55 | 
 56 | 
 57 | def setup_logging():
 58 |     """Configure logging with a nice format."""
 59 |     logging.basicConfig(
 60 |         level=logging.INFO,
 61 |         format="%(asctime)s - %(levelname)s - %(message)s",
 62 |         datefmt="%Y-%m-%d %H:%M:%S",
 63 |     )
 64 | 
 65 | 
 66 | class Timer:
 67 |     """Enhanced context manager for timing code blocks."""
 68 | 
 69 |     def __init__(self, name: str, logger):
 70 |         self.name = name
 71 |         self.logger = logger
 72 |         self.start_time: float = 0.0
 73 |         self.elapsed_time: float = 0.0
 74 | 
 75 |     def __enter__(self):
 76 |         self.start_time = time.time()
 77 |         return self
 78 | 
 79 |     def __exit__(self, *args):
 80 |         self.elapsed_time = time.time() - self.start_time
 81 |         self.logger.info(f"{self.name}: {self.elapsed_time:.3f}s")
 82 |         return False
 83 | 
 84 | 
 85 | def image_to_bytes(image: Image.Image) -> bytes:
 86 |     """Convert PIL Image to PNG bytes."""
 87 |     buf = io.BytesIO()
 88 |     image.save(buf, format="PNG")
 89 |     return buf.getvalue()
 90 | 
 91 | 
 92 | def process_image(
 93 |     parser: OmniParser, image_path: str, output_dir: Path, use_ocr: bool = False
 94 | ) -> None:
 95 |     """Process a single image and save the result."""
 96 |     try:
 97 |         # Load image
 98 |         logger.info(f"Processing image: {image_path}")
 99 |         image = Image.open(image_path).convert("RGB")
100 |         logger.info(f"Image loaded successfully, size: {image.size}")
101 | 
102 |         # Create output filename
103 |         input_filename = Path(image_path).stem
104 |         output_path = output_dir / f"{input_filename}_analyzed.png"
105 | 
106 |         # Convert image to PNG bytes
107 |         image_bytes = image_to_bytes(image)
108 | 
109 |         # Process image
110 |         with Timer(f"Processing {input_filename}", logger):
111 |             result = parser.parse(image_bytes, use_ocr=use_ocr)
112 |             logger.info(
113 |                 f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
114 |             )
115 | 
116 |             # Save the annotated image
117 |             logger.info(f"Saving annotated image to: {output_path}")
118 |             try:
119 |                 # Save image from base64
120 |                 img_data = base64.b64decode(result.annotated_image_base64)
121 |                 img = Image.open(io.BytesIO(img_data))
122 |                 img.save(output_path)
123 | 
124 |                 # Print detailed results
125 |                 logger.info("\nDetected Elements:")
126 |                 for elem in result.elements:
127 |                     if isinstance(elem, IconElement):
128 |                         logger.info(
129 |                             f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
130 |                         )
131 |                     elif isinstance(elem, TextElement):
132 |                         logger.info(
133 |                             f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
134 |                         )
135 | 
136 |                 # Verify file exists and log size
137 |                 if output_path.exists():
138 |                     logger.info(
139 |                         f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
140 |                     )
141 |                 else:
142 |                     logger.error(f"Failed to verify file at {output_path}")
143 |             except Exception as e:
144 |                 logger.error(f"Error saving image: {str(e)}", exc_info=True)
145 | 
146 |     except Exception as e:
147 |         logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
148 | 
149 | 
150 | def run_detection_benchmark(
151 |     input_path: str,
152 |     output_dir: Path,
153 |     use_ocr: bool = False,
154 |     box_threshold: float = 0.01,
155 |     iou_threshold: float = 0.1,
156 | ):
157 |     """Run detection benchmark on images."""
158 |     logger.info(
159 |         f"Starting benchmark with OCR enabled: {use_ocr}, box_threshold: {box_threshold}, iou_threshold: {iou_threshold}"
160 |     )
161 | 
162 |     try:
163 |         # Initialize parser
164 |         logger.info("Initializing OmniParser...")
165 |         parser = OmniParser()
166 | 
167 |         # Create output directory
168 |         output_dir.mkdir(parents=True, exist_ok=True)
169 |         logger.info(f"Output directory created at: {output_dir}")
170 | 
171 |         # Get list of PNG files
172 |         if os.path.isdir(input_path):
173 |             image_files = glob.glob(os.path.join(input_path, "*.png"))
174 |         else:
175 |             image_files = [input_path]
176 | 
177 |         logger.info(f"Found {len(image_files)} images to process")
178 | 
179 |         # Process each image with specified thresholds
180 |         for image_path in image_files:
181 |             try:
182 |                 # Load image
183 |                 logger.info(f"Processing image: {image_path}")
184 |                 image = Image.open(image_path).convert("RGB")
185 |                 logger.info(f"Image loaded successfully, size: {image.size}")
186 | 
187 |                 # Create output filename
188 |                 input_filename = Path(image_path).stem
189 |                 output_path = output_dir / f"{input_filename}_analyzed.png"
190 | 
191 |                 # Convert image to PNG bytes
192 |                 image_bytes = image_to_bytes(image)
193 | 
194 |                 # Process image with specified thresholds
195 |                 with Timer(f"Processing {input_filename}", logger):
196 |                     result = parser.parse(
197 |                         image_bytes,
198 |                         use_ocr=use_ocr,
199 |                         box_threshold=box_threshold,
200 |                         iou_threshold=iou_threshold,
201 |                     )
202 |                     logger.info(
203 |                         f"Found {result.metadata.num_icons} icons and {result.metadata.num_text} text elements"
204 |                     )
205 | 
206 |                     # Save the annotated image
207 |                     logger.info(f"Saving annotated image to: {output_path}")
208 |                     try:
209 |                         # Save image from base64
210 |                         img_data = base64.b64decode(result.annotated_image_base64)
211 |                         img = Image.open(io.BytesIO(img_data))
212 |                         img.save(output_path)
213 | 
214 |                         # Print detailed results
215 |                         logger.info("\nDetected Elements:")
216 |                         for elem in result.elements:
217 |                             if isinstance(elem, IconElement):
218 |                                 logger.info(
219 |                                     f"Icon: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
220 |                                 )
221 |                             elif isinstance(elem, TextElement):
222 |                                 logger.info(
223 |                                     f"Text: '{elem.content}', confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
224 |                                 )
225 | 
226 |                         # Verify file exists and log size
227 |                         if output_path.exists():
228 |                             logger.info(
229 |                                 f"Successfully saved image. File size: {output_path.stat().st_size} bytes"
230 |                             )
231 |                         else:
232 |                             logger.error(f"Failed to verify file at {output_path}")
233 |                     except Exception as e:
234 |                         logger.error(f"Error saving image: {str(e)}", exc_info=True)
235 | 
236 |             except Exception as e:
237 |                 logger.error(f"Error processing image {image_path}: {str(e)}", exc_info=True)
238 | 
239 |     except Exception as e:
240 |         logger.error(f"Benchmark failed: {str(e)}", exc_info=True)
241 |         raise
242 | 
243 | 
244 | def run_experiments(input_path: str, output_dir: Path, use_ocr: bool = False):
245 |     """Run experiments with different threshold combinations."""
246 |     # Define threshold values to test
247 |     box_thresholds = [0.01, 0.05, 0.1, 0.3]
248 |     iou_thresholds = [0.05, 0.1, 0.2, 0.5]
249 | 
250 |     logger.info("Starting threshold experiments...")
251 |     logger.info("Box thresholds to test: %s", box_thresholds)
252 |     logger.info("IOU thresholds to test: %s", iou_thresholds)
253 | 
254 |     # Create results directory for this experiment
255 |     timestamp = time.strftime("%Y%m%d-%H%M%S")
256 |     ocr_suffix = "_ocr" if use_ocr else "_no_ocr"
257 |     exp_dir = output_dir / f"experiment_{timestamp}{ocr_suffix}"
258 |     exp_dir.mkdir(parents=True, exist_ok=True)
259 | 
260 |     # Create a summary file
261 |     summary_file = exp_dir / "results_summary.txt"
262 |     with open(summary_file, "w") as f:
263 |         f.write("Threshold Experiments Results\n")
264 |         f.write("==========================\n\n")
265 |         f.write(f"Input: {input_path}\n")
266 |         f.write(f"OCR Enabled: {use_ocr}\n")
267 |         f.write(f"Date: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
268 |         f.write("Results:\n")
269 |         f.write("-" * 80 + "\n")
270 |         f.write(
271 |             f"{'Box Thresh':^10} | {'IOU Thresh':^10} | {'Num Icons':^10} | {'Num Text':^10} | {'Time (s)':^10}\n"
272 |         )
273 |         f.write("-" * 80 + "\n")
274 | 
275 |         # Initialize parser once for all experiments
276 |         parser = OmniParser()
277 | 
278 |         # Run experiments with each combination
279 |         for box_thresh in box_thresholds:
280 |             for iou_thresh in iou_thresholds:
281 |                 logger.info(f"\nTesting box_threshold={box_thresh}, iou_threshold={iou_thresh}")
282 | 
283 |                 # Create directory for this combination
284 |                 combo_dir = exp_dir / f"box_{box_thresh}_iou_{iou_thresh}"
285 |                 combo_dir.mkdir(exist_ok=True)
286 | 
287 |                 try:
288 |                     # Process each image
289 |                     if os.path.isdir(input_path):
290 |                         image_files = glob.glob(os.path.join(input_path, "*.png"))
291 |                     else:
292 |                         image_files = [input_path]
293 | 
294 |                     total_icons = 0
295 |                     total_text = 0
296 |                     total_time = 0
297 | 
298 |                     for image_path in image_files:
299 |                         # Load and process image
300 |                         image = Image.open(image_path).convert("RGB")
301 |                         image_bytes = image_to_bytes(image)
302 | 
303 |                         # Process with current thresholds
304 |                         with Timer(f"Processing {Path(image_path).stem}", logger) as t:
305 |                             result = parser.parse(
306 |                                 image_bytes,
307 |                                 use_ocr=use_ocr,
308 |                                 box_threshold=box_thresh,
309 |                                 iou_threshold=iou_thresh,
310 |                             )
311 | 
312 |                             # Save annotated image
313 |                             output_path = combo_dir / f"{Path(image_path).stem}_analyzed.png"
314 |                             img_data = base64.b64decode(result.annotated_image_base64)
315 |                             img = Image.open(io.BytesIO(img_data))
316 |                             img.save(output_path)
317 | 
318 |                             # Update totals
319 |                             total_icons += result.metadata.num_icons
320 |                             total_text += result.metadata.num_text
321 | 
322 |                             # Log detailed results
323 |                             detail_file = combo_dir / f"{Path(image_path).stem}_details.txt"
324 |                             with open(detail_file, "w") as detail_f:
325 |                                 detail_f.write(f"Results for {Path(image_path).name}\n")
326 |                                 detail_f.write("-" * 40 + "\n")
327 |                                 detail_f.write(f"Number of icons: {result.metadata.num_icons}\n")
328 |                                 detail_f.write(
329 |                                     f"Number of text elements: {result.metadata.num_text}\n\n"
330 |                                 )
331 | 
332 |                                 detail_f.write("Icon Detections:\n")
333 |                                 icon_count = 1
334 |                                 text_count = (
335 |                                     result.metadata.num_icons + 1
336 |                                 )  # Text boxes start after icons
337 | 
338 |                                 # First list all icons
339 |                                 for elem in result.elements:
340 |                                     if isinstance(elem, IconElement):
341 |                                         detail_f.write(f"Box #{icon_count}: Icon\n")
342 |                                         detail_f.write(f"  - Confidence: {elem.confidence:.3f}\n")
343 |                                         detail_f.write(
344 |                                             f"  - Coordinates: {elem.bbox.coordinates}\n"
345 |                                         )
346 |                                         icon_count += 1
347 | 
348 |                                 if use_ocr:
349 |                                     detail_f.write("\nText Detections:\n")
350 |                                     for elem in result.elements:
351 |                                         if isinstance(elem, TextElement):
352 |                                             detail_f.write(f"Box #{text_count}: Text\n")
353 |                                             detail_f.write(f"  - Content: '{elem.content}'\n")
354 |                                             detail_f.write(
355 |                                                 f"  - Confidence: {elem.confidence:.3f}\n"
356 |                                             )
357 |                                             detail_f.write(
358 |                                                 f"  - Coordinates: {elem.bbox.coordinates}\n"
359 |                                             )
360 |                                             text_count += 1
361 | 
362 |                         # Update timing totals
363 |                         total_time += t.elapsed_time
364 |                         
365 |                     # Write summary for this combination
366 |                     avg_time = total_time / len(image_files)
367 |                     f.write(
368 |                         f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {total_icons:^10d} | {total_text:^10d} | {avg_time:^10.3f}\n"
369 |                     )
370 | 
371 |                 except Exception as e:
372 |                     logger.error(
373 |                         f"Error in experiment box={box_thresh}, iou={iou_thresh}: {str(e)}"
374 |                     )
375 |                     f.write(
376 |                         f"{box_thresh:^10.3f} | {iou_thresh:^10.3f} | {'ERROR':^10s} | {'ERROR':^10s} | {'ERROR':^10s}\n"
377 |                     )
378 | 
379 |         # Write summary footer
380 |         f.write("-" * 80 + "\n")
381 |         f.write("\nExperiment completed successfully!\n")
382 | 
383 |     logger.info(f"\nExperiment results saved to {exp_dir}")
384 |     logger.info(f"Summary file: {summary_file}")
385 | 
386 | 
387 | def main():
388 |     """Main entry point."""
389 |     parser = argparse.ArgumentParser(description="Run OmniParser benchmark")
390 |     parser.add_argument("input_path", help="Path to input image or directory containing images")
391 |     parser.add_argument(
392 |         "--output-dir", default="examples/output", help="Output directory for annotated images"
393 |     )
394 |     parser.add_argument(
395 |         "--ocr",
396 |         choices=["none", "easyocr"],
397 |         default="none",
398 |         help="OCR engine to use (default: none)",
399 |     )
400 |     parser.add_argument(
401 |         "--mode",
402 |         choices=["single", "experiment"],
403 |         default="single",
404 |         help="Run mode: single run or threshold experiments (default: single)",
405 |     )
406 |     parser.add_argument(
407 |         "--box-threshold",
408 |         type=float,
409 |         default=0.01,
410 |         help="Confidence threshold for detection (default: 0.01)",
411 |     )
412 |     parser.add_argument(
413 |         "--iou-threshold",
414 |         type=float,
415 |         default=0.1,
416 |         help="IOU threshold for Non-Maximum Suppression (default: 0.1)",
417 |     )
418 |     args = parser.parse_args()
419 | 
420 |     logger.info(f"Starting OmniParser with arguments: {args}")
421 |     use_ocr = args.ocr != "none"
422 |     output_dir = Path(args.output_dir)
423 | 
424 |     try:
425 |         if args.mode == "experiment":
426 |             run_experiments(args.input_path, output_dir, use_ocr)
427 |         else:
428 |             run_detection_benchmark(
429 |                 args.input_path, output_dir, use_ocr, args.box_threshold, args.iou_threshold
430 |             )
431 |     except Exception as e:
432 |         logger.error(f"Process failed: {str(e)}", exc_info=True)
433 |         return 1
434 | 
435 |     return 0
436 | 
437 | 
438 | if __name__ == "__main__":
439 |     sys.exit(main())
440 | 
```

--------------------------------------------------------------------------------
/libs/python/som/som/detect.py:
--------------------------------------------------------------------------------

```python
  1 | from pathlib import Path
  2 | from typing import Union, List, Dict, Any, Tuple, Optional, cast
  3 | import logging
  4 | import torch
  5 | import torchvision.ops
  6 | import cv2
  7 | import numpy as np
  8 | import time
  9 | import torchvision.transforms as T
 10 | from PIL import Image
 11 | import io
 12 | import base64
 13 | import argparse
 14 | import signal
 15 | from contextlib import contextmanager
 16 | 
 17 | from ultralytics import YOLO
 18 | from huggingface_hub import hf_hub_download
 19 | import supervision as sv
 20 | from supervision.detection.core import Detections
 21 | 
 22 | from .detection import DetectionProcessor
 23 | from .ocr import OCRProcessor
 24 | from .visualization import BoxAnnotator
 25 | from .models import BoundingBox, UIElement, IconElement, TextElement, ParserMetadata, ParseResult
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | 
 30 | class TimeoutException(Exception):
 31 |     pass
 32 | 
 33 | 
 34 | @contextmanager
 35 | def timeout(seconds: int):
 36 |     def timeout_handler(signum, frame):
 37 |         raise TimeoutException("OCR process timed out")
 38 | 
 39 |     # Register the signal handler
 40 |     original_handler = signal.signal(signal.SIGALRM, timeout_handler)
 41 |     signal.alarm(seconds)
 42 | 
 43 |     try:
 44 |         yield
 45 |     finally:
 46 |         signal.alarm(0)
 47 |         signal.signal(signal.SIGALRM, original_handler)
 48 | 
 49 | 
 50 | def process_text_box(box, image):
 51 |     """Process a single text box with OCR."""
 52 |     try:
 53 |         import easyocr
 54 |         from typing import List, Tuple, Any, Sequence
 55 | 
 56 |         x1 = int(min(point[0] for point in box))
 57 |         y1 = int(min(point[1] for point in box))
 58 |         x2 = int(max(point[0] for point in box))
 59 |         y2 = int(max(point[1] for point in box))
 60 | 
 61 |         # Add padding
 62 |         pad = 2
 63 |         x1 = max(0, x1 - pad)
 64 |         y1 = max(0, y1 - pad)
 65 |         x2 = min(image.shape[1], x2 + pad)
 66 |         y2 = min(image.shape[0], y2 + pad)
 67 | 
 68 |         region = image[y1:y2, x1:x2]
 69 |         if region.size > 0:
 70 |             reader = easyocr.Reader(["en"])
 71 |             results = reader.readtext(region)
 72 |             if results and len(results) > 0:
 73 |                 # EasyOCR returns a list of tuples (bbox, text, confidence)
 74 |                 first_result = results[0]
 75 |                 if isinstance(first_result, (list, tuple)) and len(first_result) >= 3:
 76 |                     text = str(first_result[1])
 77 |                     confidence = float(first_result[2])
 78 |                     if confidence > 0.5:
 79 |                         return text, [x1, y1, x2, y2], confidence
 80 |     except Exception:
 81 |         pass
 82 |     return None
 83 | 
 84 | 
 85 | def check_ocr_box(image_path: Union[str, Path]) -> Tuple[List[str], List[List[float]]]:
 86 |     """Check OCR box using EasyOCR."""
 87 |     # Read image once
 88 |     if isinstance(image_path, str):
 89 |         image_path = Path(image_path)
 90 | 
 91 |     # Read image into memory
 92 |     image_cv = cv2.imread(str(image_path))
 93 |     if image_cv is None:
 94 |         logger.error(f"Failed to read image: {image_path}")
 95 |         return [], []
 96 | 
 97 |     # Get image dimensions
 98 |     img_height, img_width = image_cv.shape[:2]
 99 |     confidence_threshold = 0.5
100 | 
101 |     # Use EasyOCR
102 |     import ssl
103 |     import easyocr
104 | 
105 |     # Create unverified SSL context for development
106 |     ssl._create_default_https_context = ssl._create_unverified_context
107 |     try:
108 |         reader = easyocr.Reader(["en"])
109 |         with timeout(5):  # 5 second timeout for EasyOCR
110 |             results = reader.readtext(image_cv, paragraph=False, text_threshold=0.5)
111 |     except TimeoutException:
112 |         logger.warning("EasyOCR timed out, returning no results")
113 |         return [], []
114 |     except Exception as e:
115 |         logger.warning(f"EasyOCR failed: {str(e)}")
116 |         return [], []
117 |     finally:
118 |         # Restore default SSL context
119 |         ssl._create_default_https_context = ssl.create_default_context
120 | 
121 |     texts = []
122 |     boxes = []
123 | 
124 |     for box, text, conf in results:
125 |         # Convert box format to [x1, y1, x2, y2]
126 |         x1 = min(point[0] for point in box)
127 |         y1 = min(point[1] for point in box)
128 |         x2 = max(point[0] for point in box)
129 |         y2 = max(point[1] for point in box)
130 | 
131 |         if float(conf) > 0.5:  # Only keep higher confidence detections
132 |             texts.append(text)
133 |             boxes.append([x1, y1, x2, y2])
134 | 
135 |     return texts, boxes
136 | 
137 | 
138 | class OmniParser:
139 |     """Enhanced UI parser using computer vision and OCR for detecting interactive elements."""
140 | 
141 |     def __init__(
142 |         self,
143 |         model_path: Optional[Union[str, Path]] = None,
144 |         cache_dir: Optional[Union[str, Path]] = None,
145 |         force_device: Optional[str] = None,
146 |     ):
147 |         """Initialize the OmniParser.
148 | 
149 |         Args:
150 |             model_path: Optional path to the YOLO model
151 |             cache_dir: Optional directory to cache model files
152 |             force_device: Force specific device (cpu/cuda/mps)
153 |         """
154 |         self.detector = DetectionProcessor(
155 |             model_path=Path(model_path) if model_path else None,
156 |             cache_dir=Path(cache_dir) if cache_dir else None,
157 |             force_device=force_device,
158 |         )
159 |         self.ocr = OCRProcessor()
160 |         self.visualizer = BoxAnnotator()
161 | 
162 |     def process_image(
163 |         self,
164 |         image: Image.Image,
165 |         box_threshold: float = 0.3,
166 |         iou_threshold: float = 0.1,
167 |         use_ocr: bool = True,
168 |     ) -> Tuple[Image.Image, List[UIElement]]:
169 |         """Process an image to detect UI elements and optionally text.
170 | 
171 |         Args:
172 |             image: Input PIL Image
173 |             box_threshold: Confidence threshold for detection
174 |             iou_threshold: IOU threshold for NMS
175 |             use_ocr: Whether to enable OCR processing
176 | 
177 |         Returns:
178 |             Tuple of (annotated image, list of detections)
179 |         """
180 |         try:
181 |             logger.info("Starting UI element detection...")
182 | 
183 |             # Detect icons
184 |             icon_detections = self.detector.detect_icons(
185 |                 image=image, box_threshold=box_threshold, iou_threshold=iou_threshold
186 |             )
187 |             logger.info(f"Found {len(icon_detections)} interactive elements")
188 | 
189 |             # Convert icon detections to typed objects
190 |             elements: List[UIElement] = cast(
191 |                 List[UIElement],
192 |                 [
193 |                     IconElement(
194 |                         id=i + 1,
195 |                         bbox=BoundingBox(
196 |                             x1=det["bbox"][0],
197 |                             y1=det["bbox"][1],
198 |                             x2=det["bbox"][2],
199 |                             y2=det["bbox"][3],
200 |                         ),
201 |                         confidence=det["confidence"],
202 |                         scale=det.get("scale"),
203 |                     )
204 |                     for i, det in enumerate(icon_detections)
205 |                 ],
206 |             )
207 | 
208 |             # Run OCR if enabled
209 |             if use_ocr:
210 |                 logger.info("Running OCR detection...")
211 |                 text_detections = self.ocr.detect_text(image=image, confidence_threshold=0.5)
212 |                 if text_detections is None:
213 |                     text_detections = []
214 |                 logger.info(f"Found {len(text_detections)} text regions")
215 | 
216 |                 # Convert text detections to typed objects
217 |                 text_elements = cast(
218 |                     List[UIElement],
219 |                     [
220 |                         TextElement(
221 |                             id=len(elements) + i + 1,
222 |                             bbox=BoundingBox(
223 |                                 x1=det["bbox"][0],
224 |                                 y1=det["bbox"][1],
225 |                                 x2=det["bbox"][2],
226 |                                 y2=det["bbox"][3],
227 |                             ),
228 |                             content=det["content"],
229 |                             confidence=det["confidence"],
230 |                         )
231 |                         for i, det in enumerate(text_detections)
232 |                     ],
233 |                 )
234 |                 
235 |                 if elements and text_elements:
236 |                     # Filter out non-OCR elements that have OCR elements with center points colliding with them
237 |                     filtered_elements = []
238 |                     for elem in elements:  # elements at this point contains only non-OCR elements
239 |                         should_keep = True
240 |                         for text_elem in text_elements:
241 |                             # Calculate center point of the text element
242 |                             center_x = (text_elem.bbox.x1 + text_elem.bbox.x2) / 2
243 |                             center_y = (text_elem.bbox.y1 + text_elem.bbox.y2) / 2
244 |                             
245 |                             # Check if this center point is inside the non-OCR element
246 |                             if (center_x >= elem.bbox.x1 and center_x <= elem.bbox.x2 and 
247 |                                 center_y >= elem.bbox.y1 and center_y <= elem.bbox.y2):
248 |                                 should_keep = False
249 |                                 break
250 |                         
251 |                         if should_keep:
252 |                             filtered_elements.append(elem)
253 |                     elements = filtered_elements
254 |                     
255 |                     # Merge detections using NMS
256 |                     all_elements = elements + text_elements
257 |                     boxes = torch.tensor([elem.bbox.coordinates for elem in all_elements])
258 |                     scores = torch.tensor([elem.confidence for elem in all_elements])
259 |                     keep_indices = torchvision.ops.nms(boxes, scores, iou_threshold)
260 |                     elements = [all_elements[i] for i in keep_indices]
261 |                 else:
262 |                     # Just add text elements to the list if IOU doesn't need to be applied
263 |                     elements.extend(text_elements)
264 | 
265 |             # Calculate drawing parameters based on image size
266 |             box_overlay_ratio = max(image.size) / 3200
267 |             draw_config = {
268 |                 "font_size": int(12 * box_overlay_ratio),
269 |                 "box_thickness": max(int(2 * box_overlay_ratio), 1),
270 |                 "text_padding": max(int(3 * box_overlay_ratio), 1),
271 |             }
272 | 
273 |             # Convert elements back to dict format for visualization
274 |             detection_dicts = [
275 |                 {
276 |                     "type": elem.type,
277 |                     "bbox": elem.bbox.coordinates,
278 |                     "confidence": elem.confidence,
279 |                     "content": elem.content if isinstance(elem, TextElement) else None,
280 |                 }
281 |                 for elem in elements
282 |             ]
283 | 
284 |             # Create visualization
285 |             logger.info("Creating visualization...")
286 |             annotated_image = self.visualizer.draw_boxes(
287 |                 image=image.copy(), detections=detection_dicts, draw_config=draw_config
288 |             )
289 |             logger.info("Visualization complete")
290 | 
291 |             return annotated_image, elements
292 | 
293 |         except Exception as e:
294 |             logger.error(f"Error in process_image: {str(e)}")
295 |             import traceback
296 | 
297 |             logger.error(traceback.format_exc())
298 |             raise
299 | 
300 |     def parse(
301 |         self,
302 |         screenshot_data: Union[bytes, str],
303 |         box_threshold: float = 0.3,
304 |         iou_threshold: float = 0.1,
305 |         use_ocr: bool = True,
306 |     ) -> ParseResult:
307 |         """Parse a UI screenshot to detect interactive elements and text.
308 | 
309 |         Args:
310 |             screenshot_data: Raw bytes or base64 string of the screenshot
311 |             box_threshold: Confidence threshold for detection
312 |             iou_threshold: IOU threshold for NMS
313 |             use_ocr: Whether to enable OCR processing
314 | 
315 |         Returns:
316 |             ParseResult object containing elements, annotated image, and metadata
317 |         """
318 |         try:
319 |             start_time = time.time()
320 | 
321 |             # Convert input to PIL Image
322 |             if isinstance(screenshot_data, str):
323 |                 screenshot_data = base64.b64decode(screenshot_data)
324 |             image = Image.open(io.BytesIO(screenshot_data)).convert("RGB")
325 | 
326 |             # Process image
327 |             annotated_image, elements = self.process_image(
328 |                 image=image,
329 |                 box_threshold=box_threshold,
330 |                 iou_threshold=iou_threshold,
331 |                 use_ocr=use_ocr,
332 |             )
333 | 
334 |             # Convert annotated image to base64
335 |             buffered = io.BytesIO()
336 |             annotated_image.save(buffered, format="PNG")
337 |             annotated_image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
338 | 
339 |             # Generate screen info text
340 |             screen_info = []
341 |             parsed_content_list = []
342 | 
343 |             # Set element IDs and generate human-readable descriptions
344 |             for i, elem in enumerate(elements):
345 |                 # Set the ID (1-indexed)
346 |                 elem.id = i + 1
347 | 
348 |                 if isinstance(elem, IconElement):
349 |                     screen_info.append(
350 |                         f"Box #{i+1}: Icon (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
351 |                     )
352 |                     parsed_content_list.append(
353 |                         {
354 |                             "id": i + 1,
355 |                             "type": "icon",
356 |                             "bbox": elem.bbox.coordinates,
357 |                             "confidence": elem.confidence,
358 |                             "content": None,
359 |                         }
360 |                     )
361 |                 elif isinstance(elem, TextElement):
362 |                     screen_info.append(
363 |                         f"Box #{i+1}: Text '{elem.content}' (confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates})"
364 |                     )
365 |                     parsed_content_list.append(
366 |                         {
367 |                             "id": i + 1,
368 |                             "type": "text",
369 |                             "bbox": elem.bbox.coordinates,
370 |                             "confidence": elem.confidence,
371 |                             "content": elem.content,
372 |                         }
373 |                     )
374 | 
375 |             # Calculate metadata
376 |             latency = time.time() - start_time
377 |             width, height = image.size
378 | 
379 |             # Create ParseResult object with enhanced properties
380 |             result = ParseResult(
381 |                 elements=elements,
382 |                 annotated_image_base64=annotated_image_base64,
383 |                 screen_info=screen_info,
384 |                 parsed_content_list=parsed_content_list,
385 |                 metadata=ParserMetadata(
386 |                     image_size=(width, height),
387 |                     num_icons=len([e for e in elements if isinstance(e, IconElement)]),
388 |                     num_text=len([e for e in elements if isinstance(e, TextElement)]),
389 |                     device=self.detector.device,
390 |                     ocr_enabled=use_ocr,
391 |                     latency=latency,
392 |                 ),
393 |             )
394 | 
395 |             # Return the ParseResult object directly
396 |             return result
397 | 
398 |         except Exception as e:
399 |             logger.error(f"Error in parse: {str(e)}")
400 |             import traceback
401 | 
402 |             logger.error(traceback.format_exc())
403 |             raise
404 | 
405 | 
406 | def main():
407 |     """Command line interface for UI element detection."""
408 |     parser = argparse.ArgumentParser(description="Detect UI elements and text in images")
409 |     parser.add_argument("image_path", help="Path to the input image")
410 |     parser.add_argument("--model-path", help="Path to YOLO model")
411 |     parser.add_argument(
412 |         "--box-threshold", type=float, default=0.3, help="Box confidence threshold (default: 0.3)"
413 |     )
414 |     parser.add_argument(
415 |         "--iou-threshold", type=float, default=0.1, help="IOU threshold (default: 0.1)"
416 |     )
417 |     parser.add_argument(
418 |         "--ocr", action="store_true", default=True, help="Enable OCR processing (default: True)"
419 |     )
420 |     parser.add_argument("--output", help="Output path for annotated image")
421 |     args = parser.parse_args()
422 | 
423 |     # Setup logging
424 |     logging.basicConfig(level=logging.INFO)
425 | 
426 |     try:
427 |         # Initialize parser
428 |         parser = OmniParser(model_path=args.model_path)
429 | 
430 |         # Load and process image
431 |         logger.info(f"Loading image from: {args.image_path}")
432 |         image = Image.open(args.image_path).convert("RGB")
433 |         logger.info(f"Image loaded successfully, size: {image.size}")
434 | 
435 |         # Process image
436 |         annotated_image, elements = parser.process_image(
437 |             image=image,
438 |             box_threshold=args.box_threshold,
439 |             iou_threshold=args.iou_threshold,
440 |             use_ocr=args.ocr,
441 |         )
442 | 
443 |         # Save output image
444 |         output_path = args.output or str(
445 |             Path(args.image_path).parent
446 |             / f"{Path(args.image_path).stem}_analyzed{Path(args.image_path).suffix}"
447 |         )
448 |         logger.info(f"Saving annotated image to: {output_path}")
449 | 
450 |         Path(output_path).parent.mkdir(parents=True, exist_ok=True)
451 |         annotated_image.save(output_path)
452 |         logger.info(f"Image saved successfully to {output_path}")
453 | 
454 |         # Print detections
455 |         logger.info("\nDetections:")
456 |         for i, elem in enumerate(elements):
457 |             if isinstance(elem, IconElement):
458 |                 logger.info(
459 |                     f"Interactive element {i}: confidence={elem.confidence:.3f}, bbox={elem.bbox.coordinates}"
460 |                 )
461 |             elif isinstance(elem, TextElement):
462 |                 logger.info(f"Text {i}: '{elem.content}', bbox={elem.bbox.coordinates}")
463 | 
464 |     except Exception as e:
465 |         logger.error(f"Error processing image: {str(e)}")
466 |         import traceback
467 | 
468 |         logger.error(traceback.format_exc())
469 |         return 1
470 | 
471 |     return 0
472 | 
473 | 
474 | if __name__ == "__main__":
475 |     import sys
476 | 
477 |     sys.exit(main())
478 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/cli.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | CLI chat interface for agent - Computer Use Agent
  3 | 
  4 | Usage:
  5 |     python -m agent.cli <model_string>
  6 |     
  7 | Examples:
  8 |     python -m agent.cli openai/computer-use-preview
  9 |     python -m agent.cli anthropic/claude-3-5-sonnet-20241022
 10 |     python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
 11 | """
 12 | 
 13 | try:
 14 |     import asyncio
 15 |     import argparse
 16 |     import os
 17 |     import sys
 18 |     import json
 19 |     from typing import List, Dict, Any
 20 |     import dotenv
 21 |     import base64
 22 |     import time
 23 |     import platform
 24 |     from pathlib import Path
 25 |     try:
 26 |         from PIL import Image, ImageDraw
 27 |         PIL_AVAILABLE = True
 28 |     except Exception:
 29 |         PIL_AVAILABLE = False
 30 |     from yaspin import yaspin
 31 | except ImportError:
 32 |     if __name__ == "__main__":
 33 |         raise ImportError(
 34 |             "CLI dependencies not found. "
 35 |             "Please install with: pip install \"cua-agent[cli]\""
 36 |         )
 37 | 
 38 | # Load environment variables
 39 | dotenv.load_dotenv()
 40 | 
 41 | # Color codes for terminal output
 42 | class Colors:
 43 |     RESET = '\033[0m'
 44 |     BOLD = '\033[1m'
 45 |     DIM = '\033[2m'
 46 |     
 47 |     # Text colors
 48 |     RED = '\033[31m'
 49 |     GREEN = '\033[32m'
 50 |     YELLOW = '\033[33m'
 51 |     BLUE = '\033[34m'
 52 |     MAGENTA = '\033[35m'
 53 |     CYAN = '\033[36m'
 54 |     WHITE = '\033[37m'
 55 |     GRAY = '\033[90m'
 56 |     
 57 |     # Background colors
 58 |     BG_RED = '\033[41m'
 59 |     BG_GREEN = '\033[42m'
 60 |     BG_YELLOW = '\033[43m'
 61 |     BG_BLUE = '\033[44m'
 62 | 
 63 | def print_colored(text: str, color: str = "", bold: bool = False, dim: bool = False, end: str = "\n", right: str = ""):
 64 |     """Print colored text to terminal with optional right-aligned text."""
 65 |     prefix = ""
 66 |     if bold:
 67 |         prefix += Colors.BOLD
 68 |     if dim:
 69 |         prefix += Colors.DIM
 70 |     if color:
 71 |         prefix += color
 72 |     
 73 |     if right:
 74 |         # Get terminal width (default to 80 if unable to determine)
 75 |         try:
 76 |             import shutil
 77 |             terminal_width = shutil.get_terminal_size().columns
 78 |         except:
 79 |             terminal_width = 80
 80 | 
 81 |         # Add right margin
 82 |         terminal_width -= 1
 83 |         
 84 |         # Calculate padding needed
 85 |         # Account for ANSI escape codes not taking visual space
 86 |         visible_left_len = len(text)
 87 |         visible_right_len = len(right)
 88 |         padding = terminal_width - visible_left_len - visible_right_len
 89 |         
 90 |         if padding > 0:
 91 |             output = f"{prefix}{text}{' ' * padding}{right}{Colors.RESET}"
 92 |         else:
 93 |             # If not enough space, just put a single space between
 94 |             output = f"{prefix}{text} {right}{Colors.RESET}"
 95 |     else:
 96 |         output = f"{prefix}{text}{Colors.RESET}"
 97 |     
 98 |     print(output, end=end)
 99 | 
100 | 
101 | def print_action(action_type: str, details: Dict[str, Any], total_cost: float):
102 |     """Print computer action with nice formatting."""
103 |     # Format action details
104 |     args_str = ""
105 |     if action_type == "click" and "x" in details and "y" in details:
106 |         args_str = f"_{details.get('button', 'left')}({details['x']}, {details['y']})"
107 |     elif action_type == "type" and "text" in details:
108 |         text = details["text"]
109 |         if len(text) > 50:
110 |             text = text[:47] + "..."
111 |         args_str = f'("{text}")'
112 |     elif action_type == "key" and "text" in details:
113 |         args_str = f"('{details['text']}')"
114 |     elif action_type == "scroll" and "x" in details and "y" in details:
115 |         args_str = f"({details['x']}, {details['y']})"
116 |     
117 |     if total_cost > 0:
118 |         print_colored(f"🛠️  {action_type}{args_str}", dim=True, right=f"💸 ${total_cost:.2f}")
119 |     else:
120 |         print_colored(f"🛠️  {action_type}{args_str}", dim=True)
121 | 
122 | def print_welcome(model: str, agent_loop: str, container_name: str):
123 |     """Print welcome message."""
124 |     print_colored(f"Connected to {container_name} ({model}, {agent_loop})")
125 |     print_colored("Type 'exit' to quit.", dim=True)
126 | 
127 | async def ainput(prompt: str = ""):
128 |     return await asyncio.to_thread(input, prompt)
129 | 
130 | async def chat_loop(agent, model: str, container_name: str, initial_prompt: str = "", show_usage: bool = True):
131 |     """Main chat loop with the agent."""
132 |     print_welcome(model, agent.agent_config_info.agent_class.__name__, container_name)
133 |     
134 |     history = []
135 |     
136 |     if initial_prompt:
137 |         history.append({"role": "user", "content": initial_prompt})
138 |     
139 |     total_cost = 0
140 | 
141 |     while True:
142 |         if len(history) == 0 or history[-1].get("role") != "user":
143 |             # Get user input with prompt
144 |             print_colored("> ", end="")
145 |             user_input = await ainput()
146 |             
147 |             if user_input.lower() in ['exit', 'quit', 'q']:
148 |                 print_colored("\n👋 Goodbye!")
149 |                 break
150 |                 
151 |             if not user_input:
152 |                 continue
153 |                 
154 |             # Add user message to history
155 |             history.append({"role": "user", "content": user_input})
156 |         
157 |         # Stream responses from the agent with spinner
158 |         with yaspin(text="Thinking...", spinner="line", attrs=["dark"]) as spinner:
159 |             spinner.hide()
160 |             
161 |             async for result in agent.run(history):
162 |                 # Add agent responses to history
163 |                 history.extend(result.get("output", []))
164 | 
165 |                 if show_usage:
166 |                     total_cost += result.get("usage", {}).get("response_cost", 0)
167 |                 
168 |                 # Process and display the output
169 |                 for item in result.get("output", []):
170 |                     if item.get("type") == "message" and item.get("role") == "assistant":
171 |                         # Display agent text response
172 |                         content = item.get("content", [])
173 |                         for content_part in content:
174 |                             if content_part.get("text"):
175 |                                 text = content_part.get("text", "").strip()
176 |                                 if text:
177 |                                     spinner.hide()
178 |                                     print_colored(text)
179 |                     
180 |                     elif item.get("type") == "computer_call":
181 |                         # Display computer action
182 |                         action = item.get("action", {})
183 |                         action_type = action.get("type", "")
184 |                         if action_type:
185 |                             spinner.hide()
186 |                             print_action(action_type, action, total_cost)
187 |                             spinner.text = f"Performing {action_type}..."
188 |                             spinner.show()
189 |                     
190 |                     elif item.get("type") == "function_call":
191 |                         # Display function call
192 |                         function_name = item.get("name", "")
193 |                         spinner.hide()
194 |                         print_colored(f"🔧 Calling function: {function_name}", dim=True)
195 |                         spinner.text = f"Calling {function_name}..."
196 |                         spinner.show()
197 |                     
198 |                     elif item.get("type") == "function_call_output":
199 |                         # Display function output (dimmed)
200 |                         output = item.get("output", "")
201 |                         if output and len(output.strip()) > 0:
202 |                             spinner.hide()
203 |                             print_colored(f"📤 {output}", dim=True)
204 |             
205 |             spinner.hide()
206 |             if show_usage and total_cost > 0:
207 |                 print_colored(f"Total cost: ${total_cost:.2f}", dim=True)
208 |         
209 | 
210 | async def main():
211 |     """Main CLI function."""
212 |     parser = argparse.ArgumentParser(
213 |         description="CUA Agent CLI - Interactive computer use assistant",
214 |         formatter_class=argparse.RawDescriptionHelpFormatter,
215 |         epilog="""
216 | Examples:
217 |   python -m agent.cli openai/computer-use-preview
218 |   python -m agent.cli anthropic/claude-3-5-sonnet-20241022
219 |   python -m agent.cli omniparser+anthropic/claude-3-5-sonnet-20241022
220 |   python -m agent.cli huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B
221 |         """
222 |     )
223 |     
224 |     parser.add_argument(
225 |         "model",
226 |         help="Model string (e.g., 'openai/computer-use-preview', 'anthropic/claude-3-5-sonnet-20241022')"
227 |     )
228 |     
229 |     parser.add_argument(
230 |         "--provider",
231 |         choices=["cloud", "lume", "winsandbox", "docker"],
232 |         default="cloud",
233 |         help="Computer provider to use: cloud (default), lume, winsandbox, or docker"
234 |     )
235 |     
236 |     parser.add_argument(
237 |         "--images",
238 |         type=int,
239 |         default=3,
240 |         help="Number of recent images to keep in context (default: 3)"
241 |     )
242 |     
243 |     parser.add_argument(
244 |         "--trajectory",
245 |         action="store_true",
246 |         help="Save trajectory for debugging"
247 |     )
248 |     
249 |     parser.add_argument(
250 |         "--budget",
251 |         type=float,
252 |         help="Maximum budget for the session (in dollars)"
253 |     )
254 |     
255 |     parser.add_argument(
256 |         "--verbose",
257 |         action="store_true",
258 |         help="Enable verbose logging"
259 |     )
260 | 
261 |     parser.add_argument(
262 |         "-p", "--prompt",
263 |         type=str,
264 |         help="Initial prompt to send to the agent. Leave blank for interactive mode."
265 |     )
266 | 
267 |     parser.add_argument(
268 |         "--prompt-file",
269 |         type=Path,
270 |         help="Path to a UTF-8 text file whose contents will be used as the initial prompt. If provided, overrides --prompt."
271 |     )
272 | 
273 |     parser.add_argument(
274 |         "--predict-click",
275 |         dest="predict_click",
276 |         type=str,
277 |         help="Instruction for click prediction. If set, runs predict_click, draws crosshair on a fresh screenshot, saves and opens it."
278 |     )
279 | 
280 |     parser.add_argument(
281 |         "-c", "--cache",
282 |         action="store_true",
283 |         help="Tell the API to enable caching"
284 |     )
285 | 
286 |     parser.add_argument(
287 |         "-u", "--usage",
288 |         action="store_true",
289 |         help="Show total cost of the agent runs"
290 |     )
291 | 
292 |     parser.add_argument(
293 |         "-r", "--max-retries",
294 |         type=int,
295 |         default=3,
296 |         help="Maximum number of retries for the LLM API calls"
297 |     )
298 |     
299 |     args = parser.parse_args()
300 |     
301 |     # Check for required environment variables
302 |     container_name = os.getenv("CUA_CONTAINER_NAME")
303 |     cua_api_key = os.getenv("CUA_API_KEY")
304 |     
305 |     # Prompt for missing environment variables (container name always required)
306 |     if not container_name:
307 |         if args.provider == "cloud":
308 |             print_colored("CUA_CONTAINER_NAME not set.", dim=True)
309 |             print_colored("You can get a CUA container at https://www.trycua.com/", dim=True)
310 |             container_name = input("Enter your CUA container name: ").strip()
311 |             if not container_name:
312 |                 print_colored("❌ Container name is required.")
313 |                 sys.exit(1)
314 |         else:
315 |             container_name = "cli-sandbox"
316 | 
317 |     # Only require API key for cloud provider
318 |     if args.provider == "cloud" and not cua_api_key:
319 |         print_colored("CUA_API_KEY not set.", dim=True)
320 |         cua_api_key = input("Enter your CUA API key: ").strip()
321 |         if not cua_api_key:
322 |             print_colored("❌ API key is required for cloud provider.")
323 |             sys.exit(1)
324 |     
325 |     # Check for provider-specific API keys based on model
326 |     provider_api_keys = {
327 |         "openai/": "OPENAI_API_KEY",
328 |         "anthropic/": "ANTHROPIC_API_KEY",
329 |     }
330 |     
331 |     # Find matching provider and check for API key
332 |     for prefix, env_var in provider_api_keys.items():
333 |         if prefix in args.model:
334 |             if not os.getenv(env_var):
335 |                 print_colored(f"{env_var} not set.", dim=True)
336 |                 api_key = input(f"Enter your {env_var.replace('_', ' ').title()}: ").strip()
337 |                 if not api_key:
338 |                     print_colored(f"❌ {env_var.replace('_', ' ').title()} is required.")
339 |                     sys.exit(1)
340 |                 # Set the environment variable for the session
341 |                 os.environ[env_var] = api_key
342 |             break
343 |     
344 |     # Import here to avoid import errors if dependencies are missing
345 |     try:
346 |         from agent import ComputerAgent
347 |         from computer import Computer
348 |     except ImportError as e:
349 |         print_colored(f"❌ Import error: {e}", Colors.RED, bold=True)
350 |         print_colored("Make sure agent and computer libraries are installed.", Colors.YELLOW)
351 |         sys.exit(1)
352 |     
353 |     # Resolve provider -> os_type, provider_type, api key requirement
354 |     provider_map = {
355 |         "cloud": ("linux", "cloud", True),
356 |         "lume": ("macos", "lume", False),
357 |         "winsandbox": ("windows", "winsandbox", False),
358 |         "docker": ("linux", "docker", False),
359 |     }
360 |     os_type, provider_type, needs_api_key = provider_map[args.provider]
361 | 
362 |     computer_kwargs = {
363 |         "os_type": os_type,
364 |         "provider_type": provider_type,
365 |         "name": container_name,
366 |     }
367 |     if needs_api_key:
368 |         computer_kwargs["api_key"] = cua_api_key # type: ignore
369 | 
370 |     # Create computer instance
371 |     async with Computer(**computer_kwargs) as computer: # type: ignore
372 |         
373 |         # Create agent
374 |         agent_kwargs = {
375 |             "model": args.model,
376 |             "tools": [computer],
377 |             "trust_remote_code": True, # needed for some local models (e.g., InternVL, OpenCUA)
378 |             "verbosity": 20 if args.verbose else 30,  # DEBUG vs WARNING
379 |             "max_retries": args.max_retries
380 |         }
381 | 
382 |         if args.images > 0:
383 |             agent_kwargs["only_n_most_recent_images"] = args.images
384 |         
385 |         if args.trajectory:
386 |             agent_kwargs["trajectory_dir"] = "trajectories"
387 |         
388 |         if args.budget:
389 |             agent_kwargs["max_trajectory_budget"] = {
390 |                 "max_budget": args.budget,
391 |                 "raise_error": True,
392 |                 "reset_after_each_run": False
393 |             }
394 | 
395 |         if args.cache:
396 |             agent_kwargs["use_prompt_caching"] = True
397 |         
398 |         agent = ComputerAgent(**agent_kwargs)
399 |         
400 |         # If predict-click mode is requested, run once and exit
401 |         if args.predict_click:
402 |             if not PIL_AVAILABLE:
403 |                 print_colored("❌ Pillow (PIL) is required for --predict-click visualization. Install with: pip install pillow", Colors.RED, bold=True)
404 |                 sys.exit(1)
405 | 
406 |             instruction = args.predict_click
407 |             print_colored(f"Predicting click for: '{instruction}'", Colors.CYAN)
408 | 
409 |             # Take a fresh screenshot FIRST
410 |             try:
411 |                 img_bytes = await computer.interface.screenshot()
412 |             except Exception as e:
413 |                 print_colored(f"❌ Failed to take screenshot: {e}", Colors.RED, bold=True)
414 |                 sys.exit(1)
415 | 
416 |             # Encode screenshot to base64 for predict_click
417 |             try:
418 |                 image_b64 = base64.b64encode(img_bytes).decode("utf-8")
419 |             except Exception as e:
420 |                 print_colored(f"❌ Failed to encode screenshot: {e}", Colors.RED, bold=True)
421 |                 sys.exit(1)
422 | 
423 |             try:
424 |                 coords = await agent.predict_click(instruction, image_b64=image_b64)
425 |             except Exception as e:
426 |                 print_colored(f"❌ predict_click failed: {e}", Colors.RED, bold=True)
427 |                 sys.exit(1)
428 | 
429 |             if not coords:
430 |                 print_colored("⚠️  No coordinates returned.", Colors.YELLOW)
431 |                 sys.exit(2)
432 | 
433 |             x, y = coords
434 |             print_colored(f"✅ Predicted coordinates: ({x}, {y})", Colors.GREEN)
435 | 
436 |             try:
437 |                 from io import BytesIO
438 |                 with Image.open(BytesIO(img_bytes)) as img:
439 |                     img = img.convert("RGB")
440 |                     draw = ImageDraw.Draw(img)
441 |                     # Draw crosshair
442 |                     size = 12
443 |                     color = (255, 0, 0)
444 |                     draw.line([(x - size, y), (x + size, y)], fill=color, width=3)
445 |                     draw.line([(x, y - size), (x, y + size)], fill=color, width=3)
446 |                     # Optional small circle
447 |                     r = 6
448 |                     draw.ellipse([(x - r, y - r), (x + r, y + r)], outline=color, width=2)
449 | 
450 |                     out_path = Path.cwd() / f"predict_click_{int(time.time())}.png"
451 |                     img.save(out_path)
452 |                     print_colored(f"🖼️  Saved to {out_path}")
453 | 
454 |                     # Open the image with default viewer
455 |                     try:
456 |                         system = platform.system().lower()
457 |                         if system == "windows":
458 |                             os.startfile(str(out_path))  # type: ignore[attr-defined]
459 |                         elif system == "darwin":
460 |                             os.system(f"open \"{out_path}\"")
461 |                         else:
462 |                             os.system(f"xdg-open \"{out_path}\"")
463 |                     except Exception:
464 |                         pass
465 |             except Exception as e:
466 |                 print_colored(f"❌ Failed to render/save screenshot: {e}", Colors.RED, bold=True)
467 |                 sys.exit(1)
468 | 
469 |             # Done
470 |             sys.exit(0)
471 | 
472 |         # Resolve initial prompt from --prompt-file or --prompt
473 |         initial_prompt = args.prompt or ""
474 |         if args.prompt_file:
475 |             try:
476 |                 initial_prompt = args.prompt_file.read_text(encoding="utf-8")
477 |             except Exception as e:
478 |                 print_colored(f"❌ Failed to read --prompt-file: {e}", Colors.RED, bold=True)
479 |                 sys.exit(1)
480 | 
481 |         # Start chat loop (default interactive mode)
482 |         await chat_loop(agent, args.model, container_name, initial_prompt, args.usage)
483 | 
484 | 
485 | 
486 | if __name__ == "__main__":
487 |     try:
488 |         asyncio.run(main())
489 |     except (KeyboardInterrupt, EOFError) as _:
490 |         print_colored("\n\n👋 Goodbye!")
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/moondream3.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Moondream3+ composed-grounded agent loop implementation.
  3 | Grounding is handled by a local Moondream3 preview model via Transformers.
  4 | Thinking is delegated to the trailing LLM in the composed model string: "moondream3+<thinking_model>".
  5 | 
  6 | Differences from composed_grounded:
  7 | - Provides a singleton Moondream3 client outside the class.
  8 | - predict_click uses model.point(image, instruction, settings={"max_objects": 1}) and returns pixel coordinates.
  9 | - If the last image was a screenshot (or we take one), run model.detect(image, "all form ui") to get bboxes, then
 10 |   run model.caption on each cropped bbox to label it. Overlay labels on the screenshot and emit via _on_screenshot.
 11 | - Add a user message listing all detected form UI names so the thinker can reference them.
 12 | - If the thinking model doesn't support vision, filter out image content before calling litellm.
 13 | """
 14 | 
 15 | from __future__ import annotations
 16 | 
 17 | import uuid
 18 | import base64
 19 | import io
 20 | from typing import Dict, List, Any, Optional, Tuple, Any
 21 | 
 22 | from PIL import Image, ImageDraw, ImageFont
 23 | import torch
 24 | from transformers import AutoModelForCausalLM
 25 | import litellm
 26 | 
 27 | from ..decorators import register_agent
 28 | from ..types import AgentCapability
 29 | from ..loops.base import AsyncAgentConfig
 30 | from ..responses import (
 31 |     convert_computer_calls_xy2desc,
 32 |     convert_responses_items_to_completion_messages,
 33 |     convert_completion_messages_to_responses_items,
 34 |     convert_computer_calls_desc2xy,
 35 |     get_all_element_descriptions,
 36 | )
 37 | 
 38 | _MOONDREAM_SINGLETON = None
 39 | 
 40 | def get_moondream_model() -> Any:
 41 |     """Get a singleton instance of the Moondream3 preview model."""
 42 |     global _MOONDREAM_SINGLETON
 43 |     if _MOONDREAM_SINGLETON is None:
 44 |         _MOONDREAM_SINGLETON = AutoModelForCausalLM.from_pretrained(
 45 |             "moondream/moondream3-preview",
 46 |             trust_remote_code=True,
 47 |             torch_dtype=torch.bfloat16,
 48 |             device_map="cuda",
 49 |         )
 50 |     return _MOONDREAM_SINGLETON
 51 | 
 52 | 
 53 | def _decode_image_b64(image_b64: str) -> Image.Image:
 54 |     data = base64.b64decode(image_b64)
 55 |     return Image.open(io.BytesIO(data)).convert("RGB")
 56 | 
 57 | 
 58 | def _image_to_b64(img: Image.Image) -> str:
 59 |     buf = io.BytesIO()
 60 |     img.save(buf, format="PNG")
 61 |     return base64.b64encode(buf.getvalue()).decode("utf-8")
 62 | 
 63 | 
 64 | def _supports_vision(model: str) -> bool:
 65 |     """Heuristic vision support detection for thinking model."""
 66 |     m = model.lower()
 67 |     vision_markers = [
 68 |         "gpt-4o",
 69 |         "gpt-4.1",
 70 |         "o1",
 71 |         "o3",
 72 |         "claude-3",
 73 |         "claude-3.5",
 74 |         "sonnet",
 75 |         "haiku",
 76 |         "opus",
 77 |         "gemini-1.5",
 78 |         "llava",
 79 |     ]
 80 |     return any(v in m for v in vision_markers)
 81 | 
 82 | 
 83 | def _filter_images_from_completion_messages(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 84 |     filtered: List[Dict[str, Any]] = []
 85 |     for msg in messages:
 86 |         msg_copy = {**msg}
 87 |         content = msg_copy.get("content")
 88 |         if isinstance(content, list):
 89 |             msg_copy["content"] = [c for c in content if c.get("type") != "image_url"]
 90 |         filtered.append(msg_copy)
 91 |     return filtered
 92 | 
 93 | def _annotate_detect_and_label_ui(base_img: Image.Image, model_md) -> Tuple[str, List[str]]:
 94 |     """Detect UI elements with Moondream, caption each, draw labels with backgrounds.
 95 | 
 96 |     Args:
 97 |         base_img: PIL image of the screenshot (RGB or RGBA). Will be copied/converted internally.
 98 |         model_md: Moondream model instance with .detect() and .query() methods.
 99 | 
100 |     Returns:
101 |         A tuple of (annotated_image_base64_png, detected_names)
102 |     """
103 |     # Ensure RGBA for semi-transparent fills
104 |     if base_img.mode != "RGBA":
105 |         base_img = base_img.convert("RGBA")
106 |     W, H = base_img.width, base_img.height
107 | 
108 |     # Detect objects
109 |     try:
110 |         detect_result = model_md.detect(base_img, "all ui elements")
111 |         objects = detect_result.get("objects", []) if isinstance(detect_result, dict) else []
112 |     except Exception:
113 |         objects = []
114 | 
115 |     draw = ImageDraw.Draw(base_img)
116 |     try:
117 |         font = ImageFont.load_default()
118 |     except Exception:
119 |         font = None
120 | 
121 |     detected_names: List[str] = []
122 | 
123 |     for i, obj in enumerate(objects):
124 |         try:
125 |             # Clamp normalized coords and crop
126 |             x_min = max(0.0, min(1.0, float(obj.get("x_min", 0.0))))
127 |             y_min = max(0.0, min(1.0, float(obj.get("y_min", 0.0))))
128 |             x_max = max(0.0, min(1.0, float(obj.get("x_max", 0.0))))
129 |             y_max = max(0.0, min(1.0, float(obj.get("y_max", 0.0))))
130 |             left, top, right, bottom = int(x_min * W), int(y_min * H), int(x_max * W), int(y_max * H)
131 |             left, top = max(0, left), max(0, top)
132 |             right, bottom = min(W - 1, right), min(H - 1, bottom)
133 |             crop = base_img.crop((left, top, right, bottom))
134 | 
135 |             # Prompted short caption
136 |             try:
137 |                 result = model_md.query(crop, "Caption this UI element in few words.")
138 |                 caption_text = (result or {}).get("answer", "")
139 |             except Exception:
140 |                 caption_text = ""
141 | 
142 |             name = (caption_text or "").strip() or f"element_{i+1}"
143 |             detected_names.append(name)
144 | 
145 |             # Draw bbox
146 |             draw.rectangle([left, top, right, bottom], outline=(255, 215, 0, 255), width=2)
147 | 
148 |             # Label background with padding and rounded corners
149 |             label = f"{i+1}. {name}"
150 |             padding = 3
151 |             if font:
152 |                 text_bbox = draw.textbbox((0, 0), label, font=font)
153 |             else:
154 |                 text_bbox = draw.textbbox((0, 0), label)
155 |             text_w = text_bbox[2] - text_bbox[0]
156 |             text_h = text_bbox[3] - text_bbox[1]
157 | 
158 |             tx = left + 3
159 |             ty = top - (text_h + 2 * padding + 4)
160 |             if ty < 0:
161 |                 ty = top + 3
162 | 
163 |             bg_left = tx - padding
164 |             bg_top = ty - padding
165 |             bg_right = tx + text_w + padding
166 |             bg_bottom = ty + text_h + padding
167 |             try:
168 |                 draw.rounded_rectangle(
169 |                     [bg_left, bg_top, bg_right, bg_bottom],
170 |                     radius=4,
171 |                     fill=(0, 0, 0, 160),
172 |                     outline=(255, 215, 0, 200),
173 |                     width=1,
174 |                 )
175 |             except Exception:
176 |                 draw.rectangle(
177 |                     [bg_left, bg_top, bg_right, bg_bottom],
178 |                     fill=(0, 0, 0, 160),
179 |                     outline=(255, 215, 0, 200),
180 |                     width=1,
181 |                 )
182 | 
183 |             text_fill = (255, 255, 255, 255)
184 |             if font:
185 |                 draw.text((tx, ty), label, fill=text_fill, font=font)
186 |             else:
187 |                 draw.text((tx, ty), label, fill=text_fill)
188 |         except Exception:
189 |             continue
190 | 
191 |     # Encode PNG base64
192 |     annotated = base_img
193 |     if annotated.mode not in ("RGBA", "RGB"):
194 |         annotated = annotated.convert("RGBA")
195 |     annotated_b64 = _image_to_b64(annotated)
196 |     return annotated_b64, detected_names
197 | 
198 | GROUNDED_COMPUTER_TOOL_SCHEMA = {
199 |     "type": "function",
200 |     "function": {
201 |         "name": "computer",
202 |         "description": (
203 |             "Control a computer by taking screenshots and interacting with UI elements. "
204 |             "The screenshot action will include a list of detected form UI element names when available. "
205 |             "Use element descriptions to locate and interact with UI elements on the screen."
206 |         ),
207 |         "parameters": {
208 |             "type": "object",
209 |             "properties": {
210 |                 "action": {
211 |                     "type": "string",
212 |                     "enum": [
213 |                         "screenshot",
214 |                         "click",
215 |                         "double_click",
216 |                         "drag",
217 |                         "type",
218 |                         "keypress",
219 |                         "scroll",
220 |                         "move",
221 |                         "wait",
222 |                         "get_current_url",
223 |                         "get_dimensions",
224 |                         "get_environment",
225 |                     ],
226 |                     "description": "The action to perform (required for all actions)",
227 |                 },
228 |                 "element_description": {
229 |                     "type": "string",
230 |                     "description": "Description of the element to interact with (required for click/double_click/move/scroll)",
231 |                 },
232 |                 "start_element_description": {
233 |                     "type": "string",
234 |                     "description": "Description of the element to start dragging from (required for drag)",
235 |                 },
236 |                 "end_element_description": {
237 |                     "type": "string",
238 |                     "description": "Description of the element to drag to (required for drag)",
239 |                 },
240 |                 "text": {
241 |                     "type": "string",
242 |                     "description": "The text to type (required for type)",
243 |                 },
244 |                 "keys": {
245 |                     "type": "array",
246 |                     "items": {"type": "string"},
247 |                     "description": "Key(s) to press (required for keypress)",
248 |                 },
249 |                 "button": {
250 |                     "type": "string",
251 |                     "enum": ["left", "right", "wheel", "back", "forward"],
252 |                     "description": "The mouse button to use for click/double_click",
253 |                 },
254 |                 "scroll_x": {
255 |                     "type": "integer",
256 |                     "description": "Horizontal scroll amount (required for scroll)",
257 |                 },
258 |                 "scroll_y": {
259 |                     "type": "integer",
260 |                     "description": "Vertical scroll amount (required for scroll)",
261 |                 },
262 |             },
263 |             "required": ["action"],
264 |         },
265 |     },
266 | }
267 | 
268 | @register_agent(r"moondream3\+.*", priority=2)
269 | class Moondream3PlusConfig(AsyncAgentConfig):
270 |     def __init__(self):
271 |         self.desc2xy: Dict[str, Tuple[float, float]] = {}
272 | 
273 |     async def predict_step(
274 |         self,
275 |         messages: List[Dict[str, Any]],
276 |         model: str,
277 |         tools: Optional[List[Dict[str, Any]]] = None,
278 |         max_retries: Optional[int] = None,
279 |         stream: bool = False,
280 |         computer_handler=None,
281 |         use_prompt_caching: Optional[bool] = False,
282 |         _on_api_start=None,
283 |         _on_api_end=None,
284 |         _on_usage=None,
285 |         _on_screenshot=None,
286 |         **kwargs,
287 |     ) -> Dict[str, Any]:
288 |         # Parse composed model: moondream3+<thinking_model>
289 |         if "+" not in model:
290 |             raise ValueError(f"Composed model must be 'moondream3+<thinking_model>', got: {model}")
291 |         _, thinking_model = model.split("+", 1)
292 | 
293 |         pre_output_items: List[Dict[str, Any]] = []
294 | 
295 |         # Acquire last screenshot; if missing, take one
296 |         last_image_b64: Optional[str] = None
297 |         for message in reversed(messages):
298 |             if (
299 |                 isinstance(message, dict)
300 |                 and message.get("type") == "computer_call_output"
301 |                 and isinstance(message.get("output"), dict)
302 |                 and message["output"].get("type") == "input_image"
303 |             ):
304 |                 image_url = message["output"].get("image_url", "")
305 |                 if image_url.startswith("data:image/png;base64,"):
306 |                     last_image_b64 = image_url.split(",", 1)[1]
307 |                     break
308 | 
309 |         if last_image_b64 is None and computer_handler is not None:
310 |             # Take a screenshot
311 |             screenshot_b64 = await computer_handler.screenshot()  # type: ignore
312 |             if screenshot_b64:
313 |                 call_id = uuid.uuid4().hex
314 |                 pre_output_items += [
315 |                     {
316 |                         "type": "message",
317 |                         "role": "assistant",
318 |                         "content": [
319 |                             {"type": "output_text", "text": "Taking a screenshot to analyze the current screen."}
320 |                         ],
321 |                     },
322 |                     {"type": "computer_call", "call_id": call_id, "status": "completed", "action": {"type": "screenshot"}},
323 |                     {
324 |                         "type": "computer_call_output",
325 |                         "call_id": call_id,
326 |                         "output": {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
327 |                     },
328 |                 ]
329 |                 last_image_b64 = screenshot_b64
330 |                 if _on_screenshot:
331 |                     await _on_screenshot(screenshot_b64)
332 | 
333 |         # If we have a last screenshot, run Moondream detection and labeling
334 |         detected_names: List[str] = []
335 |         if last_image_b64 is not None:
336 |             base_img = _decode_image_b64(last_image_b64)
337 |             model_md = get_moondream_model()
338 |             annotated_b64, detected_names = _annotate_detect_and_label_ui(base_img, model_md)
339 |             if _on_screenshot:
340 |                 await _on_screenshot(annotated_b64, "annotated_form_ui")
341 | 
342 |             # Also push a user message listing all detected names
343 |             if detected_names:
344 |                 names_text = "\n".join(f"- {n}" for n in detected_names)
345 |                 pre_output_items.append(
346 |                     {
347 |                         "type": "message",
348 |                         "role": "user",
349 |                         "content": [
350 |                             {"type": "input_text", "text": "Detected form UI elements on screen:"},
351 |                             {"type": "input_text", "text": names_text},
352 |                             {"type": "input_text", "text": "Please continue with the next action needed to perform your task."}
353 |                         ],
354 |                     }
355 |                 )
356 | 
357 |         tool_schemas = []
358 |         for schema in (tools or []):
359 |             if schema.get("type") == "computer":
360 |                 tool_schemas.append(GROUNDED_COMPUTER_TOOL_SCHEMA)
361 |             else:
362 |                 tool_schemas.append(schema)
363 | 
364 |         # Step 1: Convert computer calls from xy to descriptions
365 |         input_messages = messages + pre_output_items
366 |         messages_with_descriptions = convert_computer_calls_xy2desc(input_messages, self.desc2xy)
367 | 
368 |         # Step 2: Convert responses items to completion messages
369 |         completion_messages = convert_responses_items_to_completion_messages(
370 |             messages_with_descriptions,
371 |             allow_images_in_tool_results=False,
372 |         )
373 | 
374 |         # Optionally filter images if model lacks vision
375 |         if not _supports_vision(thinking_model):
376 |             completion_messages = _filter_images_from_completion_messages(completion_messages)
377 | 
378 |         # Step 3: Call thinking model with litellm.acompletion
379 |         api_kwargs = {
380 |             "model": thinking_model,
381 |             "messages": completion_messages,
382 |             "tools": tool_schemas,
383 |             "max_retries": max_retries,
384 |             "stream": stream,
385 |             **kwargs,
386 |         }
387 |         if use_prompt_caching:
388 |             api_kwargs["use_prompt_caching"] = use_prompt_caching
389 | 
390 |         if _on_api_start:
391 |             await _on_api_start(api_kwargs)
392 | 
393 |         response = await litellm.acompletion(**api_kwargs)
394 | 
395 |         if _on_api_end:
396 |             await _on_api_end(api_kwargs, response)
397 | 
398 |         usage = {
399 |             **response.usage.model_dump(),  # type: ignore
400 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
401 |         }
402 |         if _on_usage:
403 |             await _on_usage(usage)
404 | 
405 |         # Step 4: Convert completion messages back to responses items format
406 |         response_dict = response.model_dump()  # type: ignore
407 |         choice_messages = [choice["message"] for choice in response_dict["choices"]]
408 |         thinking_output_items: List[Dict[str, Any]] = []
409 |         for choice_message in choice_messages:
410 |             thinking_output_items.extend(
411 |                 convert_completion_messages_to_responses_items([choice_message])
412 |             )
413 | 
414 |         # Step 5: Use Moondream to get coordinates for each description
415 |         element_descriptions = get_all_element_descriptions(thinking_output_items)
416 |         if element_descriptions and last_image_b64:
417 |             for desc in element_descriptions:
418 |                 for _ in range(3):  # try 3 times
419 |                     coords = await self.predict_click(
420 |                         model=model,
421 |                         image_b64=last_image_b64,
422 |                         instruction=desc,
423 |                     )
424 |                     if coords:
425 |                         self.desc2xy[desc] = coords
426 |                         break
427 | 
428 |         # Step 6: Convert computer calls from descriptions back to xy coordinates
429 |         final_output_items = convert_computer_calls_desc2xy(thinking_output_items, self.desc2xy)
430 | 
431 |         # Step 7: Return output and usage
432 |         return {"output": pre_output_items + final_output_items, "usage": usage}
433 | 
434 |     async def predict_click(
435 |         self,
436 |         model: str,
437 |         image_b64: str,
438 |         instruction: str,
439 |         **kwargs,
440 |     ) -> Optional[Tuple[float, float]]:
441 |         """Predict click coordinates using Moondream3's point API.
442 | 
443 |         Returns pixel coordinates (x, y) as floats.
444 |         """
445 |         img = _decode_image_b64(image_b64)
446 |         W, H = img.width, img.height
447 |         model_md = get_moondream_model()
448 |         try:
449 |             result = model_md.point(img, instruction, settings={"max_objects": 1})
450 |         except Exception:
451 |             return None
452 | 
453 |         try:
454 |             pt = (result or {}).get("points", [])[0]
455 |             x_norm = float(pt.get("x", 0.0))
456 |             y_norm = float(pt.get("y", 0.0))
457 |             x_px = max(0.0, min(float(W - 1), x_norm * W))
458 |             y_px = max(0.0, min(float(H - 1), y_norm * H))
459 |             return (x_px, y_px)
460 |         except Exception:
461 |             return None
462 | 
463 |     def get_capabilities(self) -> List[AgentCapability]:
464 |         return ["click", "step"]
465 | 
```

--------------------------------------------------------------------------------
/libs/typescript/computer/src/interface/macos.ts:
--------------------------------------------------------------------------------

```typescript
  1 | /**
  2 |  * macOS computer interface implementation.
  3 |  */
  4 | 
  5 | import type { ScreenSize } from '../types';
  6 | import { BaseComputerInterface } from './base';
  7 | import type { AccessibilityNode, CursorPosition, MouseButton } from './base';
  8 | 
  9 | export class MacOSComputerInterface extends BaseComputerInterface {
 10 |   // Mouse Actions
 11 |   /**
 12 |    * Press and hold a mouse button at the specified coordinates.
 13 |    * @param {number} [x] - X coordinate for the mouse action
 14 |    * @param {number} [y] - Y coordinate for the mouse action
 15 |    * @param {MouseButton} [button='left'] - Mouse button to press down
 16 |    * @returns {Promise<void>}
 17 |    */
 18 |   async mouseDown(
 19 |     x?: number,
 20 |     y?: number,
 21 |     button: MouseButton = 'left'
 22 |   ): Promise<void> {
 23 |     await this.sendCommand('mouse_down', { x, y, button });
 24 |   }
 25 | 
 26 |   /**
 27 |    * Release a mouse button at the specified coordinates.
 28 |    * @param {number} [x] - X coordinate for the mouse action
 29 |    * @param {number} [y] - Y coordinate for the mouse action
 30 |    * @param {MouseButton} [button='left'] - Mouse button to release
 31 |    * @returns {Promise<void>}
 32 |    */
 33 |   async mouseUp(
 34 |     x?: number,
 35 |     y?: number,
 36 |     button: MouseButton = 'left'
 37 |   ): Promise<void> {
 38 |     await this.sendCommand('mouse_up', { x, y, button });
 39 |   }
 40 | 
 41 |   /**
 42 |    * Perform a left mouse click at the specified coordinates.
 43 |    * @param {number} [x] - X coordinate for the click
 44 |    * @param {number} [y] - Y coordinate for the click
 45 |    * @returns {Promise<void>}
 46 |    */
 47 |   async leftClick(x?: number, y?: number): Promise<void> {
 48 |     await this.sendCommand('left_click', { x, y });
 49 |   }
 50 | 
 51 |   /**
 52 |    * Perform a right mouse click at the specified coordinates.
 53 |    * @param {number} [x] - X coordinate for the click
 54 |    * @param {number} [y] - Y coordinate for the click
 55 |    * @returns {Promise<void>}
 56 |    */
 57 |   async rightClick(x?: number, y?: number): Promise<void> {
 58 |     await this.sendCommand('right_click', { x, y });
 59 |   }
 60 | 
 61 |   /**
 62 |    * Perform a double click at the specified coordinates.
 63 |    * @param {number} [x] - X coordinate for the double click
 64 |    * @param {number} [y] - Y coordinate for the double click
 65 |    * @returns {Promise<void>}
 66 |    */
 67 |   async doubleClick(x?: number, y?: number): Promise<void> {
 68 |     await this.sendCommand('double_click', { x, y });
 69 |   }
 70 | 
 71 |   /**
 72 |    * Move the cursor to the specified coordinates.
 73 |    * @param {number} x - X coordinate to move to
 74 |    * @param {number} y - Y coordinate to move to
 75 |    * @returns {Promise<void>}
 76 |    */
 77 |   async moveCursor(x: number, y: number): Promise<void> {
 78 |     await this.sendCommand('move_cursor', { x, y });
 79 |   }
 80 | 
 81 |   /**
 82 |    * Drag from current position to the specified coordinates.
 83 |    * @param {number} x - X coordinate to drag to
 84 |    * @param {number} y - Y coordinate to drag to
 85 |    * @param {MouseButton} [button='left'] - Mouse button to use for dragging
 86 |    * @param {number} [duration=0.5] - Duration of the drag operation in seconds
 87 |    * @returns {Promise<void>}
 88 |    */
 89 |   async dragTo(
 90 |     x: number,
 91 |     y: number,
 92 |     button: MouseButton = 'left',
 93 |     duration = 0.5
 94 |   ): Promise<void> {
 95 |     await this.sendCommand('drag_to', { x, y, button, duration });
 96 |   }
 97 | 
 98 |   /**
 99 |    * Drag along a path of coordinates.
100 |    * @param {Array<[number, number]>} path - Array of [x, y] coordinate pairs to drag through
101 |    * @param {MouseButton} [button='left'] - Mouse button to use for dragging
102 |    * @param {number} [duration=0.5] - Duration of the drag operation in seconds
103 |    * @returns {Promise<void>}
104 |    */
105 |   async drag(
106 |     path: Array<[number, number]>,
107 |     button: MouseButton = 'left',
108 |     duration = 0.5
109 |   ): Promise<void> {
110 |     await this.sendCommand('drag', { path, button, duration });
111 |   }
112 | 
113 |   // Keyboard Actions
114 |   /**
115 |    * Press and hold a key.
116 |    * @param {string} key - Key to press down
117 |    * @returns {Promise<void>}
118 |    */
119 |   async keyDown(key: string): Promise<void> {
120 |     await this.sendCommand('key_down', { key });
121 |   }
122 | 
123 |   /**
124 |    * Release a key.
125 |    * @param {string} key - Key to release
126 |    * @returns {Promise<void>}
127 |    */
128 |   async keyUp(key: string): Promise<void> {
129 |     await this.sendCommand('key_up', { key });
130 |   }
131 | 
132 |   /**
133 |    * Type text as if entered from keyboard.
134 |    * @param {string} text - Text to type
135 |    * @returns {Promise<void>}
136 |    */
137 |   async typeText(text: string): Promise<void> {
138 |     await this.sendCommand('type_text', { text });
139 |   }
140 | 
141 |   /**
142 |    * Press and release a key.
143 |    * @param {string} key - Key to press
144 |    * @returns {Promise<void>}
145 |    */
146 |   async pressKey(key: string): Promise<void> {
147 |     await this.sendCommand('press_key', { key });
148 |   }
149 | 
150 |   /**
151 |    * Press multiple keys simultaneously as a hotkey combination.
152 |    * @param {...string} keys - Keys to press together
153 |    * @returns {Promise<void>}
154 |    */
155 |   async hotkey(...keys: string[]): Promise<void> {
156 |     await this.sendCommand('hotkey', { keys });
157 |   }
158 | 
159 |   // Scrolling Actions
160 |   /**
161 |    * Scroll by the specified amount in x and y directions.
162 |    * @param {number} x - Horizontal scroll amount
163 |    * @param {number} y - Vertical scroll amount
164 |    * @returns {Promise<void>}
165 |    */
166 |   async scroll(x: number, y: number): Promise<void> {
167 |     await this.sendCommand('scroll', { x, y });
168 |   }
169 | 
170 |   /**
171 |    * Scroll down by the specified number of clicks.
172 |    * @param {number} [clicks=1] - Number of scroll clicks
173 |    * @returns {Promise<void>}
174 |    */
175 |   async scrollDown(clicks = 1): Promise<void> {
176 |     await this.sendCommand('scroll_down', { clicks });
177 |   }
178 | 
179 |   /**
180 |    * Scroll up by the specified number of clicks.
181 |    * @param {number} [clicks=1] - Number of scroll clicks
182 |    * @returns {Promise<void>}
183 |    */
184 |   async scrollUp(clicks = 1): Promise<void> {
185 |     await this.sendCommand('scroll_up', { clicks });
186 |   }
187 | 
188 |   // Screen Actions
189 |   /**
190 |    * Take a screenshot of the screen.
191 |    * @returns {Promise<Buffer>} Screenshot image data as a Buffer
192 |    * @throws {Error} If screenshot fails
193 |    */
194 |   async screenshot(): Promise<Buffer> {
195 |     const response = await this.sendCommand('screenshot');
196 |     if (!response.image_data) {
197 |       throw new Error('Failed to take screenshot');
198 |     }
199 |     return Buffer.from(response.image_data as string, 'base64');
200 |   }
201 | 
202 |   /**
203 |    * Get the current screen size.
204 |    * @returns {Promise<ScreenSize>} Screen dimensions
205 |    * @throws {Error} If unable to get screen size
206 |    */
207 |   async getScreenSize(): Promise<ScreenSize> {
208 |     const response = await this.sendCommand('get_screen_size');
209 |     if (!response.success || !response.size) {
210 |       throw new Error('Failed to get screen size');
211 |     }
212 |     return response.size as ScreenSize;
213 |   }
214 | 
215 |   /**
216 |    * Get the current cursor position.
217 |    * @returns {Promise<CursorPosition>} Current cursor coordinates
218 |    * @throws {Error} If unable to get cursor position
219 |    */
220 |   async getCursorPosition(): Promise<CursorPosition> {
221 |     const response = await this.sendCommand('get_cursor_position');
222 |     if (!response.success || !response.position) {
223 |       throw new Error('Failed to get cursor position');
224 |     }
225 |     return response.position as CursorPosition;
226 |   }
227 | 
228 |   // Clipboard Actions
229 |   /**
230 |    * Copy current selection to clipboard and return the content.
231 |    * @returns {Promise<string>} Clipboard content
232 |    * @throws {Error} If unable to get clipboard content
233 |    */
234 |   async copyToClipboard(): Promise<string> {
235 |     const response = await this.sendCommand('copy_to_clipboard');
236 |     if (!response.success || !response.content) {
237 |       throw new Error('Failed to get clipboard content');
238 |     }
239 |     return response.content as string;
240 |   }
241 | 
242 |   /**
243 |    * Set the clipboard content to the specified text.
244 |    * @param {string} text - Text to set in clipboard
245 |    * @returns {Promise<void>}
246 |    */
247 |   async setClipboard(text: string): Promise<void> {
248 |     await this.sendCommand('set_clipboard', { text });
249 |   }
250 | 
251 |   // File System Actions
252 |   /**
253 |    * Check if a file exists at the specified path.
254 |    * @param {string} path - Path to the file
255 |    * @returns {Promise<boolean>} True if file exists, false otherwise
256 |    */
257 |   async fileExists(path: string): Promise<boolean> {
258 |     const response = await this.sendCommand('file_exists', { path });
259 |     return (response.exists as boolean) || false;
260 |   }
261 | 
262 |   /**
263 |    * Check if a directory exists at the specified path.
264 |    * @param {string} path - Path to the directory
265 |    * @returns {Promise<boolean>} True if directory exists, false otherwise
266 |    */
267 |   async directoryExists(path: string): Promise<boolean> {
268 |     const response = await this.sendCommand('directory_exists', { path });
269 |     return (response.exists as boolean) || false;
270 |   }
271 | 
272 |   /**
273 |    * List the contents of a directory.
274 |    * @param {string} path - Path to the directory
275 |    * @returns {Promise<string[]>} Array of file and directory names
276 |    * @throws {Error} If unable to list directory
277 |    */
278 |   async listDir(path: string): Promise<string[]> {
279 |     const response = await this.sendCommand('list_dir', { path });
280 |     if (!response.success) {
281 |       throw new Error((response.error as string) || 'Failed to list directory');
282 |     }
283 |     return (response.files as string[]) || [];
284 |   }
285 | 
286 |   /**
287 |    * Get the size of a file in bytes.
288 |    * @param {string} path - Path to the file
289 |    * @returns {Promise<number>} File size in bytes
290 |    * @throws {Error} If unable to get file size
291 |    */
292 |   async getFileSize(path: string): Promise<number> {
293 |     const response = await this.sendCommand('get_file_size', { path });
294 |     if (!response.success) {
295 |       throw new Error((response.error as string) || 'Failed to get file size');
296 |     }
297 |     return (response.size as number) || 0;
298 |   }
299 | 
300 |   /**
301 |    * Read file content in chunks for large files.
302 |    * @private
303 |    * @param {string} path - Path to the file
304 |    * @param {number} offset - Starting byte offset
305 |    * @param {number} totalLength - Total number of bytes to read
306 |    * @param {number} [chunkSize=1048576] - Size of each chunk in bytes
307 |    * @returns {Promise<Buffer>} File content as Buffer
308 |    * @throws {Error} If unable to read file chunk
309 |    */
310 |   private async readBytesChunked(
311 |     path: string,
312 |     offset: number,
313 |     totalLength: number,
314 |     chunkSize: number = 1024 * 1024
315 |   ): Promise<Buffer> {
316 |     const chunks: Buffer[] = [];
317 |     let currentOffset = offset;
318 |     let remaining = totalLength;
319 | 
320 |     while (remaining > 0) {
321 |       const readSize = Math.min(chunkSize, remaining);
322 |       const response = await this.sendCommand('read_bytes', {
323 |         path,
324 |         offset: currentOffset,
325 |         length: readSize,
326 |       });
327 | 
328 |       if (!response.success) {
329 |         throw new Error(
330 |           (response.error as string) || 'Failed to read file chunk'
331 |         );
332 |       }
333 | 
334 |       const chunkData = Buffer.from(response.content_b64 as string, 'base64');
335 |       chunks.push(chunkData);
336 | 
337 |       currentOffset += readSize;
338 |       remaining -= readSize;
339 |     }
340 | 
341 |     return Buffer.concat(chunks);
342 |   }
343 | 
344 |   /**
345 |    * Write file content in chunks for large files.
346 |    * @private
347 |    * @param {string} path - Path to the file
348 |    * @param {Buffer} content - Content to write
349 |    * @param {boolean} [append=false] - Whether to append to existing file
350 |    * @param {number} [chunkSize=1048576] - Size of each chunk in bytes
351 |    * @returns {Promise<void>}
352 |    * @throws {Error} If unable to write file chunk
353 |    */
354 |   private async writeBytesChunked(
355 |     path: string,
356 |     content: Buffer,
357 |     append: boolean = false,
358 |     chunkSize: number = 1024 * 1024
359 |   ): Promise<void> {
360 |     const totalSize = content.length;
361 |     let currentOffset = 0;
362 | 
363 |     while (currentOffset < totalSize) {
364 |       const chunkEnd = Math.min(currentOffset + chunkSize, totalSize);
365 |       const chunkData = content.subarray(currentOffset, chunkEnd);
366 | 
367 |       // First chunk uses the original append flag, subsequent chunks always append
368 |       const chunkAppend = currentOffset === 0 ? append : true;
369 | 
370 |       const response = await this.sendCommand('write_bytes', {
371 |         path,
372 |         content_b64: chunkData.toString('base64'),
373 |         append: chunkAppend,
374 |       });
375 | 
376 |       if (!response.success) {
377 |         throw new Error(
378 |           (response.error as string) || 'Failed to write file chunk'
379 |         );
380 |       }
381 | 
382 |       currentOffset = chunkEnd;
383 |     }
384 |   }
385 | 
386 |   /**
387 |    * Read text from a file with specified encoding.
388 |    * @param {string} path - Path to the file to read
389 |    * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use
390 |    * @returns {Promise<string>} The decoded text content of the file
391 |    */
392 |   async readText(path: string, encoding: BufferEncoding = 'utf8'): Promise<string> {
393 |     const contentBytes = await this.readBytes(path);
394 |     return contentBytes.toString(encoding);
395 |   }
396 | 
397 |   /**
398 |    * Write text to a file with specified encoding.
399 |    * @param {string} path - Path to the file to write
400 |    * @param {string} content - Text content to write
401 |    * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use
402 |    * @param {boolean} [append=false] - Whether to append to the file instead of overwriting
403 |    * @returns {Promise<void>}
404 |    */
405 |   async writeText(
406 |     path: string,
407 |     content: string,
408 |     encoding: BufferEncoding = 'utf8',
409 |     append: boolean = false
410 |   ): Promise<void> {
411 |     const contentBytes = Buffer.from(content, encoding);
412 |     await this.writeBytes(path, contentBytes, append);
413 |   }
414 | 
415 |   /**
416 |    * Read bytes from a file, with optional offset and length.
417 |    * @param {string} path - Path to the file
418 |    * @param {number} [offset=0] - Starting byte offset
419 |    * @param {number} [length] - Number of bytes to read (reads entire file if not specified)
420 |    * @returns {Promise<Buffer>} File content as Buffer
421 |    * @throws {Error} If unable to read file
422 |    */
423 |   async readBytes(path: string, offset: number = 0, length?: number): Promise<Buffer> {
424 |     // For large files, use chunked reading
425 |     if (length === undefined) {
426 |       // Get file size first to determine if we need chunking
427 |       const fileSize = await this.getFileSize(path);
428 |       // If file is larger than 5MB, read in chunks
429 |       if (fileSize > 5 * 1024 * 1024) {
430 |         const readLength = offset > 0 ? fileSize - offset : fileSize;
431 |         return await this.readBytesChunked(path, offset, readLength);
432 |       }
433 |     }
434 | 
435 |     const response = await this.sendCommand('read_bytes', {
436 |       path,
437 |       offset,
438 |       length,
439 |     });
440 |     if (!response.success) {
441 |       throw new Error((response.error as string) || 'Failed to read file');
442 |     }
443 |     return Buffer.from(response.content_b64 as string, 'base64');
444 |   }
445 | 
446 |   /**
447 |    * Write bytes to a file.
448 |    * @param {string} path - Path to the file
449 |    * @param {Buffer} content - Content to write as Buffer
450 |    * @param {boolean} [append=false] - Whether to append to existing file
451 |    * @returns {Promise<void>}
452 |    * @throws {Error} If unable to write file
453 |    */
454 |   async writeBytes(path: string, content: Buffer, append: boolean = false): Promise<void> {
455 |     // For large files, use chunked writing
456 |     if (content.length > 5 * 1024 * 1024) {
457 |       // 5MB threshold
458 |       await this.writeBytesChunked(path, content, append);
459 |       return;
460 |     }
461 | 
462 |     const response = await this.sendCommand('write_bytes', {
463 |       path,
464 |       content_b64: content.toString('base64'),
465 |       append,
466 |     });
467 |     if (!response.success) {
468 |       throw new Error((response.error as string) || 'Failed to write file');
469 |     }
470 |   }
471 | 
472 |   /**
473 |    * Delete a file at the specified path.
474 |    * @param {string} path - Path to the file to delete
475 |    * @returns {Promise<void>}
476 |    * @throws {Error} If unable to delete file
477 |    */
478 |   async deleteFile(path: string): Promise<void> {
479 |     const response = await this.sendCommand('delete_file', { path });
480 |     if (!response.success) {
481 |       throw new Error((response.error as string) || 'Failed to delete file');
482 |     }
483 |   }
484 | 
485 |   /**
486 |    * Create a directory at the specified path.
487 |    * @param {string} path - Path where to create the directory
488 |    * @returns {Promise<void>}
489 |    * @throws {Error} If unable to create directory
490 |    */
491 |   async createDir(path: string): Promise<void> {
492 |     const response = await this.sendCommand('create_dir', { path });
493 |     if (!response.success) {
494 |       throw new Error(
495 |         (response.error as string) || 'Failed to create directory'
496 |       );
497 |     }
498 |   }
499 | 
500 |   /**
501 |    * Delete a directory at the specified path.
502 |    * @param {string} path - Path to the directory to delete
503 |    * @returns {Promise<void>}
504 |    * @throws {Error} If unable to delete directory
505 |    */
506 |   async deleteDir(path: string): Promise<void> {
507 |     const response = await this.sendCommand('delete_dir', { path });
508 |     if (!response.success) {
509 |       throw new Error(
510 |         (response.error as string) || 'Failed to delete directory'
511 |       );
512 |     }
513 |   }
514 | 
515 |   /**
516 |    * Execute a shell command and return stdout and stderr.
517 |    * @param {string} command - Command to execute
518 |    * @returns {Promise<[string, string]>} Tuple of [stdout, stderr]
519 |    * @throws {Error} If command execution fails
520 |    */
521 |   async runCommand(command: string): Promise<[string, string]> {
522 |     const response = await this.sendCommand('run_command', { command });
523 |     if (!response.success) {
524 |       throw new Error((response.error as string) || 'Failed to run command');
525 |     }
526 |     return [
527 |       (response.stdout as string) || '',
528 |       (response.stderr as string) || '',
529 |     ];
530 |   }
531 | 
532 |   // Accessibility Actions
533 |   /**
534 |    * Get the accessibility tree of the current screen.
535 |    * @returns {Promise<AccessibilityNode>} Root accessibility node
536 |    * @throws {Error} If unable to get accessibility tree
537 |    */
538 |   async getAccessibilityTree(): Promise<AccessibilityNode> {
539 |     const response = await this.sendCommand('get_accessibility_tree');
540 |     if (!response.success) {
541 |       throw new Error(
542 |         (response.error as string) || 'Failed to get accessibility tree'
543 |       );
544 |     }
545 |     return response as unknown as AccessibilityNode;
546 |   }
547 | 
548 |   /**
549 |    * Convert coordinates to screen coordinates.
550 |    * @param {number} x - X coordinate to convert
551 |    * @param {number} y - Y coordinate to convert
552 |    * @returns {Promise<[number, number]>} Converted screen coordinates as [x, y]
553 |    * @throws {Error} If coordinate conversion fails
554 |    */
555 |   async toScreenCoordinates(x: number, y: number): Promise<[number, number]> {
556 |     const response = await this.sendCommand('to_screen_coordinates', { x, y });
557 |     if (!response.success || !response.coordinates) {
558 |       throw new Error('Failed to convert to screen coordinates');
559 |     }
560 |     return response.coordinates as [number, number];
561 |   }
562 | 
563 |   /**
564 |    * Convert coordinates to screenshot coordinates.
565 |    * @param {number} x - X coordinate to convert
566 |    * @param {number} y - Y coordinate to convert
567 |    * @returns {Promise<[number, number]>} Converted screenshot coordinates as [x, y]
568 |    * @throws {Error} If coordinate conversion fails
569 |    */
570 |   async toScreenshotCoordinates(
571 |     x: number,
572 |     y: number
573 |   ): Promise<[number, number]> {
574 |     const response = await this.sendCommand('to_screenshot_coordinates', {
575 |       x,
576 |       y,
577 |     });
578 |     if (!response.success || !response.coordinates) {
579 |       throw new Error('Failed to convert to screenshot coordinates');
580 |     }
581 |     return response.coordinates as [number, number];
582 |   }
583 | }
584 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/VMVirtualizationService.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Foundation
  2 | import Virtualization
  3 | 
  4 | /// Framework-agnostic VM configuration
  5 | struct VMVirtualizationServiceContext {
  6 |     let cpuCount: Int
  7 |     let memorySize: UInt64
  8 |     let display: String
  9 |     let sharedDirectories: [SharedDirectory]?
 10 |     let mount: Path?
 11 |     let hardwareModel: Data?
 12 |     let machineIdentifier: Data?
 13 |     let macAddress: String
 14 |     let diskPath: Path
 15 |     let nvramPath: Path
 16 |     let recoveryMode: Bool
 17 |     let usbMassStoragePaths: [Path]?
 18 | }
 19 | 
 20 | /// Protocol defining the interface for virtualization operations
 21 | @MainActor
 22 | protocol VMVirtualizationService {
 23 |     var state: VZVirtualMachine.State { get }
 24 |     func start() async throws
 25 |     func stop() async throws
 26 |     func pause() async throws
 27 |     func resume() async throws
 28 |     func getVirtualMachine() -> Any
 29 | }
 30 | 
 31 | /// Base implementation of VMVirtualizationService using VZVirtualMachine
 32 | @MainActor
 33 | class BaseVirtualizationService: VMVirtualizationService {
 34 |     let virtualMachine: VZVirtualMachine
 35 |     let recoveryMode: Bool  // Store whether we should start in recovery mode
 36 | 
 37 |     var state: VZVirtualMachine.State {
 38 |         virtualMachine.state
 39 |     }
 40 | 
 41 |     init(virtualMachine: VZVirtualMachine, recoveryMode: Bool = false) {
 42 |         self.virtualMachine = virtualMachine
 43 |         self.recoveryMode = recoveryMode
 44 |     }
 45 | 
 46 |     func start() async throws {
 47 |         try await withCheckedThrowingContinuation {
 48 |             (continuation: CheckedContinuation<Void, Error>) in
 49 |             Task { @MainActor in
 50 |                 if #available(macOS 13, *) {
 51 |                     let startOptions = VZMacOSVirtualMachineStartOptions()
 52 |                     startOptions.startUpFromMacOSRecovery = recoveryMode
 53 |                     if recoveryMode {
 54 |                         Logger.info("Starting VM in recovery mode")
 55 |                     }
 56 |                     virtualMachine.start(options: startOptions) { error in
 57 |                         if let error = error {
 58 |                             continuation.resume(throwing: error)
 59 |                         } else {
 60 |                             continuation.resume()
 61 |                         }
 62 |                     }
 63 |                 } else {
 64 |                     Logger.info("Starting VM in normal mode")
 65 |                     virtualMachine.start { result in
 66 |                         switch result {
 67 |                         case .success:
 68 |                             continuation.resume()
 69 |                         case .failure(let error):
 70 |                             continuation.resume(throwing: error)
 71 |                         }
 72 |                     }
 73 |                 }
 74 |             }
 75 |         }
 76 |     }
 77 | 
 78 |     func stop() async throws {
 79 |         try await withCheckedThrowingContinuation {
 80 |             (continuation: CheckedContinuation<Void, Error>) in
 81 |             virtualMachine.stop { error in
 82 |                 if let error = error {
 83 |                     continuation.resume(throwing: error)
 84 |                 } else {
 85 |                     continuation.resume()
 86 |                 }
 87 |             }
 88 |         }
 89 |     }
 90 | 
 91 |     func pause() async throws {
 92 |         try await withCheckedThrowingContinuation {
 93 |             (continuation: CheckedContinuation<Void, Error>) in
 94 |             virtualMachine.start { result in
 95 |                 switch result {
 96 |                 case .success:
 97 |                     continuation.resume()
 98 |                 case .failure(let error):
 99 |                     continuation.resume(throwing: error)
100 |                 }
101 |             }
102 |         }
103 |     }
104 | 
105 |     func resume() async throws {
106 |         try await withCheckedThrowingContinuation {
107 |             (continuation: CheckedContinuation<Void, Error>) in
108 |             virtualMachine.start { result in
109 |                 switch result {
110 |                 case .success:
111 |                     continuation.resume()
112 |                 case .failure(let error):
113 |                     continuation.resume(throwing: error)
114 |                 }
115 |             }
116 |         }
117 |     }
118 | 
119 |     func getVirtualMachine() -> Any {
120 |         return virtualMachine
121 |     }
122 | 
123 |     // Helper methods for creating common configurations
124 |     static func createStorageDeviceConfiguration(diskPath: Path, readOnly: Bool = false) throws
125 |         -> VZStorageDeviceConfiguration
126 |     {
127 |         return VZVirtioBlockDeviceConfiguration(
128 |             attachment: try VZDiskImageStorageDeviceAttachment(
129 |                 url: diskPath.url,
130 |                 readOnly: readOnly,
131 |                 cachingMode: VZDiskImageCachingMode.automatic,
132 |                 synchronizationMode: VZDiskImageSynchronizationMode.fsync
133 |             )
134 |         )
135 |     }
136 | 
137 |     static func createUSBMassStorageDeviceConfiguration(diskPath: Path, readOnly: Bool = false)
138 |         throws
139 |         -> VZStorageDeviceConfiguration
140 |     {
141 |         if #available(macOS 15.0, *) {
142 |             return VZUSBMassStorageDeviceConfiguration(
143 |                 attachment: try VZDiskImageStorageDeviceAttachment(
144 |                     url: diskPath.url,
145 |                     readOnly: readOnly,
146 |                     cachingMode: VZDiskImageCachingMode.automatic,
147 |                     synchronizationMode: VZDiskImageSynchronizationMode.fsync
148 |                 )
149 |             )
150 |         } else {
151 |             // Fallback to normal storage device if USB mass storage not available
152 |             return try createStorageDeviceConfiguration(diskPath: diskPath, readOnly: readOnly)
153 |         }
154 |     }
155 | 
156 |     static func createNetworkDeviceConfiguration(macAddress: String) throws
157 |         -> VZNetworkDeviceConfiguration
158 |     {
159 |         let network = VZVirtioNetworkDeviceConfiguration()
160 |         guard let vzMacAddress = VZMACAddress(string: macAddress) else {
161 |             throw VMConfigError.invalidMachineIdentifier
162 |         }
163 |         network.attachment = VZNATNetworkDeviceAttachment()
164 |         network.macAddress = vzMacAddress
165 |         return network
166 |     }
167 | 
168 |     static func createDirectorySharingDevices(sharedDirectories: [SharedDirectory]?)
169 |         -> [VZDirectorySharingDeviceConfiguration]
170 |     {
171 |         return sharedDirectories?.map { sharedDir in
172 |             let device = VZVirtioFileSystemDeviceConfiguration(tag: sharedDir.tag)
173 |             let url = URL(fileURLWithPath: sharedDir.hostPath)
174 |             device.share = VZSingleDirectoryShare(
175 |                 directory: VZSharedDirectory(url: url, readOnly: sharedDir.readOnly))
176 |             return device
177 |         } ?? []
178 |     }
179 | }
180 | 
181 | /// macOS-specific virtualization service
182 | @MainActor
183 | final class DarwinVirtualizationService: BaseVirtualizationService {
184 |     static func createConfiguration(_ config: VMVirtualizationServiceContext) throws
185 |         -> VZVirtualMachineConfiguration
186 |     {
187 |         let vzConfig = VZVirtualMachineConfiguration()
188 |         vzConfig.cpuCount = config.cpuCount
189 |         vzConfig.memorySize = config.memorySize
190 | 
191 |         // Platform configuration
192 |         guard let machineIdentifier = config.machineIdentifier else {
193 |             throw VMConfigError.emptyMachineIdentifier
194 |         }
195 | 
196 |         guard let hardwareModel = config.hardwareModel else {
197 |             throw VMConfigError.emptyHardwareModel
198 |         }
199 | 
200 |         let platform = VZMacPlatformConfiguration()
201 |         platform.auxiliaryStorage = VZMacAuxiliaryStorage(url: config.nvramPath.url)
202 |         Logger.info("Pre-VZMacHardwareModel: hardwareModel=\(hardwareModel)")
203 |         guard let vzHardwareModel = VZMacHardwareModel(dataRepresentation: hardwareModel) else {
204 |             throw VMConfigError.invalidHardwareModel
205 |         }
206 |         platform.hardwareModel = vzHardwareModel
207 |         guard
208 |             let vzMachineIdentifier = VZMacMachineIdentifier(dataRepresentation: machineIdentifier)
209 |         else {
210 |             throw VMConfigError.invalidMachineIdentifier
211 |         }
212 |         platform.machineIdentifier = vzMachineIdentifier
213 |         vzConfig.platform = platform
214 |         vzConfig.bootLoader = VZMacOSBootLoader()
215 | 
216 |         // Graphics configuration
217 |         let display = VMDisplayResolution(string: config.display)!
218 |         let graphics = VZMacGraphicsDeviceConfiguration()
219 |         graphics.displays = [
220 |             VZMacGraphicsDisplayConfiguration(
221 |                 widthInPixels: display.width,
222 |                 heightInPixels: display.height,
223 |                 pixelsPerInch: 220  // Retina display density
224 |             )
225 |         ]
226 |         vzConfig.graphicsDevices = [graphics]
227 | 
228 |         // Common configurations
229 |         vzConfig.keyboards = [VZUSBKeyboardConfiguration()]
230 |         vzConfig.pointingDevices = [VZUSBScreenCoordinatePointingDeviceConfiguration()]
231 |         var storageDevices = [try createStorageDeviceConfiguration(diskPath: config.diskPath)]
232 |         if let mount = config.mount {
233 |             storageDevices.append(
234 |                 try createStorageDeviceConfiguration(diskPath: mount, readOnly: true))
235 |         }
236 |         // Add USB mass storage devices if specified
237 |         if #available(macOS 15.0, *), let usbPaths = config.usbMassStoragePaths, !usbPaths.isEmpty {
238 |             for usbPath in usbPaths {
239 |                 storageDevices.append(
240 |                     try createUSBMassStorageDeviceConfiguration(diskPath: usbPath, readOnly: true))
241 |             }
242 |         }
243 |         vzConfig.storageDevices = storageDevices
244 |         vzConfig.networkDevices = [
245 |             try createNetworkDeviceConfiguration(macAddress: config.macAddress)
246 |         ]
247 |         vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()]
248 |         vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()]
249 |         
250 |         // Audio configuration
251 |         let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration()
252 |         let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration()
253 |         let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration()
254 |         
255 |         inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource()
256 |         outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink()
257 |         
258 |         soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration]
259 |         vzConfig.audioDevices = [soundDeviceConfiguration]
260 |         
261 |         // Clipboard sharing via Spice agent
262 |         let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration()
263 |         let spiceAgentPort = VZVirtioConsolePortConfiguration()
264 |         spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName
265 |         let spiceAgentPortAttachment = VZSpiceAgentPortAttachment()
266 |         spiceAgentPortAttachment.sharesClipboard = true
267 |         spiceAgentPort.attachment = spiceAgentPortAttachment
268 |         spiceAgentConsoleDevice.ports[0] = spiceAgentPort
269 |         vzConfig.consoleDevices.append(spiceAgentConsoleDevice)
270 | 
271 |         // Directory sharing
272 |         let directorySharingDevices = createDirectorySharingDevices(
273 |             sharedDirectories: config.sharedDirectories)
274 |         if !directorySharingDevices.isEmpty {
275 |             vzConfig.directorySharingDevices = directorySharingDevices
276 |         }
277 | 
278 |         // USB Controller configuration
279 |         if #available(macOS 15.0, *) {
280 |             let usbControllerConfiguration = VZXHCIControllerConfiguration()
281 |             vzConfig.usbControllers = [usbControllerConfiguration]
282 |         }
283 | 
284 |         try vzConfig.validate()
285 |         return vzConfig
286 |     }
287 | 
288 |     static func generateMacAddress() -> String {
289 |         VZMACAddress.randomLocallyAdministered().string
290 |     }
291 | 
292 |     static func generateMachineIdentifier() -> Data {
293 |         VZMacMachineIdentifier().dataRepresentation
294 |     }
295 | 
296 |     func createAuxiliaryStorage(at path: Path, hardwareModel: Data) throws {
297 |         guard let vzHardwareModel = VZMacHardwareModel(dataRepresentation: hardwareModel) else {
298 |             throw VMConfigError.invalidHardwareModel
299 |         }
300 |         _ = try VZMacAuxiliaryStorage(creatingStorageAt: path.url, hardwareModel: vzHardwareModel)
301 |     }
302 | 
303 |     init(configuration: VMVirtualizationServiceContext) throws {
304 |         let vzConfig = try Self.createConfiguration(configuration)
305 |         super.init(
306 |             virtualMachine: VZVirtualMachine(configuration: vzConfig),
307 |             recoveryMode: configuration.recoveryMode)
308 |     }
309 | 
310 |     func installMacOS(imagePath: Path, progressHandler: (@Sendable (Double) -> Void)?) async throws
311 |     {
312 |         var observers: [NSKeyValueObservation] = []  // must hold observer references during installation to print process
313 |         try await withCheckedThrowingContinuation {
314 |             (continuation: CheckedContinuation<Void, Error>) in
315 |             Task {
316 |                 let installer = VZMacOSInstaller(
317 |                     virtualMachine: virtualMachine, restoringFromImageAt: imagePath.url)
318 |                 Logger.info("Starting macOS installation")
319 | 
320 |                 if let progressHandler = progressHandler {
321 |                     let observer = installer.progress.observe(
322 |                         \.fractionCompleted, options: [.initial, .new]
323 |                     ) { (progress, change) in
324 |                         if let newValue = change.newValue {
325 |                             progressHandler(newValue)
326 |                         }
327 |                     }
328 |                     observers.append(observer)
329 |                 }
330 | 
331 |                 installer.install { result in
332 |                     switch result {
333 |                     case .success:
334 |                         continuation.resume()
335 |                     case .failure(let error):
336 |                         Logger.error("Failed to install, error=\(error))")
337 |                         continuation.resume(throwing: error)
338 |                     }
339 |                 }
340 |             }
341 |         }
342 |         Logger.info("macOS installation finished")
343 |     }
344 | }
345 | 
346 | /// Linux-specific virtualization service
347 | @MainActor
348 | final class LinuxVirtualizationService: BaseVirtualizationService {
349 |     static func createConfiguration(_ config: VMVirtualizationServiceContext) throws
350 |         -> VZVirtualMachineConfiguration
351 |     {
352 |         let vzConfig = VZVirtualMachineConfiguration()
353 |         vzConfig.cpuCount = config.cpuCount
354 |         vzConfig.memorySize = config.memorySize
355 | 
356 |         // Platform configuration
357 |         let platform = VZGenericPlatformConfiguration()
358 |         if #available(macOS 15, *) {
359 |             platform.isNestedVirtualizationEnabled =
360 |                 VZGenericPlatformConfiguration.isNestedVirtualizationSupported
361 |         }
362 |         vzConfig.platform = platform
363 | 
364 |         let bootLoader = VZEFIBootLoader()
365 |         bootLoader.variableStore = VZEFIVariableStore(url: config.nvramPath.url)
366 |         vzConfig.bootLoader = bootLoader
367 | 
368 |         // Graphics configuration
369 |         let display = VMDisplayResolution(string: config.display)!
370 |         let graphics = VZVirtioGraphicsDeviceConfiguration()
371 |         graphics.scanouts = [
372 |             VZVirtioGraphicsScanoutConfiguration(
373 |                 widthInPixels: display.width,
374 |                 heightInPixels: display.height
375 |             )
376 |         ]
377 |         vzConfig.graphicsDevices = [graphics]
378 | 
379 |         // Common configurations
380 |         vzConfig.keyboards = [VZUSBKeyboardConfiguration()]
381 |         vzConfig.pointingDevices = [VZUSBScreenCoordinatePointingDeviceConfiguration()]
382 |         var storageDevices = [try createStorageDeviceConfiguration(diskPath: config.diskPath)]
383 |         if let mount = config.mount {
384 |             storageDevices.append(
385 |                 try createStorageDeviceConfiguration(diskPath: mount, readOnly: true))
386 |         }
387 |         // Add USB mass storage devices if specified
388 |         if #available(macOS 15.0, *), let usbPaths = config.usbMassStoragePaths, !usbPaths.isEmpty {
389 |             for usbPath in usbPaths {
390 |                 storageDevices.append(
391 |                     try createUSBMassStorageDeviceConfiguration(diskPath: usbPath, readOnly: true))
392 |             }
393 |         }
394 |         vzConfig.storageDevices = storageDevices
395 |         vzConfig.networkDevices = [
396 |             try createNetworkDeviceConfiguration(macAddress: config.macAddress)
397 |         ]
398 |         vzConfig.memoryBalloonDevices = [VZVirtioTraditionalMemoryBalloonDeviceConfiguration()]
399 |         vzConfig.entropyDevices = [VZVirtioEntropyDeviceConfiguration()]
400 |         
401 |         // Audio configuration
402 |         let soundDeviceConfiguration = VZVirtioSoundDeviceConfiguration()
403 |         let inputAudioStreamConfiguration = VZVirtioSoundDeviceInputStreamConfiguration()
404 |         let outputAudioStreamConfiguration = VZVirtioSoundDeviceOutputStreamConfiguration()
405 |         
406 |         inputAudioStreamConfiguration.source = VZHostAudioInputStreamSource()
407 |         outputAudioStreamConfiguration.sink = VZHostAudioOutputStreamSink()
408 |         
409 |         soundDeviceConfiguration.streams = [inputAudioStreamConfiguration, outputAudioStreamConfiguration]
410 |         vzConfig.audioDevices = [soundDeviceConfiguration]
411 | 
412 |         // Clipboard sharing via Spice agent
413 |         let spiceAgentConsoleDevice = VZVirtioConsoleDeviceConfiguration()
414 |         let spiceAgentPort = VZVirtioConsolePortConfiguration()
415 |         spiceAgentPort.name = VZSpiceAgentPortAttachment.spiceAgentPortName
416 |         let spiceAgentPortAttachment = VZSpiceAgentPortAttachment()
417 |         spiceAgentPortAttachment.sharesClipboard = true
418 |         spiceAgentPort.attachment = spiceAgentPortAttachment
419 |         spiceAgentConsoleDevice.ports[0] = spiceAgentPort
420 |         vzConfig.consoleDevices.append(spiceAgentConsoleDevice)
421 | 
422 |         // Directory sharing
423 |         var directorySharingDevices = createDirectorySharingDevices(
424 |             sharedDirectories: config.sharedDirectories)
425 | 
426 |         // Add Rosetta support if available
427 |         if #available(macOS 13.0, *) {
428 |             if VZLinuxRosettaDirectoryShare.availability == .installed {
429 |                 do {
430 |                     let rosettaShare = try VZLinuxRosettaDirectoryShare()
431 |                     let rosettaDevice = VZVirtioFileSystemDeviceConfiguration(tag: "rosetta")
432 |                     rosettaDevice.share = rosettaShare
433 |                     directorySharingDevices.append(rosettaDevice)
434 |                     Logger.info("Added Rosetta support to Linux VM")
435 |                 } catch {
436 |                     Logger.info("Failed to add Rosetta support: \(error.localizedDescription)")
437 |                 }
438 |             } else {
439 |                 Logger.info("Rosetta not installed, skipping Rosetta support")
440 |             }
441 |         }
442 | 
443 |         if !directorySharingDevices.isEmpty {
444 |             vzConfig.directorySharingDevices = directorySharingDevices
445 |         }
446 | 
447 |         // USB Controller configuration
448 |         if #available(macOS 15.0, *) {
449 |             let usbControllerConfiguration = VZXHCIControllerConfiguration()
450 |             vzConfig.usbControllers = [usbControllerConfiguration]
451 |         }
452 | 
453 |         try vzConfig.validate()
454 |         return vzConfig
455 |     }
456 | 
457 |     func generateMacAddress() -> String {
458 |         VZMACAddress.randomLocallyAdministered().string
459 |     }
460 | 
461 |     func createNVRAM(at path: Path) throws {
462 |         _ = try VZEFIVariableStore(creatingVariableStoreAt: path.url)
463 |     }
464 | 
465 |     init(configuration: VMVirtualizationServiceContext) throws {
466 |         let vzConfig = try Self.createConfiguration(configuration)
467 |         super.init(virtualMachine: VZVirtualMachine(configuration: vzConfig))
468 |     }
469 | }
470 | 
```

--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/lume_api.py:
--------------------------------------------------------------------------------

```python
  1 | """Shared API utilities for Lume and Lumier providers.
  2 | 
  3 | This module contains shared functions for interacting with the Lume API,
  4 | used by both the LumeProvider and LumierProvider classes.
  5 | """
  6 | 
  7 | import logging
  8 | import json
  9 | import subprocess
 10 | import urllib.parse
 11 | from typing import Dict, List, Optional, Any
 12 | 
 13 | # Setup logging
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | # Check if curl is available
 17 | try:
 18 |     subprocess.run(["curl", "--version"], capture_output=True, check=True)
 19 |     HAS_CURL = True
 20 | except (subprocess.SubprocessError, FileNotFoundError):
 21 |     HAS_CURL = False
 22 | 
 23 | 
 24 | def lume_api_get(
 25 |     vm_name: str,
 26 |     host: str,
 27 |     port: int,
 28 |     storage: Optional[str] = None,
 29 |     debug: bool = False,
 30 |     verbose: bool = False
 31 | ) -> Dict[str, Any]:
 32 |     """Use curl to get VM information from Lume API.
 33 |     
 34 |     Args:
 35 |         vm_name: Name of the VM to get info for
 36 |         host: API host
 37 |         port: API port
 38 |         storage: Storage path for the VM
 39 |         debug: Whether to show debug output
 40 |         verbose: Enable verbose logging
 41 |         
 42 |     Returns:
 43 |         Dictionary with VM status information parsed from JSON response
 44 |     """
 45 |     # URL encode the storage parameter for the query
 46 |     encoded_storage = ""
 47 |     storage_param = ""
 48 |     
 49 |     if storage:
 50 |         # First encode the storage path properly
 51 |         encoded_storage = urllib.parse.quote(storage, safe='')
 52 |         storage_param = f"?storage={encoded_storage}"
 53 |         
 54 |     # Construct API URL with encoded storage parameter if needed
 55 |     api_url = f"http://{host}:{port}/lume/vms/{vm_name}{storage_param}"
 56 |         
 57 |     # Construct the curl command with increased timeouts for more reliability
 58 |     # --connect-timeout: Time to establish connection (15 seconds)
 59 |     # --max-time: Maximum time for the whole operation (20 seconds)
 60 |     # -f: Fail silently (no output at all) on server errors
 61 |     # Add single quotes around URL to ensure special characters are handled correctly
 62 |     cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", f"'{api_url}'"]
 63 |     
 64 |     # For logging and display, show the properly escaped URL
 65 |     display_cmd = ["curl", "--connect-timeout", "15", "--max-time", "20", "-s", "-f", api_url]
 66 |     
 67 |     # Only print the curl command when debug is enabled
 68 |     display_curl_string = ' '.join(display_cmd)
 69 |     logger.debug(f"Executing API request: {display_curl_string}")
 70 |     
 71 |     # Execute the command - for execution we need to use shell=True to handle URLs with special characters
 72 |     try:
 73 |         # Use a single string with shell=True for proper URL handling
 74 |         shell_cmd = ' '.join(cmd)
 75 |         result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)
 76 |         
 77 |         # Handle curl exit codes
 78 |         if result.returncode != 0:
 79 |             curl_error = "Unknown error"
 80 |             
 81 |             # Map common curl error codes to helpful messages
 82 |             if result.returncode == 7:
 83 |                 curl_error = "Failed to connect to the API server - it might still be starting up"
 84 |             elif result.returncode == 22:
 85 |                 curl_error = "HTTP error returned from API server"
 86 |             elif result.returncode == 28:
 87 |                 curl_error = "Operation timeout - the API server is taking too long to respond"
 88 |             elif result.returncode == 52:
 89 |                 curl_error = "Empty reply from server - the API server is starting but not fully ready yet"
 90 |             elif result.returncode == 56:
 91 |                 curl_error = "Network problem during data transfer - check container networking"
 92 |                 
 93 |             # Only log at debug level to reduce noise during retries
 94 |             logger.debug(f"API request failed with code {result.returncode}: {curl_error}")
 95 |             
 96 |             # Return a more useful error message
 97 |             return {
 98 |                 "error": f"API request failed: {curl_error}",
 99 |                 "curl_code": result.returncode,
100 |                 "vm_name": vm_name,
101 |                 "status": "unknown"  # We don't know the actual status due to API error
102 |             }
103 |             
104 |         # Try to parse the response as JSON
105 |         if result.stdout and result.stdout.strip():
106 |             try:
107 |                 vm_status = json.loads(result.stdout)
108 |                 if debug or verbose:
109 |                     logger.info(f"Successfully parsed VM status: {vm_status.get('status', 'unknown')}")
110 |                 return vm_status
111 |             except json.JSONDecodeError as e:
112 |                 # Return the raw response if it's not valid JSON
113 |                 logger.warning(f"Invalid JSON response: {e}")
114 |                 if "Virtual machine not found" in result.stdout:
115 |                     return {"status": "not_found", "message": "VM not found in Lume API"}
116 |                 
117 |                 return {"error": f"Invalid JSON response: {result.stdout[:100]}...", "status": "unknown"}
118 |         else:
119 |             return {"error": "Empty response from API", "status": "unknown"}
120 |     except subprocess.SubprocessError as e:
121 |         logger.error(f"Failed to execute API request: {e}")
122 |         return {"error": f"Failed to execute API request: {str(e)}", "status": "unknown"}
123 | 
124 | 
125 | def lume_api_run(
126 |     vm_name: str,
127 |     host: str,
128 |     port: int,
129 |     run_opts: Dict[str, Any],
130 |     storage: Optional[str] = None,
131 |     debug: bool = False,
132 |     verbose: bool = False
133 | ) -> Dict[str, Any]:
134 |     """Run a VM using curl.
135 |     
136 |     Args:
137 |         vm_name: Name of the VM to run
138 |         host: API host
139 |         port: API port
140 |         run_opts: Dictionary of run options
141 |         storage: Storage path for the VM
142 |         debug: Whether to show debug output
143 |         verbose: Enable verbose logging
144 |         
145 |     Returns:
146 |         Dictionary with API response or error information
147 |     """
148 |     # Construct API URL
149 |     api_url = f"http://{host}:{port}/lume/vms/{vm_name}/run"
150 |     
151 |     # Prepare JSON payload with required parameters
152 |     payload = {}
153 |     
154 |     # Add CPU cores if specified
155 |     if "cpu" in run_opts:
156 |         payload["cpu"] = run_opts["cpu"]
157 |         
158 |     # Add memory if specified
159 |     if "memory" in run_opts:
160 |         payload["memory"] = run_opts["memory"]
161 |     
162 |     # Add storage parameter if specified
163 |     if storage:
164 |         payload["storage"] = storage
165 |     elif "storage" in run_opts:
166 |         payload["storage"] = run_opts["storage"]
167 |         
168 |     # Add shared directories if specified
169 |     if "shared_directories" in run_opts and run_opts["shared_directories"]:
170 |         payload["sharedDirectories"] = run_opts["shared_directories"]
171 |         
172 |     # Log the payload for debugging
173 |     logger.debug(f"API payload: {json.dumps(payload, indent=2)}")
174 |     
175 |     # Construct the curl command
176 |     cmd = [
177 |         "curl", "--connect-timeout", "30", "--max-time", "30",
178 |         "-s", "-X", "POST", "-H", "Content-Type: application/json",
179 |         "-d", json.dumps(payload),
180 |         api_url
181 |     ]
182 |     
183 |     # Execute the command
184 |     try:
185 |         result = subprocess.run(cmd, capture_output=True, text=True)
186 |         
187 |         if result.returncode != 0:
188 |             logger.warning(f"API request failed with code {result.returncode}: {result.stderr}")
189 |             return {"error": f"API request failed: {result.stderr}"}
190 |             
191 |         # Try to parse the response as JSON
192 |         if result.stdout and result.stdout.strip():
193 |             try:
194 |                 response = json.loads(result.stdout)
195 |                 return response
196 |             except json.JSONDecodeError:
197 |                 # Return the raw response if it's not valid JSON
198 |                 return {"success": True, "message": "VM started successfully", "raw_response": result.stdout}
199 |         else:
200 |             return {"success": True, "message": "VM started successfully"}
201 |     except subprocess.SubprocessError as e:
202 |         logger.error(f"Failed to execute run request: {e}")
203 |         return {"error": f"Failed to execute run request: {str(e)}"}
204 | 
205 | 
206 | def lume_api_stop(
207 |     vm_name: str,
208 |     host: str,
209 |     port: int,
210 |     storage: Optional[str] = None,
211 |     debug: bool = False,
212 |     verbose: bool = False
213 | ) -> Dict[str, Any]:
214 |     """Stop a VM using curl.
215 |     
216 |     Args:
217 |         vm_name: Name of the VM to stop
218 |         host: API host
219 |         port: API port
220 |         storage: Storage path for the VM
221 |         debug: Whether to show debug output
222 |         verbose: Enable verbose logging
223 |         
224 |     Returns:
225 |         Dictionary with API response or error information
226 |     """
227 |     # Construct API URL
228 |     api_url = f"http://{host}:{port}/lume/vms/{vm_name}/stop"
229 |     
230 |     # Prepare JSON payload with required parameters
231 |     payload = {}
232 |     
233 |     # Add storage path if specified
234 |     if storage:
235 |         payload["storage"] = storage
236 |         
237 |     # Construct the curl command
238 |     cmd = [
239 |         "curl", "--connect-timeout", "15", "--max-time", "20",
240 |         "-s", "-X", "POST", "-H", "Content-Type: application/json",
241 |         "-d", json.dumps(payload),
242 |         api_url
243 |     ]
244 |     
245 |     # Execute the command
246 |     try:
247 |         if debug or verbose:
248 |             logger.info(f"Executing: {' '.join(cmd)}")
249 |             
250 |         result = subprocess.run(cmd, capture_output=True, text=True)
251 |         
252 |         if result.returncode != 0:
253 |             logger.warning(f"API request failed with code {result.returncode}: {result.stderr}")
254 |             return {"error": f"API request failed: {result.stderr}"}
255 |             
256 |         # Try to parse the response as JSON
257 |         if result.stdout and result.stdout.strip():
258 |             try:
259 |                 response = json.loads(result.stdout)
260 |                 return response
261 |             except json.JSONDecodeError:
262 |                 # Return the raw response if it's not valid JSON
263 |                 return {"success": True, "message": "VM stopped successfully", "raw_response": result.stdout}
264 |         else:
265 |             return {"success": True, "message": "VM stopped successfully"}
266 |     except subprocess.SubprocessError as e:
267 |         logger.error(f"Failed to execute stop request: {e}")
268 |         return {"error": f"Failed to execute stop request: {str(e)}"}
269 | 
270 | 
271 | def lume_api_update(
272 |     vm_name: str,
273 |     host: str,
274 |     port: int,
275 |     update_opts: Dict[str, Any],
276 |     storage: Optional[str] = None,
277 |     debug: bool = False,
278 |     verbose: bool = False
279 | ) -> Dict[str, Any]:
280 |     """Update VM settings using curl.
281 |     
282 |     Args:
283 |         vm_name: Name of the VM to update
284 |         host: API host
285 |         port: API port
286 |         update_opts: Dictionary of update options
287 |         storage: Storage path for the VM
288 |         debug: Whether to show debug output
289 |         verbose: Enable verbose logging
290 |         
291 |     Returns:
292 |         Dictionary with API response or error information
293 |     """
294 |     # Construct API URL
295 |     api_url = f"http://{host}:{port}/lume/vms/{vm_name}/update"
296 |     
297 |     # Prepare JSON payload with required parameters
298 |     payload = {}
299 |     
300 |     # Add CPU cores if specified
301 |     if "cpu" in update_opts:
302 |         payload["cpu"] = update_opts["cpu"]
303 |         
304 |     # Add memory if specified
305 |     if "memory" in update_opts:
306 |         payload["memory"] = update_opts["memory"]
307 |     
308 |     # Add storage path if specified
309 |     if storage:
310 |         payload["storage"] = storage
311 |         
312 |     # Construct the curl command
313 |     cmd = [
314 |         "curl", "--connect-timeout", "15", "--max-time", "20",
315 |         "-s", "-X", "POST", "-H", "Content-Type: application/json",
316 |         "-d", json.dumps(payload),
317 |         api_url
318 |     ]
319 |     
320 |     # Execute the command
321 |     try:
322 |         if debug:
323 |             logger.info(f"Executing: {' '.join(cmd)}")
324 |             
325 |         result = subprocess.run(cmd, capture_output=True, text=True)
326 |         
327 |         if result.returncode != 0:
328 |             logger.warning(f"API request failed with code {result.returncode}: {result.stderr}")
329 |             return {"error": f"API request failed: {result.stderr}"}
330 |             
331 |         # Try to parse the response as JSON
332 |         if result.stdout and result.stdout.strip():
333 |             try:
334 |                 response = json.loads(result.stdout)
335 |                 return response
336 |             except json.JSONDecodeError:
337 |                 # Return the raw response if it's not valid JSON
338 |                 return {"success": True, "message": "VM updated successfully", "raw_response": result.stdout}
339 |         else:
340 |             return {"success": True, "message": "VM updated successfully"}
341 |     except subprocess.SubprocessError as e:
342 |         logger.error(f"Failed to execute update request: {e}")
343 |         return {"error": f"Failed to execute update request: {str(e)}"}
344 | 
345 | 
346 | def lume_api_pull(
347 |     image: str,
348 |     name: str,
349 |     host: str,
350 |     port: int,
351 |     storage: Optional[str] = None,
352 |     registry: str = "ghcr.io",
353 |     organization: str = "trycua",
354 |     debug: bool = False,
355 |     verbose: bool = False
356 | ) -> Dict[str, Any]:
357 |     """Pull a VM image from a registry using curl.
358 |     
359 |     Args:
360 |         image: Name/tag of the image to pull
361 |         name: Name to give the VM after pulling
362 |         host: API host
363 |         port: API port
364 |         storage: Storage path for the VM
365 |         registry: Registry to pull from (default: ghcr.io)
366 |         organization: Organization in registry (default: trycua)
367 |         debug: Whether to show debug output
368 |         verbose: Enable verbose logging
369 |         
370 |     Returns:
371 |         Dictionary with pull status and information
372 |     """
373 |     # Prepare pull request payload
374 |     pull_payload = {
375 |         "image": image,  # Use provided image name
376 |         "name": name, # Always use name as the target VM name
377 |         "registry": registry,
378 |         "organization": organization
379 |     }
380 |     
381 |     if storage:
382 |         pull_payload["storage"] = storage
383 |     
384 |     # Construct pull command with proper JSON payload
385 |     pull_cmd = [
386 |         "curl"
387 |     ]
388 |     
389 |     if not verbose:
390 |         pull_cmd.append("-s")
391 |     
392 |     pull_cmd.extend([
393 |         "-X", "POST",
394 |         "-H", "Content-Type: application/json",
395 |         "-d", json.dumps(pull_payload),
396 |         f"http://{host}:{port}/lume/pull"
397 |     ])
398 |     
399 |     logger.debug(f"Executing API request: {' '.join(pull_cmd)}")
400 |     
401 |     try:
402 |         # Execute pull command
403 |         result = subprocess.run(pull_cmd, capture_output=True, text=True)
404 |         
405 |         if result.returncode != 0:
406 |             error_msg = f"Failed to pull VM {name}: {result.stderr}"
407 |             logger.error(error_msg)
408 |             return {"error": error_msg}
409 |         
410 |         try:
411 |             response = json.loads(result.stdout)
412 |             logger.info(f"Successfully initiated pull for VM {name}")
413 |             return response
414 |         except json.JSONDecodeError:
415 |             if result.stdout:
416 |                 logger.info(f"Pull response: {result.stdout}")
417 |             return {"success": True, "message": f"Successfully initiated pull for VM {name}"}
418 |             
419 |     except subprocess.SubprocessError as e:
420 |         error_msg = f"Failed to execute pull command: {str(e)}"
421 |         logger.error(error_msg)
422 |         return {"error": error_msg}
423 | 
424 | 
425 | def lume_api_delete(
426 |     vm_name: str,
427 |     host: str,
428 |     port: int,
429 |     storage: Optional[str] = None,
430 |     debug: bool = False,
431 |     verbose: bool = False
432 | ) -> Dict[str, Any]:
433 |     """Delete a VM using curl.
434 |     
435 |     Args:
436 |         vm_name: Name of the VM to delete
437 |         host: API host
438 |         port: API port
439 |         storage: Storage path for the VM
440 |         debug: Whether to show debug output
441 |         verbose: Enable verbose logging
442 |         
443 |     Returns:
444 |         Dictionary with API response or error information
445 |     """
446 |     # URL encode the storage parameter for the query
447 |     encoded_storage = ""
448 |     storage_param = ""
449 |     
450 |     if storage:
451 |         # First encode the storage path properly
452 |         encoded_storage = urllib.parse.quote(storage, safe='')
453 |         storage_param = f"?storage={encoded_storage}"
454 |         
455 |     # Construct API URL with encoded storage parameter if needed
456 |     api_url = f"http://{host}:{port}/lume/vms/{vm_name}{storage_param}"
457 |         
458 |     # Construct the curl command for DELETE operation - using much longer timeouts matching shell implementation
459 |     cmd = ["curl", "--connect-timeout", "6000", "--max-time", "5000", "-s", "-X", "DELETE", f"'{api_url}'"]
460 |     
461 |     # For logging and display, show the properly escaped URL
462 |     display_cmd = ["curl", "--connect-timeout", "6000", "--max-time", "5000", "-s", "-X", "DELETE", api_url]
463 |     
464 |     # Only print the curl command when debug is enabled
465 |     display_curl_string = ' '.join(display_cmd)
466 |     logger.debug(f"Executing API request: {display_curl_string}")
467 |     
468 |     # Execute the command - for execution we need to use shell=True to handle URLs with special characters
469 |     try:
470 |         # Use a single string with shell=True for proper URL handling
471 |         shell_cmd = ' '.join(cmd)
472 |         result = subprocess.run(shell_cmd, shell=True, capture_output=True, text=True)
473 |         
474 |         # Handle curl exit codes
475 |         if result.returncode != 0:
476 |             curl_error = "Unknown error"
477 |             
478 |             # Map common curl error codes to helpful messages
479 |             if result.returncode == 7:
480 |                 curl_error = "Failed to connect to the API server - it might still be starting up"
481 |             elif result.returncode == 22:
482 |                 curl_error = "HTTP error returned from API server"
483 |             elif result.returncode == 28:
484 |                 curl_error = "Operation timeout - the API server is taking too long to respond"
485 |             elif result.returncode == 52:
486 |                 curl_error = "Empty reply from server - the API server is starting but not fully ready yet"
487 |             elif result.returncode == 56:
488 |                 curl_error = "Network problem during data transfer - check container networking"
489 |                 
490 |             # Only log at debug level to reduce noise during retries
491 |             logger.debug(f"API request failed with code {result.returncode}: {curl_error}")
492 |             
493 |             # Return a more useful error message
494 |             return {
495 |                 "error": f"API request failed: {curl_error}",
496 |                 "curl_code": result.returncode,
497 |                 "vm_name": vm_name,
498 |                 "storage": storage
499 |             }
500 |             
501 |         # Try to parse the response as JSON
502 |         if result.stdout and result.stdout.strip():
503 |             try:
504 |                 response = json.loads(result.stdout)
505 |                 return response
506 |             except json.JSONDecodeError:
507 |                 # Return the raw response if it's not valid JSON
508 |                 return {"success": True, "message": "VM deleted successfully", "raw_response": result.stdout}
509 |         else:
510 |             return {"success": True, "message": "VM deleted successfully"}
511 |     except subprocess.SubprocessError as e:
512 |         logger.error(f"Failed to execute delete request: {e}")
513 |         return {"error": f"Failed to execute delete request: {str(e)}"}
514 | 
515 | 
516 | def parse_memory(memory_str: str) -> int:
517 |     """Parse memory string to MB integer.
518 |     
519 |     Examples:
520 |         "8GB" -> 8192
521 |         "1024MB" -> 1024
522 |         "512" -> 512
523 |         
524 |     Returns:
525 |         Memory value in MB
526 |     """
527 |     if isinstance(memory_str, int):
528 |         return memory_str
529 |         
530 |     if isinstance(memory_str, str):
531 |         # Extract number and unit
532 |         import re
533 |         match = re.match(r"(\d+)([A-Za-z]*)", memory_str)
534 |         if match:
535 |             value, unit = match.groups()
536 |             value = int(value)
537 |             unit = unit.upper()
538 |             
539 |             if unit == "GB" or unit == "G":
540 |                 return value * 1024
541 |             elif unit == "MB" or unit == "M" or unit == "":
542 |                 return value
543 |                 
544 |     # Default fallback
545 |     logger.warning(f"Could not parse memory string '{memory_str}', using 8GB default")
546 |     return 8192  # Default to 8GB
547 | 
```

--------------------------------------------------------------------------------
/libs/python/pylume/pylume/server.py:
--------------------------------------------------------------------------------

```python
  1 | import os
  2 | import time
  3 | import asyncio
  4 | import subprocess
  5 | import tempfile
  6 | import logging
  7 | import socket
  8 | from typing import Optional
  9 | import sys
 10 | from .exceptions import LumeConnectionError
 11 | import signal
 12 | import json
 13 | import shlex
 14 | import random
 15 | from logging import getLogger
 16 | 
 17 | 
 18 | class LumeServer:
 19 |     def __init__(
 20 |         self,
 21 |         debug: bool = False,
 22 |         server_start_timeout: int = 60,
 23 |         port: Optional[int] = None,
 24 |         use_existing_server: bool = False,
 25 |         host: str = "localhost",
 26 |     ):
 27 |         """Initialize the LumeServer.
 28 | 
 29 |         Args:
 30 |             debug: Enable debug logging
 31 |             server_start_timeout: Timeout in seconds to wait for server to start
 32 |             port: Specific port to use for the server
 33 |             use_existing_server: If True, will try to connect to an existing server
 34 |                                instead of starting a new one
 35 |             host: Host to use for connections (e.g., "localhost", "127.0.0.1", "host.docker.internal")
 36 |         """
 37 |         self.debug = debug
 38 |         self.server_start_timeout = server_start_timeout
 39 |         self.server_process = None
 40 |         self.output_file = None
 41 |         self.requested_port = port
 42 |         self.port = None
 43 |         self.base_url = None
 44 |         self.use_existing_server = use_existing_server
 45 |         self.host = host
 46 | 
 47 |         # Configure logging
 48 |         self.logger = getLogger("pylume.server")
 49 |         if not self.logger.handlers:
 50 |             handler = logging.StreamHandler()
 51 |             formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 52 |             handler.setFormatter(formatter)
 53 |             self.logger.addHandler(handler)
 54 |             self.logger.setLevel(logging.DEBUG if debug else logging.INFO)
 55 | 
 56 |         self.logger.debug(f"Server initialized with host: {self.host}")
 57 | 
 58 |     def _check_port_available(self, port: int) -> bool:
 59 |         """Check if a port is available."""
 60 |         try:
 61 |             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 62 |                 s.settimeout(0.5)
 63 |                 result = s.connect_ex(("127.0.0.1", port))
 64 |                 if result == 0:  # Port is in use on localhost
 65 |                     return False
 66 |         except:
 67 |             pass
 68 | 
 69 |         # Check the specified host (e.g., "host.docker.internal") if it's not a localhost alias
 70 |         if self.host not in ["localhost", "127.0.0.1"]:
 71 |             try:
 72 |                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 73 |                     s.settimeout(0.5)
 74 |                     result = s.connect_ex((self.host, port))
 75 |                     if result == 0:  # Port is in use on host
 76 |                         return False
 77 |             except:
 78 |                 pass
 79 | 
 80 |         return True
 81 | 
 82 |     def _get_server_port(self) -> int:
 83 |         """Get an available port for the server."""
 84 |         # Use requested port if specified
 85 |         if self.requested_port is not None:
 86 |             if not self._check_port_available(self.requested_port):
 87 |                 raise RuntimeError(f"Requested port {self.requested_port} is not available")
 88 |             return self.requested_port
 89 | 
 90 |         # Find a free port
 91 |         for _ in range(10):  # Try up to 10 times
 92 |             port = random.randint(49152, 65535)
 93 |             if self._check_port_available(port):
 94 |                 return port
 95 | 
 96 |         raise RuntimeError("Could not find an available port")
 97 | 
 98 |     async def _ensure_server_running(self) -> None:
 99 |         """Ensure the lume server is running, start it if it's not."""
100 |         try:
101 |             self.logger.debug("Checking if lume server is running...")
102 |             # Try to connect to the server with a short timeout
103 |             cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"]
104 |             process = await asyncio.create_subprocess_exec(
105 |                 *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
106 |             )
107 |             stdout, stderr = await process.communicate()
108 | 
109 |             if process.returncode == 0:
110 |                 response = stdout.decode()
111 |                 status_code = int(response[-3:])
112 |                 if status_code == 200:
113 |                     self.logger.debug("PyLume server is running")
114 |                     return
115 | 
116 |             self.logger.debug("PyLume server not running, attempting to start it")
117 |             # Server not running, try to start it
118 |             lume_path = os.path.join(os.path.dirname(__file__), "lume")
119 |             if not os.path.exists(lume_path):
120 |                 raise RuntimeError(f"Could not find lume binary at {lume_path}")
121 | 
122 |             # Make sure the file is executable
123 |             os.chmod(lume_path, 0o755)
124 | 
125 |             # Create a temporary file for server output
126 |             self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False)
127 |             self.logger.debug(f"Using temporary file for server output: {self.output_file.name}")
128 | 
129 |             # Start the server
130 |             self.logger.debug(f"Starting lume server with: {lume_path} serve --port {self.port}")
131 | 
132 |             # Start server in background using subprocess.Popen
133 |             try:
134 |                 self.server_process = subprocess.Popen(
135 |                     [lume_path, "serve", "--port", str(self.port)],
136 |                     stdout=self.output_file,
137 |                     stderr=self.output_file,
138 |                     cwd=os.path.dirname(lume_path),
139 |                     start_new_session=True,  # Run in new session to avoid blocking
140 |                 )
141 |             except Exception as e:
142 |                 self.output_file.close()
143 |                 os.unlink(self.output_file.name)
144 |                 raise RuntimeError(f"Failed to start lume server process: {str(e)}")
145 | 
146 |             # Wait for server to start
147 |             self.logger.debug(
148 |                 f"Waiting up to {self.server_start_timeout} seconds for server to start..."
149 |             )
150 |             start_time = time.time()
151 |             server_ready = False
152 |             last_size = 0
153 | 
154 |             while time.time() - start_time < self.server_start_timeout:
155 |                 if self.server_process.poll() is not None:
156 |                     # Process has terminated
157 |                     self.output_file.seek(0)
158 |                     output = self.output_file.read()
159 |                     self.output_file.close()
160 |                     os.unlink(self.output_file.name)
161 |                     error_msg = (
162 |                         f"Server process terminated unexpectedly.\n"
163 |                         f"Exit code: {self.server_process.returncode}\n"
164 |                         f"Output: {output}"
165 |                     )
166 |                     raise RuntimeError(error_msg)
167 | 
168 |                 # Check output file for server ready message
169 |                 self.output_file.seek(0, os.SEEK_END)
170 |                 size = self.output_file.tell()
171 |                 if size > last_size:  # Only read if there's new content
172 |                     self.output_file.seek(last_size)
173 |                     new_output = self.output_file.read()
174 |                     if new_output.strip():  # Only log non-empty output
175 |                         self.logger.debug(f"Server output: {new_output.strip()}")
176 |                     last_size = size
177 | 
178 |                     if "Server started" in new_output:
179 |                         server_ready = True
180 |                         self.logger.debug("Server startup detected")
181 |                         break
182 | 
183 |                 # Try to connect to the server periodically
184 |                 try:
185 |                     cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "5", f"{self.base_url}/vms"]
186 |                     process = await asyncio.create_subprocess_exec(
187 |                         *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
188 |                     )
189 |                     stdout, stderr = await process.communicate()
190 | 
191 |                     if process.returncode == 0:
192 |                         response = stdout.decode()
193 |                         status_code = int(response[-3:])
194 |                         if status_code == 200:
195 |                             server_ready = True
196 |                             self.logger.debug("Server is responding to requests")
197 |                             break
198 |                 except:
199 |                     pass  # Server not ready yet
200 | 
201 |                 await asyncio.sleep(1.0)
202 | 
203 |             if not server_ready:
204 |                 # Cleanup if server didn't start
205 |                 if self.server_process:
206 |                     self.server_process.terminate()
207 |                     try:
208 |                         self.server_process.wait(timeout=5)
209 |                     except subprocess.TimeoutExpired:
210 |                         self.server_process.kill()
211 |                 self.output_file.close()
212 |                 os.unlink(self.output_file.name)
213 |                 raise RuntimeError(
214 |                     f"Failed to start lume server after {self.server_start_timeout} seconds. "
215 |                     "Check the debug output for more details."
216 |                 )
217 | 
218 |             # Give the server a moment to fully initialize
219 |             await asyncio.sleep(2.0)
220 | 
221 |             # Verify server is responding
222 |             try:
223 |                 cmd = ["curl", "-s", "-w", "%{http_code}", "-m", "10", f"{self.base_url}/vms"]
224 |                 process = await asyncio.create_subprocess_exec(
225 |                     *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
226 |                 )
227 |                 stdout, stderr = await process.communicate()
228 | 
229 |                 if process.returncode != 0:
230 |                     raise RuntimeError(f"Curl command failed: {stderr.decode()}")
231 | 
232 |                 response = stdout.decode()
233 |                 status_code = int(response[-3:])
234 | 
235 |                 if status_code != 200:
236 |                     raise RuntimeError(f"Server returned status code {status_code}")
237 | 
238 |                 self.logger.debug("PyLume server started successfully")
239 |             except Exception as e:
240 |                 self.logger.debug(f"Server verification failed: {str(e)}")
241 |                 if self.server_process:
242 |                     self.server_process.terminate()
243 |                     try:
244 |                         self.server_process.wait(timeout=5)
245 |                     except subprocess.TimeoutExpired:
246 |                         self.server_process.kill()
247 |                 self.output_file.close()
248 |                 os.unlink(self.output_file.name)
249 |                 raise RuntimeError(f"Server started but is not responding: {str(e)}")
250 | 
251 |             self.logger.debug("Server startup completed successfully")
252 | 
253 |         except Exception as e:
254 |             raise RuntimeError(f"Failed to start lume server: {str(e)}")
255 | 
256 |     async def _start_server(self) -> None:
257 |         """Start the lume server using the lume executable."""
258 |         self.logger.debug("Starting PyLume server")
259 | 
260 |         # Get absolute path to lume executable in the same directory as this file
261 |         lume_path = os.path.join(os.path.dirname(__file__), "lume")
262 |         if not os.path.exists(lume_path):
263 |             raise RuntimeError(f"Could not find lume binary at {lume_path}")
264 | 
265 |         try:
266 |             # Make executable
267 |             os.chmod(lume_path, 0o755)
268 | 
269 |             # Get and validate port
270 |             self.port = self._get_server_port()
271 |             self.base_url = f"http://{self.host}:{self.port}/lume"
272 | 
273 |             # Set up output handling
274 |             self.output_file = tempfile.NamedTemporaryFile(mode="w+", delete=False)
275 | 
276 |             # Start the server process with the lume executable
277 |             env = os.environ.copy()
278 |             env["RUST_BACKTRACE"] = "1"  # Enable backtrace for better error reporting
279 | 
280 |             # Specify the host to bind to (0.0.0.0 to allow external connections)
281 |             self.server_process = subprocess.Popen(
282 |                 [lume_path, "serve", "--port", str(self.port)],
283 |                 stdout=self.output_file,
284 |                 stderr=subprocess.STDOUT,
285 |                 cwd=os.path.dirname(lume_path),  # Run from same directory as executable
286 |                 env=env,
287 |             )
288 | 
289 |             # Wait for server to initialize
290 |             await asyncio.sleep(2)
291 |             await self._wait_for_server()
292 | 
293 |         except Exception as e:
294 |             await self._cleanup()
295 |             raise RuntimeError(f"Failed to start lume server process: {str(e)}")
296 | 
297 |     async def _tail_log(self) -> None:
298 |         """Read and display server log output in debug mode."""
299 |         while True:
300 |             try:
301 |                 self.output_file.seek(0, os.SEEK_END)  # type: ignore[attr-defined]
302 |                 line = self.output_file.readline()  # type: ignore[attr-defined]
303 |                 if line:
304 |                     line = line.strip()
305 |                     if line:
306 |                         print(f"SERVER: {line}")
307 |                 if self.server_process.poll() is not None:  # type: ignore[attr-defined]
308 |                     print("Server process ended")
309 |                     break
310 |                 await asyncio.sleep(0.1)
311 |             except Exception as e:
312 |                 print(f"Error reading log: {e}")
313 |                 await asyncio.sleep(0.1)
314 | 
315 |     async def _wait_for_server(self) -> None:
316 |         """Wait for server to start and become responsive with increased timeout."""
317 |         start_time = time.time()
318 |         while time.time() - start_time < self.server_start_timeout:
319 |             if self.server_process.poll() is not None:  # type: ignore[attr-defined]
320 |                 error_msg = await self._get_error_output()
321 |                 await self._cleanup()
322 |                 raise RuntimeError(error_msg)
323 | 
324 |             try:
325 |                 await self._verify_server()
326 |                 self.logger.debug("Server is now responsive")
327 |                 return
328 |             except Exception as e:
329 |                 self.logger.debug(f"Server not ready yet: {str(e)}")
330 |                 await asyncio.sleep(1.0)
331 | 
332 |         await self._cleanup()
333 |         raise RuntimeError(f"Server failed to start after {self.server_start_timeout} seconds")
334 | 
335 |     async def _verify_server(self) -> None:
336 |         """Verify server is responding to requests."""
337 |         try:
338 |             cmd = [
339 |                 "curl",
340 |                 "-s",
341 |                 "-w",
342 |                 "%{http_code}",
343 |                 "-m",
344 |                 "10",
345 |                 f"http://{self.host}:{self.port}/lume/vms",
346 |             ]
347 |             process = await asyncio.create_subprocess_exec(
348 |                 *cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
349 |             )
350 |             stdout, stderr = await process.communicate()
351 | 
352 |             if process.returncode != 0:
353 |                 raise RuntimeError(f"Curl command failed: {stderr.decode()}")
354 | 
355 |             response = stdout.decode()
356 |             status_code = int(response[-3:])
357 | 
358 |             if status_code != 200:
359 |                 raise RuntimeError(f"Server returned status code {status_code}")
360 | 
361 |             self.logger.debug("PyLume server started successfully")
362 |         except Exception as e:
363 |             raise RuntimeError(f"Server not responding: {str(e)}")
364 | 
365 |     async def _get_error_output(self) -> str:
366 |         """Get error output from the server process."""
367 |         if not self.output_file:
368 |             return "No output available"
369 |         self.output_file.seek(0)
370 |         output = self.output_file.read()
371 |         return (
372 |             f"Server process terminated unexpectedly.\n"
373 |             f"Exit code: {self.server_process.returncode}\n"  # type: ignore[attr-defined]
374 |             f"Output: {output}"
375 |         )
376 | 
377 |     async def _cleanup(self) -> None:
378 |         """Clean up all server resources."""
379 |         if self.server_process:
380 |             try:
381 |                 self.server_process.terminate()
382 |                 try:
383 |                     self.server_process.wait(timeout=5)
384 |                 except subprocess.TimeoutExpired:
385 |                     self.server_process.kill()
386 |             except:
387 |                 pass
388 |             self.server_process = None
389 | 
390 |         # Clean up output file
391 |         if self.output_file:
392 |             try:
393 |                 self.output_file.close()
394 |                 os.unlink(self.output_file.name)
395 |             except Exception as e:
396 |                 self.logger.debug(f"Error cleaning up output file: {e}")
397 |             self.output_file = None
398 | 
399 |     async def ensure_running(self) -> None:
400 |         """Ensure the server is running.
401 | 
402 |         If use_existing_server is True, will only try to connect to an existing server.
403 |         Otherwise will:
404 |           1. Try to connect to an existing server on the specified port
405 |           2. If that fails and not in Docker, start a new server
406 |           3. If in Docker and no existing server is found, raise an error
407 |         """
408 |         # First check if we're in Docker
409 |         in_docker = os.path.exists("/.dockerenv") or (
410 |             os.path.exists("/proc/1/cgroup") and "docker" in open("/proc/1/cgroup", "r").read()
411 |         )
412 | 
413 |         # If using a non-localhost host like host.docker.internal, set up the connection details
414 |         if self.host not in ["localhost", "127.0.0.1"]:
415 |             if self.requested_port is None:
416 |                 raise RuntimeError("Port must be specified when using a remote host")
417 | 
418 |             self.port = self.requested_port
419 |             self.base_url = f"http://{self.host}:{self.port}/lume"
420 |             self.logger.debug(f"Using remote host server at {self.base_url}")
421 | 
422 |             # Try to verify the server is accessible
423 |             try:
424 |                 await self._verify_server()
425 |                 self.logger.debug("Successfully connected to remote server")
426 |                 return
427 |             except Exception as e:
428 |                 if self.use_existing_server or in_docker:
429 |                     # If explicitly requesting an existing server or in Docker, we can't start a new one
430 |                     raise RuntimeError(
431 |                         f"Failed to connect to remote server at {self.base_url}: {str(e)}"
432 |                     )
433 |                 else:
434 |                     self.logger.debug(f"Remote server not available at {self.base_url}: {str(e)}")
435 |                     # Fall back to localhost for starting a new server
436 |                     self.host = "localhost"
437 | 
438 |         # If explicitly using an existing server, verify it's running
439 |         if self.use_existing_server:
440 |             if self.requested_port is None:
441 |                 raise RuntimeError("Port must be specified when using an existing server")
442 | 
443 |             self.port = self.requested_port
444 |             self.base_url = f"http://{self.host}:{self.port}/lume"
445 | 
446 |             try:
447 |                 await self._verify_server()
448 |                 self.logger.debug("Successfully connected to existing server")
449 |             except Exception as e:
450 |                 raise RuntimeError(
451 |                     f"Failed to connect to existing server at {self.base_url}: {str(e)}"
452 |                 )
453 |         else:
454 |             # Try to connect to an existing server first
455 |             if self.requested_port is not None:
456 |                 self.port = self.requested_port
457 |                 self.base_url = f"http://{self.host}:{self.port}/lume"
458 | 
459 |                 try:
460 |                     await self._verify_server()
461 |                     self.logger.debug("Successfully connected to existing server")
462 |                     return
463 |                 except Exception:
464 |                     self.logger.debug(f"No existing server found at {self.base_url}")
465 | 
466 |                     # If in Docker and can't connect to existing server, raise an error
467 |                     if in_docker:
468 |                         raise RuntimeError(
469 |                             f"Failed to connect to server at {self.base_url} and cannot start a new server in Docker"
470 |                         )
471 | 
472 |             # Start a new server
473 |             self.logger.debug("Starting a new server instance")
474 |             await self._start_server()
475 | 
476 |     async def stop(self) -> None:
477 |         """Stop the server if we're managing it."""
478 |         if not self.use_existing_server:
479 |             self.logger.debug("Stopping lume server...")
480 |             await self._cleanup()
481 | 
```