trycua/cua # codebase.md

This is page 7 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_mcp_server_session_management.py
    ├── test_mcp_server_streaming.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/docs/content/docs/telemetry.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Telemetry
  3 | description: This document explains how telemetry works in CUA libraries and how you can control it.
  4 | icon: RadioTower
  5 | ---
  6 | 
  7 | # Telemetry in CUA
  8 | 
  9 | CUA tracks anonymized usage and error report statistics; we ascribe to Posthog's approach as detailed [here](https://posthog.com/blog/open-source-telemetry-ethical). If you would like to opt out of sending anonymized info, you can set `telemetry_enabled` to false.
 10 | 
 11 | ## What telemetry data we collect
 12 | 
 13 | CUA libraries collect usage data to help improve our software. We have two categories of telemetry:
 14 | 
 15 | ### Opt-Out Telemetry (Enabled by Default)
 16 | 
 17 | Basic performance metrics and system information that help us understand usage patterns:
 18 | 
 19 | - **System Information**: Operating system, OS version, Python version
 20 | - **Module Initialization**: When modules are imported and their versions
 21 | - **Performance Metrics**: Agent run durations, step counts, token usage, and API costs
 22 | - **Session Tracking**: Anonymous session IDs and run IDs for performance analysis
 23 | 
 24 | ### Opt-In Telemetry (Disabled by Default)
 25 | 
 26 | **Conversation Trajectory Logging**: Full conversation history including:
 27 | - User messages and agent responses
 28 | - Computer actions and their outputs
 29 | - Reasoning traces from the agent
 30 | 
 31 | **Important**: Trajectory logging is **opt-in only** and must be explicitly enabled.
 32 | 
 33 | ### We do NOT collect:
 34 | 
 35 | - Personal information or user identifiers
 36 | - API keys or credentials
 37 | - File contents or application data
 38 | - Information about files being accessed
 39 | - Actual screenshots or screen contents (unless trajectory logging is enabled)
 40 | - Specific text being typed, including user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
 41 | 
 42 | ## Controlling Telemetry
 43 | 
 44 | We are committed to transparency and user control over telemetry. There are two ways to control telemetry:
 45 | 
 46 | ### 1. Environment Variable (Global Control)
 47 | 
 48 | Telemetry is enabled by default. To disable telemetry, set the `CUA_TELEMETRY_ENABLED` environment variable to a falsy value (`0`, `false`, `no`, or `off`):
 49 | 
 50 | ```bash
 51 | # Disable telemetry before running your script
 52 | export CUA_TELEMETRY_ENABLED=false
 53 | 
 54 | # Or as part of the command
 55 | CUA_TELEMETRY_ENABLED=1 python your_script.py
 56 | 
 57 | ```
 58 | 
 59 | Or from Python:
 60 | 
 61 | ```python
 62 | import os
 63 | os.environ["CUA_TELEMETRY_ENABLED"] = "false"
 64 | ```
 65 | 
 66 | ### 2. Instance-Level Control
 67 | 
 68 | #### Computer SDK
 69 | 
 70 | ```python
 71 | from computer import Computer
 72 | 
 73 | # Enable telemetry (default)
 74 | computer = Computer(telemetry_enabled=True)
 75 | 
 76 | # Disable telemetry
 77 | computer = Computer(telemetry_enabled=False)
 78 | ```
 79 | 
 80 | #### Agent SDK
 81 | 
 82 | ```python
 83 | from agent import ComputerAgent
 84 | import os
 85 | 
 86 | # Basic telemetry - performance metrics only (opt-out, enabled by default)
 87 | agent = ComputerAgent(
 88 |     model="claude-3-5-sonnet-20241022",
 89 |     telemetry_enabled=True  # Default is True
 90 | )
 91 | 
 92 | # Enable telemetry with full conversation trajectory logging (opt-in)
 93 | agent = ComputerAgent(
 94 |     model="claude-3-5-sonnet-20241022",
 95 |     telemetry_enabled={
 96 |         "log_trajectory": True  # Logs full conversation items
 97 |     }
 98 | )
 99 | 
100 | # Disable telemetry completely
101 | agent = ComputerAgent(
102 |     model="claude-3-5-sonnet-20241022",
103 |     telemetry_enabled=False
104 | )
105 | 
106 | # Disable telemetry completely using environment variables
107 | os.environ["CUA_TELEMETRY_ENABLED"] = "false"
108 | agent = ComputerAgent(
109 |     model="claude-3-5-sonnet-20241022"
110 | )
111 | ```
112 | 
113 | You can check if telemetry is enabled for an instance:
114 | 
115 | ```python
116 | print(computer.telemetry_enabled)  # Will print True or False
117 | print(agent.telemetry_enabled)     # Will print True, False, or dict
118 | ```
119 | 
120 | Note that telemetry settings must be configured during initialization and cannot be changed after the object is created.
121 | 
122 | ## Detailed Telemetry Events
123 | 
124 | ### Computer SDK Events
125 | 
126 | | Event Name | Data Collected | Trigger Notes |
127 | |------------|----------------|---------------|
128 | | **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created |
129 | | **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time |
130 | 
131 | ### Agent SDK Events
132 | 
133 | | Event Name | Data Collected | Trigger Notes |
134 | |------------|----------------|---------------|
135 | | **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
136 | | **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-3-5-sonnet")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
137 | | **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
138 | | **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
139 | | **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
140 | | **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API |
141 | 
142 | ## Transparency
143 | 
144 | We believe in being transparent about the data we collect. If you have any questions about our telemetry practices, please open an issue on our GitHub repository.
145 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/operator_validator.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | OperatorValidatorCallback
  3 | 
  4 | Ensures agent output actions conform to expected schemas by fixing common issues:
  5 | - click: add default button='left' if missing
  6 | - keypress: wrap keys string into a list
  7 | - etc.
  8 | 
  9 | This runs in on_llm_end, which receives the output array (AgentMessage[] as dicts).
 10 | The purpose is to avoid spending another LLM call to fix broken computer call syntax when possible.
 11 | """
 12 | from __future__ import annotations
 13 | 
 14 | from typing import Any, Dict, List
 15 | 
 16 | from .base import AsyncCallbackHandler
 17 | 
 18 | 
 19 | class OperatorNormalizerCallback(AsyncCallbackHandler):
 20 |     """Normalizes common computer call hallucinations / errors in computer call syntax."""
 21 | 
 22 |     async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 23 |         # Mutate in-place as requested, but still return the list for chaining
 24 |         for item in output or []:
 25 |             if item.get("type") != "computer_call":
 26 |                 continue
 27 |             action = item.get("action")
 28 |             if not isinstance(action, dict):
 29 |                 continue
 30 | 
 31 |             # rename mouse click actions to "click"
 32 |             for mouse_btn in ["left", "right", "wheel", "back", "forward"]:
 33 |                 if action.get("type", "") == f"{mouse_btn}_click":
 34 |                     action["type"] = "click"
 35 |                     action["button"] = mouse_btn
 36 |             # rename hotkey actions to "keypress"
 37 |             for alias in ["hotkey", "key", "press", "key_press"]:
 38 |                 if action.get("type", "") == alias:
 39 |                     action["type"] = "keypress"
 40 |             # assume click actions
 41 |             if "button" in action and "type" not in action:
 42 |                 action["type"] = "click"
 43 |             if "click" in action and "type" not in action:
 44 |                 action["type"] = "click"
 45 |             if ("scroll_x" in action or "scroll_y" in action) and "type" not in action:
 46 |                 action["type"] = "scroll"
 47 |             if "text" in action and "type" not in action:
 48 |                 action["type"] = "type"
 49 | 
 50 |             action_type = action.get("type")
 51 |             def _keep_keys(action: Dict[str, Any], keys_to_keep: List[str]):
 52 |                 """Keep only the provided keys on action; delete everything else.
 53 |                 Always ensures required 'type' is present if listed in keys_to_keep.
 54 |                 """
 55 |                 for key in list(action.keys()):
 56 |                     if key not in keys_to_keep:
 57 |                         del action[key]
 58 |             # rename "coordinate" to "x", "y"
 59 |             if "coordinate" in action:
 60 |                 action["x"] = action["coordinate"][0]
 61 |                 action["y"] = action["coordinate"][1]
 62 |                 del action["coordinate"]
 63 |             if action_type == "click":
 64 |                 # convert "click" to "button"
 65 |                 if "button" not in action and "click" in action:
 66 |                     action["button"] = action["click"]
 67 |                     del action["click"]
 68 |                 # default button to "left"
 69 |                 action["button"] = action.get("button", "left")
 70 |             # add default scroll x, y if missing
 71 |             if action_type == "scroll":
 72 |                 action["scroll_x"] = action.get("scroll_x", 0)
 73 |                 action["scroll_y"] = action.get("scroll_y", 0)
 74 |             # ensure keys arg is a list (normalize aliases first)
 75 |             if action_type == "keypress":
 76 |                 keys = action.get("keys")
 77 |                 for keys_alias in ["keypress", "key", "press", "key_press", "text"]:
 78 |                     if keys_alias in action:
 79 |                         action["keys"] = action[keys_alias]
 80 |                         del action[keys_alias]
 81 |                 keys = action.get("keys")
 82 |                 if isinstance(keys, str):
 83 |                     action["keys"] = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
 84 |             required_keys_by_type = {
 85 |                 # OpenAI actions
 86 |                 "click": ["type", "button", "x", "y"],
 87 |                 "double_click": ["type", "x", "y"],
 88 |                 "drag": ["type", "path"],
 89 |                 "keypress": ["type", "keys"],
 90 |                 "move": ["type", "x", "y"],
 91 |                 "screenshot": ["type"],
 92 |                 "scroll": ["type", "scroll_x", "scroll_y", "x", "y"],
 93 |                 "type": ["type", "text"],
 94 |                 "wait": ["type"],
 95 |                 # Anthropic actions
 96 |                 "left_mouse_down": ["type", "x", "y"],
 97 |                 "left_mouse_up": ["type", "x", "y"],
 98 |                 "triple_click": ["type", "button", "x", "y"],
 99 |             }
100 |             keep = required_keys_by_type.get(action_type or "")
101 |             if keep:
102 |                 _keep_keys(action, keep)
103 |             
104 | 
105 |         # # Second pass: if an assistant message is immediately followed by a computer_call,
106 |         # # replace the assistant message itself with a reasoning message with summary text.
107 |         # if isinstance(output, list):
108 |         #     for i, item in enumerate(output):
109 |         #         # AssistantMessage shape: { type: 'message', role: 'assistant', content: OutputContent[] }
110 |         #         if item.get("type") == "message" and item.get("role") == "assistant":
111 |         #             next_idx = i + 1
112 |         #             if next_idx >= len(output):
113 |         #                 continue
114 |         #             next_item = output[next_idx]
115 |         #             if not isinstance(next_item, dict):
116 |         #                 continue
117 |         #             if next_item.get("type") != "computer_call":
118 |         #                 continue
119 |         #             contents = item.get("content") or []
120 |         #             # Extract text from OutputContent[]
121 |         #             text_parts: List[str] = []
122 |         #             if isinstance(contents, list):
123 |         #                 for c in contents:
124 |         #                     if isinstance(c, dict) and c.get("type") == "output_text" and isinstance(c.get("text"), str):
125 |         #                         text_parts.append(c["text"])
126 |         #             text_content = "\n".join(text_parts).strip()
127 |         #             # Replace assistant message with reasoning message
128 |         #             output[i] = {
129 |         #                 "type": "reasoning",
130 |         #                 "summary": [
131 |         #                     {
132 |         #                         "type": "summary_text",
133 |         #                         "text": text_content,
134 |         #                     }
135 |         #                 ],
136 |         #             }
137 | 
138 |         return output
139 | 
```

--------------------------------------------------------------------------------
/.github/workflows/docker-reusable-publish.yml:
--------------------------------------------------------------------------------

```yaml
  1 | name: Reusable Docker Publish Workflow
  2 | 
  3 | on:
  4 |   workflow_call:
  5 |     inputs:
  6 |       image_name:
  7 |         description: "Name of the Docker image (e.g. cua-ubuntu, cua-xfce)"
  8 |         required: true
  9 |         type: string
 10 |       context_dir:
 11 |         description: "Directory containing the Dockerfile relative to workspace root (e.g. libs/kasm, libs/xfce)"
 12 |         required: true
 13 |         type: string
 14 |       dockerfile_path:
 15 |         description: "Path to Dockerfile relative to context_dir (e.g. Dockerfile)"
 16 |         required: false
 17 |         type: string
 18 |         default: "Dockerfile"
 19 |       tag_prefix:
 20 |         description: "Prefix for semantic version tags (e.g. docker-kasm-v, docker-xfce-v)"
 21 |         required: true
 22 |         type: string
 23 |       docker_hub_org:
 24 |         description: "Docker Hub organization name"
 25 |         required: false
 26 |         type: string
 27 |         default: "trycua"
 28 |     secrets:
 29 |       DOCKER_HUB_TOKEN:
 30 |         required: true
 31 | 
 32 | jobs:
 33 |   build-and-push:
 34 |     runs-on: ubuntu-latest
 35 |     strategy:
 36 |       fail-fast: false
 37 |       matrix:
 38 |         platform:
 39 |           - linux/amd64
 40 |           - linux/arm64
 41 |     steps:
 42 |       - name: Checkout repository
 43 |         uses: actions/checkout@v4
 44 | 
 45 |       - name: Prepare platform tag
 46 |         id: platform
 47 |         run: |
 48 |           # Convert platform (e.g., linux/amd64) to a valid tag suffix (e.g., linux-amd64)
 49 |           PLATFORM_TAG=$(echo "${{ matrix.platform }}" | sed 's/\//-/g')
 50 |           echo "tag=${PLATFORM_TAG}" >> $GITHUB_OUTPUT
 51 | 
 52 |       - name: Set up Docker Buildx
 53 |         uses: docker/setup-buildx-action@v3
 54 | 
 55 |       - name: Log in to Docker Hub
 56 |         uses: docker/login-action@v3
 57 |         with:
 58 |           username: ${{ inputs.docker_hub_org }}
 59 |           password: ${{ secrets.DOCKER_HUB_TOKEN }}
 60 | 
 61 |       - name: Extract metadata (PR)
 62 |         if: github.event_name == 'pull_request'
 63 |         id: meta-pr
 64 |         uses: docker/metadata-action@v5
 65 |         with:
 66 |           images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
 67 |           tags: |
 68 |             type=raw,value=${{ github.sha }}
 69 | 
 70 |       - name: Extract metadata (main branch)
 71 |         if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
 72 |         id: meta-main
 73 |         uses: docker/metadata-action@v5
 74 |         with:
 75 |           images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
 76 |           tags: |
 77 |             type=raw,value=latest
 78 | 
 79 |       - name: Extract metadata (semantic version tag)
 80 |         if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
 81 |         id: meta-semver
 82 |         uses: docker/metadata-action@v5
 83 |         with:
 84 |           images: ${{ inputs.docker_hub_org }}/${{ inputs.image_name }}
 85 |           tags: |
 86 |             type=semver,pattern={{version}},prefix=${{ inputs.tag_prefix }}
 87 |             type=semver,pattern={{major}}.{{minor}},prefix=${{ inputs.tag_prefix }}
 88 |             type=semver,pattern={{major}},prefix=${{ inputs.tag_prefix }}
 89 |             type=raw,value=latest
 90 | 
 91 |       - name: Build and push Docker image (PR)
 92 |         if: github.event_name == 'pull_request'
 93 |         uses: docker/build-push-action@v5
 94 |         with:
 95 |           context: ./${{ inputs.context_dir }}
 96 |           file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
 97 |           push: true
 98 |           tags: ${{ steps.meta-pr.outputs.tags }}
 99 |           labels: ${{ steps.meta-pr.outputs.labels }}
100 |           platforms: ${{ matrix.platform }}
101 |           cache-from: |
102 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
103 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
104 |           cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
105 | 
106 |       - name: Build and push Docker image (main branch)
107 |         if: github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
108 |         uses: docker/build-push-action@v5
109 |         with:
110 |           context: ./${{ inputs.context_dir }}
111 |           file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
112 |           push: true
113 |           tags: ${{ steps.meta-main.outputs.tags }}
114 |           labels: ${{ steps.meta-main.outputs.labels }}
115 |           platforms: ${{ matrix.platform }}
116 |           cache-from: |
117 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
118 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
119 |           cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
120 | 
121 |       - name: Build and push Docker image (semantic version tag)
122 |         if: startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
123 |         uses: docker/build-push-action@v5
124 |         with:
125 |           context: ./${{ inputs.context_dir }}
126 |           file: ./${{ inputs.context_dir }}/${{ inputs.dockerfile_path }}
127 |           push: true
128 |           tags: ${{ steps.meta-semver.outputs.tags }}
129 |           labels: ${{ steps.meta-semver.outputs.labels }}
130 |           platforms: ${{ matrix.platform }}
131 |           cache-from: |
132 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }}
133 |             type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:latest
134 |           cache-to: type=registry,ref=${{ inputs.docker_hub_org }}/${{ inputs.image_name }}:buildcache-${{ steps.platform.outputs.tag }},mode=max
135 | 
136 |       - name: Image digest
137 |         if: github.event_name == 'pull_request' || github.ref == 'refs/heads/main' || startsWith(github.ref, format('refs/tags/{0}', inputs.tag_prefix))
138 |         run: |
139 |           if [ "${{ github.event_name }}" == "pull_request" ]; then
140 |             echo "Image pushed with digest ${{ steps.meta-pr.outputs.digest }}"
141 |           elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then
142 |             echo "Image pushed with digest ${{ steps.meta-semver.outputs.digest }}"
143 |           else
144 |             echo "Image pushed with digest ${{ steps.meta-main.outputs.digest }}"
145 |           fi
146 | 
147 |       - name: print image tags
148 |         run: |
149 |           if [ "${{ github.event_name }}" == "pull_request" ]; then
150 |             echo "Image tags: ${{ steps.meta-pr.outputs.tags }}"
151 |           elif [[ "${{ github.ref }}" == refs/tags/${{ inputs.tag_prefix }}* ]]; then
152 |             echo "Image tags: ${{ steps.meta-semver.outputs.tags }}"
153 |           else
154 |             echo "Image tags: ${{ steps.meta-main.outputs.tags }}"
155 |           fi
156 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/internvl.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | InternVL agent loop implementation for click prediction using litellm.acompletion.
  3 | 
  4 | Implements the ScreenSpot InternVL grounding baseline behavior:
  5 | - Uses the exact grounding prompt format with <image> and <ref> tags
  6 | - Expects coordinates in 0-1000 normalized range in formats [[x1,y1,x2,y2]] or [[x,y]]
  7 | - Converts to pixel coordinates relative to the original screenshot size
  8 | 
  9 | Note: We do NOT manually load the InternVL model; acompletions (via HuggingFaceLocalAdapter)
 10 | will handle loading based on the provided model name.
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import base64
 16 | import math
 17 | import re
 18 | from io import BytesIO
 19 | from typing import Any, Dict, List, Optional, Tuple
 20 | 
 21 | from PIL import Image
 22 | import litellm
 23 | 
 24 | from ..decorators import register_agent
 25 | from .composed_grounded import ComposedGroundedConfig
 26 | from ..types import AgentCapability
 27 | 
 28 | 
 29 | # Regex patterns for extracting coordinates
 30 | # Accept optional whitespace and optional decimal fractions
 31 | _NUM = r"(\d+(?:\.\d+)?)"
 32 | _POINT_PATTERN = re.compile(r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]")
 33 | _BBOX_PATTERN = re.compile(
 34 |     r"\[\[\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*,\s*" + _NUM + r"\s*\]\]"
 35 | )
 36 | 
 37 | 
 38 | def _extract_first_point(text: str) -> Optional[Tuple[float, float]]:
 39 |     """Extract the first [[x,y]] as normalized (0-1000) floats."""
 40 |     m = _POINT_PATTERN.search(text)
 41 |     if not m:
 42 |         return None
 43 |     try:
 44 |         x = float(m.group(1))
 45 |         y = float(m.group(2))
 46 |         return x, y
 47 |     except Exception:
 48 |         return None
 49 | 
 50 | 
 51 | def _extract_last_bbox(text: str) -> Optional[Tuple[float, float, float, float]]:
 52 |     """Extract the last [[x1,y1,x2,y2]] as normalized (0-1000) floats."""
 53 |     matches = list(_BBOX_PATTERN.finditer(text))
 54 |     if not matches:
 55 |         return None
 56 |     m = matches[-1]
 57 |     try:
 58 |         x1 = float(m.group(1))
 59 |         y1 = float(m.group(2))
 60 |         x2 = float(m.group(3))
 61 |         y2 = float(m.group(4))
 62 |         return x1, y1, x2, y2
 63 |     except Exception:
 64 |         return None
 65 | 
 66 | 
 67 | def _scale_norm_to_pixels(x_norm: float, y_norm: float, width: int, height: int) -> Tuple[int, int]:
 68 |     """Scale 0-1000 normalized coordinates to pixel coordinates for given image size."""
 69 |     x_px = int(math.floor((x_norm / 1000.0) * width))
 70 |     y_px = int(math.floor((y_norm / 1000.0) * height))
 71 |     # Clamp to image bounds just in case
 72 |     x_px = max(0, min(width - 1, x_px))
 73 |     y_px = max(0, min(height - 1, y_px))
 74 |     return x_px, y_px
 75 | 
 76 | 
 77 | @register_agent(models=r"(?i).*InternVL.*")
 78 | class InternVLConfig(ComposedGroundedConfig):
 79 |     """InternVL agent configuration reusing ComposedGroundedConfig for steps and
 80 |     overriding predict_click to implement ScreenSpot InternVL grounding baseline."""
 81 | 
 82 |     async def predict_step(
 83 |         self,
 84 |         messages: List[Dict[str, Any]],
 85 |         model: str,
 86 |         tools: Optional[List[Dict[str, Any]]] = None,
 87 |         max_retries: Optional[int] = None,
 88 |         stream: bool = False,
 89 |         computer_handler=None,
 90 |         _on_api_start=None,
 91 |         _on_api_end=None,
 92 |         _on_usage=None,
 93 |         _on_screenshot=None,
 94 |         **kwargs
 95 |     ) -> Dict[str, Any]:
 96 |         """Fallback to a self-composed model"""
 97 |         return await super().predict_step(
 98 |             messages=messages,
 99 |             model=f"{model}+{model}",
100 |             tools=tools,
101 |             max_retries=max_retries,
102 |             stream=stream,
103 |             computer_handler=computer_handler,
104 |             _on_api_start=_on_api_start,
105 |             _on_api_end=_on_api_end,
106 |             _on_usage=_on_usage,
107 |             _on_screenshot=_on_screenshot,
108 |             **kwargs
109 |         )
110 |     
111 |     async def predict_click(
112 |         self,
113 |         model: str,
114 |         image_b64: str,
115 |         instruction: str,
116 |         **kwargs
117 |     ) -> Optional[Tuple[int, int]]:
118 |         """
119 |         Predict click coordinates using InternVL via litellm.acompletion.
120 | 
121 |         Behavior mirrors the ScreenSpot InternVL baseline:
122 |         - Prompt: "<image>\nPlease provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. Answer in the format of [[x1, y1, x2, y2]]"
123 |         - Parse either [[x,y]] point or [[x1,y1,x2,y2]] bbox, using bbox center if point missing
124 |         - Coordinates are 0-1000 normalized; convert to pixel coordinates for the original screenshot
125 |         """
126 |         try:
127 |             # Decode image dimensions to scale the normalized outputs
128 |             img_bytes = base64.b64decode(image_b64)
129 |             image = Image.open(BytesIO(img_bytes))
130 |             width, height = image.size
131 |         except Exception:
132 |             # If decoding fails, proceed with a safe default size to avoid crash
133 |             width, height = 1920, 1080
134 | 
135 |         # Build grounding prompt exactly like the baseline
136 |         grounding_prompt = (
137 |             f"Please provide the bounding box coordinate of the UI element this user instruction describes: <ref>{instruction}</ref>. "
138 |             f"Answer in the format of [[x1, y1, x2, y2]]"
139 |         )
140 | 
141 |         # Prepare messages for LiteLLM
142 |         messages = [
143 |             {
144 |                 "role": "user",
145 |                 "content": [
146 |                     {
147 |                         "type": "image_url",
148 |                         "image_url": {"url": f"data:image/png;base64,{image_b64}"},
149 |                     },
150 |                     {"type": "text", "text": grounding_prompt},
151 |                 ],
152 |             }
153 |         ]
154 | 
155 |         # Call acompletion; HuggingFaceLocalAdapter/model handler will handle InternVL loading
156 |         api_kwargs = {
157 |             "model": model,
158 |             "messages": messages,
159 |             # Conservative generation params akin to baseline (deterministic)
160 |             "max_tokens": kwargs.get("max_tokens", 256),
161 |             "temperature": kwargs.get("temperature", 0.0),
162 |         }
163 | 
164 |         response = await litellm.acompletion(**api_kwargs)
165 |         output_text = (response.choices[0].message.content or "").strip()  # type: ignore
166 | 
167 |         # print(f"InternVL output: {output_text}")
168 | 
169 |         # Try to parse a point first; if absent, parse bbox and take center
170 |         point = _extract_first_point(output_text)
171 |         if point is None:
172 |             bbox = _extract_last_bbox(output_text)
173 |             if bbox is None:
174 |                 return None
175 |             x1, y1, x2, y2 = bbox
176 |             cx = (x1 + x2) / 2.0
177 |             cy = (y1 + y2) / 2.0
178 |             point = (cx, cy)
179 | 
180 |         x_norm, y_norm = point
181 |         x_px, y_px = _scale_norm_to_pixels(x_norm, y_norm, width, height)
182 |         return (x_px, y_px)
183 | 
184 |     def get_capabilities(self) -> List[AgentCapability]:
185 |         return ["click", "step"]
186 | 
```

--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/factory.py:
--------------------------------------------------------------------------------

```python
  1 | """Factory for creating VM providers."""
  2 | 
  3 | import logging
  4 | from typing import Dict, Optional, Any, Type, Union
  5 | 
  6 | from .base import BaseVMProvider, VMProviderType
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class VMProviderFactory:
 12 |     """Factory for creating VM providers based on provider type."""
 13 | 
 14 |     @staticmethod
 15 |     def create_provider(
 16 |         provider_type: Union[str, VMProviderType],
 17 |         port: int = 7777,
 18 |         host: str = "localhost",
 19 |         bin_path: Optional[str] = None,
 20 |         storage: Optional[str] = None,
 21 |         shared_path: Optional[str] = None,
 22 |         image: Optional[str] = None,
 23 |         verbose: bool = False,
 24 |         ephemeral: bool = False,
 25 |         noVNC_port: Optional[int] = None,
 26 |         **kwargs,
 27 |     ) -> BaseVMProvider:
 28 |         """Create a VM provider of the specified type.
 29 |         
 30 |         Args:
 31 |             provider_type: Type of VM provider to create
 32 |             port: Port for the API server
 33 |             host: Hostname for the API server
 34 |             bin_path: Path to provider binary if needed
 35 |             storage: Path for persistent VM storage
 36 |             shared_path: Path for shared folder between host and VM
 37 |             image: VM image to use (for Lumier provider)
 38 |             verbose: Enable verbose logging
 39 |             ephemeral: Use ephemeral (temporary) storage
 40 |             noVNC_port: Specific port for noVNC interface (for Lumier provider)
 41 |             
 42 |         Returns:
 43 |             An instance of the requested VM provider
 44 |             
 45 |         Raises:
 46 |             ImportError: If the required dependencies for the provider are not installed
 47 |             ValueError: If the provider type is not supported
 48 |         """
 49 |         # Convert string to enum if needed
 50 |         if isinstance(provider_type, str):
 51 |             try:
 52 |                 provider_type = VMProviderType(provider_type.lower())
 53 |             except ValueError:
 54 |                 provider_type = VMProviderType.UNKNOWN
 55 |         
 56 |         if provider_type == VMProviderType.LUME:
 57 |             try:
 58 |                 from .lume import LumeProvider, HAS_LUME
 59 |                 if not HAS_LUME:
 60 |                     raise ImportError(
 61 |                         "The pylume package is required for LumeProvider. "
 62 |                         "Please install it with 'pip install cua-computer[lume]'"
 63 |                     )
 64 |                 return LumeProvider(
 65 |                     port=port,
 66 |                     host=host,
 67 |                     storage=storage,
 68 |                     verbose=verbose,
 69 |                     ephemeral=ephemeral
 70 |                 )
 71 |             except ImportError as e:
 72 |                 logger.error(f"Failed to import LumeProvider: {e}")
 73 |                 raise ImportError(
 74 |                     "The pylume package is required for LumeProvider. "
 75 |                     "Please install it with 'pip install cua-computer[lume]'"
 76 |                 ) from e
 77 |         elif provider_type == VMProviderType.LUMIER:
 78 |             try:
 79 |                 from .lumier import LumierProvider, HAS_LUMIER
 80 |                 if not HAS_LUMIER:
 81 |                     raise ImportError(
 82 |                         "Docker is required for LumierProvider. "
 83 |                         "Please install Docker for Apple Silicon and Lume CLI before using this provider."
 84 |                     )
 85 |                 return LumierProvider(
 86 |                     port=port,
 87 |                     host=host,
 88 |                     storage=storage,
 89 |                     shared_path=shared_path,
 90 |                     image=image or "macos-sequoia-cua:latest",
 91 |                     verbose=verbose,
 92 |                     ephemeral=ephemeral,
 93 |                     noVNC_port=noVNC_port
 94 |                 )
 95 |             except ImportError as e:
 96 |                 logger.error(f"Failed to import LumierProvider: {e}")
 97 |                 raise ImportError(
 98 |                     "Docker and Lume CLI are required for LumierProvider. "
 99 |                     "Please install Docker for Apple Silicon and run the Lume installer script."
100 |                 ) from e
101 | 
102 |         elif provider_type == VMProviderType.CLOUD:
103 |             try:
104 |                 from .cloud import CloudProvider
105 |                 return CloudProvider(
106 |                     verbose=verbose,
107 |                     **kwargs,
108 |                 )
109 |             except ImportError as e:
110 |                 logger.error(f"Failed to import CloudProvider: {e}")
111 |                 raise ImportError(
112 |                     "The CloudProvider is not fully implemented yet. "
113 |                     "Please use LUME or LUMIER provider instead."
114 |                 ) from e
115 |         elif provider_type == VMProviderType.WINSANDBOX:
116 |             try:
117 |                 from .winsandbox import WinSandboxProvider, HAS_WINSANDBOX
118 |                 if not HAS_WINSANDBOX:
119 |                     raise ImportError(
120 |                         "pywinsandbox is required for WinSandboxProvider. "
121 |                         "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'"
122 |                     )
123 |                 return WinSandboxProvider(
124 |                     port=port,
125 |                     host=host,
126 |                     storage=storage,
127 |                     verbose=verbose,
128 |                     ephemeral=ephemeral,
129 |                     **kwargs
130 |                 )
131 |             except ImportError as e:
132 |                 logger.error(f"Failed to import WinSandboxProvider: {e}")
133 |                 raise ImportError(
134 |                     "pywinsandbox is required for WinSandboxProvider. "
135 |                     "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'"
136 |                 ) from e
137 |         elif provider_type == VMProviderType.DOCKER:
138 |             try:
139 |                 from .docker import DockerProvider, HAS_DOCKER
140 |                 if not HAS_DOCKER:
141 |                     raise ImportError(
142 |                         "Docker is required for DockerProvider. "
143 |                         "Please install Docker and ensure it is running."
144 |                     )
145 |                 return DockerProvider(
146 |                     port=port,
147 |                     host=host,
148 |                     storage=storage,
149 |                     shared_path=shared_path,
150 |                     image=image or "trycua/cua-ubuntu:latest",
151 |                     verbose=verbose,
152 |                     ephemeral=ephemeral,
153 |                     vnc_port=noVNC_port
154 |                 )
155 |             except ImportError as e:
156 |                 logger.error(f"Failed to import DockerProvider: {e}")
157 |                 raise ImportError(
158 |                     "Docker is required for DockerProvider. "
159 |                     "Please install Docker and ensure it is running."
160 |                 ) from e
161 |         else:
162 |             raise ValueError(f"Unsupported provider type: {provider_type}")
163 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/interactive.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Interactive Click Prediction Tool
  4 | 
  5 | Takes screenshots and allows testing multiple models interactively.
  6 | Models are loaded/unloaded one at a time to avoid memory issues.
  7 | """
  8 | 
  9 | import asyncio
 10 | import os
 11 | from datetime import datetime
 12 | from typing import List, Dict, Any
 13 | 
 14 | from utils import (
 15 |     ModelWrapper,
 16 |     take_screenshot,
 17 |     save_prediction_visualization,
 18 |     get_available_models
 19 | )
 20 | 
 21 | 
 22 | async def predict_with_all_models(image, instruction: str, models) -> List[Dict[str, Any]]:
 23 |     """
 24 |     Predict click coordinates with all models sequentially.
 25 |     
 26 |     Args:
 27 |         image: PIL Image to analyze
 28 |         instruction: Instruction text
 29 |         models: List of model instances
 30 |         
 31 |     Returns:
 32 |         List of prediction results
 33 |     """
 34 |     predictions = []
 35 |     
 36 |     for model in models:
 37 |         model_wrapper = ModelWrapper(model)
 38 |         print(f"\n🔄 Loading {model_wrapper.model_name}...")
 39 |         
 40 |         try:
 41 |             # Load model
 42 |             await model_wrapper.load_model()
 43 |             
 44 |             # Predict
 45 |             coords = await model_wrapper.predict_click(image, instruction)
 46 |             
 47 |             predictions.append({
 48 |                 'model_name': model_wrapper.model_name,
 49 |                 'coords': coords,
 50 |                 'error': None
 51 |             })
 52 |             
 53 |             if coords:
 54 |                 print(f"✅ {model_wrapper.model_name}: ({coords[0]}, {coords[1]})")
 55 |             else:
 56 |                 print(f"❌ {model_wrapper.model_name}: No prediction")
 57 |                 
 58 |         except Exception as e:
 59 |             print(f"❌ {model_wrapper.model_name}: ERROR - {str(e)}")
 60 |             predictions.append({
 61 |                 'model_name': model_wrapper.model_name,
 62 |                 'coords': None,
 63 |                 'error': str(e)
 64 |             })
 65 |         
 66 |         finally:
 67 |             # Always unload model to free memory
 68 |             try:
 69 |                 await model_wrapper.unload_model()
 70 |                 print(f"🗑️  Unloaded {model_wrapper.model_name}")
 71 |             except Exception as e:
 72 |                 print(f"⚠️  Error unloading {model_wrapper.model_name}: {e}")
 73 |     
 74 |     return predictions
 75 | 
 76 | 
 77 | def print_header():
 78 |     """Print the interactive tool header."""
 79 |     print("=" * 60)
 80 |     print("🖱️  Interactive Click Prediction Tool")
 81 |     print("=" * 60)
 82 |     print("Commands:")
 83 |     print("  • Type an instruction to test models on last screenshot")
 84 |     print("  • 'screenshot' - Take a new screenshot")
 85 |     print("  • 'models' - List available models")
 86 |     print("  • 'quit' or 'exit' - Exit the tool")
 87 |     print("=" * 60)
 88 |     print("💡 Tip: Take a screenshot first, then send instructions to test models!")
 89 | 
 90 | 
 91 | def print_models(models):
 92 |     """Print available models."""
 93 |     print("\n📋 Available Models:")
 94 |     for i, model in enumerate(models, 1):
 95 |         if isinstance(model, str):
 96 |             print(f"  {i}. {model}")
 97 |         else:
 98 |             print(f"  {i}. models.{model.__class__.__name__}")
 99 | 
100 | 
101 | async def main():
102 |     """
103 |     Main interactive loop.
104 |     """
105 |     print_header()
106 |     
107 |     # Get available models
108 |     models = get_available_models()
109 |     print_models(models)
110 |     
111 |     # Create output directory for visualizations
112 |     output_dir = "interactive_output"
113 |     os.makedirs(output_dir, exist_ok=True)
114 |     
115 |     session_count = 0
116 |     last_screenshot = None
117 |     screenshot_timestamp = None
118 |     
119 |     while True:
120 |         try:
121 |             # Get user input
122 |             print(f"\n{'='*40}")
123 |             user_input = input("🎯 Enter instruction (or command): ").strip()
124 |             
125 |             if not user_input:
126 |                 continue
127 |                 
128 |             # Handle commands
129 |             if user_input.lower() in ['quit', 'exit', 'q']:
130 |                 print("👋 Goodbye!")
131 |                 break
132 |                 
133 |             elif user_input.lower() == 'models':
134 |                 print_models(models)
135 |                 continue
136 |                 
137 |             elif user_input.lower() == 'screenshot':
138 |                 print("📸 Taking screenshot...")
139 |                 try:
140 |                     last_screenshot = take_screenshot()
141 |                     screenshot_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
142 |                     screenshot_path = os.path.join(output_dir, f"screenshot_{screenshot_timestamp}.png")
143 |                     last_screenshot.save(screenshot_path)
144 |                     print(f"✅ Screenshot captured and saved to: {screenshot_path}")
145 |                     print(f"📝 Ready for instructions! Screenshot size: {last_screenshot.size}")
146 |                 except Exception as e:
147 |                     print(f"❌ Error taking screenshot: {e}")
148 |                 continue
149 |             
150 |             # Handle instruction input
151 |             if last_screenshot is None:
152 |                 print("⚠️  No screenshot available! Please take a screenshot first using 'screenshot' command.")
153 |                 continue
154 |                 
155 |             session_count += 1
156 |             print(f"\n🎯 Session {session_count}: '{user_input}'")
157 |             print(f"📷 Using screenshot from: {screenshot_timestamp}")
158 |             
159 |             # Predict with all models using last screenshot
160 |             print(f"\n🤖 Testing {len(models)} models on screenshot...")
161 |             predictions = await predict_with_all_models(last_screenshot, user_input, models)
162 |             
163 |             # Display results summary
164 |             print(f"\n📊 Results Summary:")
165 |             print("-" * 50)
166 |             for pred in predictions:
167 |                 if pred['coords']:
168 |                     print(f"✅ {pred['model_name']}: ({pred['coords'][0]}, {pred['coords'][1]})")
169 |                 elif pred['error']:
170 |                     print(f"❌ {pred['model_name']}: ERROR - {pred['error']}")
171 |                 else:
172 |                     print(f"❌ {pred['model_name']}: No prediction")
173 |             
174 |             # Save visualization
175 |             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
176 |             vis_filename = f"session_{session_count:03d}_{timestamp}.png"
177 |             vis_path = os.path.join(output_dir, vis_filename)
178 |             
179 |             try:
180 |                 save_prediction_visualization(last_screenshot, user_input, predictions, vis_path)
181 |                 print(f"\n💾 Visualization saved to: {vis_path}")
182 |             except Exception as e:
183 |                 print(f"⚠️  Error saving visualization: {e}")
184 |             
185 |             print(f"\n✨ Session {session_count} completed!")
186 |             
187 |         except KeyboardInterrupt:
188 |             print("\n\n👋 Interrupted by user. Goodbye!")
189 |             break
190 |         except Exception as e:
191 |             print(f"\n❌ Unexpected error: {e}")
192 |             print("Continuing...")
193 | 
194 | 
195 | if __name__ == "__main__":
196 |     try:
197 |         asyncio.run(main())
198 |     except KeyboardInterrupt:
199 |         print("\n👋 Goodbye!")
200 |     except Exception as e:
201 |         print(f"❌ Fatal error: {e}")
202 | 
```

--------------------------------------------------------------------------------
/tests/test_venv.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Virtual Environment Testing Module
  3 | This module tests the ability to execute python code in a virtual environment within Cua Containers.
  4 | 
  5 | Required environment variables:
  6 | - CUA_API_KEY: API key for Cua cloud provider
  7 | - CUA_CONTAINER_NAME: Name of the container to use
  8 | """
  9 | 
 10 | import os
 11 | import asyncio
 12 | import pytest
 13 | from pathlib import Path
 14 | import sys
 15 | import traceback
 16 | 
 17 | # Load environment variables from .env file
 18 | project_root = Path(__file__).parent.parent
 19 | env_file = project_root / ".env"
 20 | print(f"Loading environment from: {env_file}")
 21 | from dotenv import load_dotenv
 22 | 
 23 | load_dotenv(env_file)
 24 | 
 25 | # Add paths to sys.path if needed
 26 | pythonpath = os.environ.get("PYTHONPATH", "")
 27 | for path in pythonpath.split(":"):
 28 |     if path and path not in sys.path:
 29 |         sys.path.insert(0, path)  # Insert at beginning to prioritize
 30 |         print(f"Added to sys.path: {path}")
 31 | 
 32 | from computer import Computer, VMProviderType
 33 | from computer.helpers import sandboxed, set_default_computer
 34 | 
 35 | 
 36 | @pytest.fixture(scope="session")
 37 | async def computer():
 38 |     """Shared Computer instance for all test cases."""
 39 |     # Create a remote Linux computer with Cua
 40 |     computer = Computer(
 41 |         os_type="linux",
 42 |         api_key=os.getenv("CUA_API_KEY"),
 43 |         name=str(os.getenv("CUA_CONTAINER_NAME")),
 44 |         provider_type=VMProviderType.CLOUD,
 45 |     )
 46 |     
 47 |     # # Create a local macOS computer with Cua
 48 |     # computer = Computer()
 49 |     
 50 |     try:
 51 |         await computer.run()
 52 |         yield computer
 53 |     finally:
 54 |         await computer.disconnect()
 55 | 
 56 | 
 57 | # Sample test cases
 58 | @pytest.mark.asyncio(loop_scope="session")
 59 | async def test_venv_install(computer):
 60 |     """Test virtual environment creation and package installation."""
 61 |     # Create a test virtual environment and install requests
 62 |     stdout, _ = await computer.venv_install("test_env", ["requests"])
 63 |     
 64 |     # Check that installation was successful (no major errors)
 65 |     assert "Successfully installed" in stdout or "Requirement already satisfied" in stdout
 66 | 
 67 | @pytest.mark.asyncio(loop_scope="session")
 68 | async def test_venv_cmd(computer):
 69 |     """Test executing shell commands in virtual environment."""
 70 |     # Test Python version check
 71 |     stdout, _ = await computer.venv_cmd("test_env", "python --version")
 72 |     
 73 |     assert "Python" in stdout
 74 | 
 75 | @pytest.mark.asyncio(loop_scope="session")
 76 | async def test_venv_exec(computer):
 77 |     """Test executing Python functions in virtual environment."""
 78 |     def test_function(message="Hello World"):
 79 |         import sys
 80 |         return f"Python {sys.version_info.major}.{sys.version_info.minor}: {message}"
 81 |     
 82 |     result = await computer.venv_exec("test_env", test_function, message="Test successful!")
 83 |     
 84 |     assert "Python" in result
 85 |     assert "Test successful!" in result
 86 | 
 87 | @pytest.mark.asyncio(loop_scope="session")
 88 | async def test_venv_exec_with_package(computer):
 89 |     """Test executing Python functions that use installed packages."""
 90 |     def test_requests():
 91 |         import requests
 92 |         return f"requests version: {requests.__version__}"
 93 |     
 94 |     result = await computer.venv_exec("test_env", test_requests)
 95 |     
 96 |     assert "requests version:" in result
 97 | 
 98 | @pytest.mark.asyncio(loop_scope="session")
 99 | async def test_venv_exec_error_handling(computer):
100 |     """Test error handling in venv_exec."""
101 |     def test_error():
102 |         raise ValueError("This is a test error")
103 |     
104 |     with pytest.raises(ValueError, match="This is a test error"):
105 |         await computer.venv_exec("test_env", test_error)
106 | 
107 | @pytest.mark.asyncio(loop_scope="session")
108 | async def test_venv_exec_with_args_kwargs(computer):
109 |     """Test executing Python functions with args and kwargs that return an object."""
110 |     def create_data_object(name, age, *hobbies, **metadata):
111 |         return {
112 |             "name": name,
113 |             "age": age,
114 |             "hobbies": list(hobbies),
115 |             "metadata": metadata,
116 |             "status": "active"
117 |         }
118 |     
119 |     args = ["Alice", 25, "reading", "coding"]
120 |     kwargs = {"location": "New York", "department": "Engineering"}
121 | 
122 |     result = await computer.venv_exec(
123 |         "test_env", 
124 |         create_data_object, 
125 |         *args, 
126 |         **kwargs
127 |     )
128 |     
129 |     assert result["name"] == "Alice"
130 |     assert result["age"] == 25
131 |     assert result["hobbies"] == ["reading", "coding"]
132 |     assert result["metadata"]["location"] == "New York"
133 |     assert result["status"] == "active"
134 | 
135 | @pytest.mark.asyncio(loop_scope="session")
136 | async def test_venv_exec_stdout_capture(computer, capfd):
137 |     """Test capturing stdout from Python functions executed in virtual environment."""
138 |     def hello_world_function():
139 |         print("Hello World!")
140 |         return "Function completed"
141 |     
142 |     # Execute the function in the virtual environment
143 |     result = await computer.venv_exec("test_env", hello_world_function)
144 |     
145 |     # Capture stdout and stderr
146 |     out, _ = capfd.readouterr()
147 |     
148 |     # Assert the stdout contains our expected output
149 |     assert out == "Hello World!\n\n"
150 |     assert result == "Function completed"
151 | 
152 | @pytest.mark.asyncio(loop_scope="session")
153 | async def test_remote_decorator(computer):
154 |     """Test the remote decorator from computer.helpers module."""
155 |     # Set the computer as default for the remote decorator
156 |     set_default_computer(computer)
157 |     
158 |     # Define a function with the remote decorator
159 |     @sandboxed("test_env")
160 |     def get_package_version():
161 |         import sys
162 |         import platform
163 |         return {
164 |             "python_version": sys.version,
165 |             "platform": platform.platform(),
166 |             "success": True
167 |         }
168 |     
169 |     # Call the decorated function
170 |     result = await get_package_version()
171 |     
172 |     # Verify the function executed in the virtual environment
173 |     assert "python_version" in result
174 |     assert "platform" in result
175 |     assert result["success"] == True
176 | 
177 | @pytest.mark.asyncio(loop_scope="session")
178 | async def test_remote_decorator_with_custom_computer(computer):
179 |     """Test the remote decorator with explicitly specified computer instance."""
180 |     # Define a function with the remote decorator that explicitly specifies the computer
181 |     @sandboxed("test_env", computer=computer)
182 |     def get_system_info():
183 |         import os
184 |         import sys
185 |         return {
186 |             "python_version": sys.version,
187 |             "environment_vars": dict(os.environ),
188 |             "working_directory": os.getcwd()
189 |         }
190 |     
191 |     # Call the decorated function
192 |     result = await get_system_info()
193 |     
194 |     # Verify the function executed in the virtual environment
195 |     assert "python_version" in result
196 |     assert "environment_vars" in result
197 |     assert "working_directory" in result
198 |     # The virtual environment should have a different working directory
199 |     # than the current test process
200 |     assert result["working_directory"] != os.getcwd()
201 | 
202 | if __name__ == "__main__":
203 |     # Run tests directly
204 |     pytest.main([__file__, "-v"])
205 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py:
--------------------------------------------------------------------------------

```python
  1 | import asyncio
  2 | import functools
  3 | import warnings
  4 | from concurrent.futures import ThreadPoolExecutor
  5 | from typing import Iterator, AsyncIterator, Dict, List, Any, Optional
  6 | from litellm.types.utils import GenericStreamingChunk, ModelResponse
  7 | from litellm.llms.custom_llm import CustomLLM
  8 | from litellm import completion, acompletion
  9 | 
 10 | # Try to import HuggingFace dependencies
 11 | try:
 12 |     import torch
 13 |     from transformers import AutoModelForImageTextToText, AutoProcessor
 14 |     HF_AVAILABLE = True
 15 | except ImportError:
 16 |     HF_AVAILABLE = False
 17 | 
 18 | from .models import load_model as load_model_handler
 19 | 
 20 | class HuggingFaceLocalAdapter(CustomLLM):
 21 |     """HuggingFace Local Adapter for running vision-language models locally."""
 22 |     
 23 |     def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
 24 |         """Initialize the adapter.
 25 |         
 26 |         Args:
 27 |             device: Device to load model on ("auto", "cuda", "cpu", etc.)
 28 |             trust_remote_code: Whether to trust remote code
 29 |             **kwargs: Additional arguments
 30 |         """
 31 |         super().__init__()
 32 |         self.device = device
 33 |         self.trust_remote_code = trust_remote_code
 34 |         # Cache for model handlers keyed by model_name
 35 |         self._handlers: Dict[str, Any] = {}
 36 |         self._executor = ThreadPoolExecutor(max_workers=1)  # Single thread pool
 37 |         
 38 |     def _get_handler(self, model_name: str):
 39 |         """Get or create a model handler for the given model name."""
 40 |         if model_name not in self._handlers:
 41 |             self._handlers[model_name] = load_model_handler(model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code)
 42 |         return self._handlers[model_name]
 43 |     
 44 |     def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 45 |         """Convert OpenAI format messages to HuggingFace format.
 46 |         
 47 |         Args:
 48 |             messages: Messages in OpenAI format
 49 |             
 50 |         Returns:
 51 |             Messages in HuggingFace format
 52 |         """
 53 |         converted_messages = []
 54 |         
 55 |         for message in messages:
 56 |             converted_message = {
 57 |                 "role": message["role"],
 58 |                 "content": []
 59 |             }
 60 |             
 61 |             content = message.get("content", [])
 62 |             if isinstance(content, str):
 63 |                 # Simple text content
 64 |                 converted_message["content"].append({
 65 |                     "type": "text",
 66 |                     "text": content
 67 |                 })
 68 |             elif isinstance(content, list):
 69 |                 # Multi-modal content
 70 |                 for item in content:
 71 |                     if item.get("type") == "text":
 72 |                         converted_message["content"].append({
 73 |                             "type": "text",
 74 |                             "text": item.get("text", "")
 75 |                         })
 76 |                     elif item.get("type") == "image_url":
 77 |                         # Convert image_url format to image format
 78 |                         image_url = item.get("image_url", {}).get("url", "")
 79 |                         converted_message["content"].append({
 80 |                             "type": "image",
 81 |                             "image": image_url
 82 |                         })
 83 |             
 84 |             converted_messages.append(converted_message)
 85 |             
 86 |         return converted_messages
 87 |     
 88 |     def _generate(self, **kwargs) -> str:
 89 |         """Generate response using the local HuggingFace model.
 90 |         
 91 |         Args:
 92 |             **kwargs: Keyword arguments containing messages and model info
 93 |             
 94 |         Returns:
 95 |             Generated text response
 96 |         """
 97 |         if not HF_AVAILABLE:
 98 |             raise ImportError(
 99 |                 "HuggingFace transformers dependencies not found. "
100 |                 "Please install with: pip install \"cua-agent[uitars-hf]\""
101 |             )
102 |         
103 |         # Extract messages and model from kwargs
104 |         messages = kwargs.get('messages', [])
105 |         model_name = kwargs.get('model', 'ByteDance-Seed/UI-TARS-1.5-7B')
106 |         max_new_tokens = kwargs.get('max_tokens', 128)
107 |         
108 |         # Warn about ignored kwargs
109 |         ignored_kwargs = set(kwargs.keys()) - {'messages', 'model', 'max_tokens'}
110 |         if ignored_kwargs:
111 |             warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
112 |         
113 |         # Convert messages to HuggingFace format
114 |         hf_messages = self._convert_messages(messages)
115 |         
116 |         # Delegate to model handler
117 |         handler = self._get_handler(model_name)
118 |         generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
119 |         return generated_text
120 |     
121 |     def completion(self, *args, **kwargs) -> ModelResponse:
122 |         """Synchronous completion method.
123 |         
124 |         Returns:
125 |             ModelResponse with generated text
126 |         """
127 |         generated_text = self._generate(**kwargs)
128 |         
129 |         return completion(
130 |             model=f"huggingface-local/{kwargs['model']}",
131 |             mock_response=generated_text,
132 |         )
133 |     
134 |     async def acompletion(self, *args, **kwargs) -> ModelResponse:
135 |         """Asynchronous completion method.
136 |         
137 |         Returns:
138 |             ModelResponse with generated text
139 |         """
140 |         # Run _generate in thread pool to avoid blocking
141 |         loop = asyncio.get_event_loop()
142 |         generated_text = await loop.run_in_executor(
143 |             self._executor, 
144 |             functools.partial(self._generate, **kwargs)
145 |         )
146 |         
147 |         return await acompletion(
148 |             model=f"huggingface-local/{kwargs['model']}",
149 |             mock_response=generated_text,
150 |         )
151 |     
152 |     def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
153 |         """Synchronous streaming method.
154 |         
155 |         Returns:
156 |             Iterator of GenericStreamingChunk
157 |         """
158 |         generated_text = self._generate(**kwargs)
159 |         
160 |         generic_streaming_chunk: GenericStreamingChunk = {
161 |             "finish_reason": "stop",
162 |             "index": 0,
163 |             "is_finished": True,
164 |             "text": generated_text,
165 |             "tool_use": None,
166 |             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
167 |         }
168 |         
169 |         yield generic_streaming_chunk
170 |     
171 |     async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
172 |         """Asynchronous streaming method.
173 |         
174 |         Returns:
175 |             AsyncIterator of GenericStreamingChunk
176 |         """
177 |         # Run _generate in thread pool to avoid blocking
178 |         loop = asyncio.get_event_loop()
179 |         generated_text = await loop.run_in_executor(
180 |             self._executor, 
181 |             functools.partial(self._generate, **kwargs)
182 |         )
183 |         
184 |         generic_streaming_chunk: GenericStreamingChunk = {
185 |             "finish_reason": "stop",
186 |             "index": 0,
187 |             "is_finished": True,
188 |             "text": generated_text,
189 |             "tool_use": None,
190 |             "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
191 |         }
192 |         
193 |         yield generic_streaming_chunk
```

--------------------------------------------------------------------------------
/libs/python/som/som/util/utils.py:
--------------------------------------------------------------------------------

```python
  1 | import easyocr
  2 | import cv2
  3 | import matplotlib.pyplot as plt
  4 | import numpy as np
  5 | from PIL import Image
  6 | from typing import Union, List, Tuple, Any, Optional, cast, Sequence
  7 | import time
  8 | import signal
  9 | from contextlib import contextmanager
 10 | import logging
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class TimeoutException(Exception):
 16 |     pass
 17 | 
 18 | 
 19 | @contextmanager
 20 | def timeout(seconds):
 21 |     def timeout_handler(signum, frame):
 22 |         logger.warning(f"OCR process timed out after {seconds} seconds")
 23 |         raise TimeoutException("OCR processing timed out")
 24 | 
 25 |     # Register the signal handler
 26 |     original_handler = signal.signal(signal.SIGALRM, timeout_handler)
 27 |     signal.alarm(seconds)
 28 | 
 29 |     try:
 30 |         yield
 31 |     finally:
 32 |         signal.alarm(0)
 33 |         signal.signal(signal.SIGALRM, original_handler)
 34 | 
 35 | 
 36 | # Initialize EasyOCR with optimized settings
 37 | logger.info("Initializing EasyOCR with optimized settings...")
 38 | reader = easyocr.Reader(
 39 |     ["en"],
 40 |     gpu=True,  # Use GPU if available
 41 |     model_storage_directory=None,  # Use default directory
 42 |     download_enabled=True,
 43 |     detector=True,  # Enable text detection
 44 |     recognizer=True,  # Enable text recognition
 45 |     verbose=False,  # Disable verbose output
 46 |     quantize=True,  # Enable quantization for faster inference
 47 |     cudnn_benchmark=True,  # Enable cuDNN benchmarking
 48 | )
 49 | logger.info("EasyOCR initialization complete")
 50 | 
 51 | 
 52 | def check_ocr_box(
 53 |     image_source: Union[str, Image.Image],
 54 |     display_img=True,
 55 |     output_bb_format="xywh",
 56 |     goal_filtering=None,
 57 |     easyocr_args=None,
 58 |     use_paddleocr=False,
 59 | ) -> Tuple[Tuple[List[str], List[Tuple[float, float, float, float]]], Optional[Any]]:
 60 |     """Check OCR box using EasyOCR with optimized settings.
 61 | 
 62 |     Args:
 63 |         image_source: Either a file path or PIL Image
 64 |         display_img: Whether to display the annotated image
 65 |         output_bb_format: Format for bounding boxes ('xywh' or 'xyxy')
 66 |         goal_filtering: Optional filtering of results
 67 |         easyocr_args: Arguments for EasyOCR
 68 |         use_paddleocr: Ignored (kept for backward compatibility)
 69 | 
 70 |     Returns:
 71 |         Tuple containing:
 72 |         - Tuple of (text_list, bounding_boxes)
 73 |         - goal_filtering value
 74 |     """
 75 |     logger.info("Starting OCR processing...")
 76 |     start_time = time.time()
 77 | 
 78 |     if isinstance(image_source, str):
 79 |         logger.info(f"Loading image from path: {image_source}")
 80 |         image_source = Image.open(image_source)
 81 |     if image_source.mode == "RGBA":
 82 |         logger.info("Converting RGBA image to RGB")
 83 |         image_source = image_source.convert("RGB")
 84 |     image_np = np.array(image_source)
 85 |     w, h = image_source.size
 86 |     logger.info(f"Image size: {w}x{h}")
 87 | 
 88 |     # Default EasyOCR arguments optimized for speed
 89 |     default_args = {
 90 |         "paragraph": False,  # Disable paragraph detection
 91 |         "text_threshold": 0.5,  # Confidence threshold
 92 |         "link_threshold": 0.4,  # Text link threshold
 93 |         "canvas_size": 2560,  # Max image size
 94 |         "mag_ratio": 1.0,  # Magnification ratio
 95 |         "slope_ths": 0.1,  # Slope threshold
 96 |         "ycenter_ths": 0.5,  # Y-center threshold
 97 |         "height_ths": 0.5,  # Height threshold
 98 |         "width_ths": 0.5,  # Width threshold
 99 |         "add_margin": 0.1,  # Margin around text
100 |         "min_size": 20,  # Minimum text size
101 |     }
102 | 
103 |     # Update with user-provided arguments
104 |     if easyocr_args:
105 |         logger.info(f"Using custom EasyOCR arguments: {easyocr_args}")
106 |         default_args.update(easyocr_args)
107 | 
108 |     try:
109 |         # Use EasyOCR with timeout
110 |         logger.info("Starting EasyOCR detection with 5 second timeout...")
111 |         with timeout(5):  # 5 second timeout
112 |             # EasyOCR's readtext returns a list of tuples, where each tuple is (bbox, text, confidence)
113 |             raw_result = reader.readtext(image_np, **default_args)
114 |             result = cast(Sequence[Tuple[List[Tuple[float, float]], str, float]], raw_result)
115 |             coord = [item[0] for item in result]  # item[0] is the bbox coordinates
116 |             text = [item[1] for item in result]  # item[1] is the text content
117 |             logger.info(f"OCR completed successfully. Found {len(text)} text regions")
118 |             logger.info(f"Detected text: {text}")
119 | 
120 |     except TimeoutException:
121 |         logger.error("OCR processing timed out after 5 seconds")
122 |         coord = []
123 |         text = []
124 |     except Exception as e:
125 |         logger.error(f"OCR processing failed with error: {str(e)}")
126 |         coord = []
127 |         text = []
128 | 
129 |     processing_time = time.time() - start_time
130 |     logger.info(f"Total OCR processing time: {processing_time:.2f} seconds")
131 | 
132 |     if display_img:
133 |         logger.info("Creating visualization of OCR results...")
134 |         opencv_img = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
135 |         bb = []
136 |         for item in coord:
137 |             x, y, a, b = get_xywh(item)
138 |             bb.append((x, y, a, b))
139 |             # Convert float coordinates to integers for cv2.rectangle
140 |             x_val = cast(float, x)
141 |             y_val = cast(float, y)
142 |             a_val = cast(float, a)
143 |             b_val = cast(float, b)
144 |             x_int, y_int = int(x_val), int(y_val)
145 |             a_int, b_int = int(a_val), int(b_val)
146 |             cv2.rectangle(
147 |                 opencv_img, (x_int, y_int), (x_int + a_int, y_int + b_int), (0, 255, 0), 2
148 |             )
149 |         plt.imshow(cv2.cvtColor(opencv_img, cv2.COLOR_BGR2RGB))
150 |     else:
151 |         if output_bb_format == "xywh":
152 |             bb = [get_xywh(item) for item in coord]
153 |         elif output_bb_format == "xyxy":
154 |             bb = [get_xyxy(item) for item in coord]
155 | 
156 |     # Cast the bounding boxes to the expected type
157 |     bb = cast(List[Tuple[float, float, float, float]], bb)
158 | 
159 |     logger.info("OCR processing complete")
160 |     return (text, bb), goal_filtering
161 | 
162 | 
163 | def get_xywh(box):
164 |     """
165 |     Convert a bounding box to xywh format (x, y, width, height).
166 | 
167 |     Args:
168 |         box: Bounding box coordinates (various formats supported)
169 | 
170 |     Returns:
171 |         Tuple of (x, y, width, height)
172 |     """
173 |     # Handle different input formats
174 |     if len(box) == 4:
175 |         # If already in xywh format or xyxy format
176 |         if isinstance(box[0], (int, float)) and isinstance(box[2], (int, float)):
177 |             if box[2] < box[0] or box[3] < box[1]:
178 |                 # Already xyxy format, convert to xywh
179 |                 x1, y1, x2, y2 = box
180 |                 return x1, y1, x2 - x1, y2 - y1
181 |             else:
182 |                 # Already in xywh format
183 |                 return box
184 |     elif len(box) == 2:
185 |         # Format like [[x1,y1],[x2,y2]] from some OCR engines
186 |         (x1, y1), (x2, y2) = box
187 |         return x1, y1, x2 - x1, y2 - y1
188 | 
189 |     # Default case - try to convert assuming it's a list of points
190 |     x_coords = [p[0] for p in box]
191 |     y_coords = [p[1] for p in box]
192 |     x1, y1 = min(x_coords), min(y_coords)
193 |     width, height = max(x_coords) - x1, max(y_coords) - y1
194 |     return x1, y1, width, height
195 | 
196 | 
197 | def get_xyxy(box):
198 |     """
199 |     Convert a bounding box to xyxy format (x1, y1, x2, y2).
200 | 
201 |     Args:
202 |         box: Bounding box coordinates (various formats supported)
203 | 
204 |     Returns:
205 |         Tuple of (x1, y1, x2, y2)
206 |     """
207 |     # Get xywh first, then convert to xyxy
208 |     x, y, w, h = get_xywh(box)
209 |     return x, y, x + w, y + h
210 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/ss-v2.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | ScreenSpot-v2 Benchmark Script
  4 | 
  5 | Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
  6 | Supports both ComputerAgent model strings and custom model classes.
  7 | """
  8 | 
  9 | import argparse
 10 | import asyncio
 11 | import random
 12 | import statistics
 13 | import time
 14 | from typing import Optional
 15 | 
 16 | from datasets import load_dataset
 17 | from tqdm import tqdm
 18 | 
 19 | from utils import (
 20 |     ModelWrapper, 
 21 |     is_click_in_bbox, 
 22 |     save_results_to_markdown, 
 23 |     save_visualizations,
 24 |     get_available_models,
 25 |     get_gpu_memory
 26 | )
 27 | 
 28 | 
 29 | async def evaluate_model(model_wrapper: ModelWrapper, samples, max_samples: Optional[int] = None) -> dict:
 30 |     """
 31 |     Evaluate a model on any iterable of samples.
 32 |     
 33 |     Args:
 34 |         model_wrapper: ModelWrapper instance
 35 |         samples: Iterable of dicts with keys: image, bbox, instruction
 36 |         max_samples: Maximum number of samples to evaluate (None for all)
 37 |         
 38 |     Returns:
 39 |         Dictionary with evaluation results
 40 |     """
 41 |     print(f"\nEvaluating model: {model_wrapper.model_name}")
 42 |     
 43 |     # Load model
 44 |     await model_wrapper.load_model()
 45 |     
 46 |     # Convert to list if needed and limit samples
 47 |     if hasattr(samples, '__len__'):
 48 |         total_samples = len(samples)
 49 |         if max_samples is not None:
 50 |             total_samples = min(max_samples, total_samples)
 51 |         sample_list = list(samples)[:total_samples]
 52 |     else:
 53 |         # For iterators, take max_samples or all
 54 |         sample_list = list(samples)
 55 |         if max_samples is not None:
 56 |             sample_list = sample_list[:max_samples]
 57 |         total_samples = len(sample_list)
 58 |     
 59 |     correct_predictions = 0
 60 |     error_predictions = 0
 61 |     results = []
 62 |     
 63 |     for i, sample in enumerate(tqdm(sample_list, desc=f"Evaluating {model_wrapper.model_name}")):
 64 |         # Extract required data (only these 3 keys matter)
 65 |         image = sample['image']
 66 |         instruction = sample['instruction']
 67 |         bbox = sample['bbox']  # [x1, y1, x2, y2]
 68 |         
 69 |         # Predict click coordinates with timing
 70 |         start_time = time.time()
 71 |         click_coords = await model_wrapper.predict_click(image, instruction)
 72 |         prediction_time = time.time() - start_time
 73 |         
 74 |         # Check if prediction is correct
 75 |         is_correct = is_click_in_bbox(click_coords, bbox)
 76 |         
 77 |         if is_correct:
 78 |             correct_predictions += 1
 79 |         
 80 |         results.append({
 81 |             'sample_idx': i,
 82 |             'instruction': instruction,
 83 |             'bbox': bbox,
 84 |             'predicted_coords': click_coords,
 85 |             'is_correct': is_correct,
 86 |             'failed': False,
 87 |             'prediction_time': prediction_time
 88 |         })
 89 |     
 90 |     # Unload model
 91 |     await model_wrapper.unload_model()
 92 |     
 93 |     # Calculate metrics
 94 |     accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
 95 |     error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
 96 |     
 97 |     # Calculate timing statistics
 98 |     successful_times = [r['prediction_time'] for r in results if not r['failed']]
 99 |     avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
100 |     median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
101 |     min_prediction_time = min(successful_times) if successful_times else 0.0
102 |     max_prediction_time = max(successful_times) if successful_times else 0.0
103 |     
104 |     # Get VRAM statistics
105 |     vram_stats = model_wrapper.get_vram_stats()
106 |     
107 |     return {
108 |         'model_name': model_wrapper.model_name,
109 |         'total_samples': total_samples,
110 |         'correct_predictions': correct_predictions,
111 |         'failed_predictions': error_predictions,
112 |         'accuracy': accuracy,
113 |         'failure_rate': error_rate,
114 |         'avg_prediction_time': avg_prediction_time,
115 |         'median_prediction_time': median_prediction_time,
116 |         'min_prediction_time': min_prediction_time,
117 |         'max_prediction_time': max_prediction_time,
118 |         'vram_max_mb': vram_stats['max_mb'],
119 |         'vram_avg_mb': vram_stats['avg_mb'],
120 |         'results': results
121 |     }
122 | 
123 | 
124 | async def main():
125 |     """
126 |     Main function to run the benchmark.
127 |     """
128 |     # Parse command line arguments
129 |     parser = argparse.ArgumentParser(description='ScreenSpot-v2 Benchmark Script')
130 |     parser.add_argument('--samples', type=int, default=500, 
131 |                        help='Number of samples to evaluate (default: 500)')
132 |     parser.add_argument('--seed', type=int, default=42,
133 |                        help='Random seed for shuffling (default: 42)')
134 |     args = parser.parse_args()
135 |     
136 |     # Set random seed
137 |     random.seed(args.seed)
138 |     
139 |     # Load dataset
140 |     print("Loading ScreenSpot-v2 dataset...")
141 |     ds = load_dataset("lmms-lab/ScreenSpot-v2")
142 |     dataset = ds['train'] # type: ignore
143 |     # Convert to simple list of dicts with only required keys
144 |     samples = []
145 |     for item in dataset:
146 |         # Convert dataset item to dict if needed
147 |         item_dict = dict(item) if hasattr(item, 'keys') else item
148 |         
149 |         # Convert ScreenSpot-v2 bbox format [x, y, w, h] to [x1, y1, x2, y2]
150 |         bbox_xywh = item_dict['bbox']  # type: ignore
151 |         x, y, w, h = bbox_xywh
152 |         bbox_xyxy = [x, y, x + w, y + h]
153 |         
154 |         samples.append({
155 |             'image': item_dict['image'],  # type: ignore
156 |             'instruction': item_dict['instruction'],  # type: ignore
157 |             'bbox': bbox_xyxy
158 |         })
159 |     print(f"Dataset loaded: {len(samples)} samples")
160 |     
161 |     # Shuffle samples with seed
162 |     random.shuffle(samples)
163 |     print(f"Samples shuffled with seed {args.seed}")
164 |     
165 |     # Get available models
166 |     models = get_available_models()
167 |     
168 |     # Evaluation settings
169 |     max_samples = args.samples  # Use command line argument
170 |     
171 |     # Run evaluations
172 |     all_results = []
173 |     
174 |     for model in models:
175 |         model_wrapper = ModelWrapper(model)
176 |         result = await evaluate_model(model_wrapper, samples, max_samples)
177 |         all_results.append(result)
178 |         
179 |         # Print summary
180 |         print(f"\n{result['model_name']} Results:")
181 |         print(f"  Accuracy: {result['accuracy']*100:.2f}%")
182 |         print(f"  Correct: {result['correct_predictions']}/{result['total_samples']}")
183 |         print(f"  Errors: {result['failed_predictions']}")
184 |         print(f"  Error Rate: {result['failure_rate']*100:.2f}%")
185 |         print(f"  Avg Time: {result['avg_prediction_time']:.2f}s")
186 |         print(f"  Median Time: {result['median_prediction_time']:.2f}s")
187 |         print(f"  Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s")
188 |         print(f"  VRAM Max: {result['vram_max_mb']:.1f}MB")
189 |         print(f"  VRAM Avg: {result['vram_avg_mb']:.1f}MB")
190 |         
191 |         # Print GPU memory info
192 |         gpu_memory = get_gpu_memory()
193 |         if gpu_memory and gpu_memory[0] > 0:
194 |             print(f"  GPU Free Memory: {gpu_memory[0]:.1f}MB")
195 |     
196 |     # Save results
197 |     if all_results:
198 |         save_results_to_markdown(all_results, "screenspot_v2_results.md", title="ScreenSpot-v2 Benchmark Results")
199 |         save_visualizations(all_results, samples)
200 |         print("\nBenchmark completed successfully!")
201 |     else:
202 |         print("\nNo successful evaluations completed.")
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     asyncio.run(main())
```

--------------------------------------------------------------------------------
/blog/ubuntu-docker-support.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Ubuntu Docker Support in Cua with Kasm
  2 | 
  3 | *Published Aug 26, 2025 by Francesco Bonacci*
  4 | 
  5 | Today we’re shipping **Ubuntu Docker support** in Cua. You get a full Linux desktop inside a Docker container, viewable right in your browser—no VM spin-up, no extra clients. It behaves the same on macOS, Windows, and Linux.
  6 | 
  7 | <img src="./assets/docker-ubuntu-support.png" alt="Cua + KasmVNC Ubuntu container desktop">
  8 | 
  9 | ## Why we did this
 10 | 
 11 | If you build automation or RL workflows with Cua, you’ve probably run into the usual platform walls: macOS VMs (via Lume) are Apple-Silicon only; Windows Sandbox needs Pro/Enterprise; giving agents your host desktop is… exciting, but risky; and little OS quirks make “build once, run anywhere” harder than it should be.
 12 | 
 13 | We wanted something lightweight, isolated, and identical across machines. So we put a desktop in a container.
 14 | 
 15 | ## Why we didn’t use QEMU/KVM
 16 | 
 17 | Short answer: **portability, startup time, and ops friction.**
 18 | 
 19 | * **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
 20 | * **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
 21 | * **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
 22 | * **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
 23 | * **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
 24 | 
 25 | **When we *do* reach for QEMU/KVM:**
 26 | 
 27 | * You need **true OS isolation** or to run **non-Linux** guests.
 28 | * You want **kernel-level features** or **device/GPU passthrough** (VFIO).
 29 | * You’re optimizing for **hardware realism** over startup speed and density.
 30 | 
 31 | For this release, the goal was a **cross-platform Linux desktop that feels instant and identical** across local dev and CI. Containers + KasmVNC hit that sweet spot.
 32 | 
 33 | ## What we built
 34 | 
 35 | Under the hood it’s **KasmVNC + Ubuntu 22.04 (Xfce) in Docker**, pre-configured for computer-use automation. You get a proper GUI desktop served over HTTP (no VNC/RDP client), accessible from any modern browser. Cua’s Computer server boots automatically so your agents can connect immediately.
 36 | 
 37 | ### How it works (at a glance)
 38 | 
 39 | ```
 40 | Your System
 41 | └─ Docker Container
 42 |    └─ Xfce Desktop + KasmVNC → open in your browser
 43 | ```
 44 | 
 45 | ---
 46 | 
 47 | ## Quick start
 48 | 
 49 | 1. **Install Docker** — Docker Desktop (macOS/Windows) or Docker Engine (Linux).
 50 | 
 51 | 2. **Pull or build the image**
 52 | 
 53 | ```bash
 54 | # Pull (recommended)
 55 | docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
 56 | 
 57 | # Or build locally
 58 | cd libs/kasm
 59 | docker build -t cua-ubuntu:latest .
 60 | ```
 61 | 
 62 | 3. **Run with Cua’s Computer SDK**
 63 | 
 64 | ```python
 65 | from computer import Computer
 66 | 
 67 | computer = Computer(
 68 |     os_type="linux",
 69 |     provider_type="docker",
 70 |     image="trycua/cua-ubuntu:latest",
 71 |     name="my-automation-container"
 72 | )
 73 | 
 74 | await computer.run()
 75 | ```
 76 | 
 77 | ### Make an agent that drives this desktop
 78 | 
 79 | ```python
 80 | from agent import ComputerAgent
 81 | 
 82 | # assumes `computer` is the instance created above
 83 | agent = ComputerAgent("openrouter/z-ai/glm-4.5v", tools=[computer])
 84 | 
 85 | async for _ in agent.run("Click on the search bar and type 'hello world'"):
 86 |     pass
 87 | ```
 88 | 
 89 | > Use any VLM with tool use; just make sure your OpenRouter creds are set.
 90 | 
 91 | By default you land on **Ubuntu 22.04 + Xfce** with a browser and desktop basics, the **Computer server** is running, the **web viewer** is available at `http://localhost:8006`, and common automation tools are preinstalled.
 92 | 
 93 | ---
 94 | 
 95 | ## What’s inside (in plain English)
 96 | 
 97 | A tidy Linux desktop with web access through **KasmVNC**, Python 3.11 and dev tools, plus utilities you’ll actually use for automation—`wmctrl` for windows, `xclip` for clipboard, `ffmpeg` for media, screenshot helpers, and so on. It starts as a **non-root `kasm-user`**, lives in an **isolated filesystem** (unless you mount volumes), and ships with **SSL off for local dev** so you terminate TLS upstream when you deploy.
 98 | 
 99 | ---
100 | 
101 | ## How it compares
102 | 
103 | | Feature          | KasmVNC Docker        | Lume (macOS VM)       | Windows Sandbox        |
104 | | ---------------- | --------------------- | --------------------- | ---------------------- |
105 | | Platform support | macOS, Windows, Linux | macOS (Apple Silicon) | Windows Pro/Enterprise |
106 | | Resource usage   | Low (container)       | Medium (full VM)      | Medium (full VM)       |
107 | | Setup time       | \~30s                 | 2–5 min               | 1–2 min                |
108 | | GUI desktop      | Linux                 | macOS                 | Windows                |
109 | | Web access       | Browser (no client)   | Typically VNC client  | Typically RDP client   |
110 | | Consistency      | Same everywhere       | Hardware-dependent    | OS-dependent           |
111 | 
112 | **Use KasmVNC Docker when…** you want the **same GUI env across devs/CI/platforms**, you’re doing **RL or end-to-end GUI tests**, or you need **many isolated desktops on one machine**.
113 | **Use alternatives when…** you need native **macOS** (→ Lume) or native **Windows** (→ Windows Sandbox).
114 | 
115 | ---
116 | 
117 | ## Using the Agent Framework (parallel example)
118 | 
119 | A compact pattern for running multiple desktops and agents side-by-side:
120 | 
121 | ```python
122 | import asyncio
123 | from computer import Computer
124 | from agent import ComputerAgent
125 | 
126 | # Create multiple computer instances (each gets its own desktop)
127 | computers = []
128 | for i in range(3):
129 |     c = Computer(
130 |         os_type="linux",
131 |         provider_type="docker",
132 |         image="trycua/cua-ubuntu:latest",
133 |         name=f"parallel-desktop-{i}"
134 |     )
135 |     computers.append(c)
136 |     await c.run()
137 | 
138 | # Pair each desktop with a task
139 | tasks = [
140 |     "open github and search for 'trycua/cua'",
141 |     "open a text editor and write 'hello world'",
142 |     "open the browser and go to google.com",
143 | ]
144 | 
145 | agents = [
146 |     ComputerAgent(model="openrouter/z-ai/glm-4.5v", tools=[c])
147 |     for c in computers
148 | ]
149 | 
150 | async def run_agent(agent, task):
151 |     async for _ in agent.run(task):
152 |         pass
153 | 
154 | await asyncio.gather(*[run_agent(a, t) for a, t in zip(agents, tasks)])
155 | ```
156 | 
157 | ---
158 | 
159 | ## What’s next
160 | 
161 | We’re polishing a **CLI to push/scale these containers on Cua Cloud**, exploring **GPU acceleration** for in-container inference, and publishing **prebuilt images** for Playwright, Selenium, and friends.
162 | 
163 | ---
164 | 
165 | ## Try it
166 | 
167 | ```python
168 | from computer import Computer
169 | computer = Computer(os_type="linux", provider_type="docker", image="trycua/cua-ubuntu:latest")
170 | await computer.run()
171 | ```
172 | 
173 | ---
174 | 
175 | ## Links
176 | 
177 | * **Docker Provider Docs:** [https://docs.trycua.com/computers/docker](https://docs.trycua.com/computers/docker)
178 | * **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
179 | * **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
180 | * **Computer SDK:** [https://docs.trycua.com/docs/computer-sdk/computers](https://docs.trycua.com/docs/computer-sdk/computers)
181 | * **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
182 | 
183 | Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/telemetry.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Telemetry callback handler for Computer-Use Agent (cua-agent)
  3 | """
  4 | 
  5 | import time
  6 | import uuid
  7 | from typing import List, Dict, Any, Optional, Union
  8 | 
  9 | from .base import AsyncCallbackHandler
 10 | from core.telemetry import (
 11 |     record_event,
 12 |     is_telemetry_enabled,
 13 | )
 14 | 
 15 | import platform
 16 | 
 17 | SYSTEM_INFO = {
 18 |     "os": platform.system().lower(),
 19 |     "os_version": platform.release(),
 20 |     "python_version": platform.python_version(),
 21 | }
 22 | 
 23 | class TelemetryCallback(AsyncCallbackHandler):
 24 |     """
 25 |     Telemetry callback handler for Computer-Use Agent (cua-agent)
 26 |     
 27 |     Tracks agent usage, performance metrics, and optionally trajectory data.
 28 |     """
 29 |     
 30 |     def __init__(
 31 |         self, 
 32 |         agent, 
 33 |         log_trajectory: bool = False
 34 |     ):
 35 |         """
 36 |         Initialize telemetry callback.
 37 |         
 38 |         Args:
 39 |             agent: The ComputerAgent instance
 40 |             log_trajectory: Whether to log full trajectory items (opt-in)
 41 |         """
 42 |         self.agent = agent
 43 |         self.log_trajectory = log_trajectory
 44 |         
 45 |         # Generate session/run IDs
 46 |         self.session_id = str(uuid.uuid4())
 47 |         self.run_id = None
 48 |         
 49 |         # Track timing and metrics
 50 |         self.run_start_time = None
 51 |         self.step_count = 0
 52 |         self.step_start_time = None
 53 |         self.total_usage = {
 54 |             "prompt_tokens": 0,
 55 |             "completion_tokens": 0,
 56 |             "total_tokens": 0,
 57 |             "response_cost": 0.0
 58 |         }
 59 |         
 60 |         # Record agent initialization
 61 |         if is_telemetry_enabled():
 62 |             self._record_agent_initialization()
 63 |     
 64 |     def _record_agent_initialization(self) -> None:
 65 |         """Record agent type/model and session initialization."""
 66 |         agent_info = {
 67 |             "session_id": self.session_id,
 68 |             "agent_type": self.agent.agent_loop.__name__ if hasattr(self.agent, 'agent_loop') else 'unknown',
 69 |             "model": getattr(self.agent, 'model', 'unknown'),
 70 |             **SYSTEM_INFO
 71 |         }
 72 |         
 73 |         record_event("agent_session_start", agent_info)
 74 |     
 75 |     async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
 76 |         """Called at the start of an agent run loop."""
 77 |         if not is_telemetry_enabled():
 78 |             return
 79 |             
 80 |         self.run_id = str(uuid.uuid4())
 81 |         self.run_start_time = time.time()
 82 |         self.step_count = 0
 83 |         
 84 |         # Calculate input context size
 85 |         input_context_size = self._calculate_context_size(old_items)
 86 |         
 87 |         run_data = {
 88 |             "session_id": self.session_id,
 89 |             "run_id": self.run_id,
 90 |             "start_time": self.run_start_time,
 91 |             "input_context_size": input_context_size,
 92 |             "num_existing_messages": len(old_items)
 93 |         }
 94 |         
 95 |         # Log trajectory if opted in
 96 |         if self.log_trajectory:
 97 |             trajectory = self._extract_trajectory(old_items)
 98 |             if trajectory:
 99 |                 run_data["uploaded_trajectory"] = trajectory
100 |         
101 |         record_event("agent_run_start", run_data)
102 |     
103 |     async def on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
104 |         """Called at the end of an agent run loop."""
105 |         if not is_telemetry_enabled() or not self.run_start_time:
106 |             return
107 |             
108 |         run_duration = time.time() - self.run_start_time
109 |         
110 |         run_data = {
111 |             "session_id": self.session_id,
112 |             "run_id": self.run_id,
113 |             "end_time": time.time(),
114 |             "duration_seconds": run_duration,
115 |             "num_steps": self.step_count,
116 |             "total_usage": self.total_usage.copy()
117 |         }
118 |         
119 |         # Log trajectory if opted in
120 |         if self.log_trajectory:
121 |             trajectory = self._extract_trajectory(new_items)
122 |             if trajectory:
123 |                 run_data["uploaded_trajectory"] = trajectory
124 |         
125 |         record_event("agent_run_end", run_data)
126 |     
127 |     async def on_usage(self, usage: Dict[str, Any]) -> None:
128 |         """Called when usage information is received."""
129 |         if not is_telemetry_enabled():
130 |             return
131 |             
132 |         # Accumulate usage stats
133 |         self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
134 |         self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0) 
135 |         self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
136 |         self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
137 |         
138 |         # Record individual usage event
139 |         usage_data = {
140 |             "session_id": self.session_id,
141 |             "run_id": self.run_id,
142 |             "step": self.step_count,
143 |             **usage
144 |         }
145 |         
146 |         record_event("agent_usage", usage_data)
147 |     
148 |     async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
149 |         """Called when responses are received."""
150 |         if not is_telemetry_enabled():
151 |             return
152 |             
153 |         self.step_count += 1
154 |         step_duration = None
155 |         
156 |         if self.step_start_time:
157 |             step_duration = time.time() - self.step_start_time
158 |         
159 |         self.step_start_time = time.time()
160 |         
161 |         step_data = {
162 |             "session_id": self.session_id,
163 |             "run_id": self.run_id,
164 |             "step": self.step_count,
165 |             "timestamp": self.step_start_time
166 |         }
167 |         
168 |         if step_duration is not None:
169 |             step_data["duration_seconds"] = step_duration
170 |         
171 |         record_event("agent_step", step_data)
172 |     
173 |     def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
174 |         """Calculate approximate context size in tokens/characters."""
175 |         total_size = 0
176 |         
177 |         for item in items:
178 |             if item.get("type") == "message" and "content" in item:
179 |                 content = item["content"]
180 |                 if isinstance(content, str):
181 |                     total_size += len(content)
182 |                 elif isinstance(content, list):
183 |                     for part in content:
184 |                         if isinstance(part, dict) and "text" in part:
185 |                             total_size += len(part["text"])
186 |             elif "content" in item and isinstance(item["content"], str):
187 |                 total_size += len(item["content"])
188 |                 
189 |         return total_size
190 |     
191 |     def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
192 |         """Extract trajectory items that should be logged."""
193 |         trajectory = []
194 |         
195 |         for item in items:
196 |             # Include user messages, assistant messages, reasoning, computer calls, and computer outputs
197 |             if (
198 |                 item.get("role") == "user" or  # User inputs
199 |                 (item.get("type") == "message" and item.get("role") == "assistant") or  # Model outputs
200 |                 item.get("type") == "reasoning" or  # Reasoning traces
201 |                 item.get("type") == "computer_call" or  # Computer actions
202 |                 item.get("type") == "computer_call_output"  # Computer outputs
203 |             ):
204 |                 # Create a copy of the item with timestamp
205 |                 trajectory_item = item.copy()
206 |                 trajectory_item["logged_at"] = time.time()
207 |                 trajectory.append(trajectory_item)
208 |         
209 |         return trajectory
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/handlers/base.py:
--------------------------------------------------------------------------------

```python
  1 | from abc import ABC, abstractmethod
  2 | from typing import Optional, Dict, Any, List, Tuple
  3 | 
  4 | class BaseAccessibilityHandler(ABC):
  5 |     """Abstract base class for OS-specific accessibility handlers."""
  6 |     
  7 |     @abstractmethod
  8 |     async def get_accessibility_tree(self) -> Dict[str, Any]:
  9 |         """Get the accessibility tree of the current window."""
 10 |         pass
 11 | 
 12 |     @abstractmethod
 13 |     async def find_element(self, role: Optional[str] = None,
 14 |                           title: Optional[str] = None,
 15 |                           value: Optional[str] = None) -> Dict[str, Any]:
 16 |         """Find an element in the accessibility tree by criteria."""
 17 |         pass
 18 | 
 19 | class BaseFileHandler(ABC):
 20 |     """Abstract base class for OS-specific file handlers."""
 21 |     
 22 |     @abstractmethod
 23 |     async def file_exists(self, path: str) -> Dict[str, Any]:
 24 |         """Check if a file exists at the specified path."""
 25 |         pass
 26 | 
 27 |     @abstractmethod
 28 |     async def directory_exists(self, path: str) -> Dict[str, Any]:
 29 |         """Check if a directory exists at the specified path."""
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     async def list_dir(self, path: str) -> Dict[str, Any]:
 34 |         """List the contents of a directory."""
 35 |         pass
 36 | 
 37 |     @abstractmethod
 38 |     async def read_text(self, path: str) -> Dict[str, Any]:
 39 |         """Read the text contents of a file."""
 40 |         pass
 41 | 
 42 |     @abstractmethod
 43 |     async def write_text(self, path: str, content: str) -> Dict[str, Any]:
 44 |         """Write text content to a file."""
 45 |         pass
 46 |     
 47 |     @abstractmethod
 48 |     async def write_bytes(self, path: str, content_b64: str) -> Dict[str, Any]:
 49 |         """Write binary content to a file. Sent over the websocket as a base64 string."""
 50 |         pass
 51 | 
 52 |     @abstractmethod
 53 |     async def delete_file(self, path: str) -> Dict[str, Any]:
 54 |         """Delete a file."""
 55 |         pass
 56 | 
 57 |     @abstractmethod
 58 |     async def create_dir(self, path: str) -> Dict[str, Any]:
 59 |         """Create a directory."""
 60 |         pass
 61 | 
 62 |     @abstractmethod
 63 |     async def delete_dir(self, path: str) -> Dict[str, Any]:
 64 |         """Delete a directory."""
 65 |         pass
 66 | 
 67 |     @abstractmethod
 68 |     async def read_bytes(self, path: str, offset: int = 0, length: Optional[int] = None) -> Dict[str, Any]:
 69 |         """Read the binary contents of a file. Sent over the websocket as a base64 string.
 70 |         
 71 |         Args:
 72 |             path: Path to the file
 73 |             offset: Byte offset to start reading from (default: 0)
 74 |             length: Number of bytes to read (default: None for entire file)
 75 |         """
 76 |         pass
 77 | 
 78 |     @abstractmethod
 79 |     async def get_file_size(self, path: str) -> Dict[str, Any]:
 80 |         """Get the size of a file in bytes."""
 81 |         pass
 82 | 
 83 | class BaseAutomationHandler(ABC):
 84 |     """Abstract base class for OS-specific automation handlers.
 85 |     
 86 |     Categories:
 87 |     - Mouse Actions: Methods for mouse control
 88 |     - Keyboard Actions: Methods for keyboard input
 89 |     - Scrolling Actions: Methods for scrolling
 90 |     - Screen Actions: Methods for screen interaction
 91 |     - Clipboard Actions: Methods for clipboard operations
 92 |     """
 93 |     
 94 |     # Mouse Actions
 95 |     @abstractmethod
 96 |     async def mouse_down(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
 97 |         """Perform a mouse down at the current or specified position."""
 98 |         pass
 99 |     
100 |     @abstractmethod
101 |     async def mouse_up(self, x: Optional[int] = None, y: Optional[int] = None, button: str = "left") -> Dict[str, Any]:
102 |         """Perform a mouse up at the current or specified position."""
103 |         pass
104 |     
105 |     @abstractmethod
106 |     async def left_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
107 |         """Perform a left click at the current or specified position."""
108 |         pass
109 | 
110 |     @abstractmethod
111 |     async def right_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
112 |         """Perform a right click at the current or specified position."""
113 |         pass
114 | 
115 |     @abstractmethod
116 |     async def double_click(self, x: Optional[int] = None, y: Optional[int] = None) -> Dict[str, Any]:
117 |         """Perform a double click at the current or specified position."""
118 |         pass
119 | 
120 |     @abstractmethod
121 |     async def move_cursor(self, x: int, y: int) -> Dict[str, Any]:
122 |         """Move the cursor to the specified position."""
123 |         pass
124 | 
125 |     @abstractmethod
126 |     async def drag_to(self, x: int, y: int, button: str = "left", duration: float = 0.5) -> Dict[str, Any]:
127 |         """Drag the cursor from current position to specified coordinates.
128 |         
129 |         Args:
130 |             x: The x coordinate to drag to
131 |             y: The y coordinate to drag to
132 |             button: The mouse button to use ('left', 'middle', 'right')
133 |             duration: How long the drag should take in seconds
134 |         """
135 |         pass
136 |     
137 |     @abstractmethod
138 |     async def drag(self, path: List[Tuple[int, int]], button: str = "left", duration: float = 0.5) -> Dict[str, Any]:
139 |         """Drag the cursor from current position to specified coordinates.
140 |         
141 |         Args:
142 |             path: A list of tuples of x and y coordinates to drag to
143 |             button: The mouse button to use ('left', 'middle', 'right')
144 |             duration: How long the drag should take in seconds
145 |         """
146 |         pass
147 | 
148 |     # Keyboard Actions
149 |     @abstractmethod
150 |     async def key_down(self, key: str) -> Dict[str, Any]:
151 |         """Press and hold the specified key."""
152 |         pass
153 |     
154 |     @abstractmethod
155 |     async def key_up(self, key: str) -> Dict[str, Any]:
156 |         """Release the specified key."""
157 |         pass
158 |     
159 |     @abstractmethod
160 |     async def type_text(self, text: str) -> Dict[str, Any]:
161 |         """Type the specified text."""
162 |         pass
163 | 
164 |     @abstractmethod
165 |     async def press_key(self, key: str) -> Dict[str, Any]:
166 |         """Press the specified key."""
167 |         pass
168 | 
169 |     @abstractmethod
170 |     async def hotkey(self, keys: List[str]) -> Dict[str, Any]:
171 |         """Press a combination of keys together."""
172 |         pass
173 | 
174 |     # Scrolling Actions
175 |     @abstractmethod
176 |     async def scroll(self, x: int, y: int) -> Dict[str, Any]:
177 |         """Scroll the specified amount."""
178 |         pass
179 |     
180 |     @abstractmethod
181 |     async def scroll_down(self, clicks: int = 1) -> Dict[str, Any]:
182 |         """Scroll down by the specified number of clicks."""
183 |         pass
184 | 
185 |     @abstractmethod
186 |     async def scroll_up(self, clicks: int = 1) -> Dict[str, Any]:
187 |         """Scroll up by the specified number of clicks."""
188 |         pass
189 | 
190 |     # Screen Actions
191 |     @abstractmethod
192 |     async def screenshot(self) -> Dict[str, Any]:
193 |         """Take a screenshot and return base64 encoded image data."""
194 |         pass
195 | 
196 |     @abstractmethod
197 |     async def get_screen_size(self) -> Dict[str, Any]:
198 |         """Get the screen size of the VM."""
199 |         pass
200 | 
201 |     @abstractmethod
202 |     async def get_cursor_position(self) -> Dict[str, Any]:
203 |         """Get the current cursor position."""
204 |         pass
205 | 
206 |     # Clipboard Actions
207 |     @abstractmethod
208 |     async def copy_to_clipboard(self) -> Dict[str, Any]:
209 |         """Get the current clipboard content."""
210 |         pass
211 | 
212 |     @abstractmethod
213 |     async def set_clipboard(self, text: str) -> Dict[str, Any]:
214 |         """Set the clipboard content."""
215 |         pass 
216 | 
217 |     @abstractmethod
218 |     async def run_command(self, command: str) -> Dict[str, Any]:
219 |         """Run a command and return the output."""
220 |         pass
```

--------------------------------------------------------------------------------
/Development.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Getting Started
  2 | 
  3 | ## Project Structure
  4 | 
  5 | The project is organized as a monorepo with these main packages:
  6 | 
  7 | - `libs/core/` - Base package with telemetry support
  8 | - `libs/computer/` - Computer-use interface (CUI) library
  9 | - `libs/agent/` - AI agent library with multi-provider support
 10 | - `libs/som/` - Set-of-Mark parser
 11 | - `libs/computer-server/` - Server component for VM
 12 | - `libs/lume/` - Lume CLI
 13 | - `libs/pylume/` - Python bindings for Lume
 14 | 
 15 | Each package has its own virtual environment and dependencies, managed through PDM.
 16 | 
 17 | ## Local Development Setup
 18 | 
 19 | 1. Install Lume CLI:
 20 | 
 21 |     ```bash
 22 |     /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
 23 |     ```
 24 | 
 25 | 2. Clone the repository:
 26 | 
 27 |     ```bash
 28 |     git clone https://github.com/trycua/cua.git
 29 |     cd cua
 30 |     ```
 31 | 
 32 | 3. Create a `.env.local` file in the root directory with your API keys:
 33 | 
 34 |     ```bash
 35 |     # Required for Anthropic provider
 36 |     ANTHROPIC_API_KEY=your_anthropic_key_here
 37 | 
 38 |     # Required for OpenAI provider
 39 |     OPENAI_API_KEY=your_openai_key_here
 40 |     ```
 41 | 
 42 | 4. Open the workspace in VSCode or Cursor:
 43 | 
 44 |     ```bash
 45 |     # For Cua Python development
 46 |     code .vscode/py.code-workspace
 47 | 
 48 |     # For Lume (Swift) development
 49 |     code .vscode/lume.code-workspace
 50 |     ```
 51 | 
 52 | Using the workspace file is strongly recommended as it:
 53 | 
 54 | - Sets up correct Python environments for each package
 55 | - Configures proper import paths
 56 | - Enables debugging configurations
 57 | - Maintains consistent settings across packages
 58 | 
 59 | ## Lume Development
 60 | 
 61 | Refer to the [Lume README](./libs/lume/Development.md) for instructions on how to develop the Lume CLI.
 62 | 
 63 | ## Python Development
 64 | 
 65 | There are two ways to install Lume:
 66 | 
 67 | ### Run the build script
 68 | 
 69 | Run the build script to set up all packages:
 70 | 
 71 | ```bash
 72 | ./scripts/build.sh
 73 | ```
 74 | 
 75 | The build script creates a shared virtual environment for all packages. The workspace configuration automatically handles import paths with the correct Python path settings.
 76 | 
 77 | This will:
 78 | 
 79 | - Create a virtual environment for the project
 80 | - Install all packages in development mode
 81 | - Set up the correct Python path
 82 | - Install development tools
 83 | 
 84 | ### Install with PDM
 85 | 
 86 | If PDM is not already installed, you can follow the installation instructions [here](https://pdm-project.org/en/latest/#installation).
 87 | 
 88 | To install with PDM, simply run:
 89 | 
 90 | ```console
 91 | pdm install -G:all
 92 | ```
 93 | 
 94 | This installs all the dependencies for development, testing, and building the docs. If you'd only like development dependencies, you can run:
 95 | 
 96 | ```console
 97 | pdm install -d
 98 | ```
 99 | 
100 | ## Running Examples
101 | 
102 | The Python workspace includes launch configurations for all packages:
103 | 
104 | - "Run Computer Examples" - Runs computer examples
105 | - "Run Agent Examples" - Runs agent examples
106 | - "SOM" configurations - Various settings for running SOM
107 | 
108 | To run examples from VSCode / Cursor:
109 | 
110 | 1. Press F5 or use the Run/Debug view
111 | 2. Select the desired configuration
112 | 
113 | The workspace also includes compound launch configurations:
114 | 
115 | - "Run Computer Examples + Server" - Runs both the Computer Examples and Server simultaneously
116 | 
117 | ## Docker Development Environment
118 | 
119 | As an alternative to installing directly on your host machine, you can use Docker for development. This approach has several advantages:
120 | 
121 | ### Prerequisites
122 | 
123 | - Docker installed on your machine
124 | - Lume server running on your host (port 7777): `lume serve`
125 | 
126 | ### Setup and Usage
127 | 
128 | 1. Build the development Docker image:
129 | 
130 |     ```bash
131 |     ./scripts/run-docker-dev.sh build
132 |     ```
133 | 
134 | 2. Run an example in the container:
135 | 
136 |     ```bash
137 |     ./scripts/run-docker-dev.sh run computer_examples.py
138 |     ```
139 | 
140 | 3. Get an interactive shell in the container:
141 | 
142 |     ```bash
143 |     ./scripts/run-docker-dev.sh run --interactive
144 |     ```
145 | 
146 | 4. Stop any running containers:
147 | 
148 |     ```bash
149 |     ./scripts/run-docker-dev.sh stop
150 |     ```
151 | 
152 | ### How it Works
153 | 
154 | The Docker development environment:
155 | 
156 | - Installs all required Python dependencies in the container
157 | - Mounts your source code from the host at runtime
158 | - Automatically configures the connection to use host.docker.internal:7777 for accessing the Lume server on your host machine
159 | - Preserves your code changes without requiring rebuilds (source code is mounted as a volume)
160 | 
161 | > **Note**: The Docker container doesn't include the macOS-specific Lume executable. Instead, it connects to the Lume server running on your host machine via host.docker.internal:7777. Make sure to start the Lume server on your host before running examples in the container.
162 | 
163 | ## Cleanup and Reset
164 | 
165 | If you need to clean up the environment (non-docker) and start fresh:
166 | 
167 | ```bash
168 | ./scripts/cleanup.sh
169 | ```
170 | 
171 | This will:
172 | 
173 | - Remove all virtual environments
174 | - Clean Python cache files and directories
175 | - Remove build artifacts
176 | - Clean PDM-related files
177 | - Reset environment configurations
178 | 
179 | ## Code Formatting Standards
180 | 
181 | The cua project follows strict code formatting standards to ensure consistency across all packages.
182 | 
183 | ### Python Code Formatting
184 | 
185 | #### Tools
186 | 
187 | The project uses the following tools for code formatting and linting:
188 | 
189 | - **[Black](https://black.readthedocs.io/)**: Code formatter
190 | - **[Ruff](https://beta.ruff.rs/docs/)**: Fast linter and formatter
191 | - **[MyPy](https://mypy.readthedocs.io/)**: Static type checker
192 | 
193 | These tools are automatically installed when you set up the development environment using the `./scripts/build.sh` script.
194 | 
195 | #### Configuration
196 | 
197 | The formatting configuration is defined in the root `pyproject.toml` file:
198 | 
199 | ```toml
200 | [tool.black]
201 | line-length = 100
202 | target-version = ["py311"]
203 | 
204 | [tool.ruff]
205 | line-length = 100
206 | target-version = "py311"
207 | select = ["E", "F", "B", "I"]
208 | fix = true
209 | 
210 | [tool.ruff.format]
211 | docstring-code-format = true
212 | 
213 | [tool.mypy]
214 | strict = true
215 | python_version = "3.11"
216 | ignore_missing_imports = true
217 | disallow_untyped_defs = true
218 | check_untyped_defs = true
219 | warn_return_any = true
220 | show_error_codes = true
221 | warn_unused_ignores = false
222 | ```
223 | 
224 | #### Key Formatting Rules
225 | 
226 | - **Line Length**: Maximum of 100 characters
227 | - **Python Version**: Code should be compatible with Python 3.11+
228 | - **Imports**: Automatically sorted (using Ruff's "I" rule)
229 | - **Type Hints**: Required for all function definitions (strict mypy mode)
230 | 
231 | #### IDE Integration
232 | 
233 | The repository includes VSCode workspace configurations that enable automatic formatting. When you open the workspace files (as recommended in the setup instructions), the correct formatting settings are automatically applied.
234 | 
235 | Python-specific settings in the workspace files:
236 | 
237 | ```json
238 | "[python]": {
239 |     "editor.formatOnSave": true,
240 |     "editor.defaultFormatter": "ms-python.black-formatter",
241 |     "editor.codeActionsOnSave": {
242 |         "source.organizeImports": "explicit"
243 |     }
244 | }
245 | ```
246 | 
247 | Recommended VS Code extensions:
248 | 
249 | - Black Formatter (ms-python.black-formatter)
250 | - Ruff (charliermarsh.ruff)
251 | - Pylance (ms-python.vscode-pylance)
252 | 
253 | #### Manual Formatting
254 | 
255 | To manually format code:
256 | 
257 | ```bash
258 | # Format all Python files using Black
259 | pdm run black .
260 | 
261 | # Run Ruff linter with auto-fix
262 | pdm run ruff check --fix .
263 | 
264 | # Run type checking with MyPy
265 | pdm run mypy .
266 | ```
267 | 
268 | #### Pre-commit Validation
269 | 
270 | Before submitting a pull request, ensure your code passes all formatting checks:
271 | 
272 | ```bash
273 | # Run all checks
274 | pdm run black --check .
275 | pdm run ruff check .
276 | pdm run mypy .
277 | ```
278 | 
279 | ### Swift Code (Lume)
280 | 
281 | For Swift code in the `libs/lume` directory:
282 | 
283 | - Follow the [Swift API Design Guidelines](https://www.swift.org/documentation/api-design-guidelines/)
284 | - Use SwiftFormat for consistent formatting
285 | - Code will be automatically formatted on save when using the lume workspace
286 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/holo.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
  3 | 
  4 | Implements the Holo1.5 grounding behavior:
  5 | - Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
  6 | - Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
  7 | - If resized, maps predicted coordinates back to the original screenshot resolution
  8 | 
  9 | Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
 10 | will handle loading based on the provided model name.
 11 | """
 12 | 
 13 | from __future__ import annotations
 14 | 
 15 | import base64
 16 | import json
 17 | from io import BytesIO
 18 | from typing import Any, Dict, List, Optional, Tuple
 19 | 
 20 | import litellm
 21 | from PIL import Image
 22 | 
 23 | from ..decorators import register_agent
 24 | from .base import AsyncAgentConfig
 25 | from ..types import AgentCapability
 26 | 
 27 | 
 28 | def _strip_hf_prefix(model: str) -> str:
 29 |     """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
 30 |     if "/" in model and model.lower().startswith("huggingface-local/"):
 31 |         return model.split("/", 1)[1]
 32 |     return model
 33 | 
 34 | 
 35 | def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
 36 |     """
 37 |     Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
 38 | 
 39 |     Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
 40 |     returns the original image and size without resizing.
 41 |     """
 42 |     orig_w, orig_h = image.size
 43 |     try:
 44 |         # Import lazily to avoid hard dependency if not installed
 45 |         from transformers import AutoProcessor  # type: ignore
 46 |         from transformers.models.qwen2_vl.image_processing_qwen2_vl import (  # type: ignore
 47 |             smart_resize,
 48 |         )
 49 | 
 50 |         processor_name = _strip_hf_prefix(model)
 51 |         processor = AutoProcessor.from_pretrained(processor_name)
 52 |         image_processor = getattr(processor, "image_processor", None)
 53 |         if image_processor is None:
 54 |             return image, (orig_w, orig_h)
 55 | 
 56 |         factor = getattr(image_processor, "patch_size", 14) * getattr(image_processor, "merge_size", 1)
 57 |         min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
 58 |         max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
 59 | 
 60 |         resized_h, resized_w = smart_resize(
 61 |             orig_h,
 62 |             orig_w,
 63 |             factor=factor,
 64 |             min_pixels=min_pixels,
 65 |             max_pixels=max_pixels,
 66 |         )
 67 | 
 68 |         if (resized_w, resized_h) == (orig_w, orig_h):
 69 |             return image, (orig_w, orig_h)
 70 | 
 71 |         processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
 72 |         return processed, (orig_w, orig_h)
 73 |     except Exception:
 74 |         # If any failure (no transformers, processor load error), fall back to original
 75 |         return image, (orig_w, orig_h)
 76 | 
 77 | 
 78 | def _build_holo_prompt(instruction: str) -> str:
 79 |     """Construct the Holo1.5 grounding prompt."""
 80 |     # Keep it close to the cookbook while avoiding heavy schema generation
 81 |     schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
 82 |     return (
 83 |         "Localize an element on the GUI image according to the provided target and output a click position. "
 84 |         f"You must output a valid JSON following the format: {schema_hint} "
 85 |         f"Your target is: {instruction}"
 86 |     )
 87 | 
 88 | 
 89 | def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
 90 |     """
 91 |     Parse JSON from model output and extract x, y ints.
 92 |     Tries to find the first JSON object substring if extra text is present.
 93 |     """
 94 |     try:
 95 |         # Fast path: direct JSON
 96 |         data = json.loads(output_text)
 97 |     except Exception:
 98 |         # Try to locate a JSON object within the text
 99 |         start = output_text.find("{")
100 |         end = output_text.rfind("}")
101 |         if start == -1 or end == -1 or end <= start:
102 |             return None
103 |         try:
104 |             data = json.loads(output_text[start : end + 1])
105 |         except Exception:
106 |             return None
107 | 
108 |     try:
109 |         x = int(data.get("x"))
110 |         y = int(data.get("y"))
111 |         return x, y
112 |     except Exception:
113 |         return None
114 | 
115 | 
116 | @register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
117 | class HoloConfig(AsyncAgentConfig):
118 |     """Holo is a family of UI grounding models from H Company"""
119 | 
120 |     async def predict_step(
121 |         self,
122 |         messages: List[Dict[str, Any]],
123 |         model: str,
124 |         tools: Optional[List[Dict[str, Any]]] = None,
125 |         max_retries: Optional[int] = None,
126 |         stream: bool = False,
127 |         computer_handler=None,
128 |         _on_api_start=None,
129 |         _on_api_end=None,
130 |         _on_usage=None,
131 |         _on_screenshot=None,
132 |         **kwargs,
133 |     ) -> Dict[str, Any]:
134 |         # Holo models are only trained on UI localization tasks, not all-in-one agent
135 |         raise NotImplementedError()
136 | 
137 |     async def predict_click(
138 |         self,
139 |         model: str,
140 |         image_b64: str,
141 |         instruction: str,
142 |         **kwargs,
143 |     ) -> Optional[Tuple[int, int]]:
144 |         """
145 |         Predict click coordinates using Holo1.5 via litellm.acompletion.
146 | 
147 |         - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
148 |         - Prompts for JSON with absolute pixel coordinates
149 |         - Parses x,y and maps back to original screenshot size if resized
150 |         """
151 |         try:
152 |             img_bytes = base64.b64decode(image_b64)
153 |             original_img = Image.open(BytesIO(img_bytes))
154 |         except Exception:
155 |             return None
156 | 
157 |         # Optional preprocessing
158 |         processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
159 | 
160 |         # If we resized, send the resized image; otherwise send original
161 |         img_to_send = processed_img
162 |         buf = BytesIO()
163 |         img_to_send.save(buf, format="PNG")
164 |         processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
165 | 
166 |         prompt = _build_holo_prompt(instruction)
167 | 
168 |         messages = [
169 |             {
170 |                 "role": "user",
171 |                 "content": [
172 |                     {
173 |                         "type": "image_url",
174 |                         "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
175 |                     },
176 |                     {"type": "text", "text": prompt},
177 |                 ],
178 |             }
179 |         ]
180 | 
181 |         api_kwargs = {
182 |             "model": model,
183 |             "messages": messages,
184 |             # Deterministic, small output
185 |             "max_tokens": kwargs.get("max_tokens", 256),
186 |             "temperature": kwargs.get("temperature", 0.0),
187 |         }
188 | 
189 |         response = await litellm.acompletion(**api_kwargs)
190 |         output_text = (response.choices[0].message.content or "").strip()  # type: ignore
191 | 
192 |         coords = _parse_click_json(output_text)
193 |         if coords is None:
194 |             return None
195 | 
196 |         x, y = coords
197 | 
198 |         # Map back to original size if we resized
199 |         proc_w, proc_h = img_to_send.size
200 |         if (proc_w, proc_h) != (orig_w, orig_h):
201 |             try:
202 |                 sx = orig_w / float(proc_w)
203 |                 sy = orig_h / float(proc_h)
204 |                 x = int(round(x * sx))
205 |                 y = int(round(y * sy))
206 |             except Exception:
207 |                 # Fallback: clamp within original bounds
208 |                 pass
209 | 
210 |         # Clamp to original image bounds
211 |         x = max(0, min(orig_w - 1, x))
212 |         y = max(0, min(orig_h - 1, y))
213 |         return x, y
214 | 
215 |     def get_capabilities(self) -> List[AgentCapability]:
216 |         return ["click"]
217 | 
```

--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------

```json
  1 | {
  2 |     "configurations": [
  3 |         {
  4 |             "name": "Agent UI",
  5 |             "type": "debugpy",
  6 |             "request": "launch",
  7 |             "program": "examples/agent_ui_examples.py",
  8 |             "console": "integratedTerminal",
  9 |             "justMyCode": false,
 10 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 11 |             "cwd": "${workspaceFolder:cua-root}",
 12 |             "env": {
 13 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 14 |             }
 15 |         },
 16 |         {
 17 |             "name": "Computer UI",
 18 |             "type": "debugpy",
 19 |             "request": "launch",
 20 |             "program": "examples/computer_ui_examples.py",
 21 |             "console": "integratedTerminal",
 22 |             "justMyCode": false,
 23 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 24 |             "cwd": "${workspaceFolder:cua-root}",
 25 |             "env": {
 26 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 27 |             }
 28 |         },
 29 |         {
 30 |             "name": "Run Computer Examples",
 31 |             "type": "debugpy",
 32 |             "request": "launch",
 33 |             "program": "examples/computer_examples.py",
 34 |             "console": "integratedTerminal",
 35 |             "justMyCode": true,
 36 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 37 |             "cwd": "${workspaceFolder:cua-root}",
 38 |             "env": {
 39 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 40 |             }
 41 |         },
 42 |         {
 43 |             "name": "Run Agent Examples",
 44 |             "type": "debugpy",
 45 |             "request": "launch",
 46 |             "program": "examples/agent_examples.py",
 47 |             "console": "integratedTerminal",
 48 |             "justMyCode": false,
 49 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 50 |             "cwd": "${workspaceFolder:cua-root}",
 51 |             "env": {
 52 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 53 |             }
 54 |         },
 55 |         {
 56 |             "name": "Run PyLume Examples",
 57 |             "type": "debugpy",
 58 |             "request": "launch",
 59 |             "program": "examples/pylume_examples.py",
 60 |             "console": "integratedTerminal",
 61 |             "justMyCode": true,
 62 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 63 |             "cwd": "${workspaceFolder:cua-root}",
 64 |             "env": {
 65 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 66 |             }
 67 |         },
 68 |         {
 69 |             "name": "SOM: Run Experiments (No OCR)",
 70 |             "type": "debugpy",
 71 |             "request": "launch",
 72 |             "program": "examples/som_examples.py",
 73 |             "args": [
 74 |                 "examples/test_data",
 75 |                 "--output-dir",
 76 |                 "examples/output",
 77 |                 "--ocr",
 78 |                 "none",
 79 |                 "--mode",
 80 |                 "experiment"
 81 |             ],
 82 |             "console": "integratedTerminal",
 83 |             "justMyCode": false,
 84 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
 85 |             "cwd": "${workspaceFolder:cua-root}",
 86 |             "env": {
 87 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
 88 |             }
 89 |         },
 90 |         {
 91 |             "name": "SOM: Run Experiments (EasyOCR)",
 92 |             "type": "debugpy",
 93 |             "request": "launch",
 94 |             "program": "examples/som_examples.py",
 95 |             "args": [
 96 |                 "examples/test_data",
 97 |                 "--output-dir",
 98 |                 "examples/output",
 99 |                 "--ocr",
100 |                 "easyocr",
101 |                 "--mode",
102 |                 "experiment"
103 |             ],
104 |             "console": "integratedTerminal",
105 |             "justMyCode": false,
106 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
107 |             "cwd": "${workspaceFolder:cua-root}",
108 |             "env": {
109 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
110 |             }
111 |         },
112 |         {
113 |             "name": "Run Computer Server",
114 |             "type": "debugpy",
115 |             "request": "launch",
116 |             "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
117 |             "console": "integratedTerminal",
118 |             "justMyCode": true,
119 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
120 |             "cwd": "${workspaceFolder:cua-root}",
121 |             "env": {
122 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer:${workspaceFolder:cua-root}/libs/python/agent:${workspaceFolder:cua-root}/libs/python/som:${workspaceFolder:cua-root}/libs/python/pylume"
123 |             }
124 |         },
125 |         {
126 |             "name": "Run Computer Server with Args",
127 |             "type": "debugpy",
128 |             "request": "launch",
129 |             "program": "${workspaceFolder}/libs/python/computer-server/run_server.py",
130 |             "args": [
131 |                 "--host",
132 |                 "0.0.0.0",
133 |                 "--port",
134 |                 "8000",
135 |                 "--log-level",
136 |                 "debug"
137 |             ],
138 |             "console": "integratedTerminal",
139 |             "justMyCode": false,
140 |             "python": "${workspaceFolder:cua-root}/.venv/bin/python",
141 |             "cwd": "${workspaceFolder:cua-root}",
142 |             "env": {
143 |                 "PYTHONPATH": "${workspaceFolder:cua-root}/libs/python/core:${workspaceFolder:cua-root}/libs/python/computer-server"
144 |             }
145 |         },
146 |         {
147 |             "type": "lldb",
148 |             "request": "launch",
149 |             "args": [],
150 |             "cwd": "${workspaceFolder:cua-root}/libs/lume",
151 |             "name": "Debug lume (libs/lume)",
152 |             "program": "${workspaceFolder:cua-root}/libs/lume/.build/debug/lume",
153 |             "preLaunchTask": "swift: Build Debug lume (libs/lume)"
154 |         },
155 |         {
156 |             "type": "lldb",
157 |             "request": "launch",
158 |             "args": [],
159 |             "cwd": "${workspaceFolder:cua-root}/libs/lume",
160 |             "name": "Release lume (libs/lume)",
161 |             "program": "${workspaceFolder:cua-root}/libs/lume/.build/release/lume",
162 |             "preLaunchTask": "swift: Build Release lume (libs/lume)"
163 |         }
164 |     ]
165 | }
```

--------------------------------------------------------------------------------
/libs/lume/src/Commands/Config.swift:
--------------------------------------------------------------------------------

```swift
  1 | import ArgumentParser
  2 | import Foundation
  3 | 
  4 | struct Config: ParsableCommand {
  5 |     static let configuration = CommandConfiguration(
  6 |         commandName: "config",
  7 |         abstract: "Get or set lume configuration",
  8 |         subcommands: [Get.self, Storage.self, Cache.self, Caching.self],
  9 |         defaultSubcommand: Get.self
 10 |     )
 11 | 
 12 |     // MARK: - Basic Configuration Subcommands
 13 | 
 14 |     struct Get: ParsableCommand {
 15 |         static let configuration = CommandConfiguration(
 16 |             commandName: "get",
 17 |             abstract: "Get current configuration"
 18 |         )
 19 | 
 20 |         func run() throws {
 21 |             let controller = LumeController()
 22 |             let settings = controller.getSettings()
 23 | 
 24 |             // Display default location
 25 |             print(
 26 |                 "Default VM storage: \(settings.defaultLocationName) (\(settings.defaultLocation?.path ?? "not set"))"
 27 |             )
 28 | 
 29 |             // Display cache directory
 30 |             print("Cache directory: \(settings.cacheDirectory)")
 31 | 
 32 |             // Display caching enabled status
 33 |             print("Caching enabled: \(settings.cachingEnabled)")
 34 | 
 35 |             // Display all locations
 36 |             if !settings.vmLocations.isEmpty {
 37 |                 print("\nConfigured VM storage locations:")
 38 |                 for location in settings.sortedLocations {
 39 |                     let isDefault = location.name == settings.defaultLocationName
 40 |                     let defaultMark = isDefault ? " (default)" : ""
 41 |                     print("  - \(location.name): \(location.path)\(defaultMark)")
 42 |                 }
 43 |             }
 44 |         }
 45 |     }
 46 | 
 47 |     // MARK: - Debug Command
 48 | 
 49 |     struct Debug: ParsableCommand {
 50 |         static let configuration = CommandConfiguration(
 51 |             commandName: "debug",
 52 |             abstract: "Output detailed debug information about current configuration",
 53 |             shouldDisplay: false
 54 |         )
 55 | 
 56 |         func run() throws {
 57 |             let debugInfo = SettingsManager.shared.debugSettings()
 58 |             print(debugInfo)
 59 |         }
 60 |     }
 61 | 
 62 |     // MARK: - Caching Management Subcommands
 63 | 
 64 |     struct Caching: ParsableCommand {
 65 |         static let configuration = CommandConfiguration(
 66 |             commandName: "caching",
 67 |             abstract: "Manage image caching settings",
 68 |             subcommands: [GetCaching.self, SetCaching.self]
 69 |         )
 70 | 
 71 |         struct GetCaching: ParsableCommand {
 72 |             static let configuration = CommandConfiguration(
 73 |                 commandName: "get",
 74 |                 abstract: "Show current caching status"
 75 |             )
 76 | 
 77 |             func run() throws {
 78 |                 let controller = LumeController()
 79 |                 let cachingEnabled = controller.isCachingEnabled()
 80 |                 print("Caching enabled: \(cachingEnabled)")
 81 |             }
 82 |         }
 83 | 
 84 |         struct SetCaching: ParsableCommand {
 85 |             static let configuration = CommandConfiguration(
 86 |                 commandName: "set",
 87 |                 abstract: "Enable or disable image caching"
 88 |             )
 89 | 
 90 |             @Argument(help: "Enable or disable caching (true/false)")
 91 |             var enabled: Bool
 92 | 
 93 |             func run() throws {
 94 |                 let controller = LumeController()
 95 |                 try controller.setCachingEnabled(enabled)
 96 |                 print("Caching \(enabled ? "enabled" : "disabled")")
 97 |             }
 98 |         }
 99 |     }
100 | 
101 |     // MARK: - Cache Management Subcommands
102 | 
103 |     struct Cache: ParsableCommand {
104 |         static let configuration = CommandConfiguration(
105 |             commandName: "cache",
106 |             abstract: "Manage cache settings",
107 |             subcommands: [GetCache.self, SetCache.self]
108 |         )
109 | 
110 |         struct GetCache: ParsableCommand {
111 |             static let configuration = CommandConfiguration(
112 |                 commandName: "get",
113 |                 abstract: "Get current cache directory"
114 |             )
115 | 
116 |             func run() throws {
117 |                 let controller = LumeController()
118 |                 let cacheDir = controller.getCacheDirectory()
119 |                 print("Cache directory: \(cacheDir)")
120 |             }
121 |         }
122 | 
123 |         struct SetCache: ParsableCommand {
124 |             static let configuration = CommandConfiguration(
125 |                 commandName: "set",
126 |                 abstract: "Set cache directory"
127 |             )
128 | 
129 |             @Argument(help: "Path to cache directory")
130 |             var path: String
131 | 
132 |             func run() throws {
133 |                 let controller = LumeController()
134 |                 try controller.setCacheDirectory(path: path)
135 |                 print("Cache directory set to: \(path)")
136 |             }
137 |         }
138 |     }
139 | 
140 |     // MARK: - Storage Management Subcommands
141 | 
142 |     struct Storage: ParsableCommand {
143 |         static let configuration = CommandConfiguration(
144 |             commandName: "storage",
145 |             abstract: "Manage VM storage locations",
146 |             subcommands: [Add.self, Remove.self, List.self, Default.self]
147 |         )
148 | 
149 |         struct Add: ParsableCommand {
150 |             static let configuration = CommandConfiguration(
151 |                 commandName: "add",
152 |                 abstract: "Add a new VM storage location"
153 |             )
154 | 
155 |             @Argument(help: "Storage name (alphanumeric with dashes/underscores)")
156 |             var name: String
157 | 
158 |             @Argument(help: "Path to VM storage directory")
159 |             var path: String
160 | 
161 |             func run() throws {
162 |                 let controller = LumeController()
163 |                 try controller.addLocation(name: name, path: path)
164 |                 print("Added VM storage location: \(name) at \(path)")
165 |             }
166 |         }
167 | 
168 |         struct Remove: ParsableCommand {
169 |             static let configuration = CommandConfiguration(
170 |                 commandName: "remove",
171 |                 abstract: "Remove a VM storage location"
172 |             )
173 | 
174 |             @Argument(help: "Storage name to remove")
175 |             var name: String
176 | 
177 |             func run() throws {
178 |                 let controller = LumeController()
179 |                 try controller.removeLocation(name: name)
180 |                 print("Removed VM storage location: \(name)")
181 |             }
182 |         }
183 | 
184 |         struct List: ParsableCommand {
185 |             static let configuration = CommandConfiguration(
186 |                 commandName: "list",
187 |                 abstract: "List all VM storage locations"
188 |             )
189 | 
190 |             func run() throws {
191 |                 let controller = LumeController()
192 |                 let settings = controller.getSettings()
193 | 
194 |                 if settings.vmLocations.isEmpty {
195 |                     print("No VM storage locations configured")
196 |                     return
197 |                 }
198 | 
199 |                 print("VM Storage Locations:")
200 |                 for location in settings.sortedLocations {
201 |                     let isDefault = location.name == settings.defaultLocationName
202 |                     let defaultMark = isDefault ? " (default)" : ""
203 |                     print("  - \(location.name): \(location.path)\(defaultMark)")
204 |                 }
205 |             }
206 |         }
207 | 
208 |         struct Default: ParsableCommand {
209 |             static let configuration = CommandConfiguration(
210 |                 commandName: "default",
211 |                 abstract: "Set the default VM storage location"
212 |             )
213 | 
214 |             @Argument(help: "Storage name to set as default")
215 |             var name: String
216 | 
217 |             func run() throws {
218 |                 let controller = LumeController()
219 |                 try controller.setDefaultLocation(name: name)
220 |                 print("Set default VM storage location to: \(name)")
221 |             }
222 |         }
223 |     }
224 | }
225 | 
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/handlers/generic.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Generic handlers for all OSes.
  3 | 
  4 | Includes:
  5 | - FileHandler
  6 | 
  7 | """
  8 | 
  9 | from pathlib import Path
 10 | from typing import Dict, Any, Optional
 11 | from .base import BaseFileHandler
 12 | import base64
 13 | 
 14 | def resolve_path(path: str) -> Path:
 15 |     """Resolve a path to its absolute path. Expand ~ to the user's home directory.
 16 |     
 17 |     Args:
 18 |         path: The file or directory path to resolve
 19 |         
 20 |     Returns:
 21 |         Path: The resolved absolute path
 22 |     """
 23 |     return Path(path).expanduser().resolve()
 24 | 
 25 | class GenericFileHandler(BaseFileHandler):
 26 |     """
 27 |     Generic file handler that provides file system operations for all operating systems.
 28 |     
 29 |     This class implements the BaseFileHandler interface and provides methods for
 30 |     file and directory operations including reading, writing, creating, and deleting
 31 |     files and directories.
 32 |     """
 33 |     
 34 |     async def file_exists(self, path: str) -> Dict[str, Any]:
 35 |         """
 36 |         Check if a file exists at the specified path.
 37 |         
 38 |         Args:
 39 |             path: The file path to check
 40 |             
 41 |         Returns:
 42 |             Dict containing 'success' boolean and either 'exists' boolean or 'error' string
 43 |         """
 44 |         try:
 45 |             return {"success": True, "exists": resolve_path(path).is_file()}
 46 |         except Exception as e:
 47 |             return {"success": False, "error": str(e)}
 48 | 
 49 |     async def directory_exists(self, path: str) -> Dict[str, Any]:
 50 |         """
 51 |         Check if a directory exists at the specified path.
 52 |         
 53 |         Args:
 54 |             path: The directory path to check
 55 |             
 56 |         Returns:
 57 |             Dict containing 'success' boolean and either 'exists' boolean or 'error' string
 58 |         """
 59 |         try:
 60 |             return {"success": True, "exists": resolve_path(path).is_dir()}
 61 |         except Exception as e:
 62 |             return {"success": False, "error": str(e)}
 63 | 
 64 |     async def list_dir(self, path: str) -> Dict[str, Any]:
 65 |         """
 66 |         List all files and directories in the specified directory.
 67 |         
 68 |         Args:
 69 |             path: The directory path to list
 70 |             
 71 |         Returns:
 72 |             Dict containing 'success' boolean and either 'files' list of names or 'error' string
 73 |         """
 74 |         try:
 75 |             return {"success": True, "files": [p.name for p in resolve_path(path).iterdir() if p.is_file() or p.is_dir()]}
 76 |         except Exception as e:
 77 |             return {"success": False, "error": str(e)}
 78 |         
 79 |     async def read_text(self, path: str) -> Dict[str, Any]:
 80 |         """
 81 |         Read the contents of a text file.
 82 |         
 83 |         Args:
 84 |             path: The file path to read from
 85 |             
 86 |         Returns:
 87 |             Dict containing 'success' boolean and either 'content' string or 'error' string
 88 |         """
 89 |         try:
 90 |             return {"success": True, "content": resolve_path(path).read_text()}
 91 |         except Exception as e:
 92 |             return {"success": False, "error": str(e)}
 93 | 
 94 |     async def write_text(self, path: str, content: str) -> Dict[str, Any]:
 95 |         """
 96 |         Write text content to a file.
 97 |         
 98 |         Args:
 99 |             path: The file path to write to
100 |             content: The text content to write
101 |             
102 |         Returns:
103 |             Dict containing 'success' boolean and optionally 'error' string
104 |         """
105 |         try:
106 |             resolve_path(path).write_text(content)
107 |             return {"success": True}
108 |         except Exception as e:
109 |             return {"success": False, "error": str(e)}
110 | 
111 |     async def write_bytes(self, path: str, content_b64: str, append: bool = False) -> Dict[str, Any]:
112 |         """
113 |         Write binary content to a file from base64 encoded string.
114 |         
115 |         Args:
116 |             path: The file path to write to
117 |             content_b64: Base64 encoded binary content
118 |             append: If True, append to existing file; if False, overwrite
119 |             
120 |         Returns:
121 |             Dict containing 'success' boolean and optionally 'error' string
122 |         """
123 |         try:
124 |             mode = 'ab' if append else 'wb'
125 |             with open(resolve_path(path), mode) as f:
126 |                 f.write(base64.b64decode(content_b64))
127 |             return {"success": True}
128 |         except Exception as e:
129 |             return {"success": False, "error": str(e)}
130 |         
131 |     async def read_bytes(self, path: str, offset: int = 0, length: Optional[int] = None) -> Dict[str, Any]:
132 |         """
133 |         Read binary content from a file and return as base64 encoded string.
134 |         
135 |         Args:
136 |             path: The file path to read from
137 |             offset: Byte offset to start reading from
138 |             length: Number of bytes to read; if None, read entire file from offset
139 |             
140 |         Returns:
141 |             Dict containing 'success' boolean and either 'content_b64' string or 'error' string
142 |         """
143 |         try:
144 |             file_path = resolve_path(path)
145 |             with open(file_path, 'rb') as f:
146 |                 if offset > 0:
147 |                     f.seek(offset)
148 |                 
149 |                 if length is not None:
150 |                     content = f.read(length)
151 |                 else:
152 |                     content = f.read()
153 |                 
154 |             return {"success": True, "content_b64": base64.b64encode(content).decode('utf-8')}
155 |         except Exception as e:
156 |             return {"success": False, "error": str(e)}
157 | 
158 |     async def get_file_size(self, path: str) -> Dict[str, Any]:
159 |         """
160 |         Get the size of a file in bytes.
161 |         
162 |         Args:
163 |             path: The file path to get size for
164 |             
165 |         Returns:
166 |             Dict containing 'success' boolean and either 'size' integer or 'error' string
167 |         """
168 |         try:
169 |             file_path = resolve_path(path)
170 |             size = file_path.stat().st_size
171 |             return {"success": True, "size": size}
172 |         except Exception as e:
173 |             return {"success": False, "error": str(e)}
174 | 
175 |     async def delete_file(self, path: str) -> Dict[str, Any]:
176 |         """
177 |         Delete a file at the specified path.
178 |         
179 |         Args:
180 |             path: The file path to delete
181 |             
182 |         Returns:
183 |             Dict containing 'success' boolean and optionally 'error' string
184 |         """
185 |         try:
186 |             resolve_path(path).unlink()
187 |             return {"success": True}
188 |         except Exception as e:
189 |             return {"success": False, "error": str(e)}
190 | 
191 |     async def create_dir(self, path: str) -> Dict[str, Any]:
192 |         """
193 |         Create a directory at the specified path.
194 |         
195 |         Creates parent directories if they don't exist and doesn't raise an error
196 |         if the directory already exists.
197 |         
198 |         Args:
199 |             path: The directory path to create
200 |             
201 |         Returns:
202 |             Dict containing 'success' boolean and optionally 'error' string
203 |         """
204 |         try:
205 |             resolve_path(path).mkdir(parents=True, exist_ok=True)
206 |             return {"success": True}
207 |         except Exception as e:
208 |             return {"success": False, "error": str(e)}
209 | 
210 |     async def delete_dir(self, path: str) -> Dict[str, Any]:
211 |         """
212 |         Delete an empty directory at the specified path.
213 |         
214 |         Args:
215 |             path: The directory path to delete
216 |             
217 |         Returns:
218 |             Dict containing 'success' boolean and optionally 'error' string
219 |         """
220 |         try:
221 |             resolve_path(path).rmdir()
222 |             return {"success": True}
223 |         except Exception as e:
224 |             return {"success": False, "error": str(e)}
225 | 
```

--------------------------------------------------------------------------------
/libs/python/pylume/pylume/models.py:
--------------------------------------------------------------------------------

```python
  1 | from typing import Optional, List, Literal, Dict, Any
  2 | import re
  3 | from pydantic import BaseModel, Field, computed_field, validator, ConfigDict, RootModel
  4 | 
  5 | class DiskInfo(BaseModel):
  6 |     """Information about disk storage allocation.
  7 |     
  8 |     Attributes:
  9 |         total: Total disk space in bytes
 10 |         allocated: Currently allocated disk space in bytes
 11 |     """
 12 |     total: int
 13 |     allocated: int
 14 | 
 15 | class VMConfig(BaseModel):
 16 |     """Configuration for creating a new VM.
 17 |     
 18 |     Note: Memory and disk sizes should be specified with units (e.g., "4GB", "64GB")
 19 |     
 20 |     Attributes:
 21 |         name: Name of the virtual machine
 22 |         os: Operating system type, either "macOS" or "linux"
 23 |         cpu: Number of CPU cores to allocate
 24 |         memory: Amount of memory to allocate with units
 25 |         disk_size: Size of the disk to create with units
 26 |         display: Display resolution in format "widthxheight"
 27 |         ipsw: IPSW path or 'latest' for macOS VMs, None for other OS types
 28 |     """
 29 |     name: str
 30 |     os: Literal["macOS", "linux"] = "macOS"
 31 |     cpu: int = Field(default=2, ge=1)
 32 |     memory: str = "4GB"
 33 |     disk_size: str = Field(default="64GB", alias="diskSize")
 34 |     display: str = "1024x768"
 35 |     ipsw: Optional[str] = Field(default=None, description="IPSW path or 'latest', for macOS VMs")
 36 | 
 37 |     class Config:
 38 |         populate_by_alias = True
 39 | 
 40 | class SharedDirectory(BaseModel):
 41 |     """Configuration for a shared directory.
 42 |     
 43 |     Attributes:
 44 |         host_path: Path to the directory on the host system
 45 |         read_only: Whether the directory should be mounted as read-only
 46 |     """
 47 |     host_path: str = Field(..., alias="hostPath")  # Allow host_path but serialize as hostPath
 48 |     read_only: bool = False
 49 |     
 50 |     class Config:
 51 |         populate_by_name = True  # Allow both alias and original name
 52 |         alias_generator = lambda s: ''.join(word.capitalize() if i else word for i, word in enumerate(s.split('_')))
 53 | 
 54 | class VMRunOpts(BaseModel):
 55 |     """Configuration for running a VM.
 56 |     
 57 |     Args:
 58 |         no_display: Whether to not display the VNC client
 59 |         shared_directories: List of directories to share with the VM
 60 |     """
 61 |     no_display: bool = Field(default=False, alias="noDisplay")
 62 |     shared_directories: Optional[list[SharedDirectory]] = Field(
 63 |         default=None, 
 64 |         alias="sharedDirectories"
 65 |     )
 66 | 
 67 |     model_config = ConfigDict(
 68 |         populate_by_name=True,
 69 |         alias_generator=lambda s: ''.join(word.capitalize() if i else word for i, word in enumerate(s.split('_')))
 70 |     )
 71 | 
 72 |     def model_dump(self, **kwargs):
 73 |         """Export model data with proper field name conversion.
 74 |         
 75 |         Converts shared directory fields to match API expectations when using aliases.
 76 |         
 77 |         Args:
 78 |             **kwargs: Keyword arguments passed to parent model_dump method
 79 |             
 80 |         Returns:
 81 |             dict: Model data with properly formatted field names
 82 |         """
 83 |         data = super().model_dump(**kwargs)
 84 |         # Convert shared directory fields to match API expectations
 85 |         if self.shared_directories and "by_alias" in kwargs and kwargs["by_alias"]:
 86 |             data["sharedDirectories"] = [
 87 |                 {
 88 |                     "hostPath": d.host_path,
 89 |                     "readOnly": d.read_only
 90 |                 }
 91 |                 for d in self.shared_directories
 92 |             ]
 93 |             # Remove the snake_case version if it exists
 94 |             data.pop("shared_directories", None)
 95 |         return data
 96 | 
 97 | class VMStatus(BaseModel):
 98 |     """Status information for a virtual machine.
 99 |     
100 |     Attributes:
101 |         name: Name of the virtual machine
102 |         status: Current status of the VM
103 |         os: Operating system type
104 |         cpu_count: Number of CPU cores allocated
105 |         memory_size: Amount of memory allocated in bytes
106 |         disk_size: Disk storage information
107 |         vnc_url: URL for VNC connection if available
108 |         ip_address: IP address of the VM if available
109 |     """
110 |     name: str
111 |     status: str
112 |     os: Literal["macOS", "linux"]
113 |     cpu_count: int = Field(alias="cpuCount")
114 |     memory_size: int = Field(alias="memorySize")  # API returns memory size in bytes
115 |     disk_size: DiskInfo = Field(alias="diskSize")
116 |     vnc_url: Optional[str] = Field(default=None, alias="vncUrl")
117 |     ip_address: Optional[str] = Field(default=None, alias="ipAddress")
118 | 
119 |     class Config:
120 |         populate_by_alias = True
121 | 
122 |     @computed_field
123 |     @property
124 |     def state(self) -> str:
125 |         """Get the current state of the VM.
126 |         
127 |         Returns:
128 |             str: Current VM status
129 |         """
130 |         return self.status
131 | 
132 |     @computed_field
133 |     @property
134 |     def cpu(self) -> int:
135 |         """Get the number of CPU cores.
136 |         
137 |         Returns:
138 |             int: Number of CPU cores allocated to the VM
139 |         """
140 |         return self.cpu_count
141 | 
142 |     @computed_field
143 |     @property
144 |     def memory(self) -> str:
145 |         """Get memory allocation in human-readable format.
146 |         
147 |         Returns:
148 |             str: Memory size formatted as "{size}GB"
149 |         """
150 |         # Convert bytes to GB
151 |         gb = self.memory_size / (1024 * 1024 * 1024)
152 |         return f"{int(gb)}GB"
153 | 
154 | class VMUpdateOpts(BaseModel):
155 |     """Options for updating VM configuration.
156 |     
157 |     Attributes:
158 |         cpu: Number of CPU cores to update to
159 |         memory: Amount of memory to update to with units
160 |         disk_size: Size of disk to update to with units
161 |     """
162 |     cpu: Optional[int] = None
163 |     memory: Optional[str] = None
164 |     disk_size: Optional[str] = None
165 | 
166 | class ImageRef(BaseModel):
167 |     """Reference to a VM image.
168 |     
169 |     Attributes:
170 |         image: Name of the image
171 |         tag: Tag version of the image
172 |         registry: Registry hostname where image is stored
173 |         organization: Organization or namespace in the registry
174 |     """
175 |     image: str
176 |     tag: str = "latest"
177 |     registry: Optional[str] = "ghcr.io"
178 |     organization: Optional[str] = "trycua"
179 | 
180 |     def model_dump(self, **kwargs):
181 |         """Override model_dump to return just the image:tag format.
182 |         
183 |         Args:
184 |             **kwargs: Keyword arguments (ignored)
185 |             
186 |         Returns:
187 |             str: Image reference in "image:tag" format
188 |         """
189 |         return f"{self.image}:{self.tag}"
190 | 
191 | class CloneSpec(BaseModel):
192 |     """Specification for cloning a VM.
193 |     
194 |     Attributes:
195 |         name: Name of the source VM to clone
196 |         new_name: Name for the new cloned VM
197 |     """
198 |     name: str
199 |     new_name: str = Field(alias="newName")
200 | 
201 |     class Config:
202 |         populate_by_alias = True
203 | 
204 | class ImageInfo(BaseModel):
205 |     """Model for individual image information.
206 |     
207 |     Attributes:
208 |         imageId: Unique identifier for the image
209 |     """
210 |     imageId: str
211 | 
212 | class ImageList(RootModel):
213 |     """Response model for the images endpoint.
214 |     
215 |     A list-like container for ImageInfo objects that provides
216 |     iteration and indexing capabilities.
217 |     """
218 |     root: List[ImageInfo]
219 | 
220 |     def __iter__(self):
221 |         """Iterate over the image list.
222 |         
223 |         Returns:
224 |             Iterator over ImageInfo objects
225 |         """
226 |         return iter(self.root)
227 | 
228 |     def __getitem__(self, item):
229 |         """Get an item from the image list by index.
230 |         
231 |         Args:
232 |             item: Index or slice to retrieve
233 |             
234 |         Returns:
235 |             ImageInfo or list of ImageInfo objects
236 |         """
237 |         return self.root[item]
238 | 
239 |     def __len__(self):
240 |         """Get the number of images in the list.
241 |         
242 |         Returns:
243 |             int: Number of images in the list
244 |         """
245 |         return len(self.root)
```