This is page 9 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/huggingfacelocal_adapter.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import functools
3 | import warnings
4 | from concurrent.futures import ThreadPoolExecutor
5 | from typing import Any, AsyncIterator, Dict, Iterator, List, Optional
6 |
7 | from litellm import acompletion, completion
8 | from litellm.llms.custom_llm import CustomLLM
9 | from litellm.types.utils import GenericStreamingChunk, ModelResponse
10 |
11 | # Try to import HuggingFace dependencies
12 | try:
13 | import torch
14 | from transformers import AutoModelForImageTextToText, AutoProcessor
15 |
16 | HF_AVAILABLE = True
17 | except ImportError:
18 | HF_AVAILABLE = False
19 |
20 | from .models import load_model as load_model_handler
21 |
22 |
23 | class HuggingFaceLocalAdapter(CustomLLM):
24 | """HuggingFace Local Adapter for running vision-language models locally."""
25 |
26 | def __init__(self, device: str = "auto", trust_remote_code: bool = False, **kwargs):
27 | """Initialize the adapter.
28 |
29 | Args:
30 | device: Device to load model on ("auto", "cuda", "cpu", etc.)
31 | trust_remote_code: Whether to trust remote code
32 | **kwargs: Additional arguments
33 | """
34 | super().__init__()
35 | self.device = device
36 | self.trust_remote_code = trust_remote_code
37 | # Cache for model handlers keyed by model_name
38 | self._handlers: Dict[str, Any] = {}
39 | self._executor = ThreadPoolExecutor(max_workers=1) # Single thread pool
40 |
41 | def _get_handler(self, model_name: str):
42 | """Get or create a model handler for the given model name."""
43 | if model_name not in self._handlers:
44 | self._handlers[model_name] = load_model_handler(
45 | model_name=model_name, device=self.device, trust_remote_code=self.trust_remote_code
46 | )
47 | return self._handlers[model_name]
48 |
49 | def _convert_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
50 | """Convert OpenAI format messages to HuggingFace format.
51 |
52 | Args:
53 | messages: Messages in OpenAI format
54 |
55 | Returns:
56 | Messages in HuggingFace format
57 | """
58 | converted_messages = []
59 |
60 | for message in messages:
61 | converted_message = {"role": message["role"], "content": []}
62 |
63 | content = message.get("content", [])
64 | if isinstance(content, str):
65 | # Simple text content
66 | converted_message["content"].append({"type": "text", "text": content})
67 | elif isinstance(content, list):
68 | # Multi-modal content
69 | for item in content:
70 | if item.get("type") == "text":
71 | converted_message["content"].append(
72 | {"type": "text", "text": item.get("text", "")}
73 | )
74 | elif item.get("type") == "image_url":
75 | # Convert image_url format to image format
76 | image_url = item.get("image_url", {}).get("url", "")
77 | converted_message["content"].append({"type": "image", "image": image_url})
78 |
79 | converted_messages.append(converted_message)
80 |
81 | return converted_messages
82 |
83 | def _generate(self, **kwargs) -> str:
84 | """Generate response using the local HuggingFace model.
85 |
86 | Args:
87 | **kwargs: Keyword arguments containing messages and model info
88 |
89 | Returns:
90 | Generated text response
91 | """
92 | if not HF_AVAILABLE:
93 | raise ImportError(
94 | "HuggingFace transformers dependencies not found. "
95 | 'Please install with: pip install "cua-agent[uitars-hf]"'
96 | )
97 |
98 | # Extract messages and model from kwargs
99 | messages = kwargs.get("messages", [])
100 | model_name = kwargs.get("model", "ByteDance-Seed/UI-TARS-1.5-7B")
101 | max_new_tokens = kwargs.get("max_tokens", 128)
102 |
103 | # Warn about ignored kwargs
104 | ignored_kwargs = set(kwargs.keys()) - {"messages", "model", "max_tokens"}
105 | if ignored_kwargs:
106 | warnings.warn(f"Ignoring unsupported kwargs: {ignored_kwargs}")
107 |
108 | # Convert messages to HuggingFace format
109 | hf_messages = self._convert_messages(messages)
110 |
111 | # Delegate to model handler
112 | handler = self._get_handler(model_name)
113 | generated_text = handler.generate(hf_messages, max_new_tokens=max_new_tokens)
114 | return generated_text
115 |
116 | def completion(self, *args, **kwargs) -> ModelResponse:
117 | """Synchronous completion method.
118 |
119 | Returns:
120 | ModelResponse with generated text
121 | """
122 | generated_text = self._generate(**kwargs)
123 |
124 | return completion(
125 | model=f"huggingface-local/{kwargs['model']}",
126 | mock_response=generated_text,
127 | )
128 |
129 | async def acompletion(self, *args, **kwargs) -> ModelResponse:
130 | """Asynchronous completion method.
131 |
132 | Returns:
133 | ModelResponse with generated text
134 | """
135 | # Run _generate in thread pool to avoid blocking
136 | loop = asyncio.get_event_loop()
137 | generated_text = await loop.run_in_executor(
138 | self._executor, functools.partial(self._generate, **kwargs)
139 | )
140 |
141 | return await acompletion(
142 | model=f"huggingface-local/{kwargs['model']}",
143 | mock_response=generated_text,
144 | )
145 |
146 | def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
147 | """Synchronous streaming method.
148 |
149 | Returns:
150 | Iterator of GenericStreamingChunk
151 | """
152 | generated_text = self._generate(**kwargs)
153 |
154 | generic_streaming_chunk: GenericStreamingChunk = {
155 | "finish_reason": "stop",
156 | "index": 0,
157 | "is_finished": True,
158 | "text": generated_text,
159 | "tool_use": None,
160 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
161 | }
162 |
163 | yield generic_streaming_chunk
164 |
165 | async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
166 | """Asynchronous streaming method.
167 |
168 | Returns:
169 | AsyncIterator of GenericStreamingChunk
170 | """
171 | # Run _generate in thread pool to avoid blocking
172 | loop = asyncio.get_event_loop()
173 | generated_text = await loop.run_in_executor(
174 | self._executor, functools.partial(self._generate, **kwargs)
175 | )
176 |
177 | generic_streaming_chunk: GenericStreamingChunk = {
178 | "finish_reason": "stop",
179 | "index": 0,
180 | "is_finished": True,
181 | "text": generated_text,
182 | "tool_use": None,
183 | "usage": {"completion_tokens": 0, "prompt_tokens": 0, "total_tokens": 0},
184 | }
185 |
186 | yield generic_streaming_chunk
187 |
```
--------------------------------------------------------------------------------
/blog/cua-vlm-router.md:
--------------------------------------------------------------------------------
```markdown
1 | # Cua VLM Router: One Provider for All Your Computer-Use Models
2 |
3 | If you've been building computer-use agents, you know the reality: every model provider has its own specification and deployment process. Anthropic has one API format, OpenAI another, Google something else entirely. Want to try a Hugging Face model? That's a completely different setup. Self-hosting? Even more complexity. Each provider requires learning their specific API, managing their credentials, and adapting your code to their particular requirements.
4 |
5 | Today we're launching the **Cua VLM Router**: a managed inference API that gives you unified access to multiple vision-language model providers through a single API key. We're starting with Anthropic's Claude models (Sonnet 4.5 and Haiku 4.5)—some of the most loved and widely-used computer-use models in the Cua ecosystem - with more providers coming soon.
6 |
7 | 
8 |
9 | ## What You Get
10 |
11 | The Cua VLM Router handles the infrastructure so you can focus on building:
12 |
13 | **Single API Key**
14 |
15 | - One key for all model providers (no juggling multiple credentials)
16 | - Works for both model inference and sandbox access
17 | - Manage everything from one dashboard at cua.ai
18 |
19 | **Smart Routing**
20 |
21 | - Automatic provider selection for optimal availability and performance
22 | - For Anthropic models, we route to the best provider (Anthropic, AWS Bedrock, or Microsoft Foundry)
23 | - No configuration needed—just specify the model and we handle the rest
24 |
25 | **Cost Tracking & Optimization**
26 |
27 | - Unified usage dashboard across all models
28 | - Real-time credit balance tracking
29 | - Detailed cost breakdown per request (gateway cost + upstream cost)
30 |
31 | **Production-Ready**
32 |
33 | - OpenAI-compatible API (drop-in replacement for existing code)
34 | - Full streaming support with Server-Sent Events
35 | - Metadata about routing decisions in every response
36 |
37 | ## Available Models (Launch)
38 |
39 | We're starting with Anthropic's latest Claude models:
40 |
41 | | Model | Best For |
42 | | --------------------------------- | ---------------------------------- |
43 | | `cua/anthropic/claude-sonnet-4.5` | General-purpose tasks, recommended |
44 | | `cua/anthropic/claude-haiku-4.5` | Fast responses, cost-effective |
45 |
46 | ## How It Works
47 |
48 | When you request an Anthropic model through Cua, we automatically route to the best available provider—whether that's Anthropic directly, AWS Bedrock, or Microsoft Foundry. You just specify `cua/anthropic/claude-sonnet-4.5`, and we handle the provider selection, failover, and optimization behind the scenes. No need to manage multiple accounts or implement fallback logic yourself.
49 |
50 | ## Getting Started
51 |
52 | Sign up at [cua.ai/signin](https://cua.ai/signin) and create your API key from **Dashboard > API Keys > New API Key** (save it immediately—you won't see it again).
53 |
54 | Use it with the Agent SDK (make sure to set your environment variable):
55 |
56 | ```python
57 | import asyncio
58 | from agent import ComputerAgent
59 | from computer import Computer
60 |
61 | async def main():
62 | # Initialize cloud computer
63 | computer = Computer(
64 | os_type="linux",
65 | provider_type="cloud",
66 | name="your-container-name",
67 | api_key="your-cua-api-key"
68 | )
69 |
70 | # Initialize agent with Claude Sonnet 4.5
71 | agent = ComputerAgent(
72 | tools=[computer],
73 | model="cua/anthropic/claude-sonnet-4.5",
74 | api_key="your-cua-api-key",
75 | instructions="You are a helpful assistant that can control computers",
76 | only_n_most_recent_images=3
77 | )
78 |
79 | # Run a task
80 | async for result in agent.run("Open a browser and search for Python tutorials"):
81 | print(result)
82 |
83 | if __name__ == "__main__":
84 | asyncio.run(main())
85 | ```
86 |
87 | ## Migration is Simple
88 |
89 | Already using Anthropic directly? Just add the `cua/` prefix:
90 |
91 | **Before:**
92 |
93 | ```python
94 | export ANTHROPIC_API_KEY="sk-ant-..."
95 | agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
96 | ```
97 |
98 | **After:**
99 |
100 | ```python
101 | export CUA_API_KEY="sk_cua-api01_..."
102 | agent = ComputerAgent(model="cua/anthropic/claude-sonnet-4.5")
103 | ```
104 |
105 | Same code structure. No other changes needed.
106 |
107 | ## Direct API Access
108 |
109 | The router exposes an OpenAI-compatible API at `https://inference.cua.ai/v1`:
110 |
111 | ```bash
112 | curl -X POST https://inference.cua.ai/v1/chat/completions \
113 | -H "Authorization: Bearer ${CUA_API_KEY}" \
114 | -H "Content-Type: application/json" \
115 | -d '{
116 | "model": "anthropic/claude-sonnet-4.5",
117 | "messages": [{"role": "user", "content": "Hello!"}],
118 | "stream": true
119 | }'
120 | ```
121 |
122 | Works with any OpenAI-compatible client library.
123 |
124 | ## FAQs
125 |
126 | <details>
127 | <summary><strong>Do I still need provider API keys?</strong></summary>
128 |
129 | No. Cua manages all provider API keys and infrastructure. You only need one Cua API key for everything—model inference and sandbox access.
130 |
131 | </details>
132 |
133 | <details>
134 | <summary><strong>How does pricing work?</strong></summary>
135 |
136 | Requests are billed in credits, deducted from your Cua account balance. Every response includes both the Cua gateway cost and the actual upstream API cost for transparency.
137 |
138 | </details>
139 |
140 | <details>
141 | <summary><strong>Can I still use my own Anthropic key (BYOK)?</strong></summary>
142 |
143 | Yes. The agent SDK still supports direct provider access. Just use `anthropic/claude-sonnet-4-5-20250929` instead of the `cua/` prefix and set your `ANTHROPIC_API_KEY`. See [Supported Model Providers](https://cua.ai/docs/agent-sdk/supported-model-providers/) for details.
144 |
145 | </details>
146 |
147 | <details>
148 | <summary><strong>What about other providers?</strong></summary>
149 |
150 | We're starting with Anthropic and adding more providers based on what people actually use. Request access to specific models in [Discord](https://discord.gg/cua-ai).
151 |
152 | </details>
153 |
154 | <details>
155 | <summary><strong>Does streaming work?</strong></summary>
156 |
157 | Yes. Set `"stream": true` in your request to receive Server-Sent Events. Works identically to OpenAI's streaming API.
158 |
159 | </details>
160 |
161 | ## What's Next
162 |
163 | This is just the beginning. We're actively iterating based on feedback:
164 |
165 | - Additional model providers
166 | - Custom model routing rules
167 | - Usage alerts and budget controls
168 | - Team collaboration features
169 |
170 | If there's a model or feature you need, let us know in [Discord](https://discord.gg/cua-ai).
171 |
172 | ## Need Help?
173 |
174 | - **Documentation**: [cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router)
175 | - **Quickstart Guide**: [cua.ai/docs/get-started/quickstart](https://cua.ai/docs/get-started/quickstart)
176 | - **Discord Community**: [discord.gg/cua-ai](https://discord.gg/cua-ai)
177 |
178 | ---
179 |
180 | Get started at [cua.ai](https://cua.ai) or check out the [VLM Router docs](https://cua.ai/docs/agent-sdk/supported-model-providers/cua-vlm-router).
181 |
```
--------------------------------------------------------------------------------
/tests/test_venv.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Virtual Environment Testing Module
3 | This module tests the ability to execute python code in a virtual environment within Cua Containers.
4 |
5 | Required environment variables:
6 | - CUA_API_KEY: API key for Cua cloud provider
7 | - CUA_CONTAINER_NAME: Name of the container to use
8 | """
9 |
10 | import asyncio
11 | import os
12 | import sys
13 | import traceback
14 | from pathlib import Path
15 |
16 | import pytest
17 |
18 | # Load environment variables from .env file
19 | project_root = Path(__file__).parent.parent
20 | env_file = project_root / ".env"
21 | print(f"Loading environment from: {env_file}")
22 | from dotenv import load_dotenv
23 |
24 | load_dotenv(env_file)
25 |
26 | # Add paths to sys.path if needed
27 | pythonpath = os.environ.get("PYTHONPATH", "")
28 | for path in pythonpath.split(":"):
29 | if path and path not in sys.path:
30 | sys.path.insert(0, path) # Insert at beginning to prioritize
31 | print(f"Added to sys.path: {path}")
32 |
33 | from computer import Computer, VMProviderType
34 | from computer.helpers import sandboxed, set_default_computer
35 |
36 |
37 | @pytest.fixture(scope="session")
38 | async def computer():
39 | """Shared Computer instance for all test cases."""
40 | # Create a remote Linux computer with Cua
41 | computer = Computer(
42 | os_type="linux",
43 | api_key=os.getenv("CUA_API_KEY"),
44 | name=str(os.getenv("CUA_CONTAINER_NAME")),
45 | provider_type=VMProviderType.CLOUD,
46 | )
47 |
48 | # # Create a local macOS computer with Cua
49 | # computer = Computer()
50 |
51 | try:
52 | await computer.run()
53 | yield computer
54 | finally:
55 | await computer.disconnect()
56 |
57 |
58 | # Sample test cases
59 | @pytest.mark.asyncio(loop_scope="session")
60 | async def test_venv_install(computer):
61 | """Test virtual environment creation and package installation."""
62 | # Create a test virtual environment and install requests
63 | stdout, _ = await computer.venv_install("test_env", ["requests"])
64 |
65 | # Check that installation was successful (no major errors)
66 | assert "Successfully installed" in stdout or "Requirement already satisfied" in stdout
67 |
68 |
69 | @pytest.mark.asyncio(loop_scope="session")
70 | async def test_venv_cmd(computer):
71 | """Test executing shell commands in virtual environment."""
72 | # Test Python version check
73 | stdout, _ = await computer.venv_cmd("test_env", "python --version")
74 |
75 | assert "Python" in stdout
76 |
77 |
78 | @pytest.mark.asyncio(loop_scope="session")
79 | async def test_venv_exec(computer):
80 | """Test executing Python functions in virtual environment."""
81 |
82 | def test_function(message="Hello World"):
83 | import sys
84 |
85 | return f"Python {sys.version_info.major}.{sys.version_info.minor}: {message}"
86 |
87 | result = await computer.venv_exec("test_env", test_function, message="Test successful!")
88 |
89 | assert "Python" in result
90 | assert "Test successful!" in result
91 |
92 |
93 | @pytest.mark.asyncio(loop_scope="session")
94 | async def test_venv_exec_with_package(computer):
95 | """Test executing Python functions that use installed packages."""
96 |
97 | def test_requests():
98 | import requests
99 |
100 | return f"requests version: {requests.__version__}"
101 |
102 | result = await computer.venv_exec("test_env", test_requests)
103 |
104 | assert "requests version:" in result
105 |
106 |
107 | @pytest.mark.asyncio(loop_scope="session")
108 | async def test_venv_exec_error_handling(computer):
109 | """Test error handling in venv_exec."""
110 |
111 | def test_error():
112 | raise ValueError("This is a test error")
113 |
114 | with pytest.raises(ValueError, match="This is a test error"):
115 | await computer.venv_exec("test_env", test_error)
116 |
117 |
118 | @pytest.mark.asyncio(loop_scope="session")
119 | async def test_venv_exec_with_args_kwargs(computer):
120 | """Test executing Python functions with args and kwargs that return an object."""
121 |
122 | def create_data_object(name, age, *hobbies, **metadata):
123 | return {
124 | "name": name,
125 | "age": age,
126 | "hobbies": list(hobbies),
127 | "metadata": metadata,
128 | "status": "active",
129 | }
130 |
131 | args = ["Alice", 25, "reading", "coding"]
132 | kwargs = {"location": "New York", "department": "Engineering"}
133 |
134 | result = await computer.venv_exec("test_env", create_data_object, *args, **kwargs)
135 |
136 | assert result["name"] == "Alice"
137 | assert result["age"] == 25
138 | assert result["hobbies"] == ["reading", "coding"]
139 | assert result["metadata"]["location"] == "New York"
140 | assert result["status"] == "active"
141 |
142 |
143 | @pytest.mark.asyncio(loop_scope="session")
144 | async def test_venv_exec_stdout_capture(computer, capfd):
145 | """Test capturing stdout from Python functions executed in virtual environment."""
146 |
147 | def hello_world_function():
148 | print("Hello World!")
149 | return "Function completed"
150 |
151 | # Execute the function in the virtual environment
152 | result = await computer.venv_exec("test_env", hello_world_function)
153 |
154 | # Capture stdout and stderr
155 | out, _ = capfd.readouterr()
156 |
157 | # Assert the stdout contains our expected output
158 | assert out == "Hello World!\n\n"
159 | assert result == "Function completed"
160 |
161 |
162 | @pytest.mark.asyncio(loop_scope="session")
163 | async def test_remote_decorator(computer):
164 | """Test the remote decorator from computer.helpers module."""
165 | # Set the computer as default for the remote decorator
166 | set_default_computer(computer)
167 |
168 | # Define a function with the remote decorator
169 | @sandboxed("test_env")
170 | def get_package_version():
171 | import platform
172 | import sys
173 |
174 | return {"python_version": sys.version, "platform": platform.platform(), "success": True}
175 |
176 | # Call the decorated function
177 | result = await get_package_version()
178 |
179 | # Verify the function executed in the virtual environment
180 | assert "python_version" in result
181 | assert "platform" in result
182 | assert result["success"] == True
183 |
184 |
185 | @pytest.mark.asyncio(loop_scope="session")
186 | async def test_remote_decorator_with_custom_computer(computer):
187 | """Test the remote decorator with explicitly specified computer instance."""
188 |
189 | # Define a function with the remote decorator that explicitly specifies the computer
190 | @sandboxed("test_env", computer=computer)
191 | def get_system_info():
192 | import os
193 | import sys
194 |
195 | return {
196 | "python_version": sys.version,
197 | "environment_vars": dict(os.environ),
198 | "working_directory": os.getcwd(),
199 | }
200 |
201 | # Call the decorated function
202 | result = await get_system_info()
203 |
204 | # Verify the function executed in the virtual environment
205 | assert "python_version" in result
206 | assert "environment_vars" in result
207 | assert "working_directory" in result
208 | # The virtual environment should have a different working directory
209 | # than the current test process
210 | assert result["working_directory"] != os.getcwd()
211 |
212 |
213 | if __name__ == "__main__":
214 | # Run tests directly
215 | pytest.main([__file__, "-v"])
216 |
```
--------------------------------------------------------------------------------
/libs/python/mcp-server/quick_test_local_option.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | Quick test to verify the local desktop option logic without full setup.
4 |
5 | This script tests the environment variable parsing and logic flow
6 | without requiring VMs, computer-server, or MCP clients to be running.
7 | """
8 |
9 | import os
10 | import sys
11 |
12 |
13 | def test_env_var_parsing():
14 | """Test that environment variable is parsed correctly."""
15 | print("Testing CUA_USE_HOST_COMPUTER_SERVER environment variable parsing...")
16 | print("-" * 60)
17 |
18 | test_cases = [
19 | # (env_value, expected_result, description)
20 | ("true", True, "lowercase 'true'"),
21 | ("True", True, "capitalized 'True'"),
22 | ("TRUE", True, "uppercase 'TRUE'"),
23 | ("1", True, "numeric '1'"),
24 | ("yes", True, "lowercase 'yes'"),
25 | ("Yes", True, "capitalized 'Yes'"),
26 | ("false", False, "lowercase 'false'"),
27 | ("False", False, "capitalized 'False'"),
28 | ("FALSE", False, "uppercase 'FALSE'"),
29 | ("0", False, "numeric '0'"),
30 | ("no", False, "lowercase 'no'"),
31 | ("", False, "empty string"),
32 | ("random", False, "random value"),
33 | (None, False, "not set (None)"),
34 | ]
35 |
36 | passed = 0
37 | failed = 0
38 |
39 | for env_value, expected, description in test_cases:
40 | # Simulate the logic from session_manager.py line 59
41 | if env_value is None:
42 | actual = os.getenv("CUA_USE_HOST_COMPUTER_SERVER", "false").lower() in (
43 | "true",
44 | "1",
45 | "yes",
46 | )
47 | else:
48 | os.environ["CUA_USE_HOST_COMPUTER_SERVER"] = env_value
49 | actual = os.getenv("CUA_USE_HOST_COMPUTER_SERVER", "false").lower() in (
50 | "true",
51 | "1",
52 | "yes",
53 | )
54 |
55 | status = "✓ PASS" if actual == expected else "✗ FAIL"
56 | if actual == expected:
57 | passed += 1
58 | else:
59 | failed += 1
60 |
61 | print(
62 | f"{status} | Value: {env_value!r:15} | Expected: {expected!s:5} | Got: {actual!s:5} | {description}"
63 | )
64 |
65 | # Clean up
66 | os.environ.pop("CUA_USE_HOST_COMPUTER_SERVER", None)
67 |
68 | print("-" * 60)
69 | print(f"Results: {passed} passed, {failed} failed")
70 | return failed == 0
71 |
72 |
73 | def test_session_manager_logic():
74 | """Test the logic flow in session_manager.py without actual Computer creation."""
75 | print("\nTesting session_manager.py logic flow...")
76 | print("-" * 60)
77 |
78 | # Read the actual session_manager.py to verify the logic
79 | import pathlib
80 |
81 | session_manager_path = (
82 | pathlib.Path(__file__).parent.parent
83 | / "libs"
84 | / "python"
85 | / "mcp-server"
86 | / "mcp_server"
87 | / "session_manager.py"
88 | )
89 |
90 | if not session_manager_path.exists():
91 | print(f"✗ FAIL | session_manager.py not found at {session_manager_path}")
92 | return False
93 |
94 | content = session_manager_path.read_text()
95 |
96 | # Check for the key logic
97 | checks = [
98 | ('os.getenv("CUA_USE_HOST_COMPUTER_SERVER"', "Environment variable check present"),
99 | ("use_host_computer_server=use_host", "use_host_computer_server parameter passed"),
100 | ("Computer(", "Computer instantiation present"),
101 | ]
102 |
103 | all_checks_passed = True
104 | for check_str, description in checks:
105 | if check_str in content:
106 | print(f"✓ PASS | {description}")
107 | else:
108 | print(f"✗ FAIL | {description} - not found")
109 | all_checks_passed = False
110 |
111 | print("-" * 60)
112 | return all_checks_passed
113 |
114 |
115 | def test_documentation_consistency():
116 | """Verify documentation mentions the new feature."""
117 | print("\nTesting documentation consistency...")
118 | print("-" * 60)
119 |
120 | import pathlib
121 |
122 | docs_to_check = [
123 | ("configuration.mdx", "CUA_USE_HOST_COMPUTER_SERVER"),
124 | ("usage.mdx", "Targeting Your Local Desktop"),
125 | ]
126 |
127 | docs_path = (
128 | pathlib.Path(__file__).parent.parent
129 | / "docs"
130 | / "content"
131 | / "docs"
132 | / "libraries"
133 | / "mcp-server"
134 | )
135 |
136 | all_docs_ok = True
137 | for doc_file, expected_content in docs_to_check:
138 | doc_path = docs_path / doc_file
139 | if not doc_path.exists():
140 | print(f"✗ FAIL | {doc_file} not found")
141 | all_docs_ok = False
142 | continue
143 |
144 | content = doc_path.read_text()
145 | if expected_content in content:
146 | print(f"✓ PASS | {doc_file} contains '{expected_content}'")
147 | else:
148 | print(f"✗ FAIL | {doc_file} missing '{expected_content}'")
149 | all_docs_ok = False
150 |
151 | print("-" * 60)
152 | return all_docs_ok
153 |
154 |
155 | def print_usage_examples():
156 | """Print usage examples for both modes."""
157 | print("\n" + "=" * 60)
158 | print("USAGE EXAMPLES")
159 | print("=" * 60)
160 |
161 | print("\n1. DEFAULT MODE (VM):")
162 | print("-" * 60)
163 | print(
164 | """
165 | {
166 | "mcpServers": {
167 | "cua-agent": {
168 | "command": "/bin/bash",
169 | "args": ["~/.cua/start_mcp_server.sh"],
170 | "env": {
171 | "CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929"
172 | }
173 | }
174 | }
175 | }
176 |
177 | Note: CUA_USE_HOST_COMPUTER_SERVER is not set, so VM mode is used (safe).
178 | """
179 | )
180 |
181 | print("\n2. LOCAL DESKTOP MODE:")
182 | print("-" * 60)
183 | print(
184 | """
185 | Step 1: Start computer-server locally:
186 | python -m computer_server
187 |
188 | Step 2: Configure MCP client:
189 | {
190 | "mcpServers": {
191 | "cua-agent": {
192 | "command": "/bin/bash",
193 | "args": ["~/.cua/start_mcp_server.sh"],
194 | "env": {
195 | "CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
196 | "CUA_USE_HOST_COMPUTER_SERVER": "true"
197 | }
198 | }
199 | }
200 | }
201 |
202 | ⚠️ WARNING: AI will have direct access to your desktop!
203 | """
204 | )
205 |
206 |
207 | def main():
208 | """Run all quick tests."""
209 | print("=" * 60)
210 | print("QUICK TEST: MCP Server Local Desktop Option")
211 | print("=" * 60)
212 | print()
213 |
214 | results = []
215 |
216 | # Run tests
217 | results.append(("Environment Variable Parsing", test_env_var_parsing()))
218 | results.append(("Session Manager Logic", test_session_manager_logic()))
219 | results.append(("Documentation Consistency", test_documentation_consistency()))
220 |
221 | # Print summary
222 | print("\n" + "=" * 60)
223 | print("SUMMARY")
224 | print("=" * 60)
225 | for test_name, passed in results:
226 | status = "✓ PASSED" if passed else "✗ FAILED"
227 | print(f"{status} | {test_name}")
228 |
229 | all_passed = all(result for _, result in results)
230 |
231 | if all_passed:
232 | print("\n🎉 All quick tests passed!")
233 | print_usage_examples()
234 | print("\nNext steps:")
235 | print("1. Run full automated tests: pytest tests/test_mcp_server_local_option.py")
236 | print("2. Follow manual testing guide: tests/MANUAL_TEST_LOCAL_OPTION.md")
237 | return 0
238 | else:
239 | print("\n❌ Some tests failed. Please review the output above.")
240 | return 1
241 |
242 |
243 | if __name__ == "__main__":
244 | sys.exit(main())
245 |
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/agent-loops.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Agent Loops
3 | description: Supported computer-using agent loops and models
4 | ---
5 |
6 | <Callout>
7 | A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
8 | </Callout>
9 |
10 | An agent can be thought of as a loop - it generates actions, executes them, and repeats until done:
11 |
12 | 1. **Generate**: Your `model` generates `output_text`, `computer_call`, `function_call`
13 | 2. **Execute**: The `computer` safely executes those items
14 | 3. **Complete**: If the model has no more calls, it's done!
15 |
16 | To run an agent loop simply do:
17 |
18 | ```python
19 | from agent import ComputerAgent
20 | import asyncio
21 | from computer import Computer
22 |
23 |
24 | async def take_screenshot():
25 | async with Computer(
26 | os_type="linux",
27 | provider_type="cloud",
28 | name="your-sandbox-name",
29 | api_key="your-api-key"
30 | ) as computer:
31 |
32 | agent = ComputerAgent(
33 | model="anthropic/claude-sonnet-4-5-20250929",
34 | tools=[computer],
35 | max_trajectory_budget=5.0
36 | )
37 |
38 | messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
39 |
40 | async for result in agent.run(messages):
41 | for item in result["output"]:
42 | if item["type"] == "message":
43 | print(item["content"][0]["text"])
44 |
45 |
46 | if __name__ == "__main__":
47 | asyncio.run(take_screenshot())
48 | ```
49 |
50 | For a list of supported models and configurations, see the [Supported Agents](./supported-agents/computer-use-agents) page.
51 |
52 | ### Response Format
53 |
54 | ```python
55 | {
56 | "output": [
57 | {
58 | "type": "message",
59 | "role": "assistant",
60 | "content": [{"type": "output_text", "text": "I can see..."}]
61 | },
62 | {
63 | "type": "computer_call",
64 | "action": {"type": "screenshot"},
65 | "call_id": "call_123"
66 | },
67 | {
68 | "type": "computer_call_output",
69 | "call_id": "call_123",
70 | "output": {"image_url": "data:image/png;base64,..."}
71 | }
72 | ],
73 | "usage": {
74 | "prompt_tokens": 150,
75 | "completion_tokens": 75,
76 | "total_tokens": 225,
77 | "response_cost": 0.01,
78 | }
79 | }
80 | ```
81 |
82 | ### Environment Variables
83 |
84 | Use the following environment variables to configure the agent and its access to cloud computers and LLM providers:
85 |
86 | ```bash
87 | # Computer instance (cloud)
88 | export CUA_SANDBOX_NAME="your-sandbox-name"
89 | export CUA_API_KEY="your-cua-api-key"
90 |
91 | # LLM API keys
92 | export ANTHROPIC_API_KEY="your-anthropic-key"
93 | export OPENAI_API_KEY="your-openai-key"
94 | ```
95 |
96 | ### Input and output
97 |
98 | The input prompt passed to `Agent.run` can either be a string or a list of message dictionaries:
99 |
100 | ```python
101 | messages = [
102 | {
103 | "role": "user",
104 | "content": "Take a screenshot and describe what you see"
105 | },
106 | {
107 | "role": "assistant",
108 | "content": "I'll take a screenshot for you."
109 | }
110 | ]
111 | ```
112 |
113 | The output is an AsyncGenerator that yields response chunks.
114 |
115 | ### Parameters
116 |
117 | The `ComputerAgent` constructor provides a wide range of options for customizing agent behavior, tool integration, callbacks, resource management, and more.
118 |
119 | - `model` (`str`): Default: **required**
120 | The LLM or agent model to use. Determines which agent loop is selected unless `custom_loop` is provided. (e.g., "claude-sonnet-4-5-20250929", "computer-use-preview", "omni+vertex_ai/gemini-pro")
121 | - `tools` (`List[Any]`):
122 | List of tools the agent can use (e.g., `Computer`, sandboxed Python functions, etc.).
123 | - `custom_loop` (`Callable`):
124 | Optional custom agent loop function. If provided, overrides automatic loop selection.
125 | - `only_n_most_recent_images` (`int`):
126 | If set, only the N most recent images are kept in the message history. Useful for limiting memory usage. Automatically adds `ImageRetentionCallback`.
127 | - `callbacks` (`List[Any]`):
128 | List of callback instances for advanced preprocessing, postprocessing, logging, or custom hooks. See [Callbacks & Extensibility](#callbacks--extensibility).
129 | - `verbosity` (`int`):
130 | Logging level (e.g., `logging.INFO`). If set, adds a logging callback.
131 | - `trajectory_dir` (`str`):
132 | Directory path to save full trajectory data, including screenshots and responses. Adds `TrajectorySaverCallback`.
133 | - `max_retries` (`int`): Default: `3`
134 | Maximum number of retries for failed API calls (default: 3).
135 | - `screenshot_delay` (`float` | `int`): Default: `0.5`
136 | Delay (in seconds) before taking screenshots (default: 0.5).
137 | - `use_prompt_caching` (`bool`): Default: `False`
138 | Enables prompt caching for repeated prompts (mainly for Anthropic models).
139 | - `max_trajectory_budget` (`float` | `dict`):
140 | If set (float or dict), adds a budget manager callback that tracks usage costs and stops execution if the budget is exceeded. Dict allows advanced options (e.g., `{ "max_budget": 5.0, "raise_error": True }`).
141 | - `instructions` (`str` | `list[str]`):
142 | System instructions for the agent. Can be a single string or multiple strings in a tuple/list for readability; they are concatenated into one system prompt.
143 | - `api_key` (`str`):
144 | Optional API key override for the model provider.
145 | - `api_base` (`str`):
146 | Optional API base URL override for the model provider.
147 | - `**additional_generation_kwargs` (`any`):
148 | Any additional keyword arguments are passed through to the agent loop or model provider.
149 |
150 | **Example with advanced options:**
151 |
152 | ```python
153 | from agent import ComputerAgent
154 | from computer import Computer
155 | from agent.callbacks import ImageRetentionCallback
156 |
157 | agent = ComputerAgent(
158 | model="anthropic/claude-sonnet-4-5-20250929",
159 | tools=[Computer(...)],
160 | only_n_most_recent_images=3,
161 | callbacks=[ImageRetentionCallback(only_n_most_recent_images=3)],
162 | verbosity=logging.INFO,
163 | trajectory_dir="trajectories",
164 | max_retries=5,
165 | screenshot_delay=1.0,
166 | use_prompt_caching=True,
167 | max_trajectory_budget={"max_budget": 5.0, "raise_error": True},
168 | instructions=(
169 | "You are a helpful computer-using agent"
170 | "Output computer calls until you complete the given task"
171 | ),
172 | api_key="your-api-key",
173 | api_base="https://your-api-base.com/v1",
174 | )
175 | ```
176 |
177 | ### Streaming Responses
178 |
179 | ```python
180 | async for result in agent.run(messages, stream=True):
181 | # Process streaming chunks
182 | for item in result["output"]:
183 | if item["type"] == "message":
184 | print(item["content"][0]["text"], end="", flush=True)
185 | elif item["type"] == "computer_call":
186 | action = item["action"]
187 | print(f"\n[Action: {action['type']}]")
188 | ```
189 |
190 | ### Error Handling
191 |
192 | ```python
193 | try:
194 | async for result in agent.run(messages):
195 | # Process results
196 | pass
197 | except BudgetExceededException:
198 | print("Budget limit exceeded")
199 | except Exception as e:
200 | print(f"Agent error: {e}")
201 | ```
202 |
```
--------------------------------------------------------------------------------
/scripts/install-cli.sh:
--------------------------------------------------------------------------------
```bash
1 | #!/bin/bash
2 | set -e
3 |
4 | # CUA CLI Installation Script for macOS/Linux
5 | echo "🚀 Installing CUA CLI..."
6 |
7 | # Function to print success message
8 | print_success() {
9 | local bin_path="$1"
10 | local version="$2"
11 | local config_file="$3"
12 |
13 | printf "\033[32m✅ CUA CLI %s was installed successfully to %s\033[0m\n" "$version" "$bin_path"
14 | printf "\033[90mAdded \"%s\" to \$PATH in \"%s\"\033[0m\n" "$bin_path" "$config_file"
15 | printf "\n\033[90mTo get started, run:\033[0m\n"
16 | printf " source %s\n" "$config_file"
17 | printf " cua --help\n"
18 | printf "\033[90m📚 For more help, visit: https://docs.cua.ai/libraries/cua-cli\033[0m\n"
19 | }
20 |
21 | # Function to install with bun as fallback
22 | install_with_bun() {
23 | echo "📦 Installing CUA CLI using Bun..."
24 |
25 | # Check if bun is already installed
26 | if ! command -v bun &> /dev/null; then
27 | echo "📦 Installing Bun..."
28 | curl -fsSL https://bun.sh/install | bash
29 |
30 | # Source the shell profile to make bun available
31 | if [ -f "$HOME/.bashrc" ]; then
32 | source "$HOME/.bashrc"
33 | elif [ -f "$HOME/.zshrc" ]; then
34 | source "$HOME/.zshrc"
35 | fi
36 |
37 | # Add bun to PATH for this session
38 | export PATH="$HOME/.bun/bin:$PATH"
39 | fi
40 |
41 | # Verify bun installation
42 | if ! command -v bun &> /dev/null; then
43 | echo "❌ Failed to install Bun. Please install manually from https://bun.sh"
44 | exit 1
45 | fi
46 |
47 | echo "📦 Installing CUA CLI..."
48 | if ! bun add -g @trycua/cli; then
49 | echo "❌ Failed to install with Bun, trying npm..."
50 | if ! npm install -g @trycua/cli; then
51 | echo "❌ Installation failed. Please try installing manually:"
52 | echo " npm install -g @trycua/cli"
53 | exit 1
54 | fi
55 | fi
56 |
57 | # Verify installation
58 | if command -v cua &> /dev/null; then
59 | # Determine which config file was updated
60 | local config_file="$HOME/.bashrc"
61 | if [ -f "$HOME/.zshrc" ]; then
62 | config_file="$HOME/.zshrc"
63 | elif [ -f "$HOME/.profile" ]; then
64 | config_file="$HOME/.profile"
65 | fi
66 | # Determine installed version via npm registry (fallback to unknown)
67 | local VERSION_BUN
68 | VERSION_BUN=$(npm view @trycua/cli version 2>/dev/null || echo "unknown")
69 | # Write version file to ~/.cua/bin/.version
70 | local INSTALL_DIR="$HOME/.cua/bin"
71 | mkdir -p "$INSTALL_DIR"
72 | echo "$VERSION_BUN" > "$INSTALL_DIR/.version"
73 | # Print success and exit
74 | print_success "$(command -v cua)" "$VERSION_BUN" "$config_file"
75 | exit 0
76 | else
77 | echo "❌ Installation failed. Please try installing manually:"
78 | echo " npm install -g @trycua/cli"
79 | exit 1
80 | fi
81 | }
82 |
83 | # Determine OS and architecture
84 | OS=$(uname -s | tr '[:upper:]' '[:lower:]')
85 | ARCH=$(uname -m)
86 |
87 | # Map architecture to the format used in release assets
88 | case "$ARCH" in
89 | x86_64) ARCH="x64" ;;
90 | aarch64) ARCH="arm64" ;;
91 | arm64) ARCH="arm64" ;;
92 | *) ARCH="$ARCH" ;;
93 | esac
94 |
95 | # Determine the binary name
96 | BINARY_NAME="cua-${OS}-${ARCH}"
97 | if [ "$OS" = "darwin" ] && [ "$ARCH" = "arm64" ]; then
98 | BINARY_NAME="cua-darwin-arm64"
99 | elif [ "$OS" = "darwin" ] && [ "$ARCH" = "x64" ]; then
100 | BINARY_NAME="cua-darwin-x64"
101 | elif [ "$OS" = "linux" ] && [ "$ARCH" = "x64" ]; then
102 | BINARY_NAME="cua-linux-x64"
103 | else
104 | echo "⚠️ Pre-built binary not available for ${OS}-${ARCH}, falling back to Bun installation"
105 | install_with_bun
106 | exit 0
107 | fi
108 |
109 | # Get the latest release version
110 | LATEST_RELEASE=$(curl -s https://api.github.com/repos/trycua/cua/releases/latest)
111 | if [ -z "$LATEST_RELEASE" ]; then
112 | echo "⚠️ Could not fetch latest release, falling back to Bun installation"
113 | install_with_bun
114 | exit 0
115 | fi
116 |
117 | # Extract version number (remove 'cua-v' prefix)
118 | TAG_NAME=$(echo "$LATEST_RELEASE" | grep 'tag_name' | cut -d '"' -f 4)
119 | VERSION=${TAG_NAME#cua-v}
120 |
121 | # Find the binary URL in the release assets
122 | BINARY_URL=$(echo "$LATEST_RELEASE" | grep -o 'https://.*/download/[^"]*/'${BINARY_NAME}'"' | head -1)
123 | BINARY_URL="${BINARY_URL%\"}"
124 | printf "\033[90mBINARY_URL: %s\033[0m\n" "$BINARY_URL"
125 |
126 | if [ -z "$BINARY_URL" ]; then
127 | echo "⚠️ Could not find ${BINARY_NAME} in release assets, falling back to Bun installation"
128 | install_with_bun
129 | exit 0
130 | fi
131 |
132 | # Create ~/.cua/bin directory if it doesn't exist
133 | INSTALL_DIR="$HOME/.cua/bin"
134 | mkdir -p "$INSTALL_DIR"
135 |
136 | # Download the binary
137 | echo "📥 Downloading CUA CLI $VERSION for ${OS}-${ARCH}..."
138 | echo "📍 Downloading from: $BINARY_URL"
139 |
140 | # Download with progress bar and proper error handling
141 | if ! curl -L --progress-bar --fail "$BINARY_URL" -o "$INSTALL_DIR/cua"; then
142 | echo "❌ Failed to download pre-built binary from $BINARY_URL"
143 | echo "⚠️ Falling back to Bun installation"
144 | install_with_bun
145 | exit 0
146 | fi
147 |
148 | # Verify the downloaded file exists and has content
149 | if [ ! -f "$INSTALL_DIR/cua" ] || [ ! -s "$INSTALL_DIR/cua" ]; then
150 | echo "❌ Downloaded file is missing or empty"
151 | echo "⚠️ Falling back to Bun installation"
152 | rm -f "$INSTALL_DIR/cua"
153 | install_with_bun
154 | exit 0
155 | fi
156 |
157 | # Check if the downloaded file looks like a binary (not HTML error page)
158 | if file "$INSTALL_DIR/cua" | grep -q "HTML\|text"; then
159 | echo "❌ Downloaded file appears to be corrupted (HTML/text instead of binary)"
160 | echo "⚠️ Falling back to Bun installation"
161 | rm -f "$INSTALL_DIR/cua"
162 | install_with_bun
163 | exit 0
164 | fi
165 |
166 | # Make the binary executable
167 | chmod +x "$INSTALL_DIR/cua"
168 |
169 | # Write version file
170 | echo "$VERSION" > "$INSTALL_DIR/.version"
171 |
172 | # Add ~/.cua/bin to PATH if not already in PATH
173 | if [[ ":$PATH:" != *":$INSTALL_DIR:"* ]]; then
174 | # Add to .bashrc, .zshrc, or .profile
175 | if [ -f "$HOME/.bashrc" ]; then
176 | echo "export PATH=\"$INSTALL_DIR:\$PATH\"" >> "$HOME/.bashrc"
177 | echo "Added $INSTALL_DIR to PATH in ~/.bashrc"
178 | fi
179 |
180 | if [ -f "$HOME/.zshrc" ]; then
181 | echo "export PATH=\"$INSTALL_DIR:\$PATH\"" >> "$HOME/.zshrc"
182 | echo "Added $INSTALL_DIR to PATH in ~/.zshrc"
183 | fi
184 |
185 | if [ -f "$HOME/.profile" ] && [ ! -f "$HOME/.bashrc" ] && [ ! -f "$HOME/.zshrc" ]; then
186 | echo "export PATH=\"$INSTALL_DIR:\$PATH\"" >> "$HOME/.profile"
187 | echo "Added $INSTALL_DIR to PATH in ~/.profile"
188 | fi
189 |
190 | # Add to current session
191 | export PATH="$INSTALL_DIR:$PATH"
192 | fi
193 |
194 | # Verify installation
195 | if command -v cua &> /dev/null; then
196 | # Determine which config file was updated
197 | config_file="$HOME/.bashrc"
198 | if [ -f "$HOME/.zshrc" ]; then
199 | config_file="$HOME/.zshrc"
200 | elif [ -f "$HOME/.profile" ]; then
201 | config_file="$HOME/.profile"
202 | fi
203 |
204 | print_success "$(which cua)" "$VERSION" "$config_file"
205 | exit 0
206 | else
207 | echo "❌ Installation failed. Please try installing manually:"
208 | echo " curl -fsSL https://cua.ai/install.sh | sh"
209 | exit 1
210 | fi
211 |
```
--------------------------------------------------------------------------------
/libs/qemu-docker/windows/src/vm/setup/setup-cua-server.ps1:
--------------------------------------------------------------------------------
```
1 | # Setup CUA Computer Server on Windows 11
2 | # Creates a scheduled task to run computer server in background
3 |
4 | Set-StrictMode -Version Latest
5 | $ErrorActionPreference = 'Continue'
6 |
7 | # Import shared utilities
8 | $scriptFolder = "C:\OEM"
9 | Import-Module (Join-Path $scriptFolder -ChildPath "setup-utils.psm1")
10 |
11 | # --- Logging ---
12 | $LogDir = "C:\Windows\Temp"
13 | if (!(Test-Path $LogDir)) { New-Item -ItemType Directory -Force -Path $LogDir | Out-Null }
14 | $RunId = (Get-Date -Format 'yyyyMMdd_HHmmss') + "_" + $PID
15 | $script:LogFile = Join-Path $LogDir ("setup_cua_server_" + $RunId + ".log")
16 |
17 | Write-Log -LogFile $script:LogFile -Message "=== Installing CUA Computer Server ==="
18 |
19 | # Ensure Chocolatey and Python 3.12 are present
20 | try {
21 | $ChocoExe = Resolve-ChocoPath
22 | if ($ChocoExe) {
23 | Write-Log -LogFile $script:LogFile -Message "Installing Python 3.12 via Chocolatey"
24 | try {
25 | & $ChocoExe install -y python312 | Out-Null
26 | } catch {
27 | Write-Log -LogFile $script:LogFile -Message "Python 3.12 install warning: $($_.Exception.Message)"
28 | }
29 | } else {
30 | Write-Log -LogFile $script:LogFile -Message "Chocolatey not available; skipping python312 install"
31 | }
32 | } catch {
33 | Write-Log -LogFile $script:LogFile -Message "Chocolatey bootstrap warning: $($_.Exception.Message)"
34 | }
35 |
36 | # Create venv
37 | $HomeDir = $env:USERPROFILE
38 | $CuaDir = Join-Path $HomeDir '.cua-server'
39 | $VenvDir = Join-Path $CuaDir 'venv'
40 | New-Item -ItemType Directory -Force -Path $CuaDir | Out-Null
41 |
42 | Write-Log -LogFile $script:LogFile -Message "Creating Python virtual environment at $VenvDir"
43 | $ExistingVenvPython = Join-Path $VenvDir 'Scripts\python.exe'
44 | if (Test-Path -LiteralPath $ExistingVenvPython) {
45 | Write-Log -LogFile $script:LogFile -Message "Existing venv detected; skipping creation"
46 | } else {
47 | try {
48 | & py -m venv $VenvDir
49 | Write-Log -LogFile $script:LogFile -Message "Virtual environment created successfully"
50 | } catch {
51 | Write-Log -LogFile $script:LogFile -Message "venv creation error: $($_.Exception.Message)"
52 | throw
53 | }
54 | }
55 |
56 | $PyExe = Join-Path $VenvDir 'Scripts\python.exe'
57 | $PipExe = Join-Path $VenvDir 'Scripts\pip.exe'
58 | $ActivateScript = Join-Path $VenvDir 'Scripts\Activate.ps1'
59 |
60 | Write-Log -LogFile $script:LogFile -Message "Activating virtual environment"
61 | & $ActivateScript
62 |
63 | Write-Log -LogFile $script:LogFile -Message "Upgrading pip, setuptools, and wheel"
64 | try {
65 | & $PipExe install --upgrade pip setuptools wheel 2>&1 | Tee-Object -FilePath $script:LogFile -Append | Out-Null
66 | } catch {
67 | Write-Log -LogFile $script:LogFile -Message "pip bootstrap warning: $($_.Exception.Message)"
68 | }
69 |
70 | Write-Log -LogFile $script:LogFile -Message "Installing cua-computer-server"
71 | try {
72 | & $PipExe install --upgrade cua-computer-server 2>&1 | Tee-Object -FilePath $script:LogFile -Append | Out-Null
73 | Write-Log -LogFile $script:LogFile -Message "cua-computer-server installed successfully"
74 | } catch {
75 | Write-Log -LogFile $script:LogFile -Message "Server install error: $($_.Exception.Message)"
76 | throw
77 | }
78 |
79 | # Open firewall for port 5000
80 | Write-Log -LogFile $script:LogFile -Message "Opening firewall for port 5000"
81 | try {
82 | netsh advfirewall firewall add rule name="CUA Computer Server 5000" dir=in action=allow protocol=TCP localport=5000 | Out-Null
83 | Write-Log -LogFile $script:LogFile -Message "Firewall rule added successfully"
84 | } catch {
85 | Write-Log -LogFile $script:LogFile -Message "Firewall rule warning: $($_.Exception.Message)"
86 | }
87 |
88 | # Create start script with auto-restart
89 | $StartScript = Join-Path $CuaDir 'start-server.ps1'
90 | $StartScriptContent = @"
91 | param()
92 |
93 | `$env:PYTHONUNBUFFERED = '1'
94 |
95 | `$LogFile = Join-Path '$CuaDir' 'server.log'
96 | `$ActivateScript = '$ActivateScript'
97 | `$PipExe = '$PipExe'
98 | `$Python = '$PyExe'
99 |
100 | function Start-Server {
101 | Write-Output "Activating virtual environment and updating cua-computer-server..." | Out-File -FilePath `$LogFile -Append
102 | & `$ActivateScript
103 | & `$PipExe install --upgrade cua-computer-server 2>&1 | Out-File -FilePath `$LogFile -Append
104 |
105 | Write-Output "Starting CUA Computer Server on port 5000..." | Out-File -FilePath `$LogFile -Append
106 | & `$Python -m computer_server --port 5000 2>&1 | Out-File -FilePath `$LogFile -Append
107 | return `$LASTEXITCODE
108 | }
109 |
110 | while (`$true) {
111 | Start-Server
112 | `$code = `$LASTEXITCODE
113 | Write-Output "Server exited with code: `$code. Restarting in 5s..." | Out-File -FilePath `$LogFile -Append
114 | Start-Sleep -Seconds 5
115 | }
116 | "@
117 |
118 | Set-Content -Path $StartScript -Value $StartScriptContent -Encoding UTF8
119 | Write-Log -LogFile $script:LogFile -Message "Start script created at $StartScript"
120 |
121 | # Create VBScript wrapper to launch PowerShell hidden
122 | $VbsWrapper = Join-Path $CuaDir 'start-server-hidden.vbs'
123 | $VbsContent = @"
124 | Set objShell = CreateObject("WScript.Shell")
125 | objShell.Run "powershell.exe -NoProfile -ExecutionPolicy Bypass -File ""$StartScript""", 0, False
126 | "@
127 | Set-Content -Path $VbsWrapper -Value $VbsContent -Encoding ASCII
128 | Write-Log -LogFile $script:LogFile -Message "VBScript wrapper created at $VbsWrapper"
129 |
130 | # Create scheduled task to run at logon
131 | try {
132 | $TaskName = 'CUA-Computer-Server'
133 | $Username = 'Docker' # Default user for Dockur Windows
134 |
135 | # Remove existing task if present
136 | $existingTask = Get-ScheduledTask -TaskName $TaskName -ErrorAction SilentlyContinue
137 | if ($existingTask) {
138 | Write-Log -LogFile $script:LogFile -Message "Removing existing scheduled task: $TaskName"
139 | Unregister-ScheduledTask -TaskName $TaskName -Confirm:$false
140 | }
141 |
142 | # Create action to run VBScript wrapper (hidden)
143 | $Action = New-ScheduledTaskAction -Execute 'wscript.exe' -Argument "`"$VbsWrapper`""
144 |
145 | # Trigger: At logon of user
146 | $UserId = "$env:COMPUTERNAME\$Username"
147 | $Trigger = New-ScheduledTaskTrigger -AtLogOn -User $UserId
148 |
149 | # Principal: Run in background without window (S4U = Service For User)
150 | $Principal = New-ScheduledTaskPrincipal -UserId $UserId -LogonType S4U -RunLevel Highest
151 |
152 | # Task settings - hide window
153 | $Settings = New-ScheduledTaskSettingsSet `
154 | -AllowStartIfOnBatteries `
155 | -DontStopIfGoingOnBatteries `
156 | -StartWhenAvailable `
157 | -RestartCount 999 `
158 | -RestartInterval (New-TimeSpan -Minutes 1) `
159 | -ExecutionTimeLimit (New-TimeSpan -Days 365) `
160 | -Hidden
161 |
162 | # Register the task
163 | Write-Log -LogFile $script:LogFile -Message "Registering scheduled task '$TaskName' to run as $Username at logon (hidden)"
164 | Register-ScheduledTask `
165 | -TaskName $TaskName `
166 | -Action $Action `
167 | -Trigger $Trigger `
168 | -Principal $Principal `
169 | -Settings $Settings `
170 | -Force | Out-Null
171 |
172 | Write-Log -LogFile $script:LogFile -Message "Scheduled task '$TaskName' registered successfully (runs hidden in background)"
173 |
174 | } catch {
175 | Write-Log -LogFile $script:LogFile -Message "Scheduled task setup error: $($_.Exception.Message)"
176 | throw
177 | }
178 |
179 | Write-Log -LogFile $script:LogFile -Message "=== CUA Computer Server setup completed ==="
180 | exit 0
181 |
```
--------------------------------------------------------------------------------
/blog/introducing-cua-cli.md:
--------------------------------------------------------------------------------
```markdown
1 | # Introducing the Cua CLI: Manage Cloud Sandboxes from Your Terminal
2 |
3 | If you've been using our Cloud Sandboxes, you've probably been managing them through the web dashboard - clicking through forms to create instances, copying credentials, manually starting and stopping sandboxes. It works, but it's not exactly built for power users like yourself.
4 |
5 | Today we're launching the **Cua CLI**: a command-line interface that brings the full power of our Cloud Sandbox platform to your terminal. Create, manage, and connect to Linux, Windows, or macOS sandboxes in seconds—all from a single command.
6 |
7 | 
8 |
9 | ## What You Can Do
10 |
11 | The Cua CLI handles everything you need to work with Cloud Sandboxes:
12 |
13 | **Authentication**
14 |
15 | - Browser-based OAuth login with automatic credential storage
16 | - Direct API key support for CI/CD pipelines
17 | - Export credentials to `.env` files for SDK integration
18 |
19 | **Sandbox Management**
20 |
21 | - Create sandboxes with your choice of OS, size, and region
22 | - List all your sandboxes with status and connection details
23 | - Start, stop, restart, and delete sandboxes
24 | - Open remote desktop (VNC) connections directly in your browser
25 |
26 | **Two Command Styles**
27 | The CLI supports both flat and grouped command structures—use whichever fits your workflow:
28 |
29 | ```bash
30 | # Grouped style (explicit & clear)
31 | cua sb ls
32 | cua sb create --os linux --size small --region north-america
33 | cua sb vnc my-sandbox
34 |
35 | # Flat style (quick & concise)
36 | cua ls
37 | cua create --os linux --size small --region north-america
38 | cua vnc my-sandbox
39 | ```
40 |
41 | Both styles work identically. The CLI shows grouped commands in help by default, but all flat commands remain available for backwards compatibility.
42 |
43 | ## Installation
44 |
45 | One command installs everything (includes Bun runtime + Cua CLI):
46 |
47 | ```bash
48 | # macOS/Linux
49 | curl -LsSf https://cua.ai/cli/install.sh | sh
50 |
51 | # Windows
52 | powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
53 | ```
54 |
55 | Or install via npm if you prefer:
56 |
57 | ```bash
58 | npm install -g @trycua/cli
59 | ```
60 |
61 | ## Getting Started
62 |
63 | Authenticate with your Cua account:
64 |
65 | ```bash
66 | # Interactive browser login (recommended)
67 | cua auth login
68 |
69 | # Or provide your API key directly
70 | cua auth login --api-key sk-your-api-key-here
71 | ```
72 |
73 | Create a sandbox:
74 |
75 | ```bash
76 | cua sb create --os linux --size small --region north-america
77 | # Sandbox created and ready: my-sandbox-abc123
78 | # Password: secure-password-here
79 | # Host: my-sandbox-abc123.sandbox.cua.ai
80 | ```
81 |
82 | List your sandboxes:
83 |
84 | ```bash
85 | cua sb list
86 | # NAME STATUS HOST
87 | # my-sandbox-abc123 running my-sandbox-abc123.sandbox.cua.ai
88 | # test-windows-456 stopped test-windows-456.sandbox.cua.ai
89 | ```
90 |
91 | Open a remote desktop:
92 |
93 | ```bash
94 | cua sb vnc my-sandbox-abc123
95 | # Opens your browser to the VNC interface with password pre-filled
96 | ```
97 |
98 | ## SDK Integration
99 |
100 | Export your API key to a `.env` file for seamless SDK integration:
101 |
102 | ```bash
103 | cd my-project
104 | cua auth env
105 | # Wrote /path/to/my-project/.env
106 | ```
107 |
108 | Then use it with our Python or TypeScript SDKs:
109 |
110 | ```python
111 | from computer import Computer
112 |
113 | computer = Computer(
114 | os_type="linux",
115 | provider_type="cloud",
116 | name="my-sandbox-abc123",
117 | api_key="your-api-key" # Or load from .env
118 | )
119 |
120 | await computer.run()
121 | ```
122 |
123 | ## Sandbox Sizes & Regions
124 |
125 | Create sandboxes in the size and region that fits your needs:
126 |
127 | **Sizes:**
128 |
129 | - `small` - 2 cores, 8 GB RAM, 128 GB SSD
130 | - `medium` - 4 cores, 16 GB RAM, 128 GB SSD
131 | - `large` - 8 cores, 32 GB RAM, 256 GB SSD
132 |
133 | **Regions:**
134 |
135 | - `north-america`
136 | - `europe`
137 | - `asia-pacific`
138 | - `south-america`
139 |
140 | **OS Options:**
141 |
142 | - `linux` - Ubuntu with XFCE desktop
143 | - `windows` - Windows 11 with Edge and Python
144 | - `macos` - macOS (preview access)
145 |
146 | ## Example Workflows
147 |
148 | **Quick Testing Environment**
149 |
150 | ```bash
151 | # Spin up a sandbox, test something, tear it down
152 | cua sb create --os linux --size small --region north-america
153 | # ... do your testing ...
154 | cua sb delete my-sandbox-abc123
155 | ```
156 |
157 | **Persistent Development Sandbox**
158 |
159 | ```bash
160 | # Create a sandbox for long-term use
161 | cua sb create --os linux --size medium --region north-america
162 |
163 | # Stop it when not in use (data persists)
164 | cua sb stop my-sandbox-abc123
165 |
166 | # Start it again when needed
167 | cua sb start my-sandbox-abc123
168 | ```
169 |
170 | **CI/CD Integration**
171 |
172 | ```bash
173 | # Provision sandboxes in your pipeline
174 | export CUA_API_KEY="sk-your-api-key"
175 | cua auth login --api-key "$CUA_API_KEY"
176 | cua sb create --os linux --size large --region north-america
177 |
178 | # Run your tests with the Cua Computer SDK
179 | python run_tests.py
180 |
181 | # Clean up
182 | cua sb delete my-test-sandbox
183 | ```
184 |
185 | ## Command Aliases
186 |
187 | We've added aliases for common commands to speed up your workflow:
188 |
189 | ```bash
190 | # List aliases
191 | cua list # or: cua ls, cua ps, cua sb list
192 |
193 | # VNC aliases
194 | cua vnc # or: cua open, cua sb vnc
195 | ```
196 |
197 | ## FAQs
198 |
199 | <details>
200 | <summary><strong>Can I use this in scripts and CI/CD?</strong></summary>
201 |
202 | Yes. All commands support non-interactive mode with `--api-key` flags, and the CLI exits with proper status codes for scripting. The flat command style (`cua list`, `cua create`) is particularly useful for quick scripts.
203 |
204 | </details>
205 |
206 | <details>
207 | <summary><strong>Where are my credentials stored?</strong></summary>
208 |
209 | API keys are stored in `~/.cua/cli.sqlite` using a local SQLite database. They never leave your machine. Use `cua auth logout` to clear stored credentials.
210 |
211 | </details>
212 |
213 | <details>
214 | <summary><strong>What happens to passwords in the output?</strong></summary>
215 |
216 | Passwords are hidden by default in `cua list` for security. Use `cua list --show-passwords` to display them when needed.
217 |
218 | </details>
219 |
220 | <details>
221 | <summary><strong>Can I manage sandboxes created through the web dashboard?</strong></summary>
222 |
223 | Yes. The CLI and dashboard share the same API. Any sandbox you create in the dashboard will show up in `cua list`, and vice versa.
224 |
225 | </details>
226 |
227 | <details>
228 | <summary><strong>How do I update the CLI?</strong></summary>
229 |
230 | If you installed via script:
231 |
232 | ```bash
233 | curl -LsSf https://cua.ai/cli/install.sh | sh
234 | ```
235 |
236 | If you installed via npm:
237 |
238 | ```bash
239 | npm install -g @trycua/cli@latest
240 | ```
241 |
242 | </details>
243 |
244 | ## What's Next
245 |
246 | We're actively iterating based on feedback. Planned features include:
247 |
248 | - SSH key management for secure sandbox access
249 | - Template-based sandbox creation
250 | - Batch operations (start/stop multiple sandboxes)
251 | - Custom sandbox configurations
252 | - Snapshot management
253 |
254 | If there's a feature you need, let us know in [Discord](https://discord.gg/cua-ai).
255 |
256 | ## Need Help?
257 |
258 | - **Documentation**: [https://cua.ai/docs/cli-playbook/commands](https://cua.ai/docs/cli-playbook/commands)
259 | - **Installation Guide**: [https://cua.ai/docs/cli-playbook](https://cua.ai/docs/cli-playbook)
260 | - **Discord Community**: [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
261 |
262 | ---
263 |
264 | Get started at [cua.ai](https://cua.ai) or check out the [quickstart guide](https://cua.ai/docs/get-started/quickstart).
265 |
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/factory.py:
--------------------------------------------------------------------------------
```python
1 | """Factory for creating VM providers."""
2 |
3 | import logging
4 | from typing import Any, Dict, Optional, Type, Union
5 |
6 | from .base import BaseVMProvider, VMProviderType
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class VMProviderFactory:
12 | """Factory for creating VM providers based on provider type."""
13 |
14 | @staticmethod
15 | def create_provider(
16 | provider_type: Union[str, VMProviderType],
17 | provider_port: int = 7777,
18 | host: str = "localhost",
19 | bin_path: Optional[str] = None,
20 | storage: Optional[str] = None,
21 | shared_path: Optional[str] = None,
22 | image: Optional[str] = None,
23 | verbose: bool = False,
24 | ephemeral: bool = False,
25 | noVNC_port: Optional[int] = None,
26 | api_port: Optional[int] = None,
27 | **kwargs,
28 | ) -> BaseVMProvider:
29 | """Create a VM provider of the specified type.
30 |
31 | Args:
32 | provider_type: Type of VM provider to create
33 | provider_port: Port for the provider's API server
34 | host: Hostname for the API server
35 | bin_path: Path to provider binary if needed
36 | storage: Path for persistent VM storage
37 | shared_path: Path for shared folder between host and VM
38 | image: VM image to use (for Lumier provider)
39 | verbose: Enable verbose logging
40 | ephemeral: Use ephemeral (temporary) storage
41 | noVNC_port: Specific port for noVNC interface (for Lumier and Docker provider)
42 | api_port: Specific port for Computer API server (for Docker provider)
43 |
44 | Returns:
45 | An instance of the requested VM provider
46 |
47 | Raises:
48 | ImportError: If the required dependencies for the provider are not installed
49 | ValueError: If the provider type is not supported
50 | """
51 | # Convert string to enum if needed
52 | if isinstance(provider_type, str):
53 | try:
54 | provider_type = VMProviderType(provider_type.lower())
55 | except ValueError:
56 | provider_type = VMProviderType.UNKNOWN
57 |
58 | if provider_type == VMProviderType.LUME:
59 | try:
60 | from .lume import HAS_LUME, LumeProvider
61 |
62 | if not HAS_LUME:
63 | raise ImportError(
64 | "The pylume package is required for LumeProvider. "
65 | "Please install it with 'pip install cua-computer[lume]'"
66 | )
67 | return LumeProvider(
68 | provider_port=provider_port,
69 | host=host,
70 | storage=storage,
71 | verbose=verbose,
72 | ephemeral=ephemeral,
73 | )
74 | except ImportError as e:
75 | logger.error(f"Failed to import LumeProvider: {e}")
76 | raise ImportError(
77 | "The pylume package is required for LumeProvider. "
78 | "Please install it with 'pip install cua-computer[lume]'"
79 | ) from e
80 | elif provider_type == VMProviderType.LUMIER:
81 | try:
82 | from .lumier import HAS_LUMIER, LumierProvider
83 |
84 | if not HAS_LUMIER:
85 | raise ImportError(
86 | "Docker is required for LumierProvider. "
87 | "Please install Docker for Apple Silicon and Lume CLI before using this provider."
88 | )
89 | return LumierProvider(
90 | provider_port=provider_port,
91 | host=host,
92 | storage=storage,
93 | shared_path=shared_path,
94 | image=image or "macos-sequoia-cua:latest",
95 | verbose=verbose,
96 | ephemeral=ephemeral,
97 | noVNC_port=noVNC_port,
98 | )
99 | except ImportError as e:
100 | logger.error(f"Failed to import LumierProvider: {e}")
101 | raise ImportError(
102 | "Docker and Lume CLI are required for LumierProvider. "
103 | "Please install Docker for Apple Silicon and run the Lume installer script."
104 | ) from e
105 |
106 | elif provider_type == VMProviderType.CLOUD:
107 | try:
108 | from .cloud import CloudProvider
109 |
110 | return CloudProvider(
111 | verbose=verbose,
112 | **kwargs,
113 | )
114 | except ImportError as e:
115 | logger.error(f"Failed to import CloudProvider: {e}")
116 | raise ImportError(
117 | "The CloudProvider is not fully implemented yet. "
118 | "Please use LUME or LUMIER provider instead."
119 | ) from e
120 | elif provider_type == VMProviderType.WINSANDBOX:
121 | try:
122 | from .winsandbox import HAS_WINSANDBOX, WinSandboxProvider
123 |
124 | if not HAS_WINSANDBOX:
125 | raise ImportError(
126 | "pywinsandbox is required for WinSandboxProvider. "
127 | "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'"
128 | )
129 | return WinSandboxProvider(
130 | host=host,
131 | storage=storage,
132 | verbose=verbose,
133 | ephemeral=ephemeral,
134 | **kwargs,
135 | )
136 | except ImportError as e:
137 | logger.error(f"Failed to import WinSandboxProvider: {e}")
138 | raise ImportError(
139 | "pywinsandbox is required for WinSandboxProvider. "
140 | "Please install it with 'pip install -U git+https://github.com/karkason/pywinsandbox.git'"
141 | ) from e
142 | elif provider_type == VMProviderType.DOCKER:
143 | try:
144 | from .docker import HAS_DOCKER, DockerProvider
145 |
146 | if not HAS_DOCKER:
147 | raise ImportError(
148 | "Docker is required for DockerProvider. "
149 | "Please install Docker and ensure it is running."
150 | )
151 | return DockerProvider(
152 | host=host,
153 | storage=storage,
154 | shared_path=shared_path,
155 | image=image or "trycua/cua-ubuntu:latest",
156 | verbose=verbose,
157 | ephemeral=ephemeral,
158 | vnc_port=noVNC_port,
159 | api_port=api_port,
160 | )
161 | except ImportError as e:
162 | logger.error(f"Failed to import DockerProvider: {e}")
163 | raise ImportError(
164 | "Docker is required for DockerProvider. "
165 | "Please install Docker and ensure it is running."
166 | ) from e
167 | else:
168 | raise ValueError(f"Unsupported provider type: {provider_type}")
169 |
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/telemetry.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Telemetry
3 | description: How telemetry works in Cua and how to control it
4 | ---
5 |
6 | # Telemetry
7 |
8 | Cua collects anonymized usage and error statistics. We follow [Posthog's ethical telemetry approach](https://posthog.com/blog/open-source-telemetry-ethical). To opt out, set `telemetry_enabled` to false.
9 |
10 | ## What we collect
11 |
12 | ### Enabled by default (opt-out)
13 |
14 | - System info: OS, OS version, Python version
15 | - Module initialization: When modules are imported and their versions
16 | - Performance: Agent run durations, step counts, token usage, API costs
17 | - Session tracking: Anonymous session IDs and run IDs
18 |
19 | ### Disabled by default (opt-in)
20 |
21 | **Trajectory logging** captures full conversation history:
22 |
23 | - User messages and agent responses
24 | - Computer actions and outputs
25 | - Agent reasoning traces
26 |
27 | Must be explicitly enabled.
28 |
29 | ### We don't collect
30 |
31 | - Personal information or user identifiers
32 | - API keys or credentials
33 | - File contents or application data
34 | - Files being accessed
35 | - Screenshots or screen contents (unless trajectory logging is enabled)
36 | - Text being typed, user inputs, model outputs, computer outputs, or tool call outputs (unless trajectory logging is enabled)
37 |
38 | ## How to disable
39 |
40 | ### Environment variable (global)
41 |
42 | Set `CUA_TELEMETRY_ENABLED` to a falsy value (`0`, `false`, `no`, or `off`):
43 |
44 | ```bash
45 | export CUA_TELEMETRY_ENABLED=false
46 | ```
47 |
48 | Or in Python:
49 |
50 | ```python
51 | import os
52 | os.environ["CUA_TELEMETRY_ENABLED"] = "false"
53 | ```
54 |
55 | <Callout type="info">
56 | **Deprecated environment variables:** The environment variables `CUA_TELEMETRY` and
57 | `CUA_TELEMETRY_DISABLED` are deprecated and no longer have any effect. Use `CUA_TELEMETRY_ENABLED`
58 | instead.
59 | </Callout>
60 |
61 | ### Per instance
62 |
63 | **Computer SDK:**
64 |
65 | ```python
66 | from computer import Computer
67 |
68 | computer = Computer(telemetry_enabled=False)
69 | ```
70 |
71 | **Agent SDK:**
72 |
73 | ```python
74 | from agent import ComputerAgent
75 | import os
76 |
77 | # Basic telemetry - performance metrics only (opt-out, enabled by default)
78 | agent = ComputerAgent(
79 | model="claude-sonnet-4-5-20250929",
80 | telemetry_enabled=True # Default is True
81 | )
82 |
83 | # Enable telemetry with full conversation trajectory logging (opt-in)
84 | agent = ComputerAgent(
85 | model="claude-sonnet-4-5-20250929",
86 | telemetry_enabled={
87 | "log_trajectory": True # Logs full conversation items
88 | }
89 | )
90 |
91 | # Disable completely
92 | agent = ComputerAgent(
93 | model="claude-sonnet-4-5-20250929",
94 | telemetry_enabled=False
95 | )
96 |
97 | # Enable trajectory logging (opt-in)
98 | agent = ComputerAgent(
99 | model="claude-sonnet-4-5-20250929",
100 | telemetry_enabled={"log_trajectory": True}
101 | )
102 | ```
103 |
104 | Check status:
105 |
106 | ```python
107 | print(computer.telemetry_enabled) # True or False
108 | print(agent.telemetry_enabled) # True, False, or dict
109 | ```
110 |
111 | Telemetry settings are configured at initialization and can't be changed afterward.
112 |
113 | ## Events collected
114 |
115 | ### Computer SDK
116 |
117 | | Event Name | Data Collected | Trigger Notes |
118 | | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- |
119 | | **computer_initialized** | • `os`: Operating system (e.g., 'windows', 'darwin', 'linux')<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when a Computer instance is created |
120 | | **module_init** | • `module`: "computer"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the computer package is imported for the first time |
121 |
122 | ### Agent SDK
123 |
124 | | Event Name | Data Collected | Trigger Notes |
125 | | ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------- |
126 | | **module_init** | • `module`: "agent"<br />• `version`: Package version<br />• `python_version`: Full Python version string | Triggered once when the agent package is imported for the first time |
127 | | **agent_session_start** | • `session_id`: Unique UUID for this agent instance<br />• `agent_type`: Class name (e.g., "ComputerAgent")<br />• `model`: Model name (e.g., "claude-sonnet-4-5")<br />• `os`: Operating system<br />• `os_version`: OS version<br />• `python_version`: Python version | Triggered when TelemetryCallback is initialized (agent instantiation) |
128 | | **agent_run_start** | • `session_id`: Agent session UUID<br />• `run_id`: Unique UUID for this run<br />• `start_time`: Unix timestamp<br />• `input_context_size`: Character count of input messages<br />• `num_existing_messages`: Count of existing messages<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the start of each agent.run() call |
129 | | **agent_run_end** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `end_time`: Unix timestamp<br />• `duration_seconds`: Total run duration<br />• `num_steps`: Total steps taken in this run<br />• `total_usage`: Accumulated token usage and costs<br />• `uploaded_trajectory`: Full conversation items (opt-in) | Triggered at the end of each agent.run() call |
130 | | **agent_step** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Step number (incremental)<br />• `timestamp`: Unix timestamp<br />• `duration_seconds`: Duration of previous step | Triggered on each agent response/step during a run |
131 | | **agent_usage** | • `session_id`: Agent session UUID<br />• `run_id`: Run UUID<br />• `step`: Current step number<br />• `prompt_tokens`: Tokens in prompt<br />• `completion_tokens`: Tokens in response<br />• `total_tokens`: Total tokens used<br />• `response_cost`: Cost of this API call | Triggered whenever usage information is received from LLM API |
132 |
133 | ## Questions
134 |
135 | Questions about telemetry? Open an issue on our [GitHub repository](https://github.com/trycua/cua).
136 |
```
--------------------------------------------------------------------------------
/libs/python/som/som/util/utils.py:
--------------------------------------------------------------------------------
```python
1 | import logging
2 | import signal
3 | import time
4 | from contextlib import contextmanager
5 | from typing import Any, List, Optional, Sequence, Tuple, Union, cast
6 |
7 | import cv2
8 | import easyocr
9 | import matplotlib.pyplot as plt
10 | import numpy as np
11 | from PIL import Image
12 |
13 | logger = logging.getLogger(__name__)
14 |
15 |
16 | class TimeoutException(Exception):
17 | pass
18 |
19 |
20 | @contextmanager
21 | def timeout(seconds):
22 | def timeout_handler(signum, frame):
23 | logger.warning(f"OCR process timed out after {seconds} seconds")
24 | raise TimeoutException("OCR processing timed out")
25 |
26 | # Register the signal handler
27 | original_handler = signal.signal(signal.SIGALRM, timeout_handler)
28 | signal.alarm(seconds)
29 |
30 | try:
31 | yield
32 | finally:
33 | signal.alarm(0)
34 | signal.signal(signal.SIGALRM, original_handler)
35 |
36 |
37 | # Initialize EasyOCR with optimized settings
38 | logger.info("Initializing EasyOCR with optimized settings...")
39 | reader = easyocr.Reader(
40 | ["en"],
41 | gpu=True, # Use GPU if available
42 | model_storage_directory=None, # Use default directory
43 | download_enabled=True,
44 | detector=True, # Enable text detection
45 | recognizer=True, # Enable text recognition
46 | verbose=False, # Disable verbose output
47 | quantize=True, # Enable quantization for faster inference
48 | cudnn_benchmark=True, # Enable cuDNN benchmarking
49 | )
50 | logger.info("EasyOCR initialization complete")
51 |
52 |
53 | def check_ocr_box(
54 | image_source: Union[str, Image.Image],
55 | display_img=True,
56 | output_bb_format="xywh",
57 | goal_filtering=None,
58 | easyocr_args=None,
59 | use_paddleocr=False,
60 | ) -> Tuple[Tuple[List[str], List[Tuple[float, float, float, float]]], Optional[Any]]:
61 | """Check OCR box using EasyOCR with optimized settings.
62 |
63 | Args:
64 | image_source: Either a file path or PIL Image
65 | display_img: Whether to display the annotated image
66 | output_bb_format: Format for bounding boxes ('xywh' or 'xyxy')
67 | goal_filtering: Optional filtering of results
68 | easyocr_args: Arguments for EasyOCR
69 | use_paddleocr: Ignored (kept for backward compatibility)
70 |
71 | Returns:
72 | Tuple containing:
73 | - Tuple of (text_list, bounding_boxes)
74 | - goal_filtering value
75 | """
76 | logger.info("Starting OCR processing...")
77 | start_time = time.time()
78 |
79 | if isinstance(image_source, str):
80 | logger.info(f"Loading image from path: {image_source}")
81 | image_source = Image.open(image_source)
82 | if image_source.mode == "RGBA":
83 | logger.info("Converting RGBA image to RGB")
84 | image_source = image_source.convert("RGB")
85 | image_np = np.array(image_source)
86 | w, h = image_source.size
87 | logger.info(f"Image size: {w}x{h}")
88 |
89 | # Default EasyOCR arguments optimized for speed
90 | default_args = {
91 | "paragraph": False, # Disable paragraph detection
92 | "text_threshold": 0.5, # Confidence threshold
93 | "link_threshold": 0.4, # Text link threshold
94 | "canvas_size": 2560, # Max image size
95 | "mag_ratio": 1.0, # Magnification ratio
96 | "slope_ths": 0.1, # Slope threshold
97 | "ycenter_ths": 0.5, # Y-center threshold
98 | "height_ths": 0.5, # Height threshold
99 | "width_ths": 0.5, # Width threshold
100 | "add_margin": 0.1, # Margin around text
101 | "min_size": 20, # Minimum text size
102 | }
103 |
104 | # Update with user-provided arguments
105 | if easyocr_args:
106 | logger.info(f"Using custom EasyOCR arguments: {easyocr_args}")
107 | default_args.update(easyocr_args)
108 |
109 | try:
110 | # Use EasyOCR with timeout
111 | logger.info("Starting EasyOCR detection with 5 second timeout...")
112 | with timeout(5): # 5 second timeout
113 | # EasyOCR's readtext returns a list of tuples, where each tuple is (bbox, text, confidence)
114 | raw_result = reader.readtext(image_np, **default_args)
115 | result = cast(Sequence[Tuple[List[Tuple[float, float]], str, float]], raw_result)
116 | coord = [item[0] for item in result] # item[0] is the bbox coordinates
117 | text = [item[1] for item in result] # item[1] is the text content
118 | logger.info(f"OCR completed successfully. Found {len(text)} text regions")
119 | logger.info(f"Detected text: {text}")
120 |
121 | except TimeoutException:
122 | logger.error("OCR processing timed out after 5 seconds")
123 | coord = []
124 | text = []
125 | except Exception as e:
126 | logger.error(f"OCR processing failed with error: {str(e)}")
127 | coord = []
128 | text = []
129 |
130 | processing_time = time.time() - start_time
131 | logger.info(f"Total OCR processing time: {processing_time:.2f} seconds")
132 |
133 | if display_img:
134 | logger.info("Creating visualization of OCR results...")
135 | opencv_img = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
136 | bb = []
137 | for item in coord:
138 | x, y, a, b = get_xywh(item)
139 | bb.append((x, y, a, b))
140 | # Convert float coordinates to integers for cv2.rectangle
141 | x_val = cast(float, x)
142 | y_val = cast(float, y)
143 | a_val = cast(float, a)
144 | b_val = cast(float, b)
145 | x_int, y_int = int(x_val), int(y_val)
146 | a_int, b_int = int(a_val), int(b_val)
147 | cv2.rectangle(
148 | opencv_img, (x_int, y_int), (x_int + a_int, y_int + b_int), (0, 255, 0), 2
149 | )
150 | plt.imshow(cv2.cvtColor(opencv_img, cv2.COLOR_BGR2RGB))
151 | else:
152 | if output_bb_format == "xywh":
153 | bb = [get_xywh(item) for item in coord]
154 | elif output_bb_format == "xyxy":
155 | bb = [get_xyxy(item) for item in coord]
156 |
157 | # Cast the bounding boxes to the expected type
158 | bb = cast(List[Tuple[float, float, float, float]], bb)
159 |
160 | logger.info("OCR processing complete")
161 | return (text, bb), goal_filtering
162 |
163 |
164 | def get_xywh(box):
165 | """
166 | Convert a bounding box to xywh format (x, y, width, height).
167 |
168 | Args:
169 | box: Bounding box coordinates (various formats supported)
170 |
171 | Returns:
172 | Tuple of (x, y, width, height)
173 | """
174 | # Handle different input formats
175 | if len(box) == 4:
176 | # If already in xywh format or xyxy format
177 | if isinstance(box[0], (int, float)) and isinstance(box[2], (int, float)):
178 | if box[2] < box[0] or box[3] < box[1]:
179 | # Already xyxy format, convert to xywh
180 | x1, y1, x2, y2 = box
181 | return x1, y1, x2 - x1, y2 - y1
182 | else:
183 | # Already in xywh format
184 | return box
185 | elif len(box) == 2:
186 | # Format like [[x1,y1],[x2,y2]] from some OCR engines
187 | (x1, y1), (x2, y2) = box
188 | return x1, y1, x2 - x1, y2 - y1
189 |
190 | # Default case - try to convert assuming it's a list of points
191 | x_coords = [p[0] for p in box]
192 | y_coords = [p[1] for p in box]
193 | x1, y1 = min(x_coords), min(y_coords)
194 | width, height = max(x_coords) - x1, max(y_coords) - y1
195 | return x1, y1, width, height
196 |
197 |
198 | def get_xyxy(box):
199 | """
200 | Convert a bounding box to xyxy format (x1, y1, x2, y2).
201 |
202 | Args:
203 | box: Bounding box coordinates (various formats supported)
204 |
205 | Returns:
206 | Tuple of (x1, y1, x2, y2)
207 | """
208 | # Get xywh first, then convert to xyxy
209 | x, y, w, h = get_xywh(box)
210 | return x, y, x + w, y + h
211 |
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/ss-v2.py:
--------------------------------------------------------------------------------
```python
1 | #!/usr/bin/env python3
2 | """
3 | ScreenSpot-v2 Benchmark Script
4 |
5 | Evaluates models on the ScreenSpot-v2 dataset for click prediction accuracy.
6 | Supports both ComputerAgent model strings and custom model classes.
7 | """
8 |
9 | import argparse
10 | import asyncio
11 | import random
12 | import statistics
13 | import time
14 | from typing import Optional
15 |
16 | from datasets import load_dataset
17 | from tqdm import tqdm
18 | from utils import (
19 | ModelWrapper,
20 | get_available_models,
21 | get_gpu_memory,
22 | is_click_in_bbox,
23 | save_results_to_markdown,
24 | save_visualizations,
25 | )
26 |
27 |
28 | async def evaluate_model(
29 | model_wrapper: ModelWrapper, samples, max_samples: Optional[int] = None
30 | ) -> dict:
31 | """
32 | Evaluate a model on any iterable of samples.
33 |
34 | Args:
35 | model_wrapper: ModelWrapper instance
36 | samples: Iterable of dicts with keys: image, bbox, instruction
37 | max_samples: Maximum number of samples to evaluate (None for all)
38 |
39 | Returns:
40 | Dictionary with evaluation results
41 | """
42 | print(f"\nEvaluating model: {model_wrapper.model_name}")
43 |
44 | # Load model
45 | await model_wrapper.load_model()
46 |
47 | # Convert to list if needed and limit samples
48 | if hasattr(samples, "__len__"):
49 | total_samples = len(samples)
50 | if max_samples is not None:
51 | total_samples = min(max_samples, total_samples)
52 | sample_list = list(samples)[:total_samples]
53 | else:
54 | # For iterators, take max_samples or all
55 | sample_list = list(samples)
56 | if max_samples is not None:
57 | sample_list = sample_list[:max_samples]
58 | total_samples = len(sample_list)
59 |
60 | correct_predictions = 0
61 | error_predictions = 0
62 | results = []
63 |
64 | for i, sample in enumerate(tqdm(sample_list, desc=f"Evaluating {model_wrapper.model_name}")):
65 | # Extract required data (only these 3 keys matter)
66 | image = sample["image"]
67 | instruction = sample["instruction"]
68 | bbox = sample["bbox"] # [x1, y1, x2, y2]
69 |
70 | # Predict click coordinates with timing
71 | start_time = time.time()
72 | click_coords = await model_wrapper.predict_click(image, instruction)
73 | prediction_time = time.time() - start_time
74 |
75 | # Check if prediction is correct
76 | is_correct = is_click_in_bbox(click_coords, bbox)
77 |
78 | if is_correct:
79 | correct_predictions += 1
80 |
81 | results.append(
82 | {
83 | "sample_idx": i,
84 | "instruction": instruction,
85 | "bbox": bbox,
86 | "predicted_coords": click_coords,
87 | "is_correct": is_correct,
88 | "failed": False,
89 | "prediction_time": prediction_time,
90 | }
91 | )
92 |
93 | # Unload model
94 | await model_wrapper.unload_model()
95 |
96 | # Calculate metrics
97 | accuracy = correct_predictions / total_samples if total_samples > 0 else 0.0
98 | error_rate = error_predictions / total_samples if total_samples > 0 else 0.0
99 |
100 | # Calculate timing statistics
101 | successful_times = [r["prediction_time"] for r in results if not r["failed"]]
102 | avg_prediction_time = sum(successful_times) / len(successful_times) if successful_times else 0.0
103 | median_prediction_time = statistics.median(successful_times) if successful_times else 0.0
104 | min_prediction_time = min(successful_times) if successful_times else 0.0
105 | max_prediction_time = max(successful_times) if successful_times else 0.0
106 |
107 | # Get VRAM statistics
108 | vram_stats = model_wrapper.get_vram_stats()
109 |
110 | return {
111 | "model_name": model_wrapper.model_name,
112 | "total_samples": total_samples,
113 | "correct_predictions": correct_predictions,
114 | "failed_predictions": error_predictions,
115 | "accuracy": accuracy,
116 | "failure_rate": error_rate,
117 | "avg_prediction_time": avg_prediction_time,
118 | "median_prediction_time": median_prediction_time,
119 | "min_prediction_time": min_prediction_time,
120 | "max_prediction_time": max_prediction_time,
121 | "vram_max_mb": vram_stats["max_mb"],
122 | "vram_avg_mb": vram_stats["avg_mb"],
123 | "results": results,
124 | }
125 |
126 |
127 | async def main():
128 | """
129 | Main function to run the benchmark.
130 | """
131 | # Parse command line arguments
132 | parser = argparse.ArgumentParser(description="ScreenSpot-v2 Benchmark Script")
133 | parser.add_argument(
134 | "--samples", type=int, default=500, help="Number of samples to evaluate (default: 500)"
135 | )
136 | parser.add_argument(
137 | "--seed", type=int, default=42, help="Random seed for shuffling (default: 42)"
138 | )
139 | args = parser.parse_args()
140 |
141 | # Set random seed
142 | random.seed(args.seed)
143 |
144 | # Load dataset
145 | print("Loading ScreenSpot-v2 dataset...")
146 | ds = load_dataset("lmms-lab/ScreenSpot-v2")
147 | dataset = ds["train"] # type: ignore
148 | # Convert to simple list of dicts with only required keys
149 | samples = []
150 | for item in dataset:
151 | # Convert dataset item to dict if needed
152 | item_dict = dict(item) if hasattr(item, "keys") else item
153 |
154 | # Convert ScreenSpot-v2 bbox format [x, y, w, h] to [x1, y1, x2, y2]
155 | bbox_xywh = item_dict["bbox"] # type: ignore
156 | x, y, w, h = bbox_xywh
157 | bbox_xyxy = [x, y, x + w, y + h]
158 |
159 | samples.append(
160 | {
161 | "image": item_dict["image"], # type: ignore
162 | "instruction": item_dict["instruction"], # type: ignore
163 | "bbox": bbox_xyxy,
164 | }
165 | )
166 | print(f"Dataset loaded: {len(samples)} samples")
167 |
168 | # Shuffle samples with seed
169 | random.shuffle(samples)
170 | print(f"Samples shuffled with seed {args.seed}")
171 |
172 | # Get available models
173 | models = get_available_models()
174 |
175 | # Evaluation settings
176 | max_samples = args.samples # Use command line argument
177 |
178 | # Run evaluations
179 | all_results = []
180 |
181 | for model in models:
182 | model_wrapper = ModelWrapper(model)
183 | result = await evaluate_model(model_wrapper, samples, max_samples)
184 | all_results.append(result)
185 |
186 | # Print summary
187 | print(f"\n{result['model_name']} Results:")
188 | print(f" Accuracy: {result['accuracy']*100:.2f}%")
189 | print(f" Correct: {result['correct_predictions']}/{result['total_samples']}")
190 | print(f" Errors: {result['failed_predictions']}")
191 | print(f" Error Rate: {result['failure_rate']*100:.2f}%")
192 | print(f" Avg Time: {result['avg_prediction_time']:.2f}s")
193 | print(f" Median Time: {result['median_prediction_time']:.2f}s")
194 | print(
195 | f" Time Range: {result['min_prediction_time']:.2f}s - {result['max_prediction_time']:.2f}s"
196 | )
197 | print(f" VRAM Max: {result['vram_max_mb']:.1f}MB")
198 | print(f" VRAM Avg: {result['vram_avg_mb']:.1f}MB")
199 |
200 | # Print GPU memory info
201 | gpu_memory = get_gpu_memory()
202 | if gpu_memory and gpu_memory[0] > 0:
203 | print(f" GPU Free Memory: {gpu_memory[0]:.1f}MB")
204 |
205 | # Save results
206 | if all_results:
207 | save_results_to_markdown(
208 | all_results, "screenspot_v2_results.md", title="ScreenSpot-v2 Benchmark Results"
209 | )
210 | save_visualizations(all_results, samples)
211 | print("\nBenchmark completed successfully!")
212 | else:
213 | print("\nNo successful evaluations completed.")
214 |
215 |
216 | if __name__ == "__main__":
217 | asyncio.run(main())
218 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/telemetry.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Telemetry callback handler for Computer-Use Agent (cua-agent)
3 | """
4 |
5 | import platform
6 | import time
7 | import uuid
8 | from typing import Any, Dict, List, Optional, Union
9 |
10 | from core.telemetry import (
11 | is_telemetry_enabled,
12 | record_event,
13 | )
14 |
15 | from .base import AsyncCallbackHandler
16 |
17 | SYSTEM_INFO = {
18 | "os": platform.system().lower(),
19 | "os_version": platform.release(),
20 | "python_version": platform.python_version(),
21 | }
22 |
23 |
24 | class TelemetryCallback(AsyncCallbackHandler):
25 | """
26 | Telemetry callback handler for Computer-Use Agent (cua-agent)
27 |
28 | Tracks agent usage, performance metrics, and optionally trajectory data.
29 | """
30 |
31 | def __init__(self, agent, log_trajectory: bool = False):
32 | """
33 | Initialize telemetry callback.
34 |
35 | Args:
36 | agent: The ComputerAgent instance
37 | log_trajectory: Whether to log full trajectory items (opt-in)
38 | """
39 | self.agent = agent
40 | self.log_trajectory = log_trajectory
41 |
42 | # Generate session/run IDs
43 | self.session_id = str(uuid.uuid4())
44 | self.run_id = None
45 |
46 | # Track timing and metrics
47 | self.run_start_time = None
48 | self.step_count = 0
49 | self.step_start_time = None
50 | self.total_usage = {
51 | "prompt_tokens": 0,
52 | "completion_tokens": 0,
53 | "total_tokens": 0,
54 | "response_cost": 0.0,
55 | }
56 |
57 | # Record agent initialization
58 | if is_telemetry_enabled():
59 | self._record_agent_initialization()
60 |
61 | def _record_agent_initialization(self) -> None:
62 | """Record agent type/model and session initialization."""
63 | # Get the agent loop type (class name)
64 | agent_type = "unknown"
65 | if hasattr(self.agent, "agent_loop") and self.agent.agent_loop is not None:
66 | agent_type = type(self.agent.agent_loop).__name__
67 |
68 | agent_info = {
69 | "session_id": self.session_id,
70 | "agent_type": agent_type,
71 | "model": getattr(self.agent, "model", "unknown"),
72 | **SYSTEM_INFO,
73 | }
74 |
75 | record_event("agent_session_start", agent_info)
76 |
77 | async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
78 | """Called at the start of an agent run loop."""
79 | if not is_telemetry_enabled():
80 | return
81 |
82 | self.run_id = str(uuid.uuid4())
83 | self.run_start_time = time.time()
84 | self.step_count = 0
85 |
86 | # Calculate input context size
87 | input_context_size = self._calculate_context_size(old_items)
88 |
89 | run_data = {
90 | "session_id": self.session_id,
91 | "run_id": self.run_id,
92 | "start_time": self.run_start_time,
93 | "input_context_size": input_context_size,
94 | "num_existing_messages": len(old_items),
95 | }
96 |
97 | # Log trajectory if opted in
98 | if self.log_trajectory:
99 | trajectory = self._extract_trajectory(old_items)
100 | if trajectory:
101 | run_data["uploaded_trajectory"] = trajectory
102 |
103 | record_event("agent_run_start", run_data)
104 |
105 | async def on_run_end(
106 | self,
107 | kwargs: Dict[str, Any],
108 | old_items: List[Dict[str, Any]],
109 | new_items: List[Dict[str, Any]],
110 | ) -> None:
111 | """Called at the end of an agent run loop."""
112 | if not is_telemetry_enabled() or not self.run_start_time:
113 | return
114 |
115 | run_duration = time.time() - self.run_start_time
116 |
117 | run_data = {
118 | "session_id": self.session_id,
119 | "run_id": self.run_id,
120 | "end_time": time.time(),
121 | "duration_seconds": run_duration,
122 | "num_steps": self.step_count,
123 | "total_usage": self.total_usage.copy(),
124 | }
125 |
126 | # Log trajectory if opted in
127 | if self.log_trajectory:
128 | trajectory = self._extract_trajectory(new_items)
129 | if trajectory:
130 | run_data["uploaded_trajectory"] = trajectory
131 |
132 | record_event("agent_run_end", run_data)
133 |
134 | async def on_usage(self, usage: Dict[str, Any]) -> None:
135 | """Called when usage information is received."""
136 | if not is_telemetry_enabled():
137 | return
138 |
139 | # Accumulate usage stats
140 | self.total_usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
141 | self.total_usage["completion_tokens"] += usage.get("completion_tokens", 0)
142 | self.total_usage["total_tokens"] += usage.get("total_tokens", 0)
143 | self.total_usage["response_cost"] += usage.get("response_cost", 0.0)
144 |
145 | # Record individual usage event
146 | usage_data = {
147 | "session_id": self.session_id,
148 | "run_id": self.run_id,
149 | "step": self.step_count,
150 | **usage,
151 | }
152 |
153 | record_event("agent_usage", usage_data)
154 |
155 | async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
156 | """Called when responses are received."""
157 | if not is_telemetry_enabled():
158 | return
159 |
160 | self.step_count += 1
161 | step_duration = None
162 |
163 | if self.step_start_time:
164 | step_duration = time.time() - self.step_start_time
165 |
166 | self.step_start_time = time.time()
167 |
168 | step_data = {
169 | "session_id": self.session_id,
170 | "run_id": self.run_id,
171 | "step": self.step_count,
172 | "timestamp": self.step_start_time,
173 | }
174 |
175 | if step_duration is not None:
176 | step_data["duration_seconds"] = step_duration
177 |
178 | record_event("agent_step", step_data)
179 |
180 | def _calculate_context_size(self, items: List[Dict[str, Any]]) -> int:
181 | """Calculate approximate context size in tokens/characters."""
182 | total_size = 0
183 |
184 | for item in items:
185 | if item.get("type") == "message" and "content" in item:
186 | content = item["content"]
187 | if isinstance(content, str):
188 | total_size += len(content)
189 | elif isinstance(content, list):
190 | for part in content:
191 | if isinstance(part, dict) and "text" in part:
192 | total_size += len(part["text"])
193 | elif "content" in item and isinstance(item["content"], str):
194 | total_size += len(item["content"])
195 |
196 | return total_size
197 |
198 | def _extract_trajectory(self, items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
199 | """Extract trajectory items that should be logged."""
200 | trajectory = []
201 |
202 | for item in items:
203 | # Include user messages, assistant messages, reasoning, computer calls, and computer outputs
204 | if (
205 | item.get("role") == "user" # User inputs
206 | or (
207 | item.get("type") == "message" and item.get("role") == "assistant"
208 | ) # Model outputs
209 | or item.get("type") == "reasoning" # Reasoning traces
210 | or item.get("type") == "computer_call" # Computer actions
211 | or item.get("type") == "computer_call_output" # Computer outputs
212 | ):
213 | # Create a copy of the item with timestamp
214 | trajectory_item = item.copy()
215 | trajectory_item["logged_at"] = time.time()
216 | trajectory.append(trajectory_item)
217 |
218 | return trajectory
219 |
```
--------------------------------------------------------------------------------
/blog/computer-use-agents-for-growth-hacking.md:
--------------------------------------------------------------------------------
```markdown
1 | # Computer Use Agents for Growth Hacking: The Cua-la Strategy
2 |
3 | _Published on January 16, 2025 by Sarina Li_
4 |
5 | <img src="./assets/esther-and-sarina.JPG" alt="Esther and Sarina at DevFest Toronto">
6 |
7 | Growing a developer-focused product is hard. Traditional marketing doesn't work. Booth rentals cost thousands. Sponsorships cost tens of thousands.
8 |
9 | So we tried something different at Google DevFest Toronto: show up with backpacks full of cute cua-la keychains and see what happens.
10 |
11 | This is the story of how two new hires, a growth engineer and a designer/artist, guerrilla marketed their way through a major tech conference with $200 worth of merch and a post-event automation pipeline.
12 |
13 | ## Meet the Team
14 |
15 | **Sarina** (Growth Engineering): Built the post-event automation pipeline that extracts LinkedIn connections and generates personalized messages while you sleep.
16 |
17 | **Esther** (Design + Art): Hand-crafted every piece of artwork, giving life to Cua through illustrations, branding, and yes, extremely cute cua-la keychains.
18 |
19 | The thesis: what if we could draw people in with irresistible physical merch, then use computer use agents to handle all the tedious follow-up work?
20 |
21 | ## The cua-la Strategy
22 |
23 | <img src="./assets/cua-at-devfest.JPG" alt="Guerrilla marketing at DevFest Toronto">
24 |
25 | Google DevFest Toronto brought together hundreds of developers and AI enthusiasts. We didn't have a booth. We didn't have demos. We showed up with backpacks full of cua-la keychains with the cua.ai logo and started handing them out.
26 |
27 | That's it. Pure guerrilla marketing, the cua-las were absurdly effective.
28 |
29 | People would literally crowd around us, not because they were interested in computer use (at first), but because they wanted a cua-la. We'd pitch Cua while handing out keychains, and suddenly we had an engaged audience!
30 |
31 | <img src="./assets/devfest-image.JPG" alt="DevFest crowd">
32 |
33 | ### The Magic Moment
34 |
35 | A few people stuck the cua-las on their bags immediately. Then, throughout the event, we started getting approached:
36 |
37 | "Wait, are you the Cua girls?"
38 |
39 | They'd seen the cua-las on someone's bag, asked about it, and tracked us down! The keychains became walking advertisements.
40 |
41 | <img src="./assets/htn-at-devfest.JPG" alt="Hack the North recognition at DevFest">
42 |
43 | Even better: two attendees recognized Cua from Hack the North. Our previous event marketing was actually working. People remembered us.
44 |
45 | ## Part 2: The Automation (Try It Yourself)
46 |
47 | After DevFest, we had 20+ new LinkedIn connections. Normally, this means hours of:
48 |
49 | - Manually copying names, roles, companies
50 | - Opening each profile to find contact info
51 | - Crafting personalized follow-up messages
52 | - Updating your CRM
53 |
54 | Sarina had a better idea: build the automation we wish existed, then open source it.
55 |
56 | **The automation is live**: [Post-Event Contact Export cookbook](https://cua.ai/docs/example-usecases/post-event-contact-export)
57 |
58 | ### How It Works
59 |
60 | <video controls width="100%">
61 | <source src="./assets/linkedin-scraping.mp4" type="video/mp4">
62 | LinkedIn scraping automation in action
63 | </video>
64 |
65 | The agent navigates LinkedIn like a human would: click profile, extract info, navigate back, repeat. But it does it overnight while you sleep.
66 |
67 | The secret sauce: **VM session persistence**. By logging into LinkedIn once through Cua's VM, the session stays alive. No captchas, no bot detection, just smooth automation.
68 |
69 | <video controls width="100%">
70 | <source src="./assets/adding-row-csv.mp4" type="video/mp4">
71 | Automatic CSV generation
72 | </video>
73 |
74 | Wake up to a clean CSV with:
75 |
76 | - First name, last name
77 | - Current role and company
78 | - LinkedIn profile URLs
79 | - Pre-generated messaging links
80 |
81 | Then use that data to craft personalized messages. Sarina wrote unique follow-ups for each person, mentioning specific conversations from DevFest.
82 |
83 | **Works for any platform**: LinkedIn, X/Twitter, or wherever your connections are. The cookbook includes full setup instructions and customizable code.
84 |
85 | ## The Results
86 |
87 | **Cost Breakdown**
88 |
89 | - Booth rental: $0 (didn't have one)
90 | - Sponsorship: $0 (didn't buy one)
91 | - cua-la keychains: ~$200
92 | - Automation: Built by Sarina in a few hours post-event
93 | - **Total spend: $200**
94 |
95 | **What We Got**
96 |
97 | - People crowding around us for cua-las
98 | - Walking advertisements on bags throughout the event
99 | - Instant brand recognition ("Are you the Cua girls?")
100 | - Two people who remembered us from Hack the North
101 | - 20+ quality connections extracted and messaged within 24 hours
102 | - Several demo requests from personalized follow-ups
103 |
104 | **ROI**
105 | Traditional event marketing at this scale: $5-10K minimum for booth + sponsorship.
106 |
107 | Our approach: $200 + scrappy execution.
108 |
109 | The automation is reuseable and will save hours of manual work, and the cua-las created more organic conversations than any booth could have.
110 |
111 | ## What Didn't Work (Yet)
112 |
113 | **cua-la Distribution**
114 | We ran out faster than expected! Next time: bigger bag, or limit to one per person.
115 |
116 | **Automation Setup**
117 | The VM login step added friction. "Log in manually first, then run the script" confused some people who wanted to try it themselves. Need better first-run UX.
118 |
119 | **Message Personalization**
120 | While the extraction was automated, I still wrote each follow-up message manually, I think we are looking for ways to better enrich messages with context from the event, which is hard to automate.
121 |
122 | ## What's Next: NeurIPS 2025
123 |
124 | NeurIPS is the biggest AI conference of the year. Thousands of researchers, hundreds of companies.
125 |
126 | **The good news**: We still have one giant bag of cua-las left. They're already packed and ready.
127 |
128 | **The better news**: We're upgrading the automation.
129 |
130 | ### The Hypothesis
131 |
132 | The cua-las get people interested. The automation ensures we actually follow through.
133 |
134 | Most event marketing fails at the follow-up stage. You collect business cards, connect on LinkedIn, and then... nothing. The moment passes. People forget.
135 |
136 | With Cua handling the mechanical work (data organization, connection tracking, follow-up scheduling), we can focus on the human part: genuine conversations, valuable introductions, and actually helping people.
137 |
138 | ## The Framework: Cute Merch + Smart Automation
139 |
140 | Traditional event marketing: show up, pitch, collect cards.
141 |
142 | Our approach: combine two forces that shouldn't work together but do.
143 |
144 | **The Physical Hook**
145 |
146 | - Make something people actually want (not another branded pen)
147 | - Hand-crafted, memorable, Instagram-worthy
148 | - Turns attendees into walking billboards
149 | - Creates natural conversation starters
150 |
151 | **The Digital Follow-Through**
152 |
153 | - Automate the tedious post-event work
154 | - Extract connections while you sleep
155 | - Personalize follow-ups with real context
156 | - Actually close the loop before the moment passes
157 |
158 | **Why It Works**
159 | The cua-las get you in the door. The automation ensures you don't waste the opportunity.
160 |
161 | Most companies nail one or the other:
162 |
163 | - Great merch, terrible follow-up → missed opportunities
164 | - Amazing automation, boring presence → no one cares
165 |
166 | Do both, and you create a flywheel: each event builds brand recognition for the next, while automation ensures maximum value from every connection.
167 |
168 | See you at NeurIPS 2025!
169 |
170 | ---
171 |
172 | _Want to build your own growth hacking automations? Check out [Cua on GitHub](https://github.com/trycua/cua) or join our [Discord](https://discord.gg/cua) to share your experiments. cua-las not included (yet)._
173 |
```
--------------------------------------------------------------------------------
/blog/ubuntu-docker-support.md:
--------------------------------------------------------------------------------
```markdown
1 | # Ubuntu Docker Support in Cua with Kasm
2 |
3 | _Published Aug 26, 2025 by Francesco Bonacci_
4 |
5 | Today we’re shipping **Ubuntu Docker support** in Cua. You get a full Linux desktop inside a Docker container, viewable right in your browser—no VM spin-up, no extra clients. It behaves the same on macOS, Windows, and Linux.
6 |
7 | <img src="./assets/docker-ubuntu-support.png" alt="Cua + KasmVNC Ubuntu container desktop">
8 |
9 | ## Why we did this
10 |
11 | If you build automation or RL workflows with Cua, you’ve probably run into the usual platform walls: macOS VMs (via Lume) are Apple-Silicon only; Windows Sandbox needs Pro/Enterprise; giving agents your host desktop is… exciting, but risky; and little OS quirks make “build once, run anywhere” harder than it should be.
12 |
13 | We wanted something lightweight, isolated, and identical across machines. So we put a desktop in a container.
14 |
15 | ## Why we didn’t use QEMU/KVM
16 |
17 | Short answer: **portability, startup time, and ops friction.**
18 |
19 | - **Runs everywhere, no hypervisor drama.** KVM needs Linux; Hyper-V/Virtualization.Framework setups vary by host and policy. Docker is ubiquitous across macOS/Windows/Linux and allowed in most CI runners—so your GUI env actually runs where your team works.
20 | - **Faster boot & smaller footprints.** Containers cold-start in seconds and images are GB-scale; VMs tend to be minutes and tens of GB. That matters for parallel agents, CI, and local iteration.
21 | - **Lower ops overhead.** No nested virt, kernel modules, or privileged host tweaks that many orgs (and cloud runners) block. Pull → run → browser.
22 | - **Same image, everywhere.** One Docker image gives you an identical desktop on every dev laptop and in CI.
23 | - **Web-first access out of the box.** KasmVNC serves the desktop over HTTP—no extra VNC/RDP clients or SPICE config.
24 |
25 | **When we _do_ reach for QEMU/KVM:**
26 |
27 | - You need **true OS isolation** or to run **non-Linux** guests.
28 | - You want **kernel-level features** or **device/GPU passthrough** (VFIO).
29 | - You’re optimizing for **hardware realism** over startup speed and density.
30 |
31 | For this release, the goal was a **cross-platform Linux desktop that feels instant and identical** across local dev and CI. Containers + KasmVNC hit that sweet spot.
32 |
33 | ## What we built
34 |
35 | Under the hood it’s **KasmVNC + Ubuntu 22.04 (Xfce) in Docker**, pre-configured for computer-use automation. You get a proper GUI desktop served over HTTP (no VNC/RDP client), accessible from any modern browser. Cua’s Computer server boots automatically so your agents can connect immediately.
36 |
37 | ### How it works (at a glance)
38 |
39 | ```
40 | Your System
41 | └─ Docker Container
42 | └─ Xfce Desktop + KasmVNC → open in your browser
43 | ```
44 |
45 | ---
46 |
47 | ## Quick start
48 |
49 | 1. **Install Docker** — Docker Desktop (macOS/Windows) or Docker Engine (Linux).
50 |
51 | 2. **Pull or build the image**
52 |
53 | ```bash
54 | # Pull (recommended)
55 | docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
56 |
57 | # Or build locally
58 | cd libs/kasm
59 | docker build -t cua-ubuntu:latest .
60 | ```
61 |
62 | 3. **Run with Cua’s Computer SDK**
63 |
64 | ```python
65 | from computer import Computer
66 |
67 | computer = Computer(
68 | os_type="linux",
69 | provider_type="docker",
70 | image="trycua/cua-ubuntu:latest",
71 | name="my-automation-container"
72 | )
73 |
74 | await computer.run()
75 | ```
76 |
77 | ### Make an agent that drives this desktop
78 |
79 | ```python
80 | from agent import ComputerAgent
81 |
82 | # assumes `computer` is the instance created above
83 | agent = ComputerAgent("openrouter/z-ai/glm-4.5v", tools=[computer])
84 |
85 | async for _ in agent.run("Click on the search bar and type 'hello world'"):
86 | pass
87 | ```
88 |
89 | > Use any VLM with tool use; just make sure your OpenRouter creds are set.
90 |
91 | By default you land on **Ubuntu 22.04 + Xfce** with a browser and desktop basics, the **Computer server** is running, the **web viewer** is available at `http://localhost:8006`, and common automation tools are preinstalled.
92 |
93 | ---
94 |
95 | ## What’s inside (in plain English)
96 |
97 | A tidy Linux desktop with web access through **KasmVNC**, Python 3.11 and dev tools, plus utilities you’ll actually use for automation—`wmctrl` for windows, `xclip` for clipboard, `ffmpeg` for media, screenshot helpers, and so on. It starts as a **non-root `kasm-user`**, lives in an **isolated filesystem** (unless you mount volumes), and ships with **SSL off for local dev** so you terminate TLS upstream when you deploy.
98 |
99 | ---
100 |
101 | ## How it compares
102 |
103 | | Feature | KasmVNC Docker | Lume (macOS VM) | Windows Sandbox |
104 | | ---------------- | --------------------- | --------------------- | ---------------------- |
105 | | Platform support | macOS, Windows, Linux | macOS (Apple Silicon) | Windows Pro/Enterprise |
106 | | Resource usage | Low (container) | Medium (full VM) | Medium (full VM) |
107 | | Setup time | \~30s | 2–5 min | 1–2 min |
108 | | GUI desktop | Linux | macOS | Windows |
109 | | Web access | Browser (no client) | Typically VNC client | Typically RDP client |
110 | | Consistency | Same everywhere | Hardware-dependent | OS-dependent |
111 |
112 | **Use KasmVNC Docker when…** you want the **same GUI env across devs/CI/platforms**, you’re doing **RL or end-to-end GUI tests**, or you need **many isolated desktops on one machine**.
113 | **Use alternatives when…** you need native **macOS** (→ Lume) or native **Windows** (→ Windows Sandbox).
114 |
115 | ---
116 |
117 | ## Using the Agent Framework (parallel example)
118 |
119 | A compact pattern for running multiple desktops and agents side-by-side:
120 |
121 | ```python
122 | import asyncio
123 | from computer import Computer
124 | from agent import ComputerAgent
125 |
126 | # Create multiple computer instances (each gets its own desktop)
127 | computers = []
128 | for i in range(3):
129 | c = Computer(
130 | os_type="linux",
131 | provider_type="docker",
132 | image="trycua/cua-ubuntu:latest",
133 | name=f"parallel-desktop-{i}"
134 | )
135 | computers.append(c)
136 | await c.run()
137 |
138 | # Pair each desktop with a task
139 | tasks = [
140 | "open github and search for 'trycua/cua'",
141 | "open a text editor and write 'hello world'",
142 | "open the browser and go to google.com",
143 | ]
144 |
145 | agents = [
146 | ComputerAgent(model="openrouter/z-ai/glm-4.5v", tools=[c])
147 | for c in computers
148 | ]
149 |
150 | async def run_agent(agent, task):
151 | async for _ in agent.run(task):
152 | pass
153 |
154 | await asyncio.gather(*[run_agent(a, t) for a, t in zip(agents, tasks)])
155 | ```
156 |
157 | ---
158 |
159 | ## What’s next
160 |
161 | We’re polishing a **CLI to push/scale these containers on Cua Cloud**, exploring **GPU acceleration** for in-container inference, and publishing **prebuilt images** for Playwright, Selenium, and friends.
162 |
163 | ---
164 |
165 | ## Try it
166 |
167 | ```python
168 | from computer import Computer
169 | computer = Computer(os_type="linux", provider_type="docker", image="trycua/cua-ubuntu:latest")
170 | await computer.run()
171 | ```
172 |
173 | ---
174 |
175 | ## Links
176 |
177 | - **Docker Provider Docs:** [https://cua.ai/docs/computers/docker](https://cua.ai/docs/computer-sdk/computers#linux-on-docker)
178 | - **KasmVNC:** [https://github.com/kasmtech/KasmVNC](https://github.com/kasmtech/KasmVNC)
179 | - **Container Source:** [https://github.com/trycua/cua/tree/main/libs/kasm](https://github.com/trycua/cua/tree/main/libs/kasm)
180 | - **Computer SDK:** [https://cua.ai/docs/computer-sdk/computers](https://cua.ai/docs/computer-sdk/computers)
181 | - **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
182 |
183 | Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
184 |
```
--------------------------------------------------------------------------------
/libs/python/bench-ui/bench_ui/child.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import json
3 | import os
4 | import random
5 | import socket
6 | import sys
7 | import threading
8 | from pathlib import Path
9 | from typing import Optional
10 |
11 | import webview
12 | from aiohttp import web
13 |
14 |
15 | def _get_free_port() -> int:
16 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
17 | s.bind(("127.0.0.1", 0))
18 | return s.getsockname()[1]
19 |
20 |
21 | def _start_http_server(
22 | window: webview.Window,
23 | port: int,
24 | ready_event: threading.Event,
25 | html_content: str | None = None,
26 | folder_path: str | None = None,
27 | ):
28 | async def rect_handler(request: web.Request):
29 | try:
30 | data = await request.json()
31 | except Exception:
32 | return web.json_response({"error": "invalid_json"}, status=400)
33 | selector = data.get("selector")
34 | space = data.get("space", "window")
35 | if not isinstance(selector, str):
36 | return web.json_response({"error": "selector_required"}, status=400)
37 |
38 | # Ensure window content is loaded
39 | if not ready_event.is_set():
40 | # give it a short chance to finish loading
41 | ready_event.wait(timeout=2.0)
42 | if not ready_event.is_set():
43 | return web.json_response({"error": "window_not_ready"}, status=409)
44 |
45 | # Safely embed selector into JS
46 | selector_js = json.dumps(selector)
47 | if space == "screen":
48 | # Compute approximate screen coordinates using window metrics
49 | js = (
50 | "(function(){"
51 | f"const s = {selector_js};"
52 | "const el = document.querySelector(s);"
53 | "if(!el){return null;}"
54 | "const r = el.getBoundingClientRect();"
55 | "const sx = (window.screenX ?? window.screenLeft ?? 0);"
56 | "const syRaw = (window.screenY ?? window.screenTop ?? 0);"
57 | "const frameH = (window.outerHeight - window.innerHeight) || 0;"
58 | "const sy = syRaw + frameH;"
59 | "return {x:sx + r.left, y:sy + r.top, width:r.width, height:r.height};"
60 | "})()"
61 | )
62 | else:
63 | js = (
64 | "(function(){"
65 | f"const s = {selector_js};"
66 | "const el = document.querySelector(s);"
67 | "if(!el){return null;}"
68 | "const r = el.getBoundingClientRect();"
69 | "return {x:r.left,y:r.top,width:r.width,height:r.height};"
70 | "})()"
71 | )
72 | try:
73 | # Evaluate JS on the target window; this call is thread-safe in pywebview
74 | result = window.evaluate_js(js)
75 | except Exception as e:
76 | return web.json_response({"error": str(e)}, status=500)
77 | return web.json_response({"rect": result})
78 |
79 | async def eval_handler(request: web.Request):
80 | try:
81 | data = await request.json()
82 | except Exception:
83 | return web.json_response({"error": "invalid_json"}, status=400)
84 | code = data.get("javascript") or data.get("code")
85 | if not isinstance(code, str):
86 | return web.json_response({"error": "javascript_required"}, status=400)
87 |
88 | if not ready_event.is_set():
89 | ready_event.wait(timeout=2.0)
90 | if not ready_event.is_set():
91 | return web.json_response({"error": "window_not_ready"}, status=409)
92 |
93 | try:
94 | result = window.evaluate_js(code)
95 | except Exception as e:
96 | return web.json_response({"error": str(e)}, status=500)
97 | return web.json_response({"result": result})
98 |
99 | async def index_handler(request: web.Request):
100 | if html_content is None:
101 | return web.json_response({"status": "ok", "message": "bench-ui control server"})
102 | return web.Response(text=html_content, content_type="text/html")
103 |
104 | app = web.Application()
105 |
106 | # If serving a folder, add static file routes
107 | if folder_path:
108 | app.router.add_static("/", folder_path, show_index=True)
109 | else:
110 | app.router.add_get("/", index_handler)
111 |
112 | app.router.add_post("/rect", rect_handler)
113 | app.router.add_post("/eval", eval_handler)
114 |
115 | loop = asyncio.new_event_loop()
116 |
117 | def run_loop():
118 | asyncio.set_event_loop(loop)
119 | runner = web.AppRunner(app)
120 | loop.run_until_complete(runner.setup())
121 | site = web.TCPSite(runner, "127.0.0.1", port)
122 | loop.run_until_complete(site.start())
123 | loop.run_forever()
124 |
125 | t = threading.Thread(target=run_loop, daemon=True)
126 | t.start()
127 |
128 |
129 | def main():
130 | if len(sys.argv) < 2:
131 | print("Usage: python -m bench_ui.child <config.json>", file=sys.stderr)
132 | sys.exit(2)
133 |
134 | cfg_path = Path(sys.argv[1])
135 | cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
136 |
137 | html: Optional[str] = cfg.get("html") or ""
138 | url: Optional[str] = cfg.get("url")
139 | folder: Optional[str] = cfg.get("folder")
140 | title: str = cfg.get("title", "Window")
141 | x: Optional[int] = cfg.get("x")
142 | y: Optional[int] = cfg.get("y")
143 | width: int = int(cfg.get("width", 600))
144 | height: int = int(cfg.get("height", 400))
145 | icon: Optional[str] = cfg.get("icon")
146 | use_inner_size: bool = bool(cfg.get("use_inner_size", False))
147 | title_bar_style: str = cfg.get("title_bar_style", "default")
148 |
149 | # Choose port early so we can point the window to it when serving inline HTML or folder
150 | port = _get_free_port()
151 |
152 | # Create window
153 | if url:
154 | window = webview.create_window(
155 | title,
156 | url=url,
157 | width=width,
158 | height=height,
159 | x=x,
160 | y=y,
161 | confirm_close=False,
162 | text_select=True,
163 | background_color="#FFFFFF",
164 | )
165 | html_for_server = None
166 | folder_for_server = None
167 | elif folder:
168 | # Serve static folder at control server root and point window to index.html
169 | resolved_url = f"http://127.0.0.1:{port}/index.html"
170 | window = webview.create_window(
171 | title,
172 | url=resolved_url,
173 | width=width,
174 | height=height,
175 | x=x,
176 | y=y,
177 | confirm_close=False,
178 | text_select=True,
179 | background_color="#FFFFFF",
180 | )
181 | html_for_server = None
182 | folder_for_server = folder
183 | else:
184 | # Serve inline HTML at control server root and point window to it
185 | resolved_url = f"http://127.0.0.1:{port}/"
186 | window = webview.create_window(
187 | title,
188 | url=resolved_url,
189 | width=width,
190 | height=height,
191 | x=x,
192 | y=y,
193 | confirm_close=False,
194 | text_select=True,
195 | background_color="#FFFFFF",
196 | )
197 | html_for_server = html
198 | folder_for_server = None
199 |
200 | # Track when the page is loaded so JS execution succeeds
201 | window_ready = threading.Event()
202 |
203 | def _on_loaded():
204 | window_ready.set()
205 |
206 | window.events.loaded += _on_loaded # type: ignore[attr-defined]
207 |
208 | # Start HTTP server for control (and optionally serve inline HTML or static folder)
209 | _start_http_server(
210 | window, port, window_ready, html_content=html_for_server, folder_path=folder_for_server
211 | )
212 |
213 | # Print startup info for parent to read
214 | print(json.dumps({"pid": os.getpid(), "port": port}), flush=True)
215 |
216 | # Start GUI (blocking)
217 | webview.start(debug=os.environ.get("CUA_BENCH_UI_DEBUG", "false").lower() in ("true", "1"))
218 |
219 |
220 | if __name__ == "__main__":
221 | main()
222 |
```
--------------------------------------------------------------------------------
/libs/lume/src/Commands/Config.swift:
--------------------------------------------------------------------------------
```swift
1 | import ArgumentParser
2 | import Foundation
3 |
4 | struct Config: ParsableCommand {
5 | static let configuration = CommandConfiguration(
6 | commandName: "config",
7 | abstract: "Get or set lume configuration",
8 | subcommands: [Get.self, Storage.self, Cache.self, Caching.self],
9 | defaultSubcommand: Get.self
10 | )
11 |
12 | // MARK: - Basic Configuration Subcommands
13 |
14 | struct Get: ParsableCommand {
15 | static let configuration = CommandConfiguration(
16 | commandName: "get",
17 | abstract: "Get current configuration"
18 | )
19 |
20 | func run() throws {
21 | let controller = LumeController()
22 | let settings = controller.getSettings()
23 |
24 | // Display default location
25 | print(
26 | "Default VM storage: \(settings.defaultLocationName) (\(settings.defaultLocation?.path ?? "not set"))"
27 | )
28 |
29 | // Display cache directory
30 | print("Cache directory: \(settings.cacheDirectory)")
31 |
32 | // Display caching enabled status
33 | print("Caching enabled: \(settings.cachingEnabled)")
34 |
35 | // Display all locations
36 | if !settings.vmLocations.isEmpty {
37 | print("\nConfigured VM storage locations:")
38 | for location in settings.sortedLocations {
39 | let isDefault = location.name == settings.defaultLocationName
40 | let defaultMark = isDefault ? " (default)" : ""
41 | print(" - \(location.name): \(location.path)\(defaultMark)")
42 | }
43 | }
44 | }
45 | }
46 |
47 | // MARK: - Debug Command
48 |
49 | struct Debug: ParsableCommand {
50 | static let configuration = CommandConfiguration(
51 | commandName: "debug",
52 | abstract: "Output detailed debug information about current configuration",
53 | shouldDisplay: false
54 | )
55 |
56 | func run() throws {
57 | let debugInfo = SettingsManager.shared.debugSettings()
58 | print(debugInfo)
59 | }
60 | }
61 |
62 | // MARK: - Caching Management Subcommands
63 |
64 | struct Caching: ParsableCommand {
65 | static let configuration = CommandConfiguration(
66 | commandName: "caching",
67 | abstract: "Manage image caching settings",
68 | subcommands: [GetCaching.self, SetCaching.self]
69 | )
70 |
71 | struct GetCaching: ParsableCommand {
72 | static let configuration = CommandConfiguration(
73 | commandName: "get",
74 | abstract: "Show current caching status"
75 | )
76 |
77 | func run() throws {
78 | let controller = LumeController()
79 | let cachingEnabled = controller.isCachingEnabled()
80 | print("Caching enabled: \(cachingEnabled)")
81 | }
82 | }
83 |
84 | struct SetCaching: ParsableCommand {
85 | static let configuration = CommandConfiguration(
86 | commandName: "set",
87 | abstract: "Enable or disable image caching"
88 | )
89 |
90 | @Argument(help: "Enable or disable caching (true/false)")
91 | var enabled: Bool
92 |
93 | func run() throws {
94 | let controller = LumeController()
95 | try controller.setCachingEnabled(enabled)
96 | print("Caching \(enabled ? "enabled" : "disabled")")
97 | }
98 | }
99 | }
100 |
101 | // MARK: - Cache Management Subcommands
102 |
103 | struct Cache: ParsableCommand {
104 | static let configuration = CommandConfiguration(
105 | commandName: "cache",
106 | abstract: "Manage cache settings",
107 | subcommands: [GetCache.self, SetCache.self]
108 | )
109 |
110 | struct GetCache: ParsableCommand {
111 | static let configuration = CommandConfiguration(
112 | commandName: "get",
113 | abstract: "Get current cache directory"
114 | )
115 |
116 | func run() throws {
117 | let controller = LumeController()
118 | let cacheDir = controller.getCacheDirectory()
119 | print("Cache directory: \(cacheDir)")
120 | }
121 | }
122 |
123 | struct SetCache: ParsableCommand {
124 | static let configuration = CommandConfiguration(
125 | commandName: "set",
126 | abstract: "Set cache directory"
127 | )
128 |
129 | @Argument(help: "Path to cache directory")
130 | var path: String
131 |
132 | func run() throws {
133 | let controller = LumeController()
134 | try controller.setCacheDirectory(path: path)
135 | print("Cache directory set to: \(path)")
136 | }
137 | }
138 | }
139 |
140 | // MARK: - Storage Management Subcommands
141 |
142 | struct Storage: ParsableCommand {
143 | static let configuration = CommandConfiguration(
144 | commandName: "storage",
145 | abstract: "Manage VM storage locations",
146 | subcommands: [Add.self, Remove.self, List.self, Default.self]
147 | )
148 |
149 | struct Add: ParsableCommand {
150 | static let configuration = CommandConfiguration(
151 | commandName: "add",
152 | abstract: "Add a new VM storage location"
153 | )
154 |
155 | @Argument(help: "Storage name (alphanumeric with dashes/underscores)")
156 | var name: String
157 |
158 | @Argument(help: "Path to VM storage directory")
159 | var path: String
160 |
161 | func run() throws {
162 | let controller = LumeController()
163 | try controller.addLocation(name: name, path: path)
164 | print("Added VM storage location: \(name) at \(path)")
165 | }
166 | }
167 |
168 | struct Remove: ParsableCommand {
169 | static let configuration = CommandConfiguration(
170 | commandName: "remove",
171 | abstract: "Remove a VM storage location"
172 | )
173 |
174 | @Argument(help: "Storage name to remove")
175 | var name: String
176 |
177 | func run() throws {
178 | let controller = LumeController()
179 | try controller.removeLocation(name: name)
180 | print("Removed VM storage location: \(name)")
181 | }
182 | }
183 |
184 | struct List: ParsableCommand {
185 | static let configuration = CommandConfiguration(
186 | commandName: "list",
187 | abstract: "List all VM storage locations"
188 | )
189 |
190 | func run() throws {
191 | let controller = LumeController()
192 | let settings = controller.getSettings()
193 |
194 | if settings.vmLocations.isEmpty {
195 | print("No VM storage locations configured")
196 | return
197 | }
198 |
199 | print("VM Storage Locations:")
200 | for location in settings.sortedLocations {
201 | let isDefault = location.name == settings.defaultLocationName
202 | let defaultMark = isDefault ? " (default)" : ""
203 | print(" - \(location.name): \(location.path)\(defaultMark)")
204 | }
205 | }
206 | }
207 |
208 | struct Default: ParsableCommand {
209 | static let configuration = CommandConfiguration(
210 | commandName: "default",
211 | abstract: "Set the default VM storage location"
212 | )
213 |
214 | @Argument(help: "Storage name to set as default")
215 | var name: String
216 |
217 | func run() throws {
218 | let controller = LumeController()
219 | try controller.setDefaultLocation(name: name)
220 | print("Set default VM storage location to: \(name)")
221 | }
222 | }
223 | }
224 | }
225 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/holo.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Holo 1.5 agent loop implementation for click prediction using litellm.acompletion.
3 |
4 | Implements the Holo1.5 grounding behavior:
5 | - Prompt asks for absolute pixel coordinates in JSON: {"action":"click_absolute","x":int,"y":int}
6 | - Optionally resizes the image using Qwen2-VL smart_resize parameters (via transformers AutoProcessor)
7 | - If resized, maps predicted coordinates back to the original screenshot resolution
8 |
9 | Note: We do NOT manually load the model; acompletions (via HuggingFaceLocalAdapter)
10 | will handle loading based on the provided model name.
11 | """
12 |
13 | from __future__ import annotations
14 |
15 | import base64
16 | import json
17 | from io import BytesIO
18 | from typing import Any, Dict, List, Optional, Tuple
19 |
20 | import litellm
21 | from PIL import Image
22 |
23 | from ..decorators import register_agent
24 | from ..types import AgentCapability
25 | from .base import AsyncAgentConfig
26 |
27 |
28 | def _strip_hf_prefix(model: str) -> str:
29 | """Strip provider prefixes like 'huggingface-local/' from model names for HF processor load."""
30 | if "/" in model and model.lower().startswith("huggingface-local/"):
31 | return model.split("/", 1)[1]
32 | return model
33 |
34 |
35 | def _maybe_smart_resize(image: Image.Image, model: str) -> Tuple[Image.Image, Tuple[int, int]]:
36 | """
37 | Try to compute Qwen2-VL smart_resize output size using transformers AutoProcessor.
38 |
39 | Returns (processed_image, (orig_w, orig_h)). If transformers or processor unavailable,
40 | returns the original image and size without resizing.
41 | """
42 | orig_w, orig_h = image.size
43 | try:
44 | # Import lazily to avoid hard dependency if not installed
45 | from transformers import AutoProcessor # type: ignore
46 | from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( # type: ignore
47 | smart_resize,
48 | )
49 |
50 | processor_name = _strip_hf_prefix(model)
51 | processor = AutoProcessor.from_pretrained(processor_name)
52 | image_processor = getattr(processor, "image_processor", None)
53 | if image_processor is None:
54 | return image, (orig_w, orig_h)
55 |
56 | factor = getattr(image_processor, "patch_size", 14) * getattr(
57 | image_processor, "merge_size", 1
58 | )
59 | min_pixels = getattr(image_processor, "min_pixels", 256 * 256)
60 | max_pixels = getattr(image_processor, "max_pixels", 1536 * 1536)
61 |
62 | resized_h, resized_w = smart_resize(
63 | orig_h,
64 | orig_w,
65 | factor=factor,
66 | min_pixels=min_pixels,
67 | max_pixels=max_pixels,
68 | )
69 |
70 | if (resized_w, resized_h) == (orig_w, orig_h):
71 | return image, (orig_w, orig_h)
72 |
73 | processed = image.resize((resized_w, resized_h), resample=Image.Resampling.LANCZOS)
74 | return processed, (orig_w, orig_h)
75 | except Exception:
76 | # If any failure (no transformers, processor load error), fall back to original
77 | return image, (orig_w, orig_h)
78 |
79 |
80 | def _build_holo_prompt(instruction: str) -> str:
81 | """Construct the Holo1.5 grounding prompt."""
82 | # Keep it close to the cookbook while avoiding heavy schema generation
83 | schema_hint = '{"action": "click_absolute", "x": <int>, "y": <int>}'
84 | return (
85 | "Localize an element on the GUI image according to the provided target and output a click position. "
86 | f"You must output a valid JSON following the format: {schema_hint} "
87 | f"Your target is: {instruction}"
88 | )
89 |
90 |
91 | def _parse_click_json(output_text: str) -> Optional[Tuple[int, int]]:
92 | """
93 | Parse JSON from model output and extract x, y ints.
94 | Tries to find the first JSON object substring if extra text is present.
95 | """
96 | try:
97 | # Fast path: direct JSON
98 | data = json.loads(output_text)
99 | except Exception:
100 | # Try to locate a JSON object within the text
101 | start = output_text.find("{")
102 | end = output_text.rfind("}")
103 | if start == -1 or end == -1 or end <= start:
104 | return None
105 | try:
106 | data = json.loads(output_text[start : end + 1])
107 | except Exception:
108 | return None
109 |
110 | try:
111 | x = int(data.get("x"))
112 | y = int(data.get("y"))
113 | return x, y
114 | except Exception:
115 | return None
116 |
117 |
118 | @register_agent(models=r"(?i).*(Holo1\.5|Hcompany/Holo1\.5).*")
119 | class HoloConfig(AsyncAgentConfig):
120 | """Holo is a family of UI grounding models from H Company"""
121 |
122 | async def predict_step(
123 | self,
124 | messages: List[Dict[str, Any]],
125 | model: str,
126 | tools: Optional[List[Dict[str, Any]]] = None,
127 | max_retries: Optional[int] = None,
128 | stream: bool = False,
129 | computer_handler=None,
130 | _on_api_start=None,
131 | _on_api_end=None,
132 | _on_usage=None,
133 | _on_screenshot=None,
134 | **kwargs,
135 | ) -> Dict[str, Any]:
136 | # Holo models are only trained on UI localization tasks, not all-in-one agent
137 | raise NotImplementedError()
138 |
139 | async def predict_click(
140 | self,
141 | model: str,
142 | image_b64: str,
143 | instruction: str,
144 | **kwargs,
145 | ) -> Optional[Tuple[int, int]]:
146 | """
147 | Predict click coordinates using Holo1.5 via litellm.acompletion.
148 |
149 | - Optionally smart-resizes the image using Qwen2-VL rules if transformers are available
150 | - Prompts for JSON with absolute pixel coordinates
151 | - Parses x,y and maps back to original screenshot size if resized
152 | """
153 | try:
154 | img_bytes = base64.b64decode(image_b64)
155 | original_img = Image.open(BytesIO(img_bytes))
156 | except Exception:
157 | return None
158 |
159 | # Optional preprocessing
160 | processed_img, (orig_w, orig_h) = _maybe_smart_resize(original_img, model)
161 |
162 | # If we resized, send the resized image; otherwise send original
163 | img_to_send = processed_img
164 | buf = BytesIO()
165 | img_to_send.save(buf, format="PNG")
166 | processed_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
167 |
168 | prompt = _build_holo_prompt(instruction)
169 |
170 | messages = [
171 | {
172 | "role": "user",
173 | "content": [
174 | {
175 | "type": "image_url",
176 | "image_url": {"url": f"data:image/png;base64,{processed_b64}"},
177 | },
178 | {"type": "text", "text": prompt},
179 | ],
180 | }
181 | ]
182 |
183 | api_kwargs = {
184 | "model": model,
185 | "messages": messages,
186 | # Deterministic, small output
187 | "max_tokens": kwargs.get("max_tokens", 256),
188 | "temperature": kwargs.get("temperature", 0.0),
189 | }
190 |
191 | response = await litellm.acompletion(**api_kwargs)
192 | output_text = (response.choices[0].message.content or "").strip() # type: ignore
193 |
194 | coords = _parse_click_json(output_text)
195 | if coords is None:
196 | return None
197 |
198 | x, y = coords
199 |
200 | # Map back to original size if we resized
201 | proc_w, proc_h = img_to_send.size
202 | if (proc_w, proc_h) != (orig_w, orig_h):
203 | try:
204 | sx = orig_w / float(proc_w)
205 | sy = orig_h / float(proc_h)
206 | x = int(round(x * sx))
207 | y = int(round(y * sy))
208 | except Exception:
209 | # Fallback: clamp within original bounds
210 | pass
211 |
212 | # Clamp to original image bounds
213 | x = max(0, min(orig_w - 1, x))
214 | y = max(0, min(orig_h - 1, y))
215 | return x, y
216 |
217 | def get_capabilities(self) -> List[AgentCapability]:
218 | return ["click"]
219 |
```