This is page 5 of 20. Use http://codebase.md/trycua/cua?page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lumier/docker.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Docker
---
You can use Lumier through Docker:
### Run a macOS VM (ephemeral)
```bash
# Run the container with temporary storage (using pre-built image from Docker Hub)
docker run -it --rm \
--name macos-vm \
-p 8006:8006 \
-e VM_NAME=macos-vm \
-e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
-e CPU_CORES=4 \
-e RAM_SIZE=8192 \
trycua/lumier:latest
```
Access the VM in your browser at **http://localhost:8006**.
After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).
<Callout title="Note">
With the basic setup above, your VM will be reset when you stop the container (ephemeral mode).
This means any changes you make inside the macOS VM will be lost. See the section below for how to
save your VM state.
</Callout>
## Saving Your VM State
To save your VM state between sessions (so your changes persist when you stop and restart the container), you'll need to set up a storage location:
```bash
# First, create a storage directory if it doesn't exist
mkdir -p storage
# Then run the container with persistent storage
docker run -it --rm \
--name lumier-vm \
-p 8006:8006 \
-v $(pwd)/storage:/storage \
-e VM_NAME=lumier-vm \
-e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
-e CPU_CORES=4 \
-e RAM_SIZE=8192 \
-e HOST_STORAGE_PATH=$(pwd)/storage \
trycua/lumier:latest
```
This command creates a connection between a folder on your Mac (`$(pwd)/storage`) and a folder inside the Docker container (`/storage`). The `-v` flag (volume mount) and the `HOST_STORAGE_PATH` variable work together to ensure your VM data is saved on your host Mac.
## Sharing Files with Your VM
To share files between your Mac and the virtual machine, you can set up a shared folder:
```bash
# Create both storage and shared folders
mkdir -p storage shared
# Run with both persistent storage and a shared folder
docker run -it --rm \
--name lumier-vm \
-p 8006:8006 \
-v $(pwd)/storage:/storage \
-v $(pwd)/shared:/shared \
-e VM_NAME=lumier-vm \
-e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
-e CPU_CORES=4 \
-e RAM_SIZE=8192 \
-e HOST_STORAGE_PATH=$(pwd)/storage \
-e HOST_SHARED_PATH=$(pwd)/shared \
trycua/lumier:latest
```
With this setup, any files you place in the `shared` folder on your Mac will be accessible from within the macOS VM, and vice versa.
## Automating VM Startup with on-logon.sh
You can automatically run scripts when the VM starts up by placing an `on-logon.sh` script in the shared folder's lifecycle directory. This is useful for setting up your VM environment each time it starts.
```bash
# Create the lifecycle directory in your shared folder
mkdir -p shared/lifecycle
# Create a sample on-logon.sh script
cat > shared/lifecycle/on-logon.sh << 'EOF'
#!/usr/bin/env bash
# Create a file on the desktop
echo "Hello from Lumier!" > /Users/lume/Desktop/hello_lume.txt
# You can add more commands to execute at VM startup
# For example:
# - Configure environment variables
# - Start applications
# - Mount network drives
# - Set up development environments
EOF
# Make the script executable
chmod +x shared/lifecycle/on-logon.sh
```
The script will be automatically executed when the VM starts up. It runs in the VM context and has access to:
- The `/Users/lume` user directory (home directory in the VM)
- The shared folder at `/Volumes/My Shared Files` inside the VM
- Any resources available to the VM
This feature enables automation of VM setup without modifying the base VM image.
## Configuration Options
When running Lumier, you'll need to configure a few things:
- **Port forwarding** (`-p 8006:8006`): Makes the VM's VNC interface accessible in your browser. If port 8006 is already in use, you can use a different port like `-p 8007:8006`.
- **Environment variables** (`-e`): Configure your VM settings:
- `VM_NAME`: A name for your virtual machine
- `VERSION`: The macOS image to use
- `CPU_CORES`: Number of CPU cores to allocate
- `RAM_SIZE`: Memory in MB to allocate
- `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage)
- `HOST_SHARED_PATH`: Path to the shared folder (optional)
- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above).
```
--------------------------------------------------------------------------------
/libs/typescript/agent/src/types.ts:
--------------------------------------------------------------------------------
```typescript
// #region Request
export type ConnectionType = 'http' | 'https' | 'peer';
export interface AgentClientOptions {
timeout?: number;
retries?: number;
/** Optional CUA API key to send as X-API-Key header for HTTP requests */
apiKey?: string;
}
// Request types matching the Python proxy API
export interface AgentRequest {
model: string;
input: string | AgentMessage[];
agent_kwargs?: {
save_trajectory?: boolean;
verbosity?: number;
[key: string]: any;
};
computer_kwargs?: {
os_type?: string;
provider_type?: string;
[key: string]: any;
};
/**
* Optional per-request environment variable overrides.
* Keys and values are strings and will be forwarded to the backend proxy.
*/
env?: Record<string, string>;
}
// #endregion
// #region Response
// Response types
export interface AgentResponse {
output: AgentMessage[];
usage: Usage;
status: 'completed' | 'failed';
error?: string;
}
// Usage information
export interface Usage {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
response_cost: number;
}
// #endregion
// #region Messages
// Agent message types - can be one of several different message types
export type AgentMessage =
| UserMessage
| AssistantMessage
| ReasoningMessage
| ComputerCallMessage
| ComputerCallOutputMessage
| FunctionCallMessage
| FunctionCallOutputMessage;
// Input message
export interface UserMessage {
type?: 'message';
role: 'user' | 'system' | 'developer';
content: string | InputContent[];
}
// Output message
export interface AssistantMessage {
type: 'message';
role: 'assistant';
content: OutputContent[];
}
// Output reasoning/thinking message
export interface ReasoningMessage {
type: 'reasoning';
summary: SummaryContent[];
}
// Output computer action call
export interface ComputerCallMessage {
type: 'computer_call';
call_id: string;
status: 'completed' | 'failed' | 'pending';
action: ComputerAction;
}
// Output computer action result (always a screenshot)
export interface ComputerCallOutputMessage {
type: 'computer_call_output';
call_id: string;
output: ComputerResultContent;
}
// Output function call
export interface FunctionCallMessage {
type: 'function_call';
call_id: string;
status: 'completed' | 'failed' | 'pending';
name: string;
arguments: string; // JSON dict of kwargs
}
// Output function call result (always text)
export interface FunctionCallOutputMessage {
type: 'function_call_output';
call_id: string;
output: string;
}
// #endregion
// #region Message Content
export interface InputContent {
type: 'input_image' | 'input_text';
text?: string;
image_url?: string;
}
export interface OutputContent {
type: 'output_text';
text: string;
}
export interface SummaryContent {
type: 'summary_text';
text: string;
}
export interface ComputerResultContent {
type: 'computer_screenshot' | 'input_image';
image_url: string;
}
// #endregion
// #region Actions
export type ComputerAction = ComputerActionOpenAI | ComputerActionAnthropic;
// OpenAI Computer Actions
export type ComputerActionOpenAI =
| ClickAction
| DoubleClickAction
| DragAction
| KeyPressAction
| MoveAction
| ScreenshotAction
| ScrollAction
| TypeAction
| WaitAction;
export interface ClickAction {
type: 'click';
button: 'left' | 'right' | 'wheel' | 'back' | 'forward';
x: number;
y: number;
}
export interface DoubleClickAction {
type: 'double_click';
button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
x: number;
y: number;
}
export interface DragAction {
type: 'drag';
button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
path: Array<[number, number]>;
}
export interface KeyPressAction {
type: 'keypress';
keys: string[];
}
export interface MoveAction {
type: 'move';
x: number;
y: number;
}
export interface ScreenshotAction {
type: 'screenshot';
}
export interface ScrollAction {
type: 'scroll';
scroll_x: number;
scroll_y: number;
x: number;
y: number;
}
export interface TypeAction {
type: 'type';
text: string;
}
export interface WaitAction {
type: 'wait';
}
// Anthropic Computer Actions
export type ComputerActionAnthropic = LeftMouseDownAction | LeftMouseUpAction;
export interface LeftMouseDownAction {
type: 'left_mouse_down';
x: number;
y: number;
}
export interface LeftMouseUpAction {
type: 'left_mouse_up';
x: number;
y: number;
}
// #endregion
```
--------------------------------------------------------------------------------
/libs/python/agent/example.py:
--------------------------------------------------------------------------------
```python
"""
Example usage of the agent library with docstring-based tool definitions.
"""
import asyncio
import logging
from agent import ComputerAgent
from computer import Computer
from computer.helpers import sandboxed
@sandboxed()
def read_file(location: str) -> str:
"""Read contents of a file
Parameters
----------
location : str
Path to the file to read
Returns
-------
str
Contents of the file or error message
"""
try:
with open(location, "r") as f:
return f.read()
except Exception as e:
return f"Error reading file: {str(e)}"
def save_note(content: str, filename: str = "note.txt") -> str:
"""Save content to a note file
Parameters
----------
content : str
Content to save to the file
filename : str, optional
Name of the file to save to (default is "note.txt")
Returns
-------
str
Success or error message
"""
try:
with open(filename, "w") as f:
f.write(content)
return f"Saved note to {filename}"
except Exception as e:
return f"Error saving note: {str(e)}"
def calculate(a: int, b: int) -> int:
"""Calculate the sum of two integers
Parameters
----------
a : int
First integer
b : int
Second integer
Returns
-------
int
Sum of the two integers
"""
return a + b
async def main():
"""Example usage of ComputerAgent with different models"""
# Example 1: Using Claude with computer and custom tools
print("=== Example 1: Claude with Computer ===")
import json
import os
import dotenv
dotenv.load_dotenv()
assert os.getenv("CUA_CONTAINER_NAME") is not None, "CUA_CONTAINER_NAME is not set"
assert os.getenv("CUA_API_KEY") is not None, "CUA_API_KEY is not set"
async with Computer(
os_type="linux",
provider_type="cloud",
name=os.getenv("CUA_CONTAINER_NAME") or "",
api_key=os.getenv("CUA_API_KEY") or "",
) as computer:
agent = ComputerAgent(
# Supported models:
# == OpenAI CUA (computer-use-preview) ==
model="openai/computer-use-preview",
# == Anthropic CUA (Claude > 3.5) ==
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-sonnet-4-20250514",
# model="anthropic/claude-3-7-sonnet-20250219",
# model="anthropic/claude-sonnet-4-5-20250929",
# == UI-TARS ==
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
# TODO: add local mlx provider
# model="mlx-community/UI-TARS-1.5-7B-6bit",
# model="ollama_chat/0000/ui-tars-1.5-7b",
# == Omniparser + Any LLM ==
# model="omniparser+..."
# model="omniparser+anthropic/claude-opus-4-20250514",
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.INFO,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget={
"max_budget": 1.0,
"raise_error": True,
"reset_after_each_run": False,
},
)
history = []
while True:
user_input = input("> ")
history.append({"role": "user", "content": user_input})
# Non-streaming usage
async for result in agent.run(history, stream=False):
history += result["output"]
# # Print output
# for item in result["output"]:
# if item["type"] == "message":
# print(item["content"][0]["text"])
# elif item["type"] == "computer_call":
# action = item["action"]
# action_type = action["type"]
# action_args = {k: v for k, v in action.items() if k != "type"}
# print(f"{action_type}({action_args})")
# elif item["type"] == "function_call":
# action = item["name"]
# action_args = item["arguments"]
# print(f"{action}({action_args})")
# elif item["type"] == "function_call_output":
# print("===>", item["output"])
if __name__ == "__main__":
asyncio.run(main())
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/contrib.md:
--------------------------------------------------------------------------------
```markdown
# Contributing Reference Agent Implementations
This guide explains how to add your own reference agent implementations to the benchmark system.
## Adding Reference Agent Implementations
### 1. Implement the ModelProtocol
Create a new file in `models/` directory implementing the `ModelProtocol`:
```python
from models.base import ModelProtocol
from typing import Optional, Tuple
from PIL import Image
class YourModelName(ModelProtocol):
def __init__(self, model_path: str):
self.model_path = model_path
self._model = None
@property
def model_name(self) -> str:
return self.model_path
async def load_model(self) -> None:
"""Load the model into memory."""
# Your model loading logic here
pass
async def unload_model(self) -> None:
"""Unload the model from memory."""
# Your model cleanup logic here
pass
async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates for the given image and instruction.
Args:
image: PIL Image to analyze
instruction: Text instruction describing what to click
Returns:
Tuple of (x, y) coordinates or None if prediction fails
"""
# Your prediction logic here
return (x, y) # Return predicted coordinates
```
### 2. Register Your Model
Add your model to the `get_available_models()` function in `utils.py`:
```python
def get_available_models() -> List[Union[str, ModelProtocol]]:
models = [
# Computer Agent SDK providers
"huggingface-local/HelloKKMe/GTA1-7B",
# Reference implementations
GTA1Model("HelloKKMe/GTA1-7B"),
YourModelName("path/to/your/model"), # Add your model here
]
return models
```
### 3. Test Your Implementation
Before submitting, test your model with the interactive tool:
```bash
python interactive.py
```
This will help you verify that your model loads correctly and produces reasonable predictions.
## Example: Adding a New Model
Here's a complete example of adding a hypothetical "MyVisionModel":
1. **Create `models/my_vision_model.py`:**
```python
import torch
from transformers import AutoModel, AutoProcessor
from models.base import ModelProtocol
from typing import Optional, Tuple
from PIL import Image
class MyVisionModel(ModelProtocol):
def __init__(self, model_path: str):
self.model_path = model_path
self.model = None
self.processor = None
@property
def model_name(self) -> str:
return f"MyVisionModel({self.model_path})"
async def load_model(self) -> None:
"""Load the model and processor."""
self.processor = AutoProcessor.from_pretrained(self.model_path)
self.model = AutoModel.from_pretrained(
self.model_path,
torch_dtype=torch.float16,
device_map="auto"
)
async def unload_model(self) -> None:
"""Clean up model resources."""
del self.model
del self.processor
self.model = None
self.processor = None
torch.cuda.empty_cache()
async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
"""Predict click coordinates."""
try:
# Preprocess inputs
inputs = self.processor(
text=instruction,
images=image,
return_tensors="pt"
)
# Run inference
with torch.no_grad():
outputs = self.model(**inputs)
# Extract coordinates (model-specific logic)
x, y = self._extract_coordinates(outputs)
return (int(x), int(y))
except Exception as e:
print(f"Prediction failed: {e}")
return None
def _extract_coordinates(self, outputs):
"""Extract x, y coordinates from model outputs."""
# Your model-specific coordinate extraction logic
pass
```
2. **Update `models/__init__.py`:**
```python
from .gta1 import GTA1Model
from .my_vision_model import MyVisionModel
__all__ = ["GTA1Model", "MyVisionModel"]
```
3. **Update `utils.py`:**
```python
from models import GTA1Model, MyVisionModel
def get_available_models() -> List[Union[str, ModelProtocol]]:
models = [
"huggingface-local/HelloKKMe/GTA1-7B",
GTA1Model("HelloKKMe/GTA1-7B"),
MyVisionModel("my-org/my-vision-model"), # Add here
]
return models
```
```
--------------------------------------------------------------------------------
/docs/src/components/doc-actions-menu.tsx:
--------------------------------------------------------------------------------
```typescript
'use client';
import { useState } from 'react';
import { SiOpenai, SiAnthropic, SiMarkdown, SiGithub } from 'react-icons/si';
import posthog from 'posthog-js';
interface DocActionsMenuProps {
pageUrl: string;
pageTitle: string;
filePath?: string;
}
export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
const [copied, setCopied] = useState(false);
const handleCopyMarkdown = async () => {
try {
if (!filePath) {
throw new Error('No file path available');
}
const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;
const response = await fetch(githubRawUrl);
if (!response.ok) {
throw new Error('Failed to fetch markdown');
}
const markdown = await response.text();
await navigator.clipboard.writeText(markdown);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
posthog.capture('docs_copy_markdown_clicked', {
page: pageUrl,
page_title: pageTitle,
success: true,
});
} catch (error) {
console.error('Error copying markdown:', error);
try {
const urlWithUtm = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
await navigator.clipboard.writeText(urlWithUtm);
setCopied(true);
setTimeout(() => setCopied(false), 2000);
} catch (fallbackError) {
console.error('Error copying URL:', fallbackError);
}
posthog.capture('docs_copy_markdown_clicked', {
page: pageUrl,
page_title: pageTitle,
success: false,
error: error instanceof Error ? error.message : 'Unknown error',
});
}
};
const handleEditGithub = () => {
if (!filePath) {
return;
}
posthog.capture('docs_edit_github_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const githubEditUrl = `https://github.com/trycua/cua/edit/main/docs/content/docs/${filePath}`;
window.open(githubEditUrl, '_blank', 'noopener,noreferrer');
};
const handleOpenChatGPT = () => {
posthog.capture('docs_open_chatgpt_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
const chatgptUrl = `https://chatgpt.com/?q=${encodeURIComponent(prompt)}`;
window.open(chatgptUrl, '_blank', 'noopener,noreferrer');
};
const handleOpenClaude = () => {
posthog.capture('docs_open_claude_clicked', {
page: pageUrl,
page_title: pageTitle,
});
const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
const claudeUrl = `https://claude.ai/new?q=${encodeURIComponent(prompt)}`;
window.open(claudeUrl, '_blank', 'noopener,noreferrer');
};
return (
<div className="flex flex-col gap-2">
<button
onClick={handleCopyMarkdown}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiMarkdown className="w-2 h-4 flex-shrink-0" />
<span>{copied ? 'Copied!' : 'Copy as markdown'}</span>
</button>
<button
onClick={handleEditGithub}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiGithub className="w-4 h-4 flex-shrink-0" />
<span>Edit on GitHub</span>
</button>
<button
onClick={handleOpenChatGPT}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiOpenai className="w-4 h-4 flex-shrink-0" />
<span>Open in ChatGPT</span>
</button>
<button
onClick={handleOpenClaude}
className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
>
<SiAnthropic className="w-4 h-4 flex-shrink-0" />
<span>Open in Claude</span>
</button>
</div>
);
}
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Composed Agents
description: Combine grounding models with any LLM for computer-use capabilities
---
Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning.
Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model.
## How Composed Agents Work
1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`)
2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates
3. **Execution**: Actions are performed using the predicted coordinates
## Supported Grounding Models
Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models).
- OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}`
- GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
- Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
- InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
- UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU)
- OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model)
- Moondream3: `moondream3` (requires combination with a LiteLLM vision/text model)
## Supported Planning Models
Any vision-enabled LiteLLM-compatible model can be used as the planning component:
- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
- Examples:
- **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
- **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
- **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
- **Local models**: Any Hugging Face vision-language model
## Usage Examples
### GTA1 + GPT-5
Use OpenAI's GPT-5 for planning with specialized grounding:
```python
agent = ComputerAgent(
"huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
tools=[computer]
)
async for _ in agent.run("Take a screenshot, analyze the UI, and click on the most prominent button"):
pass
```
### GTA1 + Claude 3.5 Sonnet
Combine state-of-the-art grounding with powerful reasoning:
```python
agent = ComputerAgent(
"huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
tools=[computer]
)
async for _ in agent.run("Open Firefox, navigate to github.com, and search for 'computer-use'"):
pass
# Success! 🎉
# - Claude 3.5 Sonnet plans the sequence of actions
# - GTA1-7B provides precise click coordinates for each UI element
```
### UI-TARS + GPT-4o
Combine two different vision models for enhanced capabilities:
```python
agent = ComputerAgent(
"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o",
tools=[computer]
)
async for _ in agent.run("Help me fill out this form with my personal information"):
pass
```
### Moondream3 + GPT-4o
Use the built-in Moondream3 grounding with any planning model. Moondream3 will detect UI elements on the latest screenshot, label them, and provide a user message listing detected element names.
```python
from agent import ComputerAgent
from computer import computer
agent = ComputerAgent(
"moondream3+openai/gpt-4o",
tools=[computer]
)
async for _ in agent.run("Close the settings window, then open the Downloads folder"):
pass
```
## Benefits of Composed Agents
- **Specialized Grounding**: Use models optimized for click prediction accuracy
- **Flexible Planning**: Choose any LLM for task reasoning and planning
- **Cost Optimization**: Use smaller grounding models with larger planning models only when needed
- **Performance**: Leverage the strengths of different model architectures
## Capabilities
Composed agents support both capabilities:
```python
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
# Full computer-use agent capabilities
async for _ in agent.run("Complete this online form"):
pass
# Direct click prediction (uses grounding model only)
coords = agent.predict_click("find the submit button")
```
---
For more information on individual model capabilities, see [Computer-Use Agents](./computer-use-agents) and [Grounding Models](./grounding-models).
```
--------------------------------------------------------------------------------
/blog/composite-agents.md:
--------------------------------------------------------------------------------
```markdown
# Announcing Cua Agent framework 0.4 and Composite Agents
_Published on August 26, 2025 by Dillon DuPont_
<img src="./assets/composite-agents.png" alt="Composite Agents">
So you want to build an agent that can use a computer. Great! You've probably discovered that there are now dozens of different AI models that claim they can click GUI buttons and fill out forms. Less great: actually getting them to work together is like trying to coordinate a group project where everyone speaks a different language and has invented seventeen different ways to say "click here".
Here's the thing about new GUI models: they're all special snowflakes. One model wants you to feed it images and expects coordinates back as percentages from 0 to 1. Another wants absolute pixel coordinates. A third model has invented its own numeral system with `<|loc095|><|loc821|>` tokens inside tool calls. Some models output Python code that calls `pyautogui.click(x, y)`. Others will start hallucinating coordinates if you forget to format all previous messages within a very specific GUI system prompt.
This is the kind of problem that makes you wonder if we're building the future of computing or just recreating the Tower of Babel with more GPUs.
## What we fixed
Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
```python
# This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929", # or any other supported model
tools=[computer]
)
```
The output format is consistent across all providers (OpenAI, Anthropic, Vertex, Hugging Face, OpenRouter, etc.). No more writing different parsers for each model's creative interpretation of how to represent a mouse click.
## Composite Agents: Two Brains Are Better Than One
Here's where it gets interesting. We realized that you don't actually need one model to be good at everything. Some models are excellent at understanding what's on the screen—they can reliably identify buttons and text fields and figure out where to click. Other models are great at planning and reasoning but might be a bit fuzzy on the exact pixel coordinates.
So we let you combine them with a `+` sign:
```python
agent = ComputerAgent(
# specify the grounding model first, then the planning model
model="huggingface-local/HelloKKMe/GTA1-7B+huggingface-local/OpenGVLab/InternVL3_5-8B",
tools=[computer]
)
```
This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository.
You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
```python
agent = ComputerAgent(
model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-4o",
tools=[computer]
)
```
## Example notebook
For a full, ready-to-run demo (install deps, local computer using Docker, and a composed agent example), see the notebook:
- https://github.com/trycua/cua/blob/models/opencua/notebooks/composite_agents_docker_nb.ipynb
## What's next
We're building integration with HUD evals, allowing us to curate and benchmark model combinations. This will help us identify which composite agent pairs work best for different types of tasks, and provide you with tested recommendations rather than just throwing model names at the wall to see what sticks.
If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects.
---
## Links
- **Composite Agent Docs:** [https://cua.ai/docs/agent-sdk/supported-agents/composed-agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
```
--------------------------------------------------------------------------------
/blog/cloud-windows-ga-macos-preview.md:
--------------------------------------------------------------------------------
```markdown
# Cloud Windows Sandboxes GA + macOS Preview
If you've been building with our `cua` libraries, you might've hit a limitation with local computer-use sandboxes: to run agents on Windows or macOS, you need to be on that OS - Windows Sandbox for Windows, Apple Virtualization for macOS. The only cross-platform option is Linux on Docker, which limits you to virtualizing Linux environments ([see all local options here](https://cua.ai/docs/computer-sdk/computers)).
Today the story changes - we're announcing general availability of **Cloud Windows Sandboxes** and opening early preview access for **Cloud macOS Sandboxes**.
## Cloud Windows Sandboxes: Now GA

Cloud Windows Sandboxes are now generally available. You get a full Windows 11 desktop in your browser with Edge and Python pre-installed, working seamlessly with all our [Computer-Use libraries](https://github.com/trycua/cua) for RPA, UI automation, code execution, and agent development.
**What's new with this release:**
- Hot-start under 1 second
- Direct noVNC over HTTPS under our sandbox.cua.ai domain
- 3 sandbox sizes available:
| Size | CPU | RAM | Storage |
| ------ | ------- | ----- | ---------- |
| Small | 2 cores | 8 GB | 128 GB SSD |
| Medium | 4 cores | 16 GB | 128 GB SSD |
| Large | 8 cores | 32 GB | 256 GB SSD |
<div align="center">
<video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" width="600" controls></video>
</div>
**Pricing:** Windows Sandboxes start at 8 credits/hour (Small), 15 credits/hour (Medium), or 31 credits/hour (Large).
## Cloud macOS Sandboxes: Now in Preview
Running macOS locally comes with challenges: 30GB golden images, a maximum of 2 sandboxes per host, and unpredictable compatibility issues. With Cloud macOS Sandboxes, we provision bare-metal macOS hosts (M1, M2, M4) on-demand—giving you full desktop access without the overhead of managing local sandboxes.

**Preview access:** Invite-only. [Join the waitlist](https://cua.ai/macos-waitlist) if you're building agents for macOS workflows.
## Getting Started Today
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then connect to a sandbox:
```python
from computer import Computer
computer = Computer(
os_type="windows", # or "macos"
provider_type="cloud",
name="my-sandbox",
api_key="your-api-key"
)
await computer.run()
```
Manage existing sandboxes:
```python
from computer.providers.cloud.provider import CloudProvider
provider = CloudProvider(api_key="your-api-key")
async with provider:
sandboxes = await provider.list_vms()
await provider.run_vm("my-sandbox")
await provider.stop_vm("my-sandbox")
```
Run an agent on Windows to automate a workflow:
```python
from agent import ComputerAgent
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer],
max_trajectory_budget=5.0
)
response = await agent.run(
"Open Excel, create a sales report with this month's data, and save it to the desktop"
)
```
## FAQs
<details>
<summary><strong>Why not just use local Windows Sandbox?</strong></summary>
Local Windows Sandbox resets on every restart. No persistence, no hot-start, and you need Windows Pro. Our sandboxes persist state, hot-start in under a second, and work from any OS.
</details>
<details>
<summary><strong>What happens to my work when I stop a sandbox?</strong></summary>
Everything persists. Files, installed software, browser profiles—it's all there when you restart. Only pay for runtime, not storage.
</details>
<details>
<summary><strong>How's the latency for UI automation?</strong></summary>
We run in 4 regions so you can pick what's closest. The noVNC connection is optimized for automation, not video streaming. Your agent sees crisp screenshots, not compressed video.
</details>
<details>
<summary><strong>Are there software restrictions?</strong></summary>
No. Full admin access on both platforms. Install whatever you need—Visual Studio, Photoshop, custom enterprise software. It's your sandbox.
</details>
## Need help?
If you hit issues getting either platform working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
---
Get started at [cua.ai](https://cua.ai) or [join the macOS waitlist](https://cua.ai/macos-waitlist).
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/base.py:
--------------------------------------------------------------------------------
```python
"""
Base callback handler interface for ComputerAgent preprocessing and postprocessing hooks.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union
class AsyncCallbackHandler(ABC):
"""
Base class for async callback handlers that can preprocess messages before
the agent loop and postprocess output after the agent loop.
"""
async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
"""Called at the start of an agent run loop."""
pass
async def on_run_end(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> None:
"""Called at the end of an agent run loop."""
pass
async def on_run_continue(
self,
kwargs: Dict[str, Any],
old_items: List[Dict[str, Any]],
new_items: List[Dict[str, Any]],
) -> bool:
"""Called during agent run loop to determine if execution should continue.
Args:
kwargs: Run arguments
old_items: Original messages
new_items: New messages generated during run
Returns:
True to continue execution, False to stop
"""
return True
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Called before messages are sent to the agent loop.
Args:
messages: List of message dictionaries to preprocess
Returns:
List of preprocessed message dictionaries
"""
return messages
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Called after the agent loop returns output.
Args:
output: List of output message dictionaries to postprocess
Returns:
List of postprocessed output dictionaries
"""
return output
async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
"""
Called when a computer call is about to start.
Args:
item: The computer call item dictionary
"""
pass
async def on_computer_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""
Called when a computer call has completed.
Args:
item: The computer call item dictionary
result: The result of the computer call
"""
pass
async def on_function_call_start(self, item: Dict[str, Any]) -> None:
"""
Called when a function call is about to start.
Args:
item: The function call item dictionary
"""
pass
async def on_function_call_end(
self, item: Dict[str, Any], result: List[Dict[str, Any]]
) -> None:
"""
Called when a function call has completed.
Args:
item: The function call item dictionary
result: The result of the function call
"""
pass
async def on_text(self, item: Dict[str, Any]) -> None:
"""
Called when a text message is encountered.
Args:
item: The message item dictionary
"""
pass
async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
"""
Called when an API call is about to start.
Args:
kwargs: The kwargs being passed to the API call
"""
pass
async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
"""
Called when an API call has completed.
Args:
kwargs: The kwargs that were passed to the API call
result: The result of the API call
"""
pass
async def on_usage(self, usage: Dict[str, Any]) -> None:
"""
Called when usage information is received.
Args:
usage: The usage information
"""
pass
async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
"""
Called when a screenshot is taken.
Args:
screenshot: The screenshot image
name: The name of the screenshot
"""
pass
async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
"""
Called when responses are received.
Args:
kwargs: The kwargs being passed to the agent loop
responses: The responses received
"""
pass
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/computers.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Computer Types
description: Understanding Cua computer types and connection methods
---
{/* prettier-ignore */}
<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>
Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.
Cua Computers are preconfigured sandboxes running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native sandbox, or on your host desktop.
## Cloud Sandbox
**Easiest & safest way to get started - works on any host OS**
This is a Cloud Sandbox running the Computer Server. Get a sandbox at [cua.ai](https://cua.ai/).
<Tabs items={['Python', 'TypeScript']}>
<Tab value="Python">
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="cloud",
name="your-sandbox-name",
api_key="your-api-key"
)
await computer.run() # Connect to the sandbox
```
</Tab>
<Tab value="TypeScript">
```typescript
import { Computer, OSType } from '@trycua/computer';
const computer = new Computer({
osType: OSType.LINUX,
name: "your-sandbox-name",
apiKey: "your-api-key"
});
await computer.run(); // Connect to the sandbox
```
</Tab>
</Tabs>
## Linux on Docker
**Run Linux desktop locally on macOS, Windows, or Linux hosts**
Cua provides two Docker images for running Linux desktops:
<Tabs items={['XFCE (Lightweight)', 'KASM (Full-Featured)']}>
<Tab value="XFCE (Lightweight)">
**Recommended for most use cases** - lightweight XFCE desktop with Firefox
1. Install Docker Desktop or Docker Engine
2. Pull the CUA XFCE image
```bash
docker pull --platform=linux/amd64 trycua/cua-xfce:latest
```
3. Connect with Computer
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="docker",
image="trycua/cua-xfce:latest",
name="my-xfce-sandbox"
)
await computer.run() # Launch & connect to Docker sandbox
```
</Tab>
<Tab value="KASM (Full-Featured)">
**Full-featured Ubuntu desktop** with additional applications
1. Install Docker Desktop or Docker Engine
2. Build or pull the CUA KASM image
```bash
# Option 1: Pull from Docker Hub
docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
# Option 2: Build locally
cd libs/kasm
docker build -t cua-ubuntu:latest .
```
3. Connect with Computer
```python
from computer import Computer
computer = Computer(
os_type="linux",
provider_type="docker",
image="trycua/cua-ubuntu:latest",
name="my-kasm-sandbox"
)
await computer.run() # Launch & connect to Docker sandbox
```
</Tab>
</Tabs>
## Windows Sandbox
**Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11**
1. Enable Windows Sandbox
2. Install pywinsandbox dependency
```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```
3. Connect with Computer
```python
from computer import Computer
computer = Computer(
os_type="windows",
provider_type="winsandbox",
ephemeral=True # Windows Sandbox is always ephemeral
)
await computer.run() # Launch & connect to Windows Sandbox
```
## macOS Sandbox
**macOS hosts only - requires Lume CLI**
1. Install lume cli
```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```
2. Start a local Cua macOS sandbox
```bash
lume run macos-sequoia-cua:latest
```
3. Connect with Computer
```python
from computer import Computer
computer = Computer(
os_type="macos",
provider_type="lume",
name="macos-sequoia-cua:latest"
)
await computer.run() # Launch & connect to the sandbox
```
## Your host desktop
You can also have agents control your desktop directly by running Computer Server without any containerization layer. Beware that AI models may perform risky actions.
```bash
pip install cua-computer-server
python -m computer_server
```
Connect with:
<Tabs items={['Python']}>
<Tab value="Python">
```python
computer = Computer(use_host_computer_server=True)
await computer.run() # Connect to the host desktop
```
</Tab>
</Tabs>
```
--------------------------------------------------------------------------------
/libs/lumier/src/bin/entry.sh:
--------------------------------------------------------------------------------
```bash
#!/usr/bin/env bash
# Configure SSH to prevent known hosts warnings
export SSHPASS_PROMPT=
export SSH_ASKPASS=/bin/echo
# Set SSH quiet mode via the SSHPASS environment variable
export SSHPASS_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -q"
# We'll enable strict error checking AFTER initialization
# to prevent premature exits
# Source configuration files
CONFIG_DIR="/run/config"
LIB_DIR="/run/lib"
# Source constants if available
if [ -f "${CONFIG_DIR}/constants.sh" ]; then
source "${CONFIG_DIR}/constants.sh"
fi
# Import utilities
for lib in "${LIB_DIR}"/*.sh; do
if [ -f "$lib" ]; then
source "$lib"
fi
done
# Set VM_NAME to env or fallback to container name (from --name)
if [ -z "${VM_NAME:-}" ]; then
VM_NAME="$(cat /etc/hostname)"
export VM_NAME
fi
# Set HOST_STORAGE_PATH to a lume ephemeral storage if not set
if [ -z "${HOST_STORAGE_PATH:-}" ]; then
HOST_STORAGE_PATH="ephemeral"
# Tell user that ephemeral storage is being used
echo "Using ephemeral storage. VM state will be lost when macOS cleans up temporary files."
export HOST_STORAGE_PATH
fi
# Only check and report mountpoints in debug mode
if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
if mountpoint -q /storage; then
echo "/storage is mounted"
fi
if mountpoint -q /shared; then
echo "/shared is mounted"
fi
# if mountpoint -q /data; then
# echo "/data is mounted"
# fi
fi
# Check if we're running as PID 1 (important for Docker signal handling)
if [ $$ -ne 1 ]; then
echo "Warning: This script is not running as PID 1 (current PID: $$)."
echo "Docker signal handling may not work properly when stopped from Docker Desktop."
fi
# Log startup info
echo "Lumier VM is starting..."
# Cleanup function to ensure VM and noVNC proxy shutdown on container stop
# Counter for signal handling
SIGNAL_COUNT=0
cleanup() {
local signal_name=$1
set +e # Don't exit on error in cleanup
# Increment signal counter
SIGNAL_COUNT=$((SIGNAL_COUNT + 1))
# If this is the first signal, try graceful shutdown
if [ $SIGNAL_COUNT -eq 1 ]; then
echo "[cleanup] Caught $signal_name signal, shutting down..."
# Check if we're in the middle of an image pull
if [[ "$PULL_IN_PROGRESS" == "1" ]]; then
echo "[cleanup] Interrupted during image pull, skipping VM stop."
else
echo "[cleanup] Stopping VM..."
stop_vm true
fi
# Attempt to clean up ephemeral storage if it's in the /private/tmp directory
if [[ "$HOST_STORAGE_PATH" == "ephemeral" ]]; then
# First check if VM actually exists
VM_INFO=$(lume_get "$VM_NAME" "$HOST_STORAGE_PATH" "json" "false")
# Only try VM deletion if VM exists and not in the middle of a pull
if [[ "$PULL_IN_PROGRESS" != "1" && $VM_INFO != *"Virtual machine not found"* ]]; then
echo "[cleanup] Cleaning up VM..."
lume_delete "$VM_NAME" "$HOST_STORAGE_PATH" > /dev/null 2>&1
fi
fi
else
# For multiple signals, force an immediate exit
echo "got $SIGNAL_COUNT SIGTERM/SIGINTs, forcefully exiting"
fi
# If we've received multiple signals, just exit immediately
if [ $SIGNAL_COUNT -ge 3 ]; then
exit 1
fi
# Exit with success for the first signal
if [ $SIGNAL_COUNT -eq 1 ]; then
exit 0
fi
}
# Ensure we catch all typical container termination signals
trap 'cleanup SIGTERM' SIGTERM
trap 'cleanup SIGINT' SIGINT
trap 'cleanup SIGHUP' SIGHUP
# Now enable strict error handling after initialization
set -euo pipefail
# Start the VM with error handling
if ! start_vm; then
echo "ERROR: Failed to start VM!" >&2
exit 1
fi
# Start noVNC for VNC access
NOVNC_PID=""
if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then
# Only show this in debug mode
if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
echo "Starting noVNC proxy with optimized color settings..."
fi
${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 &
NOVNC_PID=$!
disown $NOVNC_PID
echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true (replace PORT with the port you forwarded to 8006)"
fi
echo "Lumier is running. Press Ctrl+C to stop."
# Instead of tail -f /dev/null, use a wait loop that can be interrupted by signals
while true; do
# Sleep in small increments to make signal handling more responsive
sleep 1 &
wait $!
# Break the loop if we've received a signal
if [ $SIGNAL_COUNT -gt 0 ]; then
break
fi
done
```
--------------------------------------------------------------------------------
/libs/lume/src/Server/Requests.swift:
--------------------------------------------------------------------------------
```swift
import ArgumentParser
import Foundation
import Virtualization
struct RunVMRequest: Codable {
let noDisplay: Bool?
let sharedDirectories: [SharedDirectoryRequest]?
let recoveryMode: Bool?
let storage: String?
struct SharedDirectoryRequest: Codable {
let hostPath: String
let readOnly: Bool?
}
func parse() throws -> [SharedDirectory] {
guard let sharedDirectories = sharedDirectories else { return [] }
return try sharedDirectories.map { dir -> SharedDirectory in
// Validate that the host path exists and is a directory
var isDirectory: ObjCBool = false
guard FileManager.default.fileExists(atPath: dir.hostPath, isDirectory: &isDirectory),
isDirectory.boolValue
else {
throw ValidationError(
"Host path does not exist or is not a directory: \(dir.hostPath)")
}
return SharedDirectory(
hostPath: dir.hostPath,
tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag,
readOnly: dir.readOnly ?? false
)
}
}
}
struct PullRequest: Codable {
let image: String
let name: String?
var registry: String
var organization: String
let storage: String?
enum CodingKeys: String, CodingKey {
case image, name, registry, organization, storage
}
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
image = try container.decode(String.self, forKey: .image)
name = try container.decodeIfPresent(String.self, forKey: .name)
registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
storage = try container.decodeIfPresent(String.self, forKey: .storage)
}
}
struct CreateVMRequest: Codable {
let name: String
let os: String
let cpu: Int
let memory: String
let diskSize: String
let display: String
let ipsw: String?
let storage: String?
func parse() throws -> (memory: UInt64, diskSize: UInt64) {
return (
memory: try parseSize(memory),
diskSize: try parseSize(diskSize)
)
}
}
struct SetVMRequest: Codable {
let cpu: Int?
let memory: String?
let diskSize: String?
let display: String?
let storage: String?
func parse() throws -> (memory: UInt64?, diskSize: UInt64?, display: VMDisplayResolution?) {
return (
memory: try memory.map { try parseSize($0) },
diskSize: try diskSize.map { try parseSize($0) },
display: try display.map {
guard let resolution = VMDisplayResolution(string: $0) else {
throw ValidationError(
"Invalid display resolution format: \($0). Expected format: WIDTHxHEIGHT")
}
return resolution
}
)
}
}
struct CloneRequest: Codable {
let name: String
let newName: String
let sourceLocation: String?
let destLocation: String?
}
struct PushRequest: Codable {
let name: String // Name of the local VM
let imageName: String // Base name for the image in the registry
let tags: [String] // List of tags to push
var registry: String // Registry URL
var organization: String // Organization/user in the registry
let storage: String? // Optional VM storage location or direct path
var chunkSizeMb: Int // Chunk size
// dryRun and reassemble are less common for API, default to false?
// verbose is usually handled by server logging
enum CodingKeys: String, CodingKey {
case name, imageName, tags, registry, organization, storage, chunkSizeMb
}
// Provide default values for optional fields during decoding
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
name = try container.decode(String.self, forKey: .name)
imageName = try container.decode(String.self, forKey: .imageName)
tags = try container.decode([String].self, forKey: .tags)
registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
storage = try container.decodeIfPresent(String.self, forKey: .storage)
chunkSizeMb = try container.decodeIfPresent(Int.self, forKey: .chunkSizeMb) ?? 512
}
}
```
--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMConfig.swift:
--------------------------------------------------------------------------------
```swift
import ArgumentParser
import Foundation
import Virtualization
/// Represents a shared directory configuration
struct SharedDirectory: Codable {
let hostPath: String
let tag: String
let readOnly: Bool
var string: String {
return "\(hostPath):\(tag):\(readOnly ? "ro" : "rw")"
}
}
// MARK: - VMConfig
struct VMConfig: Codable {
// MARK: - Properties
let os: String
private var _cpuCount: Int?
private var _memorySize: UInt64?
private var _diskSize: UInt64?
private var _macAddress: String?
private var _display: VMDisplayResolution
private var _hardwareModel: Data?
private var _machineIdentifier: Data?
// MARK: - Initialization
init(
os: String,
cpuCount: Int? = nil,
memorySize: UInt64? = nil,
diskSize: UInt64? = nil,
macAddress: String? = nil,
display: String,
hardwareModel: Data? = nil,
machineIdentifier: Data? = nil
) throws {
self.os = os
self._cpuCount = cpuCount
self._memorySize = memorySize
self._diskSize = diskSize
self._macAddress = macAddress
self._display = VMDisplayResolution(string: display) ?? VMDisplayResolution(string: "1024x768")!
self._hardwareModel = hardwareModel
self._machineIdentifier = machineIdentifier
}
var display: VMDisplayResolution {
get { _display }
set { _display = newValue }
}
var cpuCount: Int? {
get { _cpuCount }
set { _cpuCount = newValue }
}
var memorySize: UInt64? {
get { _memorySize }
set { _memorySize = newValue }
}
var diskSize: UInt64? {
get { _diskSize }
set { _diskSize = newValue }
}
var hardwareModel: Data? {
get { _hardwareModel }
set { _hardwareModel = newValue }
}
var machineIdentifier: Data? {
get { _machineIdentifier }
set { _machineIdentifier = newValue }
}
var macAddress: String? {
get { _macAddress }
set { _macAddress = newValue }
}
mutating func setCpuCount(_ count: Int) {
_cpuCount = count
}
mutating func setMemorySize(_ size: UInt64) {
_memorySize = size
}
mutating func setDiskSize(_ size: UInt64) {
_diskSize = size
}
mutating func setHardwareModel(_ hardwareModel: Data) {
_hardwareModel = hardwareModel
}
mutating func setMachineIdentifier(_ machineIdentifier: Data) {
_machineIdentifier = machineIdentifier
}
mutating func setMacAddress(_ newMacAddress: String) {
self._macAddress = newMacAddress
}
mutating func setDisplay(_ newDisplay: VMDisplayResolution) {
self._display = newDisplay
}
// MARK: - Codable
enum CodingKeys: String, CodingKey {
case _cpuCount = "cpuCount"
case _memorySize = "memorySize"
case _diskSize = "diskSize"
case macAddress
case display
case _hardwareModel = "hardwareModel"
case _machineIdentifier = "machineIdentifier"
case os
}
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
os = try container.decode(String.self, forKey: .os)
_cpuCount = try container.decodeIfPresent(Int.self, forKey: ._cpuCount)
_memorySize = try container.decodeIfPresent(UInt64.self, forKey: ._memorySize)
_diskSize = try container.decodeIfPresent(UInt64.self, forKey: ._diskSize)
_macAddress = try container.decodeIfPresent(String.self, forKey: .macAddress)
_display = VMDisplayResolution(string: try container.decode(String.self, forKey: .display))!
_hardwareModel = try container.decodeIfPresent(Data.self, forKey: ._hardwareModel)
_machineIdentifier = try container.decodeIfPresent(Data.self, forKey: ._machineIdentifier)
}
func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
try container.encodeIfPresent(os, forKey: .os)
try container.encodeIfPresent(_cpuCount, forKey: ._cpuCount)
try container.encodeIfPresent(_memorySize, forKey: ._memorySize)
try container.encodeIfPresent(_diskSize, forKey: ._diskSize)
try container.encodeIfPresent(_macAddress, forKey: .macAddress)
try container.encode(display.string, forKey: .display)
try container.encodeIfPresent(_hardwareModel, forKey: ._hardwareModel)
try container.encodeIfPresent(_machineIdentifier, forKey: ._machineIdentifier)
}
}
```
--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/cli.py:
--------------------------------------------------------------------------------
```python
"""
Command-line interface for the Computer API server.
"""
import argparse
import asyncio
import logging
import os
import sys
import threading
from typing import List, Optional
from .server import Server
logger = logging.getLogger(__name__)
def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description="Start the Computer API server")
parser.add_argument(
"--host", default="0.0.0.0", help="Host to bind the server to (default: 0.0.0.0)"
)
parser.add_argument(
"--port", type=int, default=8000, help="Port to bind the server to (default: 8000)"
)
parser.add_argument(
"--log-level",
choices=["debug", "info", "warning", "error", "critical"],
default="info",
help="Logging level (default: info)",
)
parser.add_argument(
"--ssl-keyfile",
type=str,
help="Path to SSL private key file (enables HTTPS)",
)
parser.add_argument(
"--ssl-certfile",
type=str,
help="Path to SSL certificate file (enables HTTPS)",
)
parser.add_argument(
"--watchdog",
action="store_true",
help="Enable watchdog monitoring (automatically enabled if CONTAINER_NAME env var is set)",
)
parser.add_argument(
"--watchdog-interval",
type=int,
default=30,
help="Watchdog ping interval in seconds (default: 30)",
)
parser.add_argument(
"--no-restart",
action="store_true",
help="Disable automatic server restart in watchdog",
)
return parser.parse_args(args)
def main() -> None:
"""Main entry point for the CLI."""
args = parse_args()
# Configure logging
logging.basicConfig(
level=getattr(logging, args.log_level.upper()),
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
# Check if watchdog should be enabled
container_name = os.environ.get("CONTAINER_NAME")
enable_watchdog = (args.watchdog or bool(container_name)) and not sys.platform.startswith("win")
if container_name:
logger.info(
f"Container environment detected (CONTAINER_NAME={container_name}), enabling watchdog"
)
elif args.watchdog:
logger.info("Watchdog explicitly enabled via --watchdog flag")
# Start watchdog if enabled
if enable_watchdog:
logger.info(f"Starting watchdog monitoring with {args.watchdog_interval}s interval")
def run_watchdog_thread():
"""Run watchdog in a separate thread."""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# Create CLI args dict for watchdog
cli_args = {
"host": args.host,
"port": args.port,
"log_level": args.log_level,
"ssl_keyfile": args.ssl_keyfile,
"ssl_certfile": args.ssl_certfile,
}
# Create watchdog with restart settings
from .watchdog import Watchdog
watchdog = Watchdog(cli_args=cli_args, ping_interval=args.watchdog_interval)
watchdog.restart_enabled = not args.no_restart
loop.run_until_complete(watchdog.start_monitoring())
except Exception as e:
logger.error(f"Watchdog error: {e}")
finally:
loop.close()
# Start watchdog in background thread
watchdog_thread = threading.Thread(target=run_watchdog_thread, daemon=True, name="watchdog")
watchdog_thread.start()
# Create and start the server
logger.info(f"Starting CUA Computer API server on {args.host}:{args.port}...")
# Handle SSL configuration
ssl_args = {}
if args.ssl_keyfile and args.ssl_certfile:
ssl_args = {
"ssl_keyfile": args.ssl_keyfile,
"ssl_certfile": args.ssl_certfile,
}
logger.info("HTTPS mode enabled with SSL certificates")
elif args.ssl_keyfile or args.ssl_certfile:
logger.warning(
"Both --ssl-keyfile and --ssl-certfile are required for HTTPS. Running in HTTP mode."
)
else:
logger.info("HTTP mode (no SSL certificates provided)")
server = Server(host=args.host, port=args.port, log_level=args.log_level, **ssl_args)
try:
server.start()
except KeyboardInterrupt:
logger.info("Server stopped by user")
sys.exit(0)
except Exception as e:
logger.error(f"Error starting server: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/DarwinImageLoader.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
import Virtualization
/// Handles loading and validation of macOS restore images (IPSW files).
/// Provides functionality to:
/// - Fetch the latest supported macOS restore image URL
/// - Load and validate image requirements for VM creation
/// - Extract hardware model and auxiliary storage configuration
protocol ImageLoader: Sendable {
typealias ImageRequirements = DarwinImageLoader.ImageRequirements
func fetchLatestSupportedURL() async throws -> URL
func loadImageRequirements(from url: URL) async throws -> ImageRequirements
func downloadLatestImage() async throws -> Path
}
final class DarwinImageLoader: NSObject, ImageLoader, @unchecked Sendable, URLSessionDownloadDelegate {
struct ImageRequirements: Sendable {
let hardwareModel: Data
let minimumSupportedCPUCount: Int
let minimumSupportedMemorySize: UInt64
}
enum ImageError: Error {
case invalidImage
case unsupportedConfiguration
case downloadFailed
}
private var lastLoggedProgress: Double = 0.0
private var progressLogger = ProgressLogger()
private var completionHandler: ((URL?, Error?) -> Void)?
func fetchLatestSupportedURL() async throws -> URL {
try await withCheckedThrowingContinuation { continuation in
VZMacOSRestoreImage.fetchLatestSupported { result in
switch result {
case .success(let image):
continuation.resume(returning: image.url)
case .failure(let error):
continuation.resume(throwing: error)
}
}
}
}
func loadImageRequirements(from url: URL) async throws -> ImageRequirements {
let image = try await VZMacOSRestoreImage.image(from: url)
guard let requirements = image.mostFeaturefulSupportedConfiguration else {
throw ImageError.unsupportedConfiguration
}
return ImageRequirements(
hardwareModel: requirements.hardwareModel.dataRepresentation,
minimumSupportedCPUCount: requirements.minimumSupportedCPUCount,
minimumSupportedMemorySize: requirements.minimumSupportedMemorySize
)
}
func downloadLatestImage() async throws -> Path {
let url = try await fetchLatestSupportedURL()
let tempDir = FileManager.default.temporaryDirectory
let downloadPath = tempDir.appendingPathComponent("latest.ipsw")
// Reset progress logger state
progressLogger = ProgressLogger(threshold: 0.01)
// Create a continuation to wait for download completion
return try await withCheckedThrowingContinuation { continuation in
let session = URLSession(configuration: .default, delegate: self, delegateQueue: nil)
let task = session.downloadTask(with: url)
// Use the delegate method to handle completion
self.completionHandler = { location, error in
if let error = error {
continuation.resume(throwing: error)
return
}
do {
// Remove existing file if it exists
if FileManager.default.fileExists(atPath: downloadPath.path) {
try FileManager.default.removeItem(at: downloadPath)
}
try FileManager.default.moveItem(at: location!, to: downloadPath)
Logger.info("Download completed and moved to: \(downloadPath.path)")
continuation.resume(returning: Path(downloadPath.path))
} catch {
continuation.resume(throwing: error)
}
}
task.resume()
}
}
func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didWriteData bytesWritten: Int64, totalBytesWritten: Int64, totalBytesExpectedToWrite: Int64) {
let progress = Double(totalBytesWritten) / Double(totalBytesExpectedToWrite)
progressLogger.logProgress(current: progress, context: "Downloading IPSW")
}
func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) {
// Call the stored completion handler
completionHandler?(location, nil)
}
func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
// Call the stored completion handler with an error if it occurred
if let error = error {
completionHandler?(nil, error)
}
}
}
```
--------------------------------------------------------------------------------
/examples/agent_examples.py:
--------------------------------------------------------------------------------
```python
"""Example demonstrating the ComputerAgent capabilities with the Omni provider."""
import asyncio
import logging
import signal
import traceback
# Import the unified agent class and types
from agent import ComputerAgent
from computer import Computer, VMProviderType
# Import utility functions
from utils import handle_sigint, load_dotenv_files
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def run_agent_example():
"""Run example of using the ComputerAgent with different models."""
print("\n=== Example: ComputerAgent with different models ===")
try:
# Create a local macOS computer
computer = Computer(
os_type="macos",
verbosity=logging.DEBUG,
)
# Create a remote Linux computer with Cua
# computer = Computer(
# os_type="linux",
# api_key=os.getenv("CUA_API_KEY"),
# name=os.getenv("CUA_CONTAINER_NAME"),
# provider_type=VMProviderType.CLOUD,
# )
# Create ComputerAgent with new API
agent = ComputerAgent(
# Supported models:
# == OpenAI CUA (computer-use-preview) ==
model="openai/computer-use-preview",
# == Anthropic CUA (Claude > 3.5) ==
# model="anthropic/claude-opus-4-20250514",
# model="anthropic/claude-sonnet-4-20250514",
# model="anthropic/claude-3-7-sonnet-20250219",
# model="anthropic/claude-sonnet-4-5-20250929",
# == UI-TARS ==
# model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
# model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
# model="ollama_chat/0000/ui-tars-1.5-7b",
# == Omniparser + Any LLM ==
# model="omniparser+anthropic/claude-opus-4-20250514",
# model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
# == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
# model="omni+vertex_ai/gemini-3-flash",
# thinking_level="high", # or "low"
# media_resolution="medium", # or "low" or "high"
tools=[computer],
only_n_most_recent_images=3,
verbosity=logging.DEBUG,
trajectory_dir="trajectories",
use_prompt_caching=True,
max_trajectory_budget=1.0,
)
# Example tasks to demonstrate the agent
tasks = [
"Look for a repository named trycua/cua on GitHub.",
"Check the open issues, open the most recent one and read it.",
"Clone the repository in users/lume/projects if it doesn't exist yet.",
"Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
"From Cursor, open Composer if not already open.",
"Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
]
# Use message-based conversation history
history = []
for i, task in enumerate(tasks):
print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
# Add user message to history
history.append({"role": "user", "content": task})
# Run agent with conversation history
async for result in agent.run(history, stream=False):
# Add agent outputs to history
history += result.get("output", [])
# Print output for debugging
for item in result.get("output", []):
if item.get("type") == "message":
content = item.get("content", [])
for content_part in content:
if content_part.get("text"):
print(f"Agent: {content_part.get('text')}")
elif item.get("type") == "computer_call":
action = item.get("action", {})
action_type = action.get("type", "")
print(f"Computer Action: {action_type}({action})")
elif item.get("type") == "computer_call_output":
print("Computer Output: [Screenshot/Result]")
print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")
except Exception as e:
logger.error(f"Error in run_agent_example: {e}")
traceback.print_exc()
raise
def main():
"""Run the Anthropic agent example."""
try:
load_dotenv_files()
# Register signal handler for graceful exit
signal.signal(signal.SIGINT, handle_sigint)
asyncio.run(run_agent_example())
except Exception as e:
print(f"Error running example: {e}")
traceback.print_exc()
if __name__ == "__main__":
main()
```
--------------------------------------------------------------------------------
/examples/computer_examples_windows.py:
--------------------------------------------------------------------------------
```python
import asyncio
import os
import sys
import traceback
from pathlib import Path
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
env_file = project_root / ".env"
print(f"Loading environment from: {env_file}")
from computer.helpers import sandboxed
from dotenv import load_dotenv
load_dotenv(env_file)
# Add paths to sys.path if needed
pythonpath = os.environ.get("PYTHONPATH", "")
for path in pythonpath.split(":"):
if path and path not in sys.path:
sys.path.insert(0, path) # Insert at beginning to prioritize
print(f"Added to sys.path: {path}")
from computer.computer import Computer
from computer.logger import LogLevel
from computer.providers.base import VMProviderType
# ANSI color codes
RED = "\033[91m"
RESET = "\033[0m"
async def main():
try:
print("\n=== Using direct initialization ===")
# Create a remote Windows computer with Cua
computer = Computer(
os_type="windows",
api_key=os.getenv("CUA_API_KEY"),
name=os.getenv("CONTAINER_NAME") or "",
provider_type=VMProviderType.CLOUD,
)
try:
# Run the computer with default parameters
await computer.run()
# Create output directory if it doesn't exist
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)
# Keyboard Actions Examples
print("\n=== Keyboard Actions ===")
await computer.interface.type_text("Hello, World!")
await computer.interface.press_key("enter")
# Mouse Actions Examples
print("\n=== Mouse Actions ===")
await computer.interface.move_cursor(100, 100)
await computer.interface.left_click()
await computer.interface.double_click(400, 400)
await computer.interface.right_click(300, 300)
print("\n=== RPC ===")
await computer.venv_install("demo_venv", ["mss"])
@sandboxed("demo_venv")
def greet_and_print(name):
import os
from mss import mss
# get username
username = os.getlogin()
print(f"Hello from inside the container, {name}!")
print("Username:", username)
print("Screens:", mss().monitors)
# take a screenshot
with mss() as sct:
filename = sct.shot(mon=-1, output="C:/Users/azureuser/Desktop/fullscreen.png")
print(filename)
return {"greeted": name, "username": username}
# Call with args and kwargs
result = await greet_and_print("John Doe")
print("Result from sandboxed function:", result)
# Command Actions Examples
print("\n=== Command Actions ===")
result = await computer.interface.run_command("notepad")
print("Result from command:", result)
screenshot = await computer.interface.screenshot()
screenshot_path = output_dir / "screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot)
print(f"Screenshot saved to: {screenshot_path.absolute()}")
# Clipboard Actions Examples
print("\n=== Clipboard Actions ===")
await computer.interface.set_clipboard("Test clipboard")
content = await computer.interface.copy_to_clipboard()
print(f"Clipboard content: {content}")
# Simple REPL Loop
print("\n=== Command REPL ===")
print("Enter commands to run on the remote computer.")
print("Type 'exit' or 'quit' to leave the REPL.\n")
while True:
try:
# Get command from user
command = input("command> ").strip()
# Check for exit commands
if command.lower() in ["exit", "quit", ""]:
if command.lower() in ["exit", "quit"]:
print("Exiting REPL...")
break
# Run the command
result = await computer.interface.run_command(command)
print(result.stdout)
if result.stderr:
print(f"{RED}{result.stderr}{RESET}")
except KeyboardInterrupt:
print("\nExiting REPL...")
break
except Exception as e:
print(f"{RED}Error running command: {e}{RESET}")
finally:
# Important to clean up resources
# await computer.stop()
pass
except Exception as e:
print(f"Error in main: {e}")
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/computers/cua.py:
--------------------------------------------------------------------------------
```python
"""
Computer handler implementation for OpenAI computer-use-preview protocol.
"""
import base64
from typing import Any, Dict, List, Literal, Optional, Union
from computer import Computer
from .base import AsyncComputerHandler
class cuaComputerHandler(AsyncComputerHandler):
"""Computer handler that implements the Computer protocol using the computer interface."""
def __init__(self, cua_computer: Computer):
"""Initialize with a computer interface (from tool schema)."""
self.cua_computer = cua_computer
self.interface = None
async def _initialize(self):
if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
await self.cua_computer.run()
self.interface = self.cua_computer.interface
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
# TODO: detect actual environment
return "linux"
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
assert self.interface is not None
screen_size = await self.interface.get_screen_size()
return screen_size["width"], screen_size["height"]
async def screenshot(self, text: Optional[str] = None) -> str:
"""Take a screenshot and return as base64 string.
Args:
text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
"""
assert self.interface is not None
screenshot_bytes = await self.interface.screenshot()
return base64.b64encode(screenshot_bytes).decode("utf-8")
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
assert self.interface is not None
if button == "left":
await self.interface.left_click(x, y)
elif button == "right":
await self.interface.right_click(x, y)
else:
# Default to left click for unknown buttons
await self.interface.left_click(x, y)
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
assert self.interface is not None
await self.interface.double_click(x, y)
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
assert self.interface is not None
await self.interface.move_cursor(x, y)
await self.interface.scroll(scroll_x, scroll_y)
async def type(self, text: str) -> None:
"""Type text."""
assert self.interface is not None
await self.interface.type_text(text)
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
assert self.interface is not None
import asyncio
await asyncio.sleep(ms / 1000.0)
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
assert self.interface is not None
await self.interface.move_cursor(x, y)
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
assert self.interface is not None
if isinstance(keys, str):
keys = keys.replace("-", "+").split("+")
if len(keys) == 1:
await self.interface.press_key(keys[0])
else:
# Handle key combinations
await self.interface.hotkey(*keys)
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
assert self.interface is not None
if not path:
return
# Start drag from first point
start = path[0]
await self.interface.mouse_down(start["x"], start["y"])
# Move through path
for point in path[1:]:
await self.interface.move_cursor(point["x"], point["y"])
# End drag at last point
end = path[-1]
await self.interface.mouse_up(end["x"], end["y"])
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
# This would need to be implemented based on the specific browser interface
# For now, return empty string
return ""
# ==== Anthropic Computer Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
assert self.interface is not None
await self.interface.mouse_down(x, y, button="left")
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
assert self.interface is not None
await self.interface.mouse_up(x, y, button="left")
```
--------------------------------------------------------------------------------
/libs/python/agent/tests/test_computer_agent.py:
--------------------------------------------------------------------------------
```python
"""Unit tests for ComputerAgent class.
This file tests ONLY the ComputerAgent initialization and basic functionality.
Following SRP: This file tests ONE class (ComputerAgent).
All external dependencies (liteLLM, Computer) are mocked.
"""
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import pytest
class TestComputerAgentInitialization:
"""Test ComputerAgent initialization (SRP: Only tests initialization)."""
@patch("agent.agent.litellm")
def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
"""Test that agent can be initialized with a model string."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
assert agent is not None
assert hasattr(agent, "model")
assert agent.model == "anthropic/claude-sonnet-4-5-20250929"
@patch("agent.agent.litellm")
def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
"""Test that agent can be initialized with tools."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
assert agent is not None
assert hasattr(agent, "tools")
@patch("agent.agent.litellm")
def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
"""Test that agent can be initialized with max trajectory budget."""
from agent import ComputerAgent
budget = 5.0
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
)
assert agent is not None
@patch("agent.agent.litellm")
def test_agent_requires_model(self, mock_litellm, disable_telemetry):
"""Test that agent requires a model parameter."""
from agent import ComputerAgent
with pytest.raises(TypeError):
# Should fail without model parameter - intentionally missing required argument
ComputerAgent() # type: ignore[call-arg]
class TestComputerAgentRun:
"""Test ComputerAgent.run() method (SRP: Only tests run logic)."""
@pytest.mark.asyncio
@patch("agent.agent.litellm")
async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
"""Test that agent.run() works with valid messages."""
from agent import ComputerAgent
# Mock liteLLM response
mock_response = {
"id": "chatcmpl-test",
"choices": [
{
"message": {"role": "assistant", "content": "Test response"},
"finish_reason": "stop",
}
],
"usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
}
mock_litellm.acompletion = AsyncMock(return_value=mock_response)
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Run should return an async generator
result_generator = agent.run(sample_messages)
assert result_generator is not None
# Check it's an async generator
assert hasattr(result_generator, "__anext__")
def test_agent_has_run_method(self, disable_telemetry):
"""Test that agent has run method available."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Verify run method exists
assert hasattr(agent, "run")
assert callable(agent.run)
def test_agent_has_agent_loop(self, disable_telemetry):
"""Test that agent has agent_loop initialized."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")
# Verify agent_loop is initialized
assert hasattr(agent, "agent_loop")
assert agent.agent_loop is not None
class TestComputerAgentTypes:
"""Test AgentResponse and Messages types (SRP: Only tests type definitions)."""
def test_messages_type_exists(self):
"""Test that Messages type is exported."""
from agent import Messages
assert Messages is not None
def test_agent_response_type_exists(self):
"""Test that AgentResponse type is exported."""
from agent import AgentResponse
assert AgentResponse is not None
class TestComputerAgentIntegration:
"""Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""
def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
"""Test that agent can be initialized with Computer tool."""
from agent import ComputerAgent
agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])
# Verify agent accepted the tool
assert agent is not None
assert hasattr(agent, "tools")
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/usage.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Usage
---
## Basic Usage
Once configured, you can simply ask Claude to perform computer tasks:
- "Open Chrome and go to github.com"
- "Create a folder called 'Projects' on my desktop"
- "Find all PDFs in my Downloads folder"
- "Take a screenshot and highlight the error message"
Claude will automatically use your CUA agent to perform these tasks.
## Advanced Features
### Progress Reporting
The MCP server provides real-time progress updates during task execution:
- Task progress is reported as percentages (0-100%)
- Multi-task operations show progress for each individual task
- Progress updates are streamed to the MCP client for real-time feedback
### Error Handling
Robust error handling ensures reliable operation:
- Failed tasks return error messages with screenshots when possible
- Session state is preserved even when individual tasks fail
- Automatic cleanup prevents resource leaks
- Detailed error logging for troubleshooting
### Concurrent Task Execution
For improved performance, multiple tasks can run concurrently:
- Set `concurrent=true` in `run_multi_cua_tasks` for parallel execution
- Each task runs in its own context with isolated state
- Progress tracking works for both sequential and concurrent modes
- Resource pooling ensures efficient computer instance usage
### Session Management
Multi-client support with automatic resource management:
- Each client gets isolated sessions with separate computer instances
- Sessions automatically clean up after 10 minutes of inactivity
- Resource pooling prevents resource exhaustion
- Session statistics available for monitoring
## Target Computer Options
By default, the MCP server runs CUA in a virtual machine for safety. However, you can also configure it to run on your local system.
### Default: Using a VM (Recommended)
The MCP server will automatically start and connect to a VM based on your platform. This is the safest option as AI actions are isolated from your host system.
No additional configuration is needed - this is the default behavior.
### Option: Targeting Your Local Desktop
<Callout type="warn">
**Warning:** When targeting your local system, AI models have direct access to your desktop and
may perform risky actions. Use with caution.
</Callout>
To have the MCP server control your local desktop instead of a VM:
1. **Start the Computer Server on your host:**
```bash
pip install cua-computer-server
python -m computer_server
```
2. **Configure the MCP server to use your host system:**
Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client configuration:
<Tabs items={['Claude Desktop', 'Other MCP Clients']}>
<Tab value="Claude Desktop">
Update your Claude Desktop config (see [Installation](/docs/libraries/mcp-server/installation)) to include the environment variable:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
"CUA_USE_HOST_COMPUTER_SERVER": "true"
}
}
}
}
```
</Tab>
<Tab value="Other MCP Clients">
Set the environment variable in your MCP client configuration:
```bash
export CUA_USE_HOST_COMPUTER_SERVER=true
```
Then start your MCP client as usual.
</Tab>
</Tabs>
3. **Restart your MCP client** (e.g., Claude Desktop) to apply the changes.
Now Claude will control your local desktop directly when you ask it to perform computer tasks.
## Usage Examples
### Single Task Execution
```
"Open Safari and navigate to apple.com"
"Create a new folder on the desktop called 'My Projects'"
"Take a screenshot of the current screen"
```
### Multi-Task Execution (Sequential)
```
"Run these tasks in order: 1) Open Finder, 2) Navigate to Documents folder, 3) Create a new folder called 'Work'"
```
### Multi-Task Execution (Concurrent)
```
"Run these tasks simultaneously: 1) Open Chrome, 2) Open Safari, 3) Open Finder"
```
### Session Management
```
"Show me the current session statistics"
"Take a screenshot using session abc123"
"Cleanup session xyz789"
```
### Error Recovery
```
"Try to open a non-existent application and show me the error"
"Find all files with .tmp extension and delete them safely"
```
## First-time Usage Notes
**API Keys**: Ensure you have valid API keys:
- Add your Anthropic API key in the Claude Desktop config (as shown above)
- Or set it as an environment variable in your shell profile
- **Required**: The MCP server needs an API key to authenticate with the model provider
**Model Selection**: Choose the appropriate model for your needs:
- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
- **Local Models**: For privacy-sensitive environments
- **Ollama**: For offline usage
```
--------------------------------------------------------------------------------
/libs/lume/src/VNC/VNCService.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
import Dynamic
import Virtualization
/// Protocol defining the interface for VNC server operations
@MainActor
protocol VNCService {
var url: String? { get }
func start(port: Int, virtualMachine: Any?) async throws
func stop()
func openClient(url: String) async throws
}
/// Default implementation of VNCService
@MainActor
final class DefaultVNCService: VNCService {
private var vncServer: Any?
private let vmDirectory: VMDirectory
init(vmDirectory: VMDirectory) {
self.vmDirectory = vmDirectory
}
var url: String? {
get {
return try? vmDirectory.loadSession().url
}
}
func start(port: Int, virtualMachine: Any?) async throws {
let password = Array(PassphraseGenerator().prefix(4)).joined(separator: "-")
let securityConfiguration = Dynamic._VZVNCAuthenticationSecurityConfiguration(password: password)
// Create VNC server with specified port
let server = Dynamic._VZVNCServer(port: port, queue: DispatchQueue.main,
securityConfiguration: securityConfiguration)
if let vm = virtualMachine as? VZVirtualMachine {
server.virtualMachine = vm
}
server.start()
vncServer = server
// Wait for port to be assigned (both for auto-assign and specific port)
var attempts = 0
let maxAttempts = 20 // 1 second total wait time
while true {
if let assignedPort: UInt16 = server.port.asUInt16 {
// If we got a non-zero port, check if it matches our request
if assignedPort != 0 {
// For specific port requests, verify we got the requested port
if port != 0 && Int(assignedPort) != port {
throw VMError.vncPortBindingFailed(requested: port, actual: Int(assignedPort))
}
// Get the local IP address for the URL - prefer IPv4
let hostIP = try getLocalIPAddress() ?? "127.0.0.1"
let url = "vnc://:\(password)@127.0.0.1:\(assignedPort)" // Use localhost for local connections
let externalUrl = "vnc://:\(password)@\(hostIP):\(assignedPort)" // External URL for remote connections
Logger.info("VNC server started", metadata: [
"local": url,
"external": externalUrl
])
// Save session information with local URL for the client
let session = VNCSession(url: url)
try vmDirectory.saveSession(session)
break
}
}
attempts += 1
if attempts >= maxAttempts {
// If we've timed out and we requested a specific port, it likely means binding failed
vncServer = nil
if port != 0 {
throw VMError.vncPortBindingFailed(requested: port, actual: -1)
}
throw VMError.internalError("Timeout waiting for VNC server to start")
}
try await Task.sleep(nanoseconds: 50_000_000) // 50ms delay between checks
}
}
// Modified to prefer IPv4 addresses
private func getLocalIPAddress() throws -> String? {
var address: String?
var ifaddr: UnsafeMutablePointer<ifaddrs>?
guard getifaddrs(&ifaddr) == 0 else {
return nil
}
defer { freeifaddrs(ifaddr) }
var ptr = ifaddr
while ptr != nil {
defer { ptr = ptr?.pointee.ifa_next }
let interface = ptr?.pointee
let family = interface?.ifa_addr.pointee.sa_family
// Only look for IPv4 addresses
if family == UInt8(AF_INET) {
let name = String(cString: (interface?.ifa_name)!)
if name == "en0" { // Primary interface
var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
getnameinfo(interface?.ifa_addr,
socklen_t((interface?.ifa_addr.pointee.sa_len)!),
&hostname,
socklen_t(hostname.count),
nil,
0,
NI_NUMERICHOST)
address = String(cString: hostname, encoding: .utf8)
break
}
}
}
return address
}
func stop() {
if let server = vncServer as? Dynamic {
server.stop()
}
vncServer = nil
vmDirectory.clearSession()
}
func openClient(url: String) async throws {
let processRunner = DefaultProcessRunner()
try processRunner.run(executable: "/usr/bin/open", arguments: [url])
}
}
```
--------------------------------------------------------------------------------
/libs/typescript/agent/examples/playground-example.html:
--------------------------------------------------------------------------------
```html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>CUA Agent Playground Example</title>
</head>
<body>
<h1>CUA Agent Playground Example</h1>
<div>
<h2>Configuration</h2>
<label for="url">Agent URL:</label><br>
<input type="text" id="url" placeholder="https://localhost:8000 or peer://peer-id" value="https://localhost:8000" style="width: 400px;"><br><br>
<label for="model">Model:</label><br>
<input type="text" id="model" placeholder="anthropic/claude-opus-4-1-20250805" value="anthropic/claude-opus-4-1-20250805" style="width: 400px;"><br><br>
</div>
<div>
<h2>Chat</h2>
<label for="message">Message:</label><br>
<input type="text" id="message" placeholder="Enter your message here..." style="width: 400px;"><br><br>
<button onclick="sendMessage()">Send Message</button>
<!-- <button onclick="checkHealth()">Check Health</button> -->
<button onclick="clearOutput()">Clear Output</button><br><br>
<label for="output">Output:</label><br>
<textarea id="output" rows="20" cols="80" readonly></textarea>
</div>
<script src="https://unpkg.com/[email protected]/dist/peerjs.min.js"></script>
<script type="module">
// Import the AgentClient from the built library
import AgentClient from '/dist/index.js';
let client = null;
// Make functions available globally
window.sendMessage = sendMessage;
window.checkHealth = checkHealth;
window.clearOutput = clearOutput;
function log(message) {
const output = document.getElementById('output');
const timestamp = new Date().toLocaleTimeString();
output.value += `[${timestamp}] ${message}\n`;
output.scrollTop = output.scrollHeight;
}
function getClient() {
const url = document.getElementById('url').value.trim();
if (!url) {
log('ERROR: Please enter a URL');
return null;
}
// Create new client if URL changed or client doesn't exist
if (!client || client.url !== url) {
try {
client = new AgentClient(url);
client.url = url; // Store URL for comparison
log(`Created new client for: ${url}`);
} catch (error) {
log(`ERROR creating client: ${error.message}`);
return null;
}
}
return client;
}
async function sendMessage() {
const messageInput = document.getElementById('message');
const modelInput = document.getElementById('model');
const message = messageInput.value.trim();
const model = modelInput.value.trim();
if (!message) {
log('ERROR: Please enter a message');
return;
}
if (!model) {
log('ERROR: Please enter a model');
return;
}
const agentClient = getClient();
if (!agentClient) return;
try {
log(`Sending message: "${message}"`);
log(`Using model: ${model}`);
const request = {
model: model,
input: message
};
log('Sending request...');
const response = await agentClient.responses.create(request);
log('Response received:');
log(JSON.stringify(response, null, 2));
// Clear the message input
messageInput.value = '';
} catch (error) {
log(`ERROR: ${error.message}`);
}
}
async function checkHealth() {
const agentClient = getClient();
if (!agentClient) return;
try {
log('Checking health...');
const health = await agentClient.health();
log(`Health status: ${health.status}`);
} catch (error) {
log(`ERROR checking health: ${error.message}`);
}
}
function clearOutput() {
document.getElementById('output').value = '';
}
// Allow sending message with Enter key
document.getElementById('message').addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
sendMessage();
}
});
// Log initial message
log('CUA Agent Client Browser Example loaded');
log('Enter a URL (HTTP/HTTPS or peer://) and model, then send a message');
</script>
</body>
</html>
```
--------------------------------------------------------------------------------
/docs/src/assets/logo-black.svg:
--------------------------------------------------------------------------------
```
<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN"
"http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
<svg version="1.0" xmlns="http://www.w3.org/2000/svg"
width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000"
preserveAspectRatio="xMidYMid meet">
<g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)"
fill="#000000" stroke="none">
<path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38
-73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23
513 -38 69 -133 101 -213 71z"/>
<path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49
107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104
86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65
75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/>
<path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230
-265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57
-29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60
70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/>
<path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577
-579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310
22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0
-33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216
-91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15
-75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30
-70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66
521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32
415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112
31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71
159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17
31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149
43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50
19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19
189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0
-35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2
-330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16
-45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212
-255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705
209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35
-36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79
-11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0
16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85
2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299
-307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70
201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0
305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225
-102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301
303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39
550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134
-217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8
-55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382
-25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251
-25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140
-72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48
-410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17
-75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181
-199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597
5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16
-52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329
-150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96
22 103 23 243 25 95 1 154 -4 228 -20z"/>
<path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268
20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253
362 -108 4 -147 2 -171 -9z"/>
<path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3
-281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342
-15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/>
<path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151
-296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9
-24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1
-238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236
-42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15
70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186
406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13
-132 19 -200 10z"/>
</g>
</svg>
```
--------------------------------------------------------------------------------
/docs/src/assets/logo-white.svg:
--------------------------------------------------------------------------------
```
<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN"
"http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
<svg version="1.0" xmlns="http://www.w3.org/2000/svg"
width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000"
preserveAspectRatio="xMidYMid meet">
<g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)"
fill="#ffffff" stroke="none">
<path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38
-73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23
513 -38 69 -133 101 -213 71z"/>
<path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49
107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104
86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65
75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/>
<path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230
-265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57
-29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60
70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/>
<path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577
-579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310
22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0
-33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216
-91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15
-75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30
-70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66
521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32
415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112
31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71
159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17
31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149
43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50
19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19
189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0
-35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2
-330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16
-45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212
-255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705
209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35
-36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79
-11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0
16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85
2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299
-307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70
201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0
305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225
-102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301
303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39
550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134
-217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8
-55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382
-25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251
-25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140
-72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48
-410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17
-75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181
-199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597
5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16
-52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329
-150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96
22 103 23 243 25 95 1 154 -4 228 -20z"/>
<path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268
20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253
362 -108 4 -147 2 -171 -9z"/>
<path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3
-281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342
-15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/>
<path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151
-296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9
-24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1
-238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236
-42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15
70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186
406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13
-132 19 -200 10z"/>
</g>
</svg>
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/cua_adapter.py:
--------------------------------------------------------------------------------
```python
import os
from typing import Any, AsyncIterator, Iterator
from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm.types.utils import GenericStreamingChunk, ModelResponse
class CUAAdapter(CustomLLM):
def __init__(self, base_url: str | None = None, api_key: str | None = None, **_: Any):
super().__init__()
self.base_url = base_url or os.environ.get("CUA_BASE_URL") or "https://inference.cua.ai/v1"
self.api_key = (
api_key or os.environ.get("CUA_INFERENCE_API_KEY") or os.environ.get("CUA_API_KEY")
)
def _normalize_model(self, model: str) -> str:
# Accept either "cua/<model>" or raw "<model>"
return model.split("/", 1)[1] if model and model.startswith("cua/") else model
def completion(self, *args, **kwargs) -> ModelResponse:
model = kwargs.get("model", "")
api_base = kwargs.get("api_base") or self.base_url
if "anthropic/" in model:
model = f"anthropic/{self._normalize_model(model)}"
api_base = api_base.removesuffix("/v1")
else:
model = f"openai/{self._normalize_model(model)}"
params = {
"model": model,
"messages": kwargs.get("messages", []),
"api_base": api_base,
"api_key": kwargs.get("api_key") or self.api_key,
"stream": False,
}
if "optional_params" in kwargs:
params.update(kwargs["optional_params"])
del kwargs["optional_params"]
if "headers" in kwargs:
params["headers"] = kwargs["headers"]
del kwargs["headers"]
# Print dropped parameters
original_keys = set(kwargs.keys())
used_keys = set(params.keys()) # Only these are extracted from kwargs
ignored_keys = {
"litellm_params",
"client",
"print_verbose",
"acompletion",
"timeout",
"logging_obj",
"encoding",
"custom_prompt_dict",
"model_response",
"logger_fn",
}
dropped_keys = original_keys - used_keys - ignored_keys
if dropped_keys:
dropped_keyvals = {k: kwargs[k] for k in dropped_keys}
# print(f"CUAAdapter.completion: Dropped parameters: {dropped_keyvals}")
return completion(**params) # type: ignore
async def acompletion(self, *args, **kwargs) -> ModelResponse:
model = kwargs.get("model", "")
api_base = kwargs.get("api_base") or self.base_url
if "anthropic/" in model:
model = f"anthropic/{self._normalize_model(model)}"
api_base = api_base.removesuffix("/v1")
else:
model = f"openai/{self._normalize_model(model)}"
params = {
"model": model,
"messages": kwargs.get("messages", []),
"api_base": api_base,
"api_key": kwargs.get("api_key") or self.api_key,
"stream": False,
}
if "optional_params" in kwargs:
params.update(kwargs["optional_params"])
del kwargs["optional_params"]
if "headers" in kwargs:
params["headers"] = kwargs["headers"]
del kwargs["headers"]
# Print dropped parameters
original_keys = set(kwargs.keys())
used_keys = set(params.keys()) # Only these are extracted from kwargs
ignored_keys = {
"litellm_params",
"client",
"print_verbose",
"acompletion",
"timeout",
"logging_obj",
"encoding",
"custom_prompt_dict",
"model_response",
"logger_fn",
}
dropped_keys = original_keys - used_keys - ignored_keys
if dropped_keys:
dropped_keyvals = {k: kwargs[k] for k in dropped_keys}
# print(f"CUAAdapter.acompletion: Dropped parameters: {dropped_keyvals}")
response = await acompletion(**params) # type: ignore
return response
def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
params = dict(kwargs)
inner_model = self._normalize_model(params.get("model", ""))
params.update(
{
"model": f"openai/{inner_model}",
"api_base": self.base_url,
"api_key": self.api_key,
"stream": True,
}
)
# Yield chunks directly from LiteLLM's streaming generator
for chunk in completion(**params): # type: ignore
yield chunk # type: ignore
async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
params = dict(kwargs)
inner_model = self._normalize_model(params.get("model", ""))
params.update(
{
"model": f"openai/{inner_model}",
"api_base": self.base_url,
"api_key": self.api_key,
"stream": True,
}
)
stream = await acompletion(**params) # type: ignore
async for chunk in stream: # type: ignore
yield chunk # type: ignore
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/integrations/hud.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: HUD Evals
description: Use ComputerAgent with HUD for benchmarking and evaluation
---
<Callout>
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>
The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.
## Installation
First, install the required package:
```bash
pip install "cua-agent[hud]"
## or install hud-python directly
# pip install hud-python==0.4.12
```
## Environment Variables
Before running any evaluations, you’ll need to set up your environment variables for HUD and your model providers:
```bash
# HUD access
export HUD_API_KEY="your_hud_api_key"
# Model provider keys (at least one required)
export OPENAI_API_KEY="your_openai_key"
export ANTHROPIC_API_KEY="your_anthropic_key"
```
## Running a Single Task
You can run a single task from a HUD dataset for quick verification.
### Example
```python
from agent.integrations.hud import run_single_task
await run_single_task(
dataset="hud-evals/OSWorld-Verified", # or another HUD dataset
model="openai/computer-use-preview+openai/gpt-5-nano", # any supported model string
task_id=155, # e.g., reopen last closed tab
)
```
### Parameters
- `task_id` (`int`): Default: `0`
Index of the task to run from the dataset.
## Running a Full Dataset
To benchmark your agent at scale, you can run an entire dataset (or a subset) in parallel.
### Example
```python
from agent.integrations.hud import run_full_dataset
results = await run_full_dataset(
dataset="hud-evals/OSWorld-Verified", # can also pass a Dataset or list[dict]
model="openai/computer-use-preview",
split="train[:3]", # try a few tasks to start
max_concurrent=20, # tune to your infra
max_steps=50 # safety cap per task
)
```
### Parameters
- `job_name` (`str` | `None`):
Optional human-readable name for the evaluation job (shows up in HUD UI).
- `max_concurrent` (`int`): Default: `30`
Number of tasks to run in parallel. Scale this based on your infra.
- `max_steps` (`int`): Default: `50`
Safety cap on steps per task to prevent infinite loops.
- `split` (`str`): Default: `"train"`
Dataset split or subset to run. Uses the [Hugging Face split format](https://huggingface.co/docs/datasets/v1.11.0/splits.html), e.g., `"train[:10]"` for the first 10 tasks.
## Additional Parameters
Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs.
- `dataset` (`str` | `Dataset` | `list[dict]`): **Required**
HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks.
- `model` (`str`): Default: `"computer-use-preview"`
Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding).
- `allowed_tools` (`list[str]`): Default: `["openai_computer"]`
Restrict which tools the agent may use.
- `tools` (`list[Any]`):
Extra tool configs to inject.
- `custom_loop` (`Callable`):
Optional custom agent loop function. If provided, overrides automatic loop selection.
- `only_n_most_recent_images` (`int`): Default: `5` for full dataset, `None` for single task.
Retain only the last N screenshots in memory.
- `callbacks` (`list[Any]`):
Hook functions for logging, telemetry, or side effects.
- `verbosity` (`int`):
Logging level. Set `2` for debugging every call/action.
- `trajectory_dir` (`str` | `dict`):
Save local copies of trajectories for replay/analysis.
- `max_retries` (`int`): Default: `3`
Number of retries for failed model/tool calls.
- `screenshot_delay` (`float` | `int`): Default: `0.5`
Delay (seconds) between screenshots to avoid race conditions.
- `use_prompt_caching` (`bool`): Default: `False`
Cache repeated prompts to reduce API calls.
- `max_trajectory_budget` (`float` | `dict`):
Limit on trajectory size/budget (e.g., tokens, steps).
- `telemetry_enabled` (`bool`): Default: `True`
Whether to send telemetry/traces to HUD.
- `**kwargs` (`any`):
Any additional keyword arguments are passed through to the agent loop or model provider.
## Available Benchmarks
HUD provides multiple benchmark datasets for realistic evaluation.
1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc.
_Best for_: evaluating full computer-use agents in realistic environments.
_Verified variant_: fixes 300+ issues from earlier versions for reliability.
**Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets.
See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.
## Tips
- **Debugging:** set `verbosity=2` to see every model call and tool action.
- **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions.
- **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops.
- **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`.
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/cloud-vm-management.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Cloud Sandbox Management
description: Manage your Cua Cloud sandboxes via Python SDK or HTTP API
---
import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
Using the Cua Cloud API, you can manage your Cua Cloud sandboxes with Python or HTTP (curl).
All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).
---
## List Sandboxes
<Tabs items={['Python', 'curl']}>
<Tab value="Python">
```python
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
# CloudProvider automatically reads CUA_API_KEY from environment
# You can also pass api_key explicitly: CloudProvider(api_key="your-api-key")
# Optional: point to a different API base
# os.environ["CUA_API_BASE"] = "https://api.cua.ai"
provider = CloudProvider(verbose=False)
async with provider:
vms = await provider.list_vms()
for vm in vms:
print({
"name": vm["name"],
"status": vm["status"],
"api_url": vm.get("api_url"),
"vnc_url": vm.get("vnc_url"),
})
if __name__ == "__main__":
asyncio.run(main())
```
</Tab>
<Tab value="curl">
```bash
curl -H "Authorization: Bearer $CUA_API_KEY" \
"https://api.cua.ai/v1/vms"
```
Responses:
- 200: Array of minimal sandbox objects with fields `{ name, password, status }`
- 401: Unauthorized (missing/invalid API key)
```json
[
{
"name": "s-windows-x4snp46ebf",
"password": "49b8daa3",
"status": "running"
}
]
```
Status values:
- `pending`: Sandbox deployment in progress
- `running`: Sandbox is active and accessible
- `stopped`: Sandbox is stopped but not terminated
- `terminated`: Sandbox has been permanently destroyed
- `failed`: Sandbox deployment or operation failed
---
</Tab>
</Tabs>
---
## Start a Sandbox
Provide the sandbox name you want to start.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name" # e.g., "m-linux-96lcxd2c2k"
provider = CloudProvider()
async with provider:
resp = await provider.run_vm(name)
print(resp) # { "name": name, "status": "starting" }
if __name__ == "__main__":
asyncio.run(main())
```
</Tab>
<Tab value="curl">
```bash
curl -X POST \
-H "Authorization: Bearer $CUA_API_KEY" \
"https://api.cua.ai/v1/vms/my-vm-name/start" -i
```
Responses:
- 204: No Content (start accepted)
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user
```text
HTTP/1.1 204 No Content
```
</Tab>
</Tabs>
---
## Stop a Sandbox
Stops the sandbox asynchronously.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider()
async with provider:
resp = await provider.stop_vm(name)
print(resp) # { "name": name, "status": "stopping" }
if __name__ == "__main__":
asyncio.run(main())
```
</Tab>
<Tab value="curl">
```bash
curl -X POST \
-H "Authorization: Bearer $CUA_API_KEY" \
"https://api.cua.ai/v1/vms/my-vm-name/stop"
```
Responses:
- 202: Accepted with `{ "status": "stopping" }`
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user
```json
{ "status": "stopping" }
```
</Tab>
</Tabs>
---
## Restart a Sandbox
Restarts the sandbox asynchronously.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider()
async with provider:
resp = await provider.restart_vm(name)
print(resp) # { "name": name, "status": "restarting" }
if __name__ == "__main__":
asyncio.run(main())
```
</Tab>
<Tab value="curl">
```bash
curl -X POST \
-H "Authorization: Bearer $CUA_API_KEY" \
"https://api.cua.ai/v1/vms/my-vm-name/restart"
```
Responses:
- 202: Accepted with `{ "status": "restarting" }`
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user
```json
{ "status": "restarting" }
```
</Tab>
</Tabs>
---
## Query a Sandbox by name
Query the computer-server running on the sandbox. Useful for checking details like status or OS type.
<Tabs items={["Python", "curl"]}>
<Tab value="Python">
```python
import asyncio
from computer.providers.cloud.provider import CloudProvider
async def main():
# CloudProvider automatically reads CUA_API_KEY from environment
name = "my-vm-name"
provider = CloudProvider()
async with provider:
info = await provider.get_vm(name)
print(info)
if __name__ == "__main__":
asyncio.run(main())
```
</Tab>
<Tab value="curl">
```bash
curl "https://my-vm-name.containers.cloud.cua.ai:8443/status"
```
Responses:
- 200: Server available
```json
{ "status": "ok", "os_type": "linux", "features": ["agent"] }
```
</Tab>
</Tabs>
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/message-format.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Message Format
---
This page documents the Python message and response schema used by the Agent SDK.
It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.
All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.
## Response
The agent yields response chunks as an async generator of objects with `output` and `usage`.
```python
from typing import List, TypedDict
class Usage(TypedDict, total=False):
prompt_tokens: int
completion_tokens: int
total_tokens: int
response_cost: float # USD cost if available
class AgentResponse(TypedDict):
output: List["AgentMessage"]
usage: Usage
```
## Messages
Agent messages represent the state of the conversation and the agent's actions.
```python
from typing import List, Literal, Optional, TypedDict, Union
# Union of all message variants
AgentMessage = Union[
"UserMessage",
"AssistantMessage",
"ReasoningMessage",
"ComputerCallMessage",
"ComputerCallOutputMessage",
"FunctionCallMessage",
"FunctionCallOutputMessage",
]
# Input message (role: user/system/developer)
class UserMessage(TypedDict, total=False):
type: Literal["message"] # optional for user input
role: Literal["user", "system", "developer"]
content: Union[str, List["InputContent"]]
# Output message (assistant text)
class AssistantMessage(TypedDict):
type: Literal["message"]
role: Literal["assistant"]
content: List["OutputContent"]
# Output reasoning/thinking message
class ReasoningMessage(TypedDict):
type: Literal["reasoning"]
summary: List["SummaryContent"]
# Output computer action call (agent intends to act)
class ComputerCallMessage(TypedDict):
type: Literal["computer_call"]
call_id: str
status: Literal["completed", "failed", "pending"]
action: "ComputerAction"
# Output computer action result (always a screenshot)
class ComputerCallOutputMessage(TypedDict):
type: Literal["computer_call_output"]
call_id: str
output: "ComputerResultContent"
# Output function call (agent calls a Python tool)
class FunctionCallMessage(TypedDict):
type: Literal["function_call"]
call_id: str
status: Literal["completed", "failed", "pending"]
name: str
arguments: str # JSON-serialized kwargs
# Output function call result (text)
class FunctionCallOutputMessage(TypedDict):
type: Literal["function_call_output"]
call_id: str
output: str
```
## Message Content
These content items appear inside `content` arrays for the message types above.
```python
# Input content kinds
class InputContent(TypedDict):
type: Literal["input_image", "input_text"]
text: Optional[str]
image_url: Optional[str] # e.g., data URL
# Assistant output content
class OutputContent(TypedDict):
type: Literal["output_text"]
text: str
# Reasoning/summary output content
class SummaryContent(TypedDict):
type: Literal["summary_text"]
text: str
# Computer call outputs (screenshots)
class ComputerResultContent(TypedDict):
type: Literal["computer_screenshot", "input_image"]
image_url: str # data URL (e.g., "data:image/png;base64,....")
```
## Actions
Computer actions represent concrete operations the agent will perform on the computer.
Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.
```python
# Union of all supported computer actions
ComputerAction = Union[
"ClickAction",
"DoubleClickAction",
"DragAction",
"KeyPressAction",
"MoveAction",
"ScreenshotAction",
"ScrollAction",
"TypeAction",
"WaitAction",
# Anthropic variants
"LeftMouseDownAction",
"LeftMouseUpAction",
]
# OpenAI Computer Actions
class ClickAction(TypedDict):
type: Literal["click"]
button: Literal["left", "right", "wheel", "back", "forward"]
x: int
y: int
class DoubleClickAction(TypedDict, total=False):
type: Literal["double_click"]
button: Literal["left", "right", "wheel", "back", "forward"]
x: int
y: int
class DragAction(TypedDict, total=False):
type: Literal["drag"]
button: Literal["left", "right", "wheel", "back", "forward"]
path: List[tuple[int, int]] # [(x1, y1), (x2, y2), ...]
class KeyPressAction(TypedDict):
type: Literal["keypress"]
keys: List[str] # e.g., ["ctrl", "a"]
class MoveAction(TypedDict):
type: Literal["move"]
x: int
y: int
class ScreenshotAction(TypedDict):
type: Literal["screenshot"]
class ScrollAction(TypedDict):
type: Literal["scroll"]
scroll_x: int
scroll_y: int
x: int
y: int
class TypeAction(TypedDict):
type: Literal["type"]
text: str
class WaitAction(TypedDict):
type: Literal["wait"]
# Anthropic Computer Actions
class LeftMouseDownAction(TypedDict):
type: Literal["left_mouse_down"]
x: int
y: int
class LeftMouseUpAction(TypedDict):
type: Literal["left_mouse_up"]
x: int
y: int
```
## Notes
- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.
```
--------------------------------------------------------------------------------
/docs/src/components/footer.tsx:
--------------------------------------------------------------------------------
```typescript
export function Footer() {
return (
<footer className="mt-auto border-t border-fd-border py-8">
<div className="container mx-auto px-4">
<div className="grid grid-cols-1 md:grid-cols-4 gap-8 mb-6">
{/* Product Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Product</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Home
</a>
</li>
<li>
<a
href="https://cua.ai/pricing"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Pricing
</a>
</li>
<li>
<a
href="https://cua.ai/#features"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Features
</a>
</li>
</ul>
</div>
{/* Documentation Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Documentation</h3>
<ul className="space-y-2">
<li>
<a
href="/docs"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Getting Started
</a>
</li>
<li>
<a
href="/docs/agent-sdk/agent-loops"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Agent Loops
</a>
</li>
<li>
<a
href="/docs/get-started/quickstart"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Quick Start
</a>
</li>
</ul>
</div>
{/* Resources Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Resources</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai/blog"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Blog
</a>
</li>
<li>
<a
href="https://github.com/trycua/cua"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
GitHub
</a>
</li>
<li>
<a
href="https://discord.com/invite/mVnXXpdE85"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Discord Community
</a>
</li>
</ul>
</div>
{/* Company Links */}
<div>
<h3 className="font-semibold text-sm mb-3 text-fd-foreground">Company</h3>
<ul className="space-y-2">
<li>
<a
href="https://cua.ai/about"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
About
</a>
</li>
<li>
<a
href="mailto:[email protected]"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Contact
</a>
</li>
<li>
<a
href="https://cua.ai/cookie-policy"
target="_blank"
rel="noopener noreferrer"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Cookie Policy
</a>
</li>
</ul>
</div>
</div>
{/* Bottom Bar */}
<div className="pt-6 border-t border-fd-border flex flex-col md:flex-row justify-between items-center gap-4">
<p className="text-sm text-fd-muted-foreground">
© {new Date().getFullYear()} Cua. All rights reserved.
</p>
<div className="flex gap-4">
<a
href="https://cua.ai/privacy-policy"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Privacy Policy
</a>
<a
href="https://cua.ai/cookie-policy"
className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
>
Cookie Policy
</a>
</div>
</div>
</div>
</footer>
);
}
```
--------------------------------------------------------------------------------
/libs/typescript/agent/src/client.ts:
--------------------------------------------------------------------------------
```typescript
import { Peer } from 'peerjs';
import type { AgentRequest, AgentResponse, ConnectionType, AgentClientOptions } from './types';
export class AgentClient {
private url: string;
private connectionType: ConnectionType;
private options: AgentClientOptions;
private peer?: Peer;
private connection?: any;
constructor(url: string, options: AgentClientOptions = {}) {
this.url = url;
this.options = {
timeout: 30000,
retries: 3,
...options,
};
// Determine connection type from URL
if (url.startsWith('http://') || url.startsWith('https://')) {
this.connectionType = url.startsWith('https://') ? 'https' : 'http';
} else if (url.startsWith('peer://')) {
this.connectionType = 'peer';
} else {
throw new Error('Invalid URL format. Must start with http://, https://, or peer://');
}
}
// Main responses API matching the desired usage pattern
public responses = {
create: async (request: AgentRequest): Promise<AgentResponse> => {
return this.sendRequest(request);
},
};
private async sendRequest(request: AgentRequest): Promise<AgentResponse> {
switch (this.connectionType) {
case 'http':
case 'https':
return this.sendHttpRequest(request);
case 'peer':
return this.sendPeerRequest(request);
default:
throw new Error(`Unsupported connection type: ${this.connectionType}`);
}
}
private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);
try {
const headers: Record<string, string> = {
'Content-Type': 'application/json',
};
if (this.options.apiKey) {
headers['X-API-Key'] = this.options.apiKey;
}
const response = await fetch(`${this.url}/responses`, {
method: 'POST',
headers,
body: JSON.stringify(request),
signal: controller.signal,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const data = await response.json();
return data as AgentResponse;
} catch (error) {
clearTimeout(timeoutId);
if (error instanceof Error) {
throw new Error(`Failed to send HTTP request: ${error.message}`);
}
throw error;
}
}
private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> {
// Extract peer ID from peer:// URL
const peerId = this.url.replace('peer://', '');
if (!this.peer) {
// Initialize peer connection with default options as requested
this.peer = new Peer();
return new Promise<AgentResponse>((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error('Peer connection timeout'));
}, this.options.timeout);
this.peer!.on('open', () => {
// Connect to the target peer
this.connection = this.peer!.connect(peerId);
this.connection.on('open', () => {
// Send the request
this.connection!.send(JSON.stringify(request));
});
this.connection.on('data', (data: any) => {
clearTimeout(timeout);
try {
const response = typeof data === 'string' ? JSON.parse(data) : data;
resolve(response as AgentResponse);
} catch (error) {
reject(new Error('Failed to parse peer response'));
}
});
this.connection.on('error', (error: any) => {
clearTimeout(timeout);
reject(new Error(`Peer connection error: ${error}`));
});
});
this.peer!.on('error', (error: any) => {
clearTimeout(timeout);
reject(new Error(`Peer error: ${error}`));
});
});
} else {
// Reuse existing connection
return new Promise<AgentResponse>((resolve, reject) => {
const timeout = setTimeout(() => {
reject(new Error('Peer request timeout'));
}, this.options.timeout);
if (this.connection && this.connection.open) {
this.connection.send(JSON.stringify(request));
const handleData = (data: any) => {
clearTimeout(timeout);
this.connection!.off('data', handleData);
try {
const response = typeof data === 'string' ? JSON.parse(data) : data;
resolve(response as AgentResponse);
} catch (error) {
reject(new Error('Failed to parse peer response'));
}
};
this.connection.on('data', handleData);
} else {
clearTimeout(timeout);
reject(new Error('Peer connection not available'));
}
});
}
}
// Health check method
async health(): Promise<{ status: string }> {
if (this.connectionType === 'peer') {
return { status: this.peer?.open ? 'connected' : 'disconnected' };
}
try {
const response = await fetch(`${this.url}/health`);
if (response.ok) {
return { status: 'healthy' };
}
return { status: 'unhealthy' };
} catch {
return { status: 'unreachable' };
}
}
// Clean up resources
async disconnect(): Promise<void> {
if (this.connection) {
this.connection.close();
this.connection = undefined;
}
if (this.peer) {
this.peer.destroy();
this.peer = undefined;
}
}
}
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/gta1.py:
--------------------------------------------------------------------------------
```python
"""
GTA1 agent loop implementation for click prediction using litellm.acompletion
Paper: https://arxiv.org/pdf/2507.05791
Code: https://github.com/Yan98/GTA1
"""
import asyncio
import base64
import json
import math
import re
import uuid
from io import BytesIO
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
import litellm
from PIL import Image
from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..types import AgentCapability, AgentResponse, Messages, Tools
SYSTEM_PROMPT = """
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
Output the coordinate pair exactly:
(x,y)
""".strip()
def extract_coordinates(raw_string: str) -> Tuple[float, float]:
"""Extract coordinates from model output."""
try:
matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
return tuple(map(float, matches[0])) # type: ignore
except:
return (0.0, 0.0)
def smart_resize(
height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
) -> Tuple[int, int]:
"""Smart resize function similar to qwen_vl_utils."""
# Calculate the total pixels
total_pixels = height * width
# If already within bounds, return original dimensions
if min_pixels <= total_pixels <= max_pixels:
# Round to nearest factor
new_height = (height // factor) * factor
new_width = (width // factor) * factor
return new_height, new_width
# Calculate scaling factor
if total_pixels > max_pixels:
scale = (max_pixels / total_pixels) ** 0.5
else:
scale = (min_pixels / total_pixels) ** 0.5
# Apply scaling
new_height = int(height * scale)
new_width = int(width * scale)
# Round to nearest factor
new_height = (new_height // factor) * factor
new_width = (new_width // factor) * factor
# Ensure minimum size
new_height = max(new_height, factor)
new_width = max(new_width, factor)
return new_height, new_width
@register_agent(models=r".*GTA1.*")
class GTA1Config(AsyncAgentConfig):
"""GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""
def __init__(self):
self.current_model = None
self.last_screenshot_b64 = None
async def predict_step(
self,
messages: List[Dict[str, Any]],
model: str,
tools: Optional[List[Dict[str, Any]]] = None,
max_retries: Optional[int] = None,
stream: bool = False,
computer_handler=None,
_on_api_start=None,
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs,
) -> Dict[str, Any]:
raise NotImplementedError()
async def predict_click(
self, model: str, image_b64: str, instruction: str, **kwargs
) -> Optional[Tuple[float, float]]:
"""
Predict click coordinates using GTA1 model via litellm.acompletion.
Args:
model: The GTA1 model name
image_b64: Base64 encoded image
instruction: Instruction for where to click
Returns:
Tuple of (x, y) coordinates or None if prediction fails
"""
# Decode base64 image
image_data = base64.b64decode(image_b64)
image = Image.open(BytesIO(image_data))
width, height = image.width, image.height
# Smart resize the image (similar to qwen_vl_utils)
resized_height, resized_width = smart_resize(
height,
width,
factor=28, # Default factor for Qwen models
min_pixels=3136,
max_pixels=4096 * 2160,
)
resized_image = image.resize((resized_width, resized_height))
scale_x, scale_y = width / resized_width, height / resized_height
# Convert resized image back to base64
buffered = BytesIO()
resized_image.save(buffered, format="PNG")
resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()
# Prepare system and user messages
system_message = {
"role": "system",
"content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
}
user_message = {
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
},
{"type": "text", "text": instruction},
],
}
# Prepare API call kwargs
api_kwargs = {
"model": model,
"messages": [system_message, user_message],
"max_tokens": 2056,
"temperature": 0.0,
**kwargs,
}
# Use liteLLM acompletion
response = await litellm.acompletion(**api_kwargs)
# Extract response text
output_text = response.choices[0].message.content # type: ignore
# Extract and rescale coordinates
pred_x, pred_y = extract_coordinates(output_text) # type: ignore
pred_x *= scale_x
pred_y *= scale_y
return (math.floor(pred_x), math.floor(pred_y))
def get_capabilities(self) -> List[AgentCapability]:
"""Return the capabilities supported by this agent."""
return ["click"]
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/models/gta1.py:
--------------------------------------------------------------------------------
```python
"""
GTA1 model implementation for benchmarking.
"""
import gc
import re
from typing import Optional, Tuple
import torch
from PIL import Image
from qwen_vl_utils import process_vision_info, smart_resize
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
from .base import ModelProtocol
class GTA1Model:
"""Ground truth GTA1 model implementation."""
def __init__(self, model_path: str = "HelloKKMe/GTA1-7B"):
self.model_path = model_path
self.model = None
self.processor = None
self.max_new_tokens = 32
self.system_prompt = """
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
Output the coordinate pair exactly:
(x,y)
""".strip()
@property
def model_name(self) -> str:
"""Return the name of the model."""
return f"GTA1-{self.model_path.split('/')[-1]}"
async def load_model(self) -> None:
"""Load the model into memory."""
if self.model is None:
print(f"Loading GTA1 model: {self.model_path}")
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
self.model_path, torch_dtype=torch.bfloat16, device_map="auto"
)
self.processor = AutoProcessor.from_pretrained(
self.model_path, min_pixels=3136, max_pixels=4096 * 2160
)
print("GTA1 model loaded successfully")
async def unload_model(self) -> None:
"""Unload the model from memory."""
if self.model is not None:
print("Unloading GTA1 model from GPU...")
del self.model
del self.processor
self.model = None
self.processor = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
print("GTA1 model unloaded")
def _extract_coordinates(self, raw_string: str) -> Tuple[int, int]:
"""Extract coordinates from model output."""
try:
matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
return tuple(map(int, map(float, matches[0]))) # type: ignore
except:
return (0, 0)
async def predict_click(
self, image: Image.Image, instruction: str
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates for the given image and instruction.
Args:
image: PIL Image to analyze
instruction: Text instruction describing what to click
Returns:
Tuple of (x, y) coordinates or None if prediction fails
"""
if self.model is None or self.processor is None:
await self.load_model()
assert self.processor is not None
assert self.model is not None
try:
width, height = image.width, image.height
# Resize image according to processor requirements
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=self.processor.image_processor.patch_size
* self.processor.image_processor.merge_size,
min_pixels=self.processor.image_processor.min_pixels,
max_pixels=self.processor.image_processor.max_pixels,
)
resized_image = image.resize((resized_width, resized_height))
scale_x, scale_y = width / resized_width, height / resized_height
# Prepare messages
system_message = {
"role": "system",
"content": self.system_prompt.format(height=resized_height, width=resized_width),
}
user_message = {
"role": "user",
"content": [
{"type": "image", "image": resized_image},
{"type": "text", "text": instruction},
],
}
# Process inputs
image_inputs, video_inputs = process_vision_info([system_message, user_message]) # type: ignore
text = self.processor.apply_chat_template(
[system_message, user_message], tokenize=False, add_generation_prompt=True
)
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(self.model.device)
# Generate prediction
output_ids = self.model.generate(
**inputs,
max_new_tokens=self.max_new_tokens,
do_sample=False,
temperature=1.0,
use_cache=True,
)
generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = self.processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)[0]
# Extract and rescale coordinates
pred_x, pred_y = self._extract_coordinates(output_text)
pred_x = int(pred_x * scale_x)
pred_y = int(pred_y * scale_y)
return (pred_x, pred_y)
except Exception as e:
print(f"Error in GTA1 prediction: {e}")
return None
```