This is page 6 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/opencua.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | OpenCUA agent loop implementation for click prediction using litellm.acompletion
3 | Based on OpenCUA model for GUI grounding tasks.
4 | """
5 |
6 | import asyncio
7 | import base64
8 | import json
9 | import math
10 | import re
11 | import uuid
12 | from io import BytesIO
13 | from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
14 |
15 | import litellm
16 | from PIL import Image
17 |
18 | from ..decorators import register_agent
19 | from ..loops.base import AsyncAgentConfig
20 | from ..types import AgentCapability, AgentResponse, Messages, Tools
21 | from .composed_grounded import ComposedGroundedConfig
22 |
23 |
24 | def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
25 | """Extract coordinates from pyautogui.click(x=..., y=...) format."""
26 | try:
27 | # Look for pyautogui.click(x=1443, y=343) pattern
28 | pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
29 | match = re.search(pattern, text)
30 | if match:
31 | x, y = int(match.group(1)), int(match.group(2))
32 | return (x, y)
33 | return None
34 | except Exception:
35 | return None
36 |
37 |
38 | @register_agent(models=r"(?i).*OpenCUA.*")
39 | class OpenCUAConfig(ComposedGroundedConfig):
40 | """OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
41 |
42 | def __init__(self):
43 | super().__init__()
44 | self.current_model = None
45 | self.last_screenshot_b64 = None
46 |
47 | async def predict_step(
48 | self,
49 | messages: List[Dict[str, Any]],
50 | model: str,
51 | tools: Optional[List[Dict[str, Any]]] = None,
52 | max_retries: Optional[int] = None,
53 | stream: bool = False,
54 | computer_handler=None,
55 | _on_api_start=None,
56 | _on_api_end=None,
57 | _on_usage=None,
58 | _on_screenshot=None,
59 | **kwargs,
60 | ) -> Dict[str, Any]:
61 | """Fallback to a self-composed model"""
62 | return await super().predict_step(
63 | messages=messages,
64 | model=f"{model}+{model}",
65 | tools=tools,
66 | max_retries=max_retries,
67 | stream=stream,
68 | computer_handler=computer_handler,
69 | _on_api_start=_on_api_start,
70 | _on_api_end=_on_api_end,
71 | _on_usage=_on_usage,
72 | _on_screenshot=_on_screenshot,
73 | **kwargs,
74 | )
75 |
76 | async def predict_click(
77 | self, model: str, image_b64: str, instruction: str, **kwargs
78 | ) -> Optional[Tuple[int, int]]:
79 | """
80 | Predict click coordinates using OpenCUA model via litellm.acompletion.
81 |
82 | Args:
83 | model: The OpenCUA model name
84 | image_b64: Base64 encoded image
85 | instruction: Instruction for where to click
86 |
87 | Returns:
88 | Tuple of (x, y) coordinates or None if prediction fails
89 | """
90 | # Prepare system message
91 | system_prompt = (
92 | "You are a GUI agent. You are given a task and a screenshot of the screen. "
93 | "You need to perform a series of pyautogui actions to complete the task."
94 | )
95 |
96 | system_message = {"role": "system", "content": system_prompt}
97 |
98 | # Prepare user message with image and instruction
99 | user_message = {
100 | "role": "user",
101 | "content": [
102 | {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
103 | {"type": "text", "text": f"Click on {instruction}"},
104 | ],
105 | }
106 |
107 | # Prepare API call kwargs
108 | api_kwargs = {
109 | "model": model,
110 | "messages": [system_message, user_message],
111 | "max_new_tokens": 2056,
112 | "temperature": 0,
113 | **kwargs,
114 | }
115 |
116 | # Use liteLLM acompletion
117 | response = await litellm.acompletion(**api_kwargs)
118 |
119 | # Extract response text
120 | output_text = response.choices[0].message.content
121 | # print(output_text)
122 |
123 | # Extract coordinates from pyautogui format
124 | coordinates = extract_coordinates_from_pyautogui(output_text)
125 |
126 | return coordinates
127 |
128 | def get_capabilities(self) -> List[AgentCapability]:
129 | """Return the capabilities supported by this agent."""
130 | return ["click"]
131 |
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/customizing-computeragent.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Customize ComputerAgent
3 | ---
4 |
5 | <Callout>
6 | A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
7 | </Callout>
8 |
9 | The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
10 |
11 | This guide shows four proven ways to increase capabilities and success rate:
12 |
13 | - 1 — Simple: Prompt engineering
14 | - 2 — Easy: Tools
15 | - 3 — Intermediate: Callbacks
16 | - 4 — Expert: Custom `@register_agent`
17 |
18 | ## 1) Simple: Prompt engineering
19 |
20 | Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
21 |
22 | ```python
23 | from agent.agent import ComputerAgent
24 |
25 | agent = ComputerAgent(
26 | model="openai/computer-use-preview",
27 | tools=[computer],
28 | instructions=(
29 | "You are a meticulous software operator. Prefer safe, deterministic actions. "
30 | "Always confirm via on-screen text before proceeding."
31 | ),
32 | )
33 | ```
34 |
35 | ## 2) Easy: Tools
36 |
37 | Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
38 |
39 | ```python
40 | def calculate_percentage(numerator: float, denominator: float) -> str:
41 | """Calculate percentage as a string.
42 |
43 | Args:
44 | numerator: Numerator value
45 | denominator: Denominator value
46 | Returns:
47 | A formatted percentage string (e.g., '75.00%').
48 | """
49 | if denominator == 0:
50 | return "0.00%"
51 | return f"{(numerator/denominator)*100:.2f}%"
52 |
53 | agent = ComputerAgent(
54 | model="openai/computer-use-preview",
55 | tools=[computer, calculate_percentage],
56 | )
57 | ```
58 |
59 | - See `docs/agent-sdk/custom-tools` for authoring function tools.
60 | - See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
61 |
62 | ## 3) Intermediate: Callbacks
63 |
64 | Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
65 |
66 | ```python
67 | from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
68 |
69 | agent = ComputerAgent(
70 | model="cua/anthropic/claude-sonnet-4.5",
71 | tools=[computer],
72 | callbacks=[
73 | ImageRetentionCallback(only_n_most_recent_images=3),
74 | TrajectorySaverCallback("./trajectories"),
75 | BudgetManagerCallback(max_budget=10.0, raise_error=True),
76 | ],
77 | )
78 | ```
79 |
80 | - Browse implementations in `libs/python/agent/agent/loops/`.
81 |
82 | ## 4) Expert: Custom `@register_agent`
83 |
84 | Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
85 |
86 | - Register your own `model=...` loop using `@register_agent`
87 | - Browse implementations in `libs/python/agent/agent/loops/`.
88 | - Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
89 |
90 | ```python
91 | from agent.decorators import register_agent
92 |
93 | @register_agent(models=r".*my-special-model.*", priority=10)
94 | class MyCustomAgentConfig:
95 | async def predict_step(self, messages, model, tools, **kwargs):
96 | # 1) Format messages for your provider
97 | # 2) Call provider
98 | # 3) Convert responses to the agent output schema
99 | return {"output": [], "usage": {}}
100 |
101 | async def predict_click(self, model, image_b64, instruction):
102 | # Optional: click-only capability
103 | return None
104 |
105 | def get_capabilities(self):
106 | return ["step"]
107 | ```
108 |
109 | ## HUD integration (optional)
110 |
111 | When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
112 |
113 | ```python
114 | from agent.integrations.hud import run_single_task
115 |
116 | await run_single_task(
117 | dataset="username/dataset-name",
118 | model="openai/computer-use-preview",
119 | instructions="Operate carefully. Always verify on-screen text before actions.",
120 | # tools=[your_custom_function],
121 | # callbacks=[YourCustomCallback()],
122 | )
123 | ```
124 |
```
--------------------------------------------------------------------------------
/docs/src/components/iou.tsx:
--------------------------------------------------------------------------------
```typescript
1 | 'use client';
2 | import React, { useRef, useEffect, useState, useCallback } from 'react';
3 |
4 | /**
5 | * Represents a rectangle with position, dimensions, styling, and identification
6 | */
7 | interface Rectangle {
8 | /** The x-coordinate of the rectangle's left edge */
9 | left: number;
10 | /** The y-coordinate of the rectangle's top edge */
11 | top: number;
12 | /** The width of the rectangle */
13 | width: number;
14 | /** The height of the rectangle */
15 | height: number;
16 | /** The fill color of the rectangle */
17 | fill: string;
18 | /** The display name of the rectangle */
19 | name: string;
20 | }
21 |
22 | /**
23 | * Props for the IOU component
24 | */
25 | interface IOUProps {
26 | /** The title to display above the visualization */
27 | title: string;
28 | /** The description text to display below the IOU value */
29 | description: string;
30 | /** The first rectangle for IOU calculation */
31 | rect1: Rectangle;
32 | /** The second rectangle for IOU calculation */
33 | rect2: Rectangle;
34 | }
35 |
36 | /**
37 | * A React component that visualizes and calculates the Intersection over Union (IOU)
38 | * of two rectangles on a canvas
39 | * @param props - The component props
40 | * @returns The rendered IOU visualization component
41 | */
42 | export default function IOU({ title, description, rect1, rect2 }: IOUProps) {
43 | const canvasRef = useRef<HTMLCanvasElement>(null);
44 | const [actualIOU, setActualIOU] = useState<number>(0);
45 |
46 | /**
47 | * Converts a rectangle to a bounding box with left, right, top, and bottom coordinates
48 | * @param rect - The rectangle to convert
49 | * @returns An object containing the bounding box coordinates
50 | */
51 | const getBbox = (rect: Rectangle) => ({
52 | left: rect.left,
53 | right: rect.left + rect.width,
54 | top: rect.top,
55 | bottom: rect.top + rect.height,
56 | });
57 |
58 | /**
59 | * Calculates the intersection area between two bounding boxes
60 | * @param bbox1 - The first bounding box
61 | * @param bbox2 - The second bounding box
62 | * @returns The area of intersection between the two bounding boxes
63 | */
64 | const calcIntersection = (bbox1: any, bbox2: any): number => {
65 | const x1 = Math.max(bbox1.left, bbox2.left);
66 | const x2 = Math.min(bbox1.right, bbox2.right);
67 | const y1 = Math.max(bbox1.top, bbox2.top);
68 | const y2 = Math.min(bbox1.bottom, bbox2.bottom);
69 |
70 | // Check if there's actually an overlap
71 | if (x2 <= x1 || y2 <= y1) {
72 | return 0;
73 | }
74 |
75 | const intersection = (x2 - x1) * (y2 - y1);
76 | return intersection;
77 | };
78 |
79 | /**
80 | * Calculates the area of a rectangle
81 | * @param rect - The rectangle to calculate area for
82 | * @returns The area of the rectangle
83 | */
84 | const calcArea = (rect: Rectangle): number => {
85 | return rect.width * rect.height;
86 | };
87 |
88 | /**
89 | * Draws the rectangles on the canvas and calculates the IOU value
90 | */
91 | const drawCanvas = useCallback(() => {
92 | const canvas = canvasRef.current;
93 | if (!canvas) return;
94 |
95 | const ctx = canvas.getContext('2d');
96 | if (!ctx) return;
97 |
98 | // Clear canvas
99 | ctx.clearRect(0, 0, canvas.width, canvas.height);
100 |
101 | // Calculate IOU
102 | const bbox1 = getBbox(rect1);
103 | const bbox2 = getBbox(rect2);
104 | const intersection = calcIntersection(bbox1, bbox2);
105 | const union = calcArea(rect1) + calcArea(rect2) - intersection;
106 | const iou = intersection / union;
107 | setActualIOU(iou);
108 |
109 | // Draw rectangles
110 | [rect1, rect2].forEach((rect) => {
111 | ctx.fillStyle = rect.fill;
112 | ctx.fillRect(rect.left, rect.top, rect.width, rect.height);
113 |
114 | ctx.strokeStyle = '#000';
115 | ctx.lineWidth = 2;
116 | ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
117 |
118 | ctx.fillStyle = '#000';
119 | ctx.font = '12px';
120 | ctx.fillText(rect.name, rect.left + 5, rect.top + 15);
121 | });
122 | }, [rect1, rect2]);
123 |
124 | useEffect(() => {
125 | drawCanvas();
126 | }, [drawCanvas]);
127 |
128 | return (
129 | <div className="">
130 | <h3 className="text-sm font-semibold ">{title}</h3>
131 | <div className="flex items-start gap-6">
132 | <div>
133 | <canvas ref={canvasRef} width={200} height={150} className="border bg-white rounded-md" />
134 | <div className="mt-2 text-sm">
135 | <div className="font-mono mb-2">IOU = {actualIOU.toFixed(3)}</div>
136 | <span className="">{description}</span>
137 | </div>
138 | </div>
139 | </div>
140 | </div>
141 | );
142 | }
143 |
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/base.py:
--------------------------------------------------------------------------------
```python
1 | """Base provider interface for VM backends."""
2 |
3 | import abc
4 | from enum import StrEnum
5 | from typing import Any, AsyncContextManager, Dict, Optional
6 |
7 | from .types import ListVMsResponse
8 |
9 |
10 | class VMProviderType(StrEnum):
11 | """Enum of supported VM provider types."""
12 |
13 | LUME = "lume"
14 | LUMIER = "lumier"
15 | CLOUD = "cloud"
16 | WINSANDBOX = "winsandbox"
17 | DOCKER = "docker"
18 | UNKNOWN = "unknown"
19 |
20 |
21 | class BaseVMProvider(AsyncContextManager):
22 | """Base interface for VM providers.
23 |
24 | All VM provider implementations must implement this interface.
25 | """
26 |
27 | @property
28 | @abc.abstractmethod
29 | def provider_type(self) -> VMProviderType:
30 | """Get the provider type."""
31 | pass
32 |
33 | @abc.abstractmethod
34 | async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
35 | """Get VM information by name.
36 |
37 | Args:
38 | name: Name of the VM to get information for
39 | storage: Optional storage path override. If provided, this will be used
40 | instead of the provider's default storage path.
41 |
42 | Returns:
43 | Dictionary with VM information including status, IP address, etc.
44 | """
45 | pass
46 |
47 | @abc.abstractmethod
48 | async def list_vms(self) -> ListVMsResponse:
49 | """List all available VMs.
50 |
51 | Returns:
52 | ListVMsResponse: A list of minimal VM objects as defined in
53 | `computer.providers.types.MinimalVM`.
54 | """
55 | pass
56 |
57 | @abc.abstractmethod
58 | async def run_vm(
59 | self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None
60 | ) -> Dict[str, Any]:
61 | """Run a VM by name with the given options.
62 |
63 | Args:
64 | image: Name/tag of the image to use
65 | name: Name of the VM to run
66 | run_opts: Dictionary of run options (memory, cpu, etc.)
67 | storage: Optional storage path override. If provided, this will be used
68 | instead of the provider's default storage path.
69 |
70 | Returns:
71 | Dictionary with VM run status and information
72 | """
73 | pass
74 |
75 | @abc.abstractmethod
76 | async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
77 | """Stop a VM by name.
78 |
79 | Args:
80 | name: Name of the VM to stop
81 | storage: Optional storage path override. If provided, this will be used
82 | instead of the provider's default storage path.
83 |
84 | Returns:
85 | Dictionary with VM stop status and information
86 | """
87 | pass
88 |
89 | @abc.abstractmethod
90 | async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
91 | """Restart a VM by name.
92 |
93 | Args:
94 | name: Name of the VM to restart
95 | storage: Optional storage path override. If provided, this will be used
96 | instead of the provider's default storage path.
97 |
98 | Returns:
99 | Dictionary with VM restart status and information
100 | """
101 | pass
102 |
103 | @abc.abstractmethod
104 | async def update_vm(
105 | self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None
106 | ) -> Dict[str, Any]:
107 | """Update VM configuration.
108 |
109 | Args:
110 | name: Name of the VM to update
111 | update_opts: Dictionary of update options (memory, cpu, etc.)
112 | storage: Optional storage path override. If provided, this will be used
113 | instead of the provider's default storage path.
114 |
115 | Returns:
116 | Dictionary with VM update status and information
117 | """
118 | pass
119 |
120 | @abc.abstractmethod
121 | async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str:
122 | """Get the IP address of a VM, waiting indefinitely until it's available.
123 |
124 | Args:
125 | name: Name of the VM to get the IP for
126 | storage: Optional storage path override. If provided, this will be used
127 | instead of the provider's default storage path.
128 | retry_delay: Delay between retries in seconds (default: 2)
129 |
130 | Returns:
131 | IP address of the VM when it becomes available
132 | """
133 | pass
134 |
```
--------------------------------------------------------------------------------
/blog/cua-hackathon.md:
--------------------------------------------------------------------------------
```markdown
1 | # Computer-Use Agents SOTA Challenge: Hack the North + Global Online
2 |
3 | _Published on August 25, 2025 by Francesco Bonacci_
4 |
5 | We’re bringing something new to [Hack the North](https://hackthenorth.com), Canada’s largest hackathon, this year: a head-to-head competition for **Computer-Use Agents** - on-site at Waterloo and a **Global online challenge**. From September 12–14, 2025, teams build on the **Cua Agent Framework** and are scored in **HUD’s OSWorld-Verified** environment to push past today’s SOTA on [OS-World](https://os-world.github.io).
6 |
7 | <img src="./assets/hack-the-north.png">
8 |
9 | ## Track A: On-site @ Hack the North
10 |
11 | There’s one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Agent**. Use any model setup you like (cloud or local). After projects are submitted, [HUD](https://www.hud.so) runs the official benchmark; the top team earns a **guaranteed YC partner interview (W26 batch)**. We’ll also feature winners on our blog and socials and kit the team out with swag.
12 |
13 | ## Track B: Cua Global Online Hackathon
14 |
15 | **Cua** and [**Ollama**](https://ollama.com) organize a global hackathon to find the **most creative uses of local and hybrid computer-use agents**. There are no geographic restrictions on who can join — this is a worldwide competition focused on **originality, impact, and inventive applications** that showcase what's possible with local and hybrid inference.
16 |
17 | **Prizes:**
18 |
19 | - 1st **MacBook Air M4 (or equivalent value)** + features in Cua & Ollama channels
20 | - 2nd **$500 CAD + swag**
21 | - 3rd **swag + public feature**
22 |
23 | ---
24 |
25 | ## How it works
26 |
27 | Two different tracks, two different processes:
28 |
29 | ### On-site (Track A)
30 |
31 | Build during the weekend and submit a repo with a one-line start command. **HUD** executes your command in a clean environment and runs **OSWorld-Verified**. Scores come from official benchmark results; ties break by median, then wall-clock time, then earliest submission. Any model setup is allowed (cloud or local).
32 |
33 | **HUD** runs official evaluations immediately after submission. Winners are announced at the **closing ceremony**.
34 |
35 | ### Rules
36 |
37 | - Fork and star the [Cua repo](https://github.com/trycua/cua).
38 | - Add your agent and instructions in `samples/community/hack-the-north/<YOUR_TEAM_NAME>`.
39 | - Include a README with details on the approach and any required notes.
40 | - Submit a PR.
41 |
42 | **Deadline: Sept 15, 8:00 AM EDT**
43 |
44 | ### Global Online (Track B)
45 |
46 | Open to anyone, anywhere. Build on your own timeline and submit through the **Cua Discord form** by the deadline.
47 |
48 | **Project Requirements:**
49 |
50 | - Your agent must integrate **Cua and Ollama** in some way
51 | - Your agent must be **easily runnable by judges**
52 |
53 | Judged by **Cua** and **Ollama** teams on:
54 |
55 | - **Creativity (30%)** – originality, usefulness, surprise factor
56 | - **Technical Depth (30%)** – quality of engineering and agent design
57 | - **Use of Ollama (30%)** – effective integration of local/hybrid inference
58 | - **Polish (10%)** – presentation, clarity, demo readiness
59 |
60 | ### Submission Process
61 |
62 | Submissions will be collected via a **form link provided in the Cua Discord**. Your submission must contain:
63 |
64 | - **GitHub repo** containing the agent source code and a clear README with instructions on how to use the agent
65 | - **Explanation** of the models and tools used, and what's local or hybrid about your design
66 | - **Short demo video** (up to two minutes)
67 |
68 | A **commit freeze** will be used to ensure that no changes are made after the deadline. Winners will be announced after judging is complete.
69 |
70 | **Deadline: Sept 28, 11:59 PM UTC (extended due to popular demand!)**
71 |
72 | ---
73 |
74 | ## Join us
75 |
76 | Bring a team, pick a model stack, and push what agents can do on real computers. We can’t wait to see what you build at **Hack the North 2025**.
77 |
78 | **Discord channels**
79 |
80 | - Join the Discord first: https://discord.gg/cua-ai
81 | - **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
82 | - **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
83 |
84 | **Contact**
85 | Questions on Hack the North? Email **[email protected]**.
86 |
87 | _P.S. If you’re planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at cua.ai/docs; we’ll share office-hour times in both Discord channels._
88 |
```
--------------------------------------------------------------------------------
/.github/workflows/bump-version.yml:
--------------------------------------------------------------------------------
```yaml
1 | name: Bump Version & Publish
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | service:
7 | description: "Service/Package to bump"
8 | required: true
9 | type: choice
10 | options:
11 | - cua-agent
12 | - cua-computer
13 | - cua-computer-server
14 | - cua-core
15 | - cua-mcp-server
16 | - cua-som
17 | - pylume
18 | bump_type:
19 | description: "Version bump type"
20 | required: true
21 | type: choice
22 | options:
23 | - patch
24 | - minor
25 | - major
26 |
27 | permissions:
28 | contents: write
29 |
30 | jobs:
31 | bump-version:
32 | runs-on: ubuntu-latest
33 | outputs:
34 | agent_version: ${{ steps.agent_version.outputs.version }}
35 | computer_version: ${{ steps.computer_version.outputs.version }}
36 | steps:
37 | - name: Set package directory
38 | id: package
39 | run: |
40 | case "${{ inputs.service }}" in
41 | "cua-agent")
42 | echo "directory=libs/python/agent" >> $GITHUB_OUTPUT
43 | ;;
44 | "cua-computer")
45 | echo "directory=libs/python/computer" >> $GITHUB_OUTPUT
46 | ;;
47 | "cua-computer-server")
48 | echo "directory=libs/python/computer-server" >> $GITHUB_OUTPUT
49 | ;;
50 | "cua-core")
51 | echo "directory=libs/python/core" >> $GITHUB_OUTPUT
52 | ;;
53 | "cua-mcp-server")
54 | echo "directory=libs/python/mcp-server" >> $GITHUB_OUTPUT
55 | ;;
56 | "cua-som")
57 | echo "directory=libs/python/som" >> $GITHUB_OUTPUT
58 | ;;
59 | "pylume")
60 | echo "directory=libs/python/pylume" >> $GITHUB_OUTPUT
61 | ;;
62 | *)
63 | echo "Unknown service: ${{ inputs.service }}"
64 | exit 1
65 | ;;
66 | esac
67 |
68 | - name: Checkout repository
69 | uses: actions/checkout@v4
70 | with:
71 | fetch-depth: 0
72 | token: ${{ secrets.GITHUB_TOKEN }}
73 |
74 | - name: Set up Python
75 | uses: actions/setup-python@v5
76 | with:
77 | python-version: "3.11"
78 |
79 | - name: Install bump2version
80 | run: pip install bump2version
81 |
82 | - name: Configure Git
83 | run: |
84 | git config user.name "github-actions[bot]"
85 | git config user.email "github-actions[bot]@users.noreply.github.com"
86 |
87 | - name: Run bump2version
88 | run: |
89 | cd ${{ steps.package.outputs.directory }}
90 | bump2version ${{ inputs.bump_type }}
91 |
92 | - name: Also bump cua-agent
93 | if: ${{ inputs.service == 'cua-computer' }}
94 | run: |
95 | cd libs/python/agent
96 | bump2version ${{ inputs.bump_type }}
97 |
98 | - name: Capture bumped agent version
99 | if: ${{ inputs.service == 'cua-agent' || inputs.service == 'cua-computer' }}
100 | id: agent_version
101 | run: |
102 | cd libs/python/agent
103 | VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
104 | echo "Agent version: $VERSION"
105 | echo "version=$VERSION" >> "$GITHUB_OUTPUT"
106 |
107 | - name: Capture bumped computer version
108 | if: ${{ inputs.service == 'cua-computer' }}
109 | id: computer_version
110 | run: |
111 | cd libs/python/computer
112 | VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
113 | echo "Computer version: $VERSION"
114 | echo "version=$VERSION" >> "$GITHUB_OUTPUT"
115 |
116 | - name: Push changes
117 | run: |
118 | git push origin main --follow-tags
119 |
120 | publish-computer:
121 | needs: bump-version
122 | if: ${{ inputs.service == 'cua-computer' }}
123 | uses: ./.github/workflows/pypi-publish-computer.yml
124 | with:
125 | version: ${{ needs.bump-version.outputs.computer_version }}
126 | secrets: inherit
127 |
128 | publish-agent:
129 | needs: [bump-version, publish-computer]
130 | if: ${{ always() && (inputs.service == 'cua-agent' || inputs.service == 'cua-computer') && needs.bump-version.result == 'success' && (inputs.service == 'cua-agent' || needs.publish-computer.result == 'success') }}
131 | uses: ./.github/workflows/pypi-publish-agent.yml
132 | with:
133 | version: ${{ needs.bump-version.outputs.agent_version }}
134 | secrets: inherit
135 |
```
--------------------------------------------------------------------------------
/examples/computer_examples.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import os
3 | import sys
4 | import traceback
5 | from pathlib import Path
6 |
7 | # Load environment variables from .env file
8 | project_root = Path(__file__).parent.parent
9 | env_file = project_root / ".env"
10 | print(f"Loading environment from: {env_file}")
11 | from dotenv import load_dotenv
12 |
13 | load_dotenv(env_file)
14 |
15 | # Add paths to sys.path if needed
16 | pythonpath = os.environ.get("PYTHONPATH", "")
17 | for path in pythonpath.split(":"):
18 | if path and path not in sys.path:
19 | sys.path.insert(0, path) # Insert at beginning to prioritize
20 | print(f"Added to sys.path: {path}")
21 |
22 | from computer.computer import Computer
23 | from computer.logger import LogLevel
24 | from computer.providers.base import VMProviderType
25 |
26 |
27 | async def main():
28 | try:
29 | print("\n=== Using direct initialization ===")
30 |
31 | # Create a local macOS computer
32 | computer = Computer(
33 | display="1024x768",
34 | memory="8GB",
35 | cpu="4",
36 | os_type="macos",
37 | name="macos",
38 | verbosity=LogLevel.VERBOSE,
39 | provider_type=VMProviderType.LUME,
40 | storage="/Users/<USER>/repos/trycua/computer/examples/storage",
41 | shared_directories=["/Users/<USER>/repos/trycua/computer/examples/shared"],
42 | ephemeral=False,
43 | )
44 |
45 | # Create a remote Linux computer with Cua
46 | # computer = Computer(
47 | # os_type="linux",
48 | # api_key=os.getenv("CUA_API_KEY"),
49 | # name=os.getenv("CONTAINER_NAME"),
50 | # provider_type=VMProviderType.CLOUD,
51 | # )
52 |
53 | try:
54 | # Run the computer with default parameters
55 | await computer.run()
56 |
57 | screenshot = await computer.interface.screenshot()
58 |
59 | # Create output directory if it doesn't exist
60 | output_dir = Path("./output")
61 | output_dir.mkdir(exist_ok=True)
62 |
63 | screenshot_path = output_dir / "screenshot.png"
64 | with open(screenshot_path, "wb") as f:
65 | f.write(screenshot)
66 | print(f"Screenshot saved to: {screenshot_path.absolute()}")
67 |
68 | # await computer.interface.hotkey("command", "space")
69 |
70 | # res = await computer.interface.run_command("touch ./Downloads/empty_file")
71 | # print(f"Run command result: {res}")
72 |
73 | accessibility_tree = await computer.interface.get_accessibility_tree()
74 | print(f"Accessibility tree: {accessibility_tree}")
75 |
76 | # Screen Actions Examples
77 | # print("\n=== Screen Actions ===")
78 | # screenshot = await computer.interface.screenshot()
79 | # with open("screenshot_direct.png", "wb") as f:
80 | # f.write(screenshot)
81 |
82 | screen_size = await computer.interface.get_screen_size()
83 | print(f"Screen size: {screen_size}")
84 |
85 | # Demonstrate coordinate conversion
86 | center_x, center_y = 733, 736
87 | print(f"Center in screen coordinates: ({center_x}, {center_y})")
88 |
89 | screenshot_center = await computer.to_screenshot_coordinates(center_x, center_y)
90 | print(f"Center in screenshot coordinates: {screenshot_center}")
91 |
92 | screen_center = await computer.to_screen_coordinates(*screenshot_center)
93 | print(f"Back to screen coordinates: {screen_center}")
94 |
95 | # Mouse Actions Examples
96 | print("\n=== Mouse Actions ===")
97 | await computer.interface.move_cursor(100, 100)
98 | await computer.interface.left_click()
99 | await computer.interface.right_click(300, 300)
100 | await computer.interface.double_click(400, 400)
101 |
102 | # Keyboard Actions Examples
103 | print("\n=== Keyboard Actions ===")
104 | await computer.interface.type_text("Hello, World!")
105 | await computer.interface.press_key("enter")
106 |
107 | # Clipboard Actions Examples
108 | print("\n=== Clipboard Actions ===")
109 | await computer.interface.set_clipboard("Test clipboard")
110 | content = await computer.interface.copy_to_clipboard()
111 | print(f"Clipboard content: {content}")
112 |
113 | finally:
114 | # Important to clean up resources
115 | await computer.stop()
116 | except Exception as e:
117 | print(f"Error in main: {e}")
118 | traceback.print_exc()
119 |
120 |
121 | if __name__ == "__main__":
122 | asyncio.run(main())
123 |
```
--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/DHCPLeaseParser.swift:
--------------------------------------------------------------------------------
```swift
1 | import Foundation
2 |
3 | /// Represents a DHCP lease entry from the system's DHCP lease file
4 | private struct DHCPLease {
5 | let macAddress: String
6 | let ipAddress: String
7 | let expirationDate: Date
8 |
9 | /// Creates a lease entry from raw DHCP lease file key-value pairs
10 | /// - Parameter dict: Dictionary containing the raw lease data
11 | /// - Returns: A DHCPLease instance if the data is valid, nil otherwise
12 | static func from(_ dict: [String: String]) -> DHCPLease? {
13 | guard let hwAddress = dict["hw_address"],
14 | let ipAddress = dict["ip_address"],
15 | let lease = dict["lease"] else {
16 | return nil
17 | }
18 |
19 | // Parse MAC address from hw_address field (format can be "1,xx:xx:xx:xx:xx:xx" or "ff,...")
20 | let hwParts = hwAddress.split(separator: ",")
21 | guard hwParts.count >= 2 else { return nil }
22 |
23 | // Get the MAC part after the prefix and normalize it
24 | let rawMacAddress = String(hwParts[1]).trimmingCharacters(in: .whitespaces)
25 |
26 | // Normalize the MAC address by ensuring each component is two digits
27 | let normalizedMacAddress = rawMacAddress.split(separator: ":")
28 | .map { component in
29 | let hex = String(component)
30 | return hex.count == 1 ? "0\(hex)" : hex
31 | }
32 | .joined(separator: ":")
33 |
34 | // Convert hex timestamp to Date
35 | let timestampHex = lease.trimmingCharacters(in: CharacterSet(charactersIn: "0x"))
36 | guard let timestamp = UInt64(timestampHex, radix: 16) else { return nil }
37 | let expirationDate = Date(timeIntervalSince1970: TimeInterval(timestamp))
38 |
39 | return DHCPLease(
40 | macAddress: normalizedMacAddress,
41 | ipAddress: ipAddress,
42 | expirationDate: expirationDate
43 | )
44 | }
45 |
46 | /// Checks if the lease is currently valid
47 | var isValid: Bool {
48 | expirationDate > Date()
49 | }
50 | }
51 |
52 | /// Parses DHCP lease files to retrieve IP addresses for VMs based on their MAC addresses
53 | enum DHCPLeaseParser {
54 | private static let leasePath = "/var/db/dhcpd_leases"
55 |
56 | /// Retrieves the IP address for a given MAC address from the DHCP lease file
57 | /// - Parameter macAddress: The MAC address to look up
58 | /// - Returns: The IP address if found, nil otherwise
59 | static func getIPAddress(forMAC macAddress: String) -> String? {
60 | guard let leaseContents = try? String(contentsOfFile: leasePath, encoding: .utf8) else {
61 | return nil
62 | }
63 |
64 | // Normalize the input MAC address to ensure consistent format
65 | let normalizedMacAddress = macAddress.split(separator: ":").map { component in
66 | let hex = String(component)
67 | return hex.count == 1 ? "0\(hex)" : hex
68 | }.joined(separator: ":")
69 |
70 | let leases = try? parseDHCPLeases(leaseContents)
71 | return leases?.first { lease in
72 | lease.macAddress == normalizedMacAddress
73 | }?.ipAddress
74 | }
75 |
76 | /// Parses the contents of a DHCP lease file into lease entries
77 | /// - Parameter contents: The raw contents of the lease file
78 | /// - Returns: Array of parsed lease entries
79 | private static func parseDHCPLeases(_ contents: String) throws -> [DHCPLease] {
80 | var leases: [DHCPLease] = []
81 | var currentLease: [String: String] = [:]
82 | var inLeaseBlock = false
83 |
84 | let lines = contents.components(separatedBy: .newlines)
85 |
86 | for line in lines {
87 | let trimmedLine = line.trimmingCharacters(in: .whitespaces)
88 |
89 | if trimmedLine == "{" {
90 | inLeaseBlock = true
91 | currentLease = [:]
92 | } else if trimmedLine == "}" {
93 | if let lease = DHCPLease.from(currentLease) {
94 | leases.append(lease)
95 | }
96 | inLeaseBlock = false
97 | } else if inLeaseBlock {
98 | let parts = trimmedLine.split(separator: "=", maxSplits: 1)
99 | if parts.count == 2 {
100 | let key = String(parts[0]).trimmingCharacters(in: .whitespaces)
101 | let value = String(parts[1]).trimmingCharacters(in: .whitespaces)
102 | currentLease[key] = value
103 | }
104 | }
105 | }
106 |
107 | return leases
108 | }
109 | }
```
--------------------------------------------------------------------------------
/blog/trajectory-viewer.md:
--------------------------------------------------------------------------------
```markdown
1 | # Trajectory Viewer for Cua
2 |
3 | _Published on May 13, 2025 by Dillon DuPont_
4 |
5 | Don’t forget to check out [Part 1: Building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [Part 2: Using the Agent framework](build-your-own-operator-on-macos-2) for setting up your Cua environment and basic tips and tricks!
6 |
7 | ## Introduction
8 |
9 | Okay, so you’ve gotten your environment up and also tested a few agent runs. You’ll likely have encountered cases where your agent was successful at doing some tasks but also places where it got stuck or outright failed.
10 | Now what?
11 | If you’ve ever wondered exactly what your computer agent is doing and why it sometimes doesn’t do what you expected, then the Trajectory Viewer for Cua is here to help! Whether you’re a seasoned developer or someone who just wants to dive in and see results, this tool makes it easy to explore every step your agent takes on your screen.
12 | Plus, if you want to start thinking about generating data to train your own agentic model (we’ll cover training in an upcoming blog, so look forward to it), then our Trajectory Viewer might be for you.
13 |
14 | ## So, what’s a “trajectory”?
15 |
16 | Think of a trajectory as a detailed video recording of your agent’s journey:
17 |
18 | - **Observations**: What did the agent see (the exact screen content) at each point in time?
19 | - **Actions**: What clicks, keystrokes, or commands did it perform in response?
20 | - **Decisions**: Which options did it choose, and why?
21 | Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
22 |
23 | ## So, what’s Cua’s Trajectory Viewer and why use it?
24 |
25 | The Trajectory Player for Cua is a GUI tool that helps you explore saved trajectories generated from your Cua computer agent runs. This tool provides a powerful way to:
26 |
27 | - **Debug your agents**: See exactly what your agent saw to reproduce bugs
28 | - **Analyze failure cases**: Identify the moment when your agent went off-script
29 | - **Collect training data**: Export your trajectories for your own processing, training, and more!
30 |
31 | The viewer allows you to see exactly what your agent observed and how it interacted with the computer all through your browser.
32 |
33 | ## Opening Trajectory Viewer in 3 Simple Steps
34 |
35 | 1. **Visit**: Open your browser and go to [https://cua.ai/trajectory-viewer](https://cua.ai/trajectory-viewer).
36 | 2. **Upload**: Drag and drop a trajectories folder or click Select Folder.
37 | 3. **Explore**: View your agent’s trajectories! All data stays in your browser unless you give permission otherwise.
38 |
39 | 
40 |
41 | ## Recording a Trajectory
42 |
43 | ### Using the ComputerAgent API
44 |
45 | Trajectories are saved by default when using the ComputerAgent API:
46 |
47 | ```python
48 | agent.run("book a flight for me")
49 | ```
50 |
51 | You can explicitly control trajectory saving with the `save_trajectory` parameter:
52 |
53 | ```python
54 | from cua import ComputerAgent
55 |
56 | agent = ComputerAgent(save_trajectory=True)
57 | agent.run("search for hotels in Boston")
58 | ```
59 |
60 | Each trajectory folder is saved in a `trajectories` directory with a timestamp format, for example: `trajectories/20250501_222749`
61 |
62 | ## Exploring and Analyzing Trajectories
63 |
64 | Our Trajectory Viewer is designed to allow for thorough analysis and debugging in a friendly way. Once loaded, the viewer presents:
65 |
66 | - **Timeline Slider**: Jump to any step in the session
67 | - **Screen Preview**: See exactly what the agent saw
68 | - **Action Details**: Review clicks, keypresses, and API calls
69 | - **Logs & Metadata**: Inspect debug logs or performance stats
70 |
71 | Use these features to:
72 |
73 | - Step through each action and observation; understand your agent’s decision-making
74 | - Understand why and where your agent failed
75 | - Collect insights for improving your instructions, prompts, tasks, agent, etc.
76 |
77 | The trajectory viewer provides a visual interface for stepping through each action your agent took, making it easy to see what your agent “sees”.
78 |
79 | ## Getting Started
80 |
81 | Ready to see your agent in action? Head over to the Trajectory Viewer and load up your first session. Debug smarter, train faster, and stay in control (all within your browser).
82 |
83 | Happy tinkering and Cua on!
84 |
85 | Have questions or want to share feedback? Join our community on Discord or open an issue on GitHub.
86 |
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/installation.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Installation
3 | ---
4 |
5 | Install the package from PyPI:
6 |
7 | ```bash
8 | pip install cua-mcp-server
9 | ```
10 |
11 | This will install:
12 |
13 | - The MCP server
14 | - CUA agent and computer dependencies
15 | - An executable `cua-mcp-server` script in your PATH
16 |
17 | ## Easy Setup Script
18 |
19 | If you want to simplify installation, you can use this one-liner to download and run the installation script:
20 |
21 | ```bash
22 | curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/python/mcp-server/scripts/install_mcp_server.sh | bash
23 | ```
24 |
25 | This script will:
26 |
27 | - Create the ~/.cua directory if it doesn't exist
28 | - Generate a startup script at ~/.cua/start_mcp_server.sh
29 | - Make the script executable
30 | - The startup script automatically manages Python virtual environments and installs/updates the cua-mcp-server package
31 |
32 | You can then use the script in your MCP configuration like this:
33 |
34 | ```json
35 | {
36 | "mcpServers": {
37 | "cua-agent": {
38 | "command": "/bin/bash",
39 | "args": ["~/.cua/start_mcp_server.sh"],
40 | "env": {
41 | "CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
42 | "ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
43 | }
44 | }
45 | }
46 | }
47 | ```
48 |
49 | **Important**: You must include your Anthropic API key for the MCP server to work properly.
50 |
51 | ## Development Setup
52 |
53 | If you're working with the CUA source code directly (like in the CUA repository), you can use the development script instead:
54 |
55 | ```json
56 | {
57 | "mcpServers": {
58 | "cua-agent": {
59 | "command": "/usr/bin/env",
60 | "args": [
61 | "bash",
62 | "-lc",
63 | "export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
64 | ]
65 | }
66 | }
67 | }
68 | ```
69 |
70 | **For host computer control** (development setup):
71 |
72 | 1. **Install Computer Server Dependencies**:
73 |
74 | ```bash
75 | python3 -m pip install uvicorn fastapi
76 | python3 -m pip install -e libs/python/computer-server --break-system-packages
77 | ```
78 |
79 | 2. **Start the Computer Server**:
80 |
81 | ```bash
82 | cd /path/to/cua
83 | python -m computer_server --log-level debug
84 | ```
85 |
86 | This will start the computer server on `http://localhost:8000` that controls your actual desktop.
87 |
88 | 3. **Configure Claude Desktop**:
89 | ```json
90 | {
91 | "mcpServers": {
92 | "cua-agent": {
93 | "command": "/usr/bin/env",
94 | "args": [
95 | "bash",
96 | "-lc",
97 | "export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
98 | ]
99 | }
100 | }
101 | }
102 | ```
103 |
104 | **Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
105 |
106 | **⚠️ Important**: When using host computer control (`CUA_USE_HOST_COMPUTER_SERVER='true'`), the AI will have direct access to your desktop and can perform actions like opening applications, clicking, typing, and taking screenshots. Make sure you're comfortable with this level of access.
107 |
108 | ### Troubleshooting
109 |
110 | **Common Issues:**
111 |
112 | 1. **"Claude's response was interrupted"** - This usually means:
113 | - Missing API key: Add `ANTHROPIC_API_KEY` to your environment variables
114 | - Invalid model name: Use a valid model like `anthropic/claude-sonnet-4-20250514`
115 | - Check logs for specific error messages
116 |
117 | 2. **"Missing Anthropic API Key"** - Add your API key to the configuration:
118 |
119 | ```json
120 | "env": {
121 | "ANTHROPIC_API_KEY": "your-api-key-here"
122 | }
123 | ```
124 |
125 | 3. **"model not found"** - Use a valid model name:
126 | - ✅ `anthropic/claude-sonnet-4-20250514`
127 |
128 | 4. **Script not found** - If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
129 |
130 | 5. **Host Computer Control Issues** - If using `CUA_USE_HOST_COMPUTER_SERVER='true'`:
131 | - **Computer Server not running**: Make sure you've started the computer server with `python -m computer_server --log-level debug`
132 | - **Port 8000 in use**: Check if another process is using port 8000 with `lsof -i :8000`
133 | - **Missing dependencies**: Install `uvicorn` and `fastapi` with `python3 -m pip install uvicorn fastapi`
134 | - **Image size errors**: Use `CUA_MAX_IMAGES='1'` to reduce image context size
135 |
136 | **Viewing Logs:**
137 |
138 | ```bash
139 | tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
140 | ```
141 |
```
--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lumier/docker.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Docker
3 | ---
4 |
5 | You can use Lumier through Docker:
6 |
7 | ### Run a macOS VM (ephemeral)
8 |
9 | ```bash
10 | # Run the container with temporary storage (using pre-built image from Docker Hub)
11 | docker run -it --rm \
12 | --name macos-vm \
13 | -p 8006:8006 \
14 | -e VM_NAME=macos-vm \
15 | -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
16 | -e CPU_CORES=4 \
17 | -e RAM_SIZE=8192 \
18 | trycua/lumier:latest
19 | ```
20 |
21 | Access the VM in your browser at **http://localhost:8006**.
22 |
23 | After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).
24 |
25 | <Callout title="Note">
26 | With the basic setup above, your VM will be reset when you stop the container (ephemeral mode).
27 | This means any changes you make inside the macOS VM will be lost. See the section below for how to
28 | save your VM state.
29 | </Callout>
30 |
31 | ## Saving Your VM State
32 |
33 | To save your VM state between sessions (so your changes persist when you stop and restart the container), you'll need to set up a storage location:
34 |
35 | ```bash
36 | # First, create a storage directory if it doesn't exist
37 | mkdir -p storage
38 |
39 | # Then run the container with persistent storage
40 | docker run -it --rm \
41 | --name lumier-vm \
42 | -p 8006:8006 \
43 | -v $(pwd)/storage:/storage \
44 | -e VM_NAME=lumier-vm \
45 | -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
46 | -e CPU_CORES=4 \
47 | -e RAM_SIZE=8192 \
48 | -e HOST_STORAGE_PATH=$(pwd)/storage \
49 | trycua/lumier:latest
50 | ```
51 |
52 | This command creates a connection between a folder on your Mac (`$(pwd)/storage`) and a folder inside the Docker container (`/storage`). The `-v` flag (volume mount) and the `HOST_STORAGE_PATH` variable work together to ensure your VM data is saved on your host Mac.
53 |
54 | ## Sharing Files with Your VM
55 |
56 | To share files between your Mac and the virtual machine, you can set up a shared folder:
57 |
58 | ```bash
59 | # Create both storage and shared folders
60 | mkdir -p storage shared
61 |
62 | # Run with both persistent storage and a shared folder
63 | docker run -it --rm \
64 | --name lumier-vm \
65 | -p 8006:8006 \
66 | -v $(pwd)/storage:/storage \
67 | -v $(pwd)/shared:/shared \
68 | -e VM_NAME=lumier-vm \
69 | -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
70 | -e CPU_CORES=4 \
71 | -e RAM_SIZE=8192 \
72 | -e HOST_STORAGE_PATH=$(pwd)/storage \
73 | -e HOST_SHARED_PATH=$(pwd)/shared \
74 | trycua/lumier:latest
75 | ```
76 |
77 | With this setup, any files you place in the `shared` folder on your Mac will be accessible from within the macOS VM, and vice versa.
78 |
79 | ## Automating VM Startup with on-logon.sh
80 |
81 | You can automatically run scripts when the VM starts up by placing an `on-logon.sh` script in the shared folder's lifecycle directory. This is useful for setting up your VM environment each time it starts.
82 |
83 | ```bash
84 | # Create the lifecycle directory in your shared folder
85 | mkdir -p shared/lifecycle
86 |
87 | # Create a sample on-logon.sh script
88 | cat > shared/lifecycle/on-logon.sh << 'EOF'
89 | #!/usr/bin/env bash
90 |
91 | # Create a file on the desktop
92 | echo "Hello from Lumier!" > /Users/lume/Desktop/hello_lume.txt
93 |
94 | # You can add more commands to execute at VM startup
95 | # For example:
96 | # - Configure environment variables
97 | # - Start applications
98 | # - Mount network drives
99 | # - Set up development environments
100 | EOF
101 |
102 | # Make the script executable
103 | chmod +x shared/lifecycle/on-logon.sh
104 | ```
105 |
106 | The script will be automatically executed when the VM starts up. It runs in the VM context and has access to:
107 |
108 | - The `/Users/lume` user directory (home directory in the VM)
109 | - The shared folder at `/Volumes/My Shared Files` inside the VM
110 | - Any resources available to the VM
111 |
112 | This feature enables automation of VM setup without modifying the base VM image.
113 |
114 | ## Configuration Options
115 |
116 | When running Lumier, you'll need to configure a few things:
117 |
118 | - **Port forwarding** (`-p 8006:8006`): Makes the VM's VNC interface accessible in your browser. If port 8006 is already in use, you can use a different port like `-p 8007:8006`.
119 |
120 | - **Environment variables** (`-e`): Configure your VM settings:
121 | - `VM_NAME`: A name for your virtual machine
122 | - `VERSION`: The macOS image to use
123 | - `CPU_CORES`: Number of CPU cores to allocate
124 | - `RAM_SIZE`: Memory in MB to allocate
125 | - `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage)
126 | - `HOST_SHARED_PATH`: Path to the shared folder (optional)
127 |
128 | - **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above).
129 |
```
--------------------------------------------------------------------------------
/libs/typescript/agent/src/types.ts:
--------------------------------------------------------------------------------
```typescript
1 | // #region Request
2 | export type ConnectionType = 'http' | 'https' | 'peer';
3 | export interface AgentClientOptions {
4 | timeout?: number;
5 | retries?: number;
6 | /** Optional CUA API key to send as X-API-Key header for HTTP requests */
7 | apiKey?: string;
8 | }
9 | // Request types matching the Python proxy API
10 | export interface AgentRequest {
11 | model: string;
12 | input: string | AgentMessage[];
13 | agent_kwargs?: {
14 | save_trajectory?: boolean;
15 | verbosity?: number;
16 | [key: string]: any;
17 | };
18 | computer_kwargs?: {
19 | os_type?: string;
20 | provider_type?: string;
21 | [key: string]: any;
22 | };
23 | /**
24 | * Optional per-request environment variable overrides.
25 | * Keys and values are strings and will be forwarded to the backend proxy.
26 | */
27 | env?: Record<string, string>;
28 | }
29 | // #endregion
30 |
31 | // #region Response
32 | // Response types
33 | export interface AgentResponse {
34 | output: AgentMessage[];
35 | usage: Usage;
36 | status: 'completed' | 'failed';
37 | error?: string;
38 | }
39 | // Usage information
40 | export interface Usage {
41 | prompt_tokens: number;
42 | completion_tokens: number;
43 | total_tokens: number;
44 | response_cost: number;
45 | }
46 | // #endregion
47 |
48 | // #region Messages
49 | // Agent message types - can be one of several different message types
50 | export type AgentMessage =
51 | | UserMessage
52 | | AssistantMessage
53 | | ReasoningMessage
54 | | ComputerCallMessage
55 | | ComputerCallOutputMessage
56 | | FunctionCallMessage
57 | | FunctionCallOutputMessage;
58 | // Input message
59 | export interface UserMessage {
60 | type?: 'message';
61 | role: 'user' | 'system' | 'developer';
62 | content: string | InputContent[];
63 | }
64 | // Output message
65 | export interface AssistantMessage {
66 | type: 'message';
67 | role: 'assistant';
68 | content: OutputContent[];
69 | }
70 | // Output reasoning/thinking message
71 | export interface ReasoningMessage {
72 | type: 'reasoning';
73 | summary: SummaryContent[];
74 | }
75 | // Output computer action call
76 | export interface ComputerCallMessage {
77 | type: 'computer_call';
78 | call_id: string;
79 | status: 'completed' | 'failed' | 'pending';
80 | action: ComputerAction;
81 | }
82 | // Output computer action result (always a screenshot)
83 | export interface ComputerCallOutputMessage {
84 | type: 'computer_call_output';
85 | call_id: string;
86 | output: ComputerResultContent;
87 | }
88 | // Output function call
89 | export interface FunctionCallMessage {
90 | type: 'function_call';
91 | call_id: string;
92 | status: 'completed' | 'failed' | 'pending';
93 | name: string;
94 | arguments: string; // JSON dict of kwargs
95 | }
96 | // Output function call result (always text)
97 | export interface FunctionCallOutputMessage {
98 | type: 'function_call_output';
99 | call_id: string;
100 | output: string;
101 | }
102 | // #endregion
103 |
104 | // #region Message Content
105 | export interface InputContent {
106 | type: 'input_image' | 'input_text';
107 | text?: string;
108 | image_url?: string;
109 | }
110 | export interface OutputContent {
111 | type: 'output_text';
112 | text: string;
113 | }
114 | export interface SummaryContent {
115 | type: 'summary_text';
116 | text: string;
117 | }
118 | export interface ComputerResultContent {
119 | type: 'computer_screenshot' | 'input_image';
120 | image_url: string;
121 | }
122 | // #endregion
123 |
124 | // #region Actions
125 | export type ComputerAction = ComputerActionOpenAI | ComputerActionAnthropic;
126 | // OpenAI Computer Actions
127 | export type ComputerActionOpenAI =
128 | | ClickAction
129 | | DoubleClickAction
130 | | DragAction
131 | | KeyPressAction
132 | | MoveAction
133 | | ScreenshotAction
134 | | ScrollAction
135 | | TypeAction
136 | | WaitAction;
137 | export interface ClickAction {
138 | type: 'click';
139 | button: 'left' | 'right' | 'wheel' | 'back' | 'forward';
140 | x: number;
141 | y: number;
142 | }
143 | export interface DoubleClickAction {
144 | type: 'double_click';
145 | button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
146 | x: number;
147 | y: number;
148 | }
149 | export interface DragAction {
150 | type: 'drag';
151 | button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
152 | path: Array<[number, number]>;
153 | }
154 | export interface KeyPressAction {
155 | type: 'keypress';
156 | keys: string[];
157 | }
158 | export interface MoveAction {
159 | type: 'move';
160 | x: number;
161 | y: number;
162 | }
163 | export interface ScreenshotAction {
164 | type: 'screenshot';
165 | }
166 | export interface ScrollAction {
167 | type: 'scroll';
168 | scroll_x: number;
169 | scroll_y: number;
170 | x: number;
171 | y: number;
172 | }
173 | export interface TypeAction {
174 | type: 'type';
175 | text: string;
176 | }
177 | export interface WaitAction {
178 | type: 'wait';
179 | }
180 | // Anthropic Computer Actions
181 | export type ComputerActionAnthropic = LeftMouseDownAction | LeftMouseUpAction;
182 | export interface LeftMouseDownAction {
183 | type: 'left_mouse_down';
184 | x: number;
185 | y: number;
186 | }
187 | export interface LeftMouseUpAction {
188 | type: 'left_mouse_up';
189 | x: number;
190 | y: number;
191 | }
192 | // #endregion
193 |
```
--------------------------------------------------------------------------------
/libs/python/agent/example.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Example usage of the agent library with docstring-based tool definitions.
3 | """
4 |
5 | import asyncio
6 | import logging
7 |
8 | from agent import ComputerAgent
9 | from computer import Computer
10 | from computer.helpers import sandboxed
11 |
12 |
13 | @sandboxed()
14 | def read_file(location: str) -> str:
15 | """Read contents of a file
16 |
17 | Parameters
18 | ----------
19 | location : str
20 | Path to the file to read
21 |
22 | Returns
23 | -------
24 | str
25 | Contents of the file or error message
26 | """
27 | try:
28 | with open(location, "r") as f:
29 | return f.read()
30 | except Exception as e:
31 | return f"Error reading file: {str(e)}"
32 |
33 |
34 | def save_note(content: str, filename: str = "note.txt") -> str:
35 | """Save content to a note file
36 |
37 | Parameters
38 | ----------
39 | content : str
40 | Content to save to the file
41 | filename : str, optional
42 | Name of the file to save to (default is "note.txt")
43 |
44 | Returns
45 | -------
46 | str
47 | Success or error message
48 | """
49 | try:
50 | with open(filename, "w") as f:
51 | f.write(content)
52 | return f"Saved note to {filename}"
53 | except Exception as e:
54 | return f"Error saving note: {str(e)}"
55 |
56 |
57 | def calculate(a: int, b: int) -> int:
58 | """Calculate the sum of two integers
59 |
60 | Parameters
61 | ----------
62 | a : int
63 | First integer
64 | b : int
65 | Second integer
66 |
67 | Returns
68 | -------
69 | int
70 | Sum of the two integers
71 | """
72 | return a + b
73 |
74 |
75 | async def main():
76 | """Example usage of ComputerAgent with different models"""
77 |
78 | # Example 1: Using Claude with computer and custom tools
79 | print("=== Example 1: Claude with Computer ===")
80 |
81 | import json
82 | import os
83 |
84 | import dotenv
85 |
86 | dotenv.load_dotenv()
87 |
88 | assert os.getenv("CUA_CONTAINER_NAME") is not None, "CUA_CONTAINER_NAME is not set"
89 | assert os.getenv("CUA_API_KEY") is not None, "CUA_API_KEY is not set"
90 |
91 | async with Computer(
92 | os_type="linux",
93 | provider_type="cloud",
94 | name=os.getenv("CUA_CONTAINER_NAME") or "",
95 | api_key=os.getenv("CUA_API_KEY") or "",
96 | ) as computer:
97 | agent = ComputerAgent(
98 | # Supported models:
99 | # == OpenAI CUA (computer-use-preview) ==
100 | model="openai/computer-use-preview",
101 | # == Anthropic CUA (Claude > 3.5) ==
102 | # model="anthropic/claude-opus-4-20250514",
103 | # model="anthropic/claude-sonnet-4-20250514",
104 | # model="anthropic/claude-3-7-sonnet-20250219",
105 | # model="anthropic/claude-sonnet-4-5-20250929",
106 | # == UI-TARS ==
107 | # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
108 | # TODO: add local mlx provider
109 | # model="mlx-community/UI-TARS-1.5-7B-6bit",
110 | # model="ollama_chat/0000/ui-tars-1.5-7b",
111 | # == Omniparser + Any LLM ==
112 | # model="omniparser+..."
113 | # model="omniparser+anthropic/claude-opus-4-20250514",
114 | tools=[computer],
115 | only_n_most_recent_images=3,
116 | verbosity=logging.INFO,
117 | trajectory_dir="trajectories",
118 | use_prompt_caching=True,
119 | max_trajectory_budget={
120 | "max_budget": 1.0,
121 | "raise_error": True,
122 | "reset_after_each_run": False,
123 | },
124 | )
125 |
126 | history = []
127 | while True:
128 | user_input = input("> ")
129 | history.append({"role": "user", "content": user_input})
130 |
131 | # Non-streaming usage
132 | async for result in agent.run(history, stream=False):
133 | history += result["output"]
134 |
135 | # # Print output
136 | # for item in result["output"]:
137 | # if item["type"] == "message":
138 | # print(item["content"][0]["text"])
139 | # elif item["type"] == "computer_call":
140 | # action = item["action"]
141 | # action_type = action["type"]
142 | # action_args = {k: v for k, v in action.items() if k != "type"}
143 | # print(f"{action_type}({action_args})")
144 | # elif item["type"] == "function_call":
145 | # action = item["name"]
146 | # action_args = item["arguments"]
147 | # print(f"{action}({action_args})")
148 | # elif item["type"] == "function_call_output":
149 | # print("===>", item["output"])
150 |
151 |
152 | if __name__ == "__main__":
153 | asyncio.run(main())
154 |
```
--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/contrib.md:
--------------------------------------------------------------------------------
```markdown
1 | # Contributing Reference Agent Implementations
2 |
3 | This guide explains how to add your own reference agent implementations to the benchmark system.
4 |
5 | ## Adding Reference Agent Implementations
6 |
7 | ### 1. Implement the ModelProtocol
8 |
9 | Create a new file in `models/` directory implementing the `ModelProtocol`:
10 |
11 | ```python
12 | from models.base import ModelProtocol
13 | from typing import Optional, Tuple
14 | from PIL import Image
15 |
16 | class YourModelName(ModelProtocol):
17 | def __init__(self, model_path: str):
18 | self.model_path = model_path
19 | self._model = None
20 |
21 | @property
22 | def model_name(self) -> str:
23 | return self.model_path
24 |
25 | async def load_model(self) -> None:
26 | """Load the model into memory."""
27 | # Your model loading logic here
28 | pass
29 |
30 | async def unload_model(self) -> None:
31 | """Unload the model from memory."""
32 | # Your model cleanup logic here
33 | pass
34 |
35 | async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
36 | """
37 | Predict click coordinates for the given image and instruction.
38 |
39 | Args:
40 | image: PIL Image to analyze
41 | instruction: Text instruction describing what to click
42 |
43 | Returns:
44 | Tuple of (x, y) coordinates or None if prediction fails
45 | """
46 | # Your prediction logic here
47 | return (x, y) # Return predicted coordinates
48 | ```
49 |
50 | ### 2. Register Your Model
51 |
52 | Add your model to the `get_available_models()` function in `utils.py`:
53 |
54 | ```python
55 | def get_available_models() -> List[Union[str, ModelProtocol]]:
56 | models = [
57 | # Computer Agent SDK providers
58 | "huggingface-local/HelloKKMe/GTA1-7B",
59 |
60 | # Reference implementations
61 | GTA1Model("HelloKKMe/GTA1-7B"),
62 | YourModelName("path/to/your/model"), # Add your model here
63 | ]
64 | return models
65 | ```
66 |
67 | ### 3. Test Your Implementation
68 |
69 | Before submitting, test your model with the interactive tool:
70 |
71 | ```bash
72 | python interactive.py
73 | ```
74 |
75 | This will help you verify that your model loads correctly and produces reasonable predictions.
76 |
77 | ## Example: Adding a New Model
78 |
79 | Here's a complete example of adding a hypothetical "MyVisionModel":
80 |
81 | 1. **Create `models/my_vision_model.py`:**
82 |
83 | ```python
84 | import torch
85 | from transformers import AutoModel, AutoProcessor
86 | from models.base import ModelProtocol
87 | from typing import Optional, Tuple
88 | from PIL import Image
89 |
90 | class MyVisionModel(ModelProtocol):
91 | def __init__(self, model_path: str):
92 | self.model_path = model_path
93 | self.model = None
94 | self.processor = None
95 |
96 | @property
97 | def model_name(self) -> str:
98 | return f"MyVisionModel({self.model_path})"
99 |
100 | async def load_model(self) -> None:
101 | """Load the model and processor."""
102 | self.processor = AutoProcessor.from_pretrained(self.model_path)
103 | self.model = AutoModel.from_pretrained(
104 | self.model_path,
105 | torch_dtype=torch.float16,
106 | device_map="auto"
107 | )
108 |
109 | async def unload_model(self) -> None:
110 | """Clean up model resources."""
111 | del self.model
112 | del self.processor
113 | self.model = None
114 | self.processor = None
115 | torch.cuda.empty_cache()
116 |
117 | async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
118 | """Predict click coordinates."""
119 | try:
120 | # Preprocess inputs
121 | inputs = self.processor(
122 | text=instruction,
123 | images=image,
124 | return_tensors="pt"
125 | )
126 |
127 | # Run inference
128 | with torch.no_grad():
129 | outputs = self.model(**inputs)
130 |
131 | # Extract coordinates (model-specific logic)
132 | x, y = self._extract_coordinates(outputs)
133 | return (int(x), int(y))
134 |
135 | except Exception as e:
136 | print(f"Prediction failed: {e}")
137 | return None
138 |
139 | def _extract_coordinates(self, outputs):
140 | """Extract x, y coordinates from model outputs."""
141 | # Your model-specific coordinate extraction logic
142 | pass
143 | ```
144 |
145 | 2. **Update `models/__init__.py`:**
146 |
147 | ```python
148 | from .gta1 import GTA1Model
149 | from .my_vision_model import MyVisionModel
150 |
151 | __all__ = ["GTA1Model", "MyVisionModel"]
152 | ```
153 |
154 | 3. **Update `utils.py`:**
155 |
156 | ```python
157 | from models import GTA1Model, MyVisionModel
158 |
159 | def get_available_models() -> List[Union[str, ModelProtocol]]:
160 | models = [
161 | "huggingface-local/HelloKKMe/GTA1-7B",
162 | GTA1Model("HelloKKMe/GTA1-7B"),
163 | MyVisionModel("my-org/my-vision-model"), # Add here
164 | ]
165 | return models
166 | ```
167 |
```
--------------------------------------------------------------------------------
/docs/src/components/doc-actions-menu.tsx:
--------------------------------------------------------------------------------
```typescript
1 | 'use client';
2 |
3 | import { useState } from 'react';
4 | import { SiOpenai, SiAnthropic, SiMarkdown, SiGithub } from 'react-icons/si';
5 | import posthog from 'posthog-js';
6 |
7 | interface DocActionsMenuProps {
8 | pageUrl: string;
9 | pageTitle: string;
10 | filePath?: string;
11 | }
12 |
13 | export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
14 | const [copied, setCopied] = useState(false);
15 |
16 | const handleCopyMarkdown = async () => {
17 | try {
18 | if (!filePath) {
19 | throw new Error('No file path available');
20 | }
21 | const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;
22 |
23 | const response = await fetch(githubRawUrl);
24 | if (!response.ok) {
25 | throw new Error('Failed to fetch markdown');
26 | }
27 | const markdown = await response.text();
28 |
29 | await navigator.clipboard.writeText(markdown);
30 |
31 | setCopied(true);
32 | setTimeout(() => setCopied(false), 2000);
33 |
34 | posthog.capture('docs_copy_markdown_clicked', {
35 | page: pageUrl,
36 | page_title: pageTitle,
37 | success: true,
38 | });
39 | } catch (error) {
40 | console.error('Error copying markdown:', error);
41 |
42 | try {
43 | const urlWithUtm = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
44 | await navigator.clipboard.writeText(urlWithUtm);
45 | setCopied(true);
46 | setTimeout(() => setCopied(false), 2000);
47 | } catch (fallbackError) {
48 | console.error('Error copying URL:', fallbackError);
49 | }
50 |
51 | posthog.capture('docs_copy_markdown_clicked', {
52 | page: pageUrl,
53 | page_title: pageTitle,
54 | success: false,
55 | error: error instanceof Error ? error.message : 'Unknown error',
56 | });
57 | }
58 | };
59 |
60 | const handleEditGithub = () => {
61 | if (!filePath) {
62 | return;
63 | }
64 | posthog.capture('docs_edit_github_clicked', {
65 | page: pageUrl,
66 | page_title: pageTitle,
67 | });
68 |
69 | const githubEditUrl = `https://github.com/trycua/cua/edit/main/docs/content/docs/${filePath}`;
70 | window.open(githubEditUrl, '_blank', 'noopener,noreferrer');
71 | };
72 |
73 | const handleOpenChatGPT = () => {
74 | posthog.capture('docs_open_chatgpt_clicked', {
75 | page: pageUrl,
76 | page_title: pageTitle,
77 | });
78 |
79 | const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
80 | const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
81 | const chatgptUrl = `https://chatgpt.com/?q=${encodeURIComponent(prompt)}`;
82 | window.open(chatgptUrl, '_blank', 'noopener,noreferrer');
83 | };
84 |
85 | const handleOpenClaude = () => {
86 | posthog.capture('docs_open_claude_clicked', {
87 | page: pageUrl,
88 | page_title: pageTitle,
89 | });
90 |
91 | const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
92 | const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
93 | const claudeUrl = `https://claude.ai/new?q=${encodeURIComponent(prompt)}`;
94 | window.open(claudeUrl, '_blank', 'noopener,noreferrer');
95 | };
96 |
97 | return (
98 | <div className="flex flex-col gap-2">
99 | <button
100 | onClick={handleCopyMarkdown}
101 | className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
102 | >
103 | <SiMarkdown className="w-2 h-4 flex-shrink-0" />
104 | <span>{copied ? 'Copied!' : 'Copy as markdown'}</span>
105 | </button>
106 |
107 | <button
108 | onClick={handleEditGithub}
109 | className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
110 | >
111 | <SiGithub className="w-4 h-4 flex-shrink-0" />
112 | <span>Edit on GitHub</span>
113 | </button>
114 |
115 | <button
116 | onClick={handleOpenChatGPT}
117 | className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
118 | >
119 | <SiOpenai className="w-4 h-4 flex-shrink-0" />
120 | <span>Open in ChatGPT</span>
121 | </button>
122 |
123 | <button
124 | onClick={handleOpenClaude}
125 | className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
126 | >
127 | <SiAnthropic className="w-4 h-4 flex-shrink-0" />
128 | <span>Open in Claude</span>
129 | </button>
130 | </div>
131 | );
132 | }
133 |
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Composed Agents
3 | description: Combine grounding models with any LLM for computer-use capabilities
4 | ---
5 |
6 | Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning.
7 |
8 | Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model.
9 |
10 | ## How Composed Agents Work
11 |
12 | 1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`)
13 | 2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates
14 | 3. **Execution**: Actions are performed using the predicted coordinates
15 |
16 | ## Supported Grounding Models
17 |
18 | Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models).
19 |
20 | - OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}`
21 | - GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
22 | - Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
23 | - InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
24 | - UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU)
25 | - OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model)
26 | - Moondream3: `moondream3` (requires combination with a LiteLLM vision/text model)
27 |
28 | ## Supported Planning Models
29 |
30 | Any vision-enabled LiteLLM-compatible model can be used as the planning component:
31 |
32 | - Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
33 | - Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
34 | - Examples:
35 | - **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
36 | - **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
37 | - **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
38 | - **Local models**: Any Hugging Face vision-language model
39 |
40 | ## Usage Examples
41 |
42 | ### GTA1 + GPT-5
43 |
44 | Use OpenAI's GPT-5 for planning with specialized grounding:
45 |
46 | ```python
47 | agent = ComputerAgent(
48 | "huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
49 | tools=[computer]
50 | )
51 |
52 | async for _ in agent.run("Take a screenshot, analyze the UI, and click on the most prominent button"):
53 | pass
54 | ```
55 |
56 | ### GTA1 + Claude 3.5 Sonnet
57 |
58 | Combine state-of-the-art grounding with powerful reasoning:
59 |
60 | ```python
61 | agent = ComputerAgent(
62 | "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
63 | tools=[computer]
64 | )
65 |
66 | async for _ in agent.run("Open Firefox, navigate to github.com, and search for 'computer-use'"):
67 | pass
68 | # Success! 🎉
69 | # - Claude 3.5 Sonnet plans the sequence of actions
70 | # - GTA1-7B provides precise click coordinates for each UI element
71 | ```
72 |
73 | ### UI-TARS + GPT-4o
74 |
75 | Combine two different vision models for enhanced capabilities:
76 |
77 | ```python
78 | agent = ComputerAgent(
79 | "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o",
80 | tools=[computer]
81 | )
82 |
83 | async for _ in agent.run("Help me fill out this form with my personal information"):
84 | pass
85 | ```
86 |
87 | ### Moondream3 + GPT-4o
88 |
89 | Use the built-in Moondream3 grounding with any planning model. Moondream3 will detect UI elements on the latest screenshot, label them, and provide a user message listing detected element names.
90 |
91 | ```python
92 | from agent import ComputerAgent
93 | from computer import computer
94 |
95 | agent = ComputerAgent(
96 | "moondream3+openai/gpt-4o",
97 | tools=[computer]
98 | )
99 |
100 | async for _ in agent.run("Close the settings window, then open the Downloads folder"):
101 | pass
102 | ```
103 |
104 | ## Benefits of Composed Agents
105 |
106 | - **Specialized Grounding**: Use models optimized for click prediction accuracy
107 | - **Flexible Planning**: Choose any LLM for task reasoning and planning
108 | - **Cost Optimization**: Use smaller grounding models with larger planning models only when needed
109 | - **Performance**: Leverage the strengths of different model architectures
110 |
111 | ## Capabilities
112 |
113 | Composed agents support both capabilities:
114 |
115 | ```python
116 | agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")
117 |
118 | # Full computer-use agent capabilities
119 | async for _ in agent.run("Complete this online form"):
120 | pass
121 |
122 | # Direct click prediction (uses grounding model only)
123 | coords = agent.predict_click("find the submit button")
124 | ```
125 |
126 | ---
127 |
128 | For more information on individual model capabilities, see [Computer-Use Agents](./computer-use-agents) and [Grounding Models](./grounding-models).
129 |
```
--------------------------------------------------------------------------------
/blog/composite-agents.md:
--------------------------------------------------------------------------------
```markdown
1 | # Announcing Cua Agent framework 0.4 and Composite Agents
2 |
3 | _Published on August 26, 2025 by Dillon DuPont_
4 |
5 | <img src="./assets/composite-agents.png" alt="Composite Agents">
6 |
7 | So you want to build an agent that can use a computer. Great! You've probably discovered that there are now dozens of different AI models that claim they can click GUI buttons and fill out forms. Less great: actually getting them to work together is like trying to coordinate a group project where everyone speaks a different language and has invented seventeen different ways to say "click here".
8 |
9 | Here's the thing about new GUI models: they're all special snowflakes. One model wants you to feed it images and expects coordinates back as percentages from 0 to 1. Another wants absolute pixel coordinates. A third model has invented its own numeral system with `<|loc095|><|loc821|>` tokens inside tool calls. Some models output Python code that calls `pyautogui.click(x, y)`. Others will start hallucinating coordinates if you forget to format all previous messages within a very specific GUI system prompt.
10 |
11 | This is the kind of problem that makes you wonder if we're building the future of computing or just recreating the Tower of Babel with more GPUs.
12 |
13 | ## What we fixed
14 |
15 | Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.
16 |
17 | Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.
18 |
19 | ```python
20 | # This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
21 | agent = ComputerAgent(
22 | model="anthropic/claude-sonnet-4-5-20250929", # or any other supported model
23 | tools=[computer]
24 | )
25 | ```
26 |
27 | The output format is consistent across all providers (OpenAI, Anthropic, Vertex, Hugging Face, OpenRouter, etc.). No more writing different parsers for each model's creative interpretation of how to represent a mouse click.
28 |
29 | ## Composite Agents: Two Brains Are Better Than One
30 |
31 | Here's where it gets interesting. We realized that you don't actually need one model to be good at everything. Some models are excellent at understanding what's on the screen—they can reliably identify buttons and text fields and figure out where to click. Other models are great at planning and reasoning but might be a bit fuzzy on the exact pixel coordinates.
32 |
33 | So we let you combine them with a `+` sign:
34 |
35 | ```python
36 | agent = ComputerAgent(
37 | # specify the grounding model first, then the planning model
38 | model="huggingface-local/HelloKKMe/GTA1-7B+huggingface-local/OpenGVLab/InternVL3_5-8B",
39 | tools=[computer]
40 | )
41 | ```
42 |
43 | This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository.
44 |
45 | You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:
46 |
47 | ```python
48 | agent = ComputerAgent(
49 | model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-4o",
50 | tools=[computer]
51 | )
52 | ```
53 |
54 | ## Example notebook
55 |
56 | For a full, ready-to-run demo (install deps, local computer using Docker, and a composed agent example), see the notebook:
57 |
58 | - https://github.com/trycua/cua/blob/models/opencua/notebooks/composite_agents_docker_nb.ipynb
59 |
60 | ## What's next
61 |
62 | We're building integration with HUD evals, allowing us to curate and benchmark model combinations. This will help us identify which composite agent pairs work best for different types of tasks, and provide you with tested recommendations rather than just throwing model names at the wall to see what sticks.
63 |
64 | If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects.
65 |
66 | ---
67 |
68 | ## Links
69 |
70 | - **Composite Agent Docs:** [https://cua.ai/docs/agent-sdk/supported-agents/composed-agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
71 | - **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)
72 |
73 | Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.
74 |
```
--------------------------------------------------------------------------------
/blog/cloud-windows-ga-macos-preview.md:
--------------------------------------------------------------------------------
```markdown
1 | # Cloud Windows Sandboxes GA + macOS Preview
2 |
3 | If you've been building with our `cua` libraries, you might've hit a limitation with local computer-use sandboxes: to run agents on Windows or macOS, you need to be on that OS - Windows Sandbox for Windows, Apple Virtualization for macOS. The only cross-platform option is Linux on Docker, which limits you to virtualizing Linux environments ([see all local options here](https://cua.ai/docs/computer-sdk/computers)).
4 |
5 | Today the story changes - we're announcing general availability of **Cloud Windows Sandboxes** and opening early preview access for **Cloud macOS Sandboxes**.
6 |
7 | ## Cloud Windows Sandboxes: Now GA
8 |
9 | 
10 |
11 | Cloud Windows Sandboxes are now generally available. You get a full Windows 11 desktop in your browser with Edge and Python pre-installed, working seamlessly with all our [Computer-Use libraries](https://github.com/trycua/cua) for RPA, UI automation, code execution, and agent development.
12 |
13 | **What's new with this release:**
14 |
15 | - Hot-start under 1 second
16 | - Direct noVNC over HTTPS under our sandbox.cua.ai domain
17 | - 3 sandbox sizes available:
18 |
19 | | Size | CPU | RAM | Storage |
20 | | ------ | ------- | ----- | ---------- |
21 | | Small | 2 cores | 8 GB | 128 GB SSD |
22 | | Medium | 4 cores | 16 GB | 128 GB SSD |
23 | | Large | 8 cores | 32 GB | 256 GB SSD |
24 |
25 | <div align="center">
26 | <video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" width="600" controls></video>
27 | </div>
28 |
29 | **Pricing:** Windows Sandboxes start at 8 credits/hour (Small), 15 credits/hour (Medium), or 31 credits/hour (Large).
30 |
31 | ## Cloud macOS Sandboxes: Now in Preview
32 |
33 | Running macOS locally comes with challenges: 30GB golden images, a maximum of 2 sandboxes per host, and unpredictable compatibility issues. With Cloud macOS Sandboxes, we provision bare-metal macOS hosts (M1, M2, M4) on-demand—giving you full desktop access without the overhead of managing local sandboxes.
34 |
35 | 
36 |
37 | **Preview access:** Invite-only. [Join the waitlist](https://cua.ai/macos-waitlist) if you're building agents for macOS workflows.
38 |
39 | ## Getting Started Today
40 |
41 | Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then connect to a sandbox:
42 |
43 | ```python
44 | from computer import Computer
45 |
46 | computer = Computer(
47 | os_type="windows", # or "macos"
48 | provider_type="cloud",
49 | name="my-sandbox",
50 | api_key="your-api-key"
51 | )
52 |
53 | await computer.run()
54 | ```
55 |
56 | Manage existing sandboxes:
57 |
58 | ```python
59 | from computer.providers.cloud.provider import CloudProvider
60 |
61 | provider = CloudProvider(api_key="your-api-key")
62 | async with provider:
63 | sandboxes = await provider.list_vms()
64 | await provider.run_vm("my-sandbox")
65 | await provider.stop_vm("my-sandbox")
66 | ```
67 |
68 | Run an agent on Windows to automate a workflow:
69 |
70 | ```python
71 | from agent import ComputerAgent
72 |
73 | agent = ComputerAgent(
74 | model="anthropic/claude-sonnet-4-5-20250929",
75 | tools=[computer],
76 | max_trajectory_budget=5.0
77 | )
78 |
79 | response = await agent.run(
80 | "Open Excel, create a sales report with this month's data, and save it to the desktop"
81 | )
82 | ```
83 |
84 | ## FAQs
85 |
86 | <details>
87 | <summary><strong>Why not just use local Windows Sandbox?</strong></summary>
88 |
89 | Local Windows Sandbox resets on every restart. No persistence, no hot-start, and you need Windows Pro. Our sandboxes persist state, hot-start in under a second, and work from any OS.
90 |
91 | </details>
92 |
93 | <details>
94 | <summary><strong>What happens to my work when I stop a sandbox?</strong></summary>
95 |
96 | Everything persists. Files, installed software, browser profiles—it's all there when you restart. Only pay for runtime, not storage.
97 |
98 | </details>
99 |
100 | <details>
101 | <summary><strong>How's the latency for UI automation?</strong></summary>
102 |
103 | We run in 4 regions so you can pick what's closest. The noVNC connection is optimized for automation, not video streaming. Your agent sees crisp screenshots, not compressed video.
104 |
105 | </details>
106 |
107 | <details>
108 | <summary><strong>Are there software restrictions?</strong></summary>
109 |
110 | No. Full admin access on both platforms. Install whatever you need—Visual Studio, Photoshop, custom enterprise software. It's your sandbox.
111 |
112 | </details>
113 |
114 | ## Need help?
115 |
116 | If you hit issues getting either platform working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
117 |
118 | ---
119 |
120 | Get started at [cua.ai](https://cua.ai) or [join the macOS waitlist](https://cua.ai/macos-waitlist).
121 |
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/base.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Base callback handler interface for ComputerAgent preprocessing and postprocessing hooks.
3 | """
4 |
5 | from abc import ABC, abstractmethod
6 | from typing import Any, Dict, List, Optional, Union
7 |
8 |
9 | class AsyncCallbackHandler(ABC):
10 | """
11 | Base class for async callback handlers that can preprocess messages before
12 | the agent loop and postprocess output after the agent loop.
13 | """
14 |
15 | async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
16 | """Called at the start of an agent run loop."""
17 | pass
18 |
19 | async def on_run_end(
20 | self,
21 | kwargs: Dict[str, Any],
22 | old_items: List[Dict[str, Any]],
23 | new_items: List[Dict[str, Any]],
24 | ) -> None:
25 | """Called at the end of an agent run loop."""
26 | pass
27 |
28 | async def on_run_continue(
29 | self,
30 | kwargs: Dict[str, Any],
31 | old_items: List[Dict[str, Any]],
32 | new_items: List[Dict[str, Any]],
33 | ) -> bool:
34 | """Called during agent run loop to determine if execution should continue.
35 |
36 | Args:
37 | kwargs: Run arguments
38 | old_items: Original messages
39 | new_items: New messages generated during run
40 |
41 | Returns:
42 | True to continue execution, False to stop
43 | """
44 | return True
45 |
46 | async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
47 | """
48 | Called before messages are sent to the agent loop.
49 |
50 | Args:
51 | messages: List of message dictionaries to preprocess
52 |
53 | Returns:
54 | List of preprocessed message dictionaries
55 | """
56 | return messages
57 |
58 | async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
59 | """
60 | Called after the agent loop returns output.
61 |
62 | Args:
63 | output: List of output message dictionaries to postprocess
64 |
65 | Returns:
66 | List of postprocessed output dictionaries
67 | """
68 | return output
69 |
70 | async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
71 | """
72 | Called when a computer call is about to start.
73 |
74 | Args:
75 | item: The computer call item dictionary
76 | """
77 | pass
78 |
79 | async def on_computer_call_end(
80 | self, item: Dict[str, Any], result: List[Dict[str, Any]]
81 | ) -> None:
82 | """
83 | Called when a computer call has completed.
84 |
85 | Args:
86 | item: The computer call item dictionary
87 | result: The result of the computer call
88 | """
89 | pass
90 |
91 | async def on_function_call_start(self, item: Dict[str, Any]) -> None:
92 | """
93 | Called when a function call is about to start.
94 |
95 | Args:
96 | item: The function call item dictionary
97 | """
98 | pass
99 |
100 | async def on_function_call_end(
101 | self, item: Dict[str, Any], result: List[Dict[str, Any]]
102 | ) -> None:
103 | """
104 | Called when a function call has completed.
105 |
106 | Args:
107 | item: The function call item dictionary
108 | result: The result of the function call
109 | """
110 | pass
111 |
112 | async def on_text(self, item: Dict[str, Any]) -> None:
113 | """
114 | Called when a text message is encountered.
115 |
116 | Args:
117 | item: The message item dictionary
118 | """
119 | pass
120 |
121 | async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
122 | """
123 | Called when an API call is about to start.
124 |
125 | Args:
126 | kwargs: The kwargs being passed to the API call
127 | """
128 | pass
129 |
130 | async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
131 | """
132 | Called when an API call has completed.
133 |
134 | Args:
135 | kwargs: The kwargs that were passed to the API call
136 | result: The result of the API call
137 | """
138 | pass
139 |
140 | async def on_usage(self, usage: Dict[str, Any]) -> None:
141 | """
142 | Called when usage information is received.
143 |
144 | Args:
145 | usage: The usage information
146 | """
147 | pass
148 |
149 | async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
150 | """
151 | Called when a screenshot is taken.
152 |
153 | Args:
154 | screenshot: The screenshot image
155 | name: The name of the screenshot
156 | """
157 | pass
158 |
159 | async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
160 | """
161 | Called when responses are received.
162 |
163 | Args:
164 | kwargs: The kwargs being passed to the agent loop
165 | responses: The responses received
166 | """
167 | pass
168 |
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/computers.mdx:
--------------------------------------------------------------------------------
```markdown
1 | ---
2 | title: Computer Types
3 | description: Understanding Cua computer types and connection methods
4 | ---
5 |
6 | {/* prettier-ignore */}
7 | <Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>
8 |
9 | Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.
10 |
11 | Cua Computers are preconfigured sandboxes running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native sandbox, or on your host desktop.
12 |
13 | ## Cloud Sandbox
14 |
15 | **Easiest & safest way to get started - works on any host OS**
16 |
17 | This is a Cloud Sandbox running the Computer Server. Get a sandbox at [cua.ai](https://cua.ai/).
18 |
19 | <Tabs items={['Python', 'TypeScript']}>
20 | <Tab value="Python">
21 | ```python
22 | from computer import Computer
23 |
24 | computer = Computer(
25 | os_type="linux",
26 | provider_type="cloud",
27 | name="your-sandbox-name",
28 | api_key="your-api-key"
29 | )
30 |
31 | await computer.run() # Connect to the sandbox
32 | ```
33 |
34 | </Tab>
35 | <Tab value="TypeScript">
36 | ```typescript
37 | import { Computer, OSType } from '@trycua/computer';
38 |
39 | const computer = new Computer({
40 | osType: OSType.LINUX,
41 | name: "your-sandbox-name",
42 | apiKey: "your-api-key"
43 | });
44 |
45 | await computer.run(); // Connect to the sandbox
46 | ```
47 |
48 | </Tab>
49 | </Tabs>
50 |
51 | ## Linux on Docker
52 |
53 | **Run Linux desktop locally on macOS, Windows, or Linux hosts**
54 |
55 | Cua provides two Docker images for running Linux desktops:
56 |
57 | <Tabs items={['XFCE (Lightweight)', 'KASM (Full-Featured)']}>
58 | <Tab value="XFCE (Lightweight)">
59 |
60 | **Recommended for most use cases** - lightweight XFCE desktop with Firefox
61 |
62 | 1. Install Docker Desktop or Docker Engine
63 |
64 | 2. Pull the CUA XFCE image
65 |
66 | ```bash
67 | docker pull --platform=linux/amd64 trycua/cua-xfce:latest
68 | ```
69 |
70 | 3. Connect with Computer
71 |
72 | ```python
73 | from computer import Computer
74 |
75 | computer = Computer(
76 | os_type="linux",
77 | provider_type="docker",
78 | image="trycua/cua-xfce:latest",
79 | name="my-xfce-sandbox"
80 | )
81 |
82 | await computer.run() # Launch & connect to Docker sandbox
83 | ```
84 |
85 | </Tab>
86 | <Tab value="KASM (Full-Featured)">
87 |
88 | **Full-featured Ubuntu desktop** with additional applications
89 |
90 | 1. Install Docker Desktop or Docker Engine
91 |
92 | 2. Build or pull the CUA KASM image
93 |
94 | ```bash
95 | # Option 1: Pull from Docker Hub
96 | docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
97 |
98 | # Option 2: Build locally
99 | cd libs/kasm
100 | docker build -t cua-ubuntu:latest .
101 | ```
102 |
103 | 3. Connect with Computer
104 |
105 | ```python
106 | from computer import Computer
107 |
108 | computer = Computer(
109 | os_type="linux",
110 | provider_type="docker",
111 | image="trycua/cua-ubuntu:latest",
112 | name="my-kasm-sandbox"
113 | )
114 |
115 | await computer.run() # Launch & connect to Docker sandbox
116 | ```
117 |
118 | </Tab>
119 | </Tabs>
120 |
121 | ## Windows Sandbox
122 |
123 | **Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11**
124 |
125 | 1. Enable Windows Sandbox
126 | 2. Install pywinsandbox dependency
127 |
128 | ```bash
129 | pip install -U git+git://github.com/karkason/pywinsandbox.git
130 | ```
131 |
132 | 3. Connect with Computer
133 |
134 | ```python
135 | from computer import Computer
136 |
137 | computer = Computer(
138 | os_type="windows",
139 | provider_type="winsandbox",
140 | ephemeral=True # Windows Sandbox is always ephemeral
141 | )
142 |
143 | await computer.run() # Launch & connect to Windows Sandbox
144 | ```
145 |
146 | ## macOS Sandbox
147 |
148 | **macOS hosts only - requires Lume CLI**
149 |
150 | 1. Install lume cli
151 |
152 | ```bash
153 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
154 | ```
155 |
156 | 2. Start a local Cua macOS sandbox
157 |
158 | ```bash
159 | lume run macos-sequoia-cua:latest
160 | ```
161 |
162 | 3. Connect with Computer
163 |
164 | ```python
165 | from computer import Computer
166 |
167 | computer = Computer(
168 | os_type="macos",
169 | provider_type="lume",
170 | name="macos-sequoia-cua:latest"
171 | )
172 |
173 | await computer.run() # Launch & connect to the sandbox
174 | ```
175 |
176 | ## Your host desktop
177 |
178 | You can also have agents control your desktop directly by running Computer Server without any containerization layer. Beware that AI models may perform risky actions.
179 |
180 | ```bash
181 | pip install cua-computer-server
182 | python -m computer_server
183 | ```
184 |
185 | Connect with:
186 |
187 | <Tabs items={['Python']}>
188 | <Tab value="Python">
189 | ```python
190 |
191 | computer = Computer(use_host_computer_server=True)
192 | await computer.run() # Connect to the host desktop
193 |
194 | ```
195 |
196 | </Tab>
197 | </Tabs>
198 |
```
--------------------------------------------------------------------------------
/libs/lumier/src/bin/entry.sh:
--------------------------------------------------------------------------------
```bash
1 | #!/usr/bin/env bash
2 |
3 | # Configure SSH to prevent known hosts warnings
4 | export SSHPASS_PROMPT=
5 | export SSH_ASKPASS=/bin/echo
6 | # Set SSH quiet mode via the SSHPASS environment variable
7 | export SSHPASS_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -q"
8 |
9 | # We'll enable strict error checking AFTER initialization
10 | # to prevent premature exits
11 |
12 | # Source configuration files
13 | CONFIG_DIR="/run/config"
14 | LIB_DIR="/run/lib"
15 |
16 | # Source constants if available
17 | if [ -f "${CONFIG_DIR}/constants.sh" ]; then
18 | source "${CONFIG_DIR}/constants.sh"
19 | fi
20 |
21 | # Import utilities
22 | for lib in "${LIB_DIR}"/*.sh; do
23 | if [ -f "$lib" ]; then
24 | source "$lib"
25 | fi
26 | done
27 |
28 | # Set VM_NAME to env or fallback to container name (from --name)
29 | if [ -z "${VM_NAME:-}" ]; then
30 | VM_NAME="$(cat /etc/hostname)"
31 | export VM_NAME
32 | fi
33 |
34 | # Set HOST_STORAGE_PATH to a lume ephemeral storage if not set
35 | if [ -z "${HOST_STORAGE_PATH:-}" ]; then
36 | HOST_STORAGE_PATH="ephemeral"
37 |
38 | # Tell user that ephemeral storage is being used
39 | echo "Using ephemeral storage. VM state will be lost when macOS cleans up temporary files."
40 |
41 | export HOST_STORAGE_PATH
42 | fi
43 |
44 | # Only check and report mountpoints in debug mode
45 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
46 | if mountpoint -q /storage; then
47 | echo "/storage is mounted"
48 | fi
49 | if mountpoint -q /shared; then
50 | echo "/shared is mounted"
51 | fi
52 | # if mountpoint -q /data; then
53 | # echo "/data is mounted"
54 | # fi
55 | fi
56 |
57 | # Check if we're running as PID 1 (important for Docker signal handling)
58 | if [ $$ -ne 1 ]; then
59 | echo "Warning: This script is not running as PID 1 (current PID: $$)."
60 | echo "Docker signal handling may not work properly when stopped from Docker Desktop."
61 | fi
62 |
63 | # Log startup info
64 | echo "Lumier VM is starting..."
65 |
66 | # Cleanup function to ensure VM and noVNC proxy shutdown on container stop
67 | # Counter for signal handling
68 | SIGNAL_COUNT=0
69 |
70 | cleanup() {
71 | local signal_name=$1
72 | set +e # Don't exit on error in cleanup
73 |
74 | # Increment signal counter
75 | SIGNAL_COUNT=$((SIGNAL_COUNT + 1))
76 |
77 | # If this is the first signal, try graceful shutdown
78 | if [ $SIGNAL_COUNT -eq 1 ]; then
79 | echo "[cleanup] Caught $signal_name signal, shutting down..."
80 |
81 | # Check if we're in the middle of an image pull
82 | if [[ "$PULL_IN_PROGRESS" == "1" ]]; then
83 | echo "[cleanup] Interrupted during image pull, skipping VM stop."
84 | else
85 | echo "[cleanup] Stopping VM..."
86 | stop_vm true
87 | fi
88 |
89 | # Attempt to clean up ephemeral storage if it's in the /private/tmp directory
90 | if [[ "$HOST_STORAGE_PATH" == "ephemeral" ]]; then
91 | # First check if VM actually exists
92 | VM_INFO=$(lume_get "$VM_NAME" "$HOST_STORAGE_PATH" "json" "false")
93 |
94 | # Only try VM deletion if VM exists and not in the middle of a pull
95 | if [[ "$PULL_IN_PROGRESS" != "1" && $VM_INFO != *"Virtual machine not found"* ]]; then
96 | echo "[cleanup] Cleaning up VM..."
97 | lume_delete "$VM_NAME" "$HOST_STORAGE_PATH" > /dev/null 2>&1
98 | fi
99 | fi
100 | else
101 | # For multiple signals, force an immediate exit
102 | echo "got $SIGNAL_COUNT SIGTERM/SIGINTs, forcefully exiting"
103 | fi
104 |
105 | # If we've received multiple signals, just exit immediately
106 | if [ $SIGNAL_COUNT -ge 3 ]; then
107 | exit 1
108 | fi
109 |
110 | # Exit with success for the first signal
111 | if [ $SIGNAL_COUNT -eq 1 ]; then
112 | exit 0
113 | fi
114 | }
115 | # Ensure we catch all typical container termination signals
116 | trap 'cleanup SIGTERM' SIGTERM
117 | trap 'cleanup SIGINT' SIGINT
118 | trap 'cleanup SIGHUP' SIGHUP
119 |
120 | # Now enable strict error handling after initialization
121 | set -euo pipefail
122 |
123 | # Start the VM with error handling
124 | if ! start_vm; then
125 | echo "ERROR: Failed to start VM!" >&2
126 | exit 1
127 | fi
128 |
129 | # Start noVNC for VNC access
130 | NOVNC_PID=""
131 | if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then
132 | # Only show this in debug mode
133 | if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
134 | echo "Starting noVNC proxy with optimized color settings..."
135 | fi
136 | ${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 &
137 | NOVNC_PID=$!
138 | disown $NOVNC_PID
139 | echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true (replace PORT with the port you forwarded to 8006)"
140 | fi
141 |
142 | echo "Lumier is running. Press Ctrl+C to stop."
143 |
144 | # Instead of tail -f /dev/null, use a wait loop that can be interrupted by signals
145 | while true; do
146 | # Sleep in small increments to make signal handling more responsive
147 | sleep 1 &
148 | wait $!
149 | # Break the loop if we've received a signal
150 | if [ $SIGNAL_COUNT -gt 0 ]; then
151 | break
152 | fi
153 | done
```
--------------------------------------------------------------------------------
/libs/lume/src/Server/Requests.swift:
--------------------------------------------------------------------------------
```swift
1 | import ArgumentParser
2 | import Foundation
3 | import Virtualization
4 |
5 | struct RunVMRequest: Codable {
6 | let noDisplay: Bool?
7 | let sharedDirectories: [SharedDirectoryRequest]?
8 | let recoveryMode: Bool?
9 | let storage: String?
10 |
11 | struct SharedDirectoryRequest: Codable {
12 | let hostPath: String
13 | let readOnly: Bool?
14 | }
15 |
16 | func parse() throws -> [SharedDirectory] {
17 | guard let sharedDirectories = sharedDirectories else { return [] }
18 |
19 | return try sharedDirectories.map { dir -> SharedDirectory in
20 | // Validate that the host path exists and is a directory
21 | var isDirectory: ObjCBool = false
22 | guard FileManager.default.fileExists(atPath: dir.hostPath, isDirectory: &isDirectory),
23 | isDirectory.boolValue
24 | else {
25 | throw ValidationError(
26 | "Host path does not exist or is not a directory: \(dir.hostPath)")
27 | }
28 |
29 | return SharedDirectory(
30 | hostPath: dir.hostPath,
31 | tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag,
32 | readOnly: dir.readOnly ?? false
33 | )
34 | }
35 | }
36 | }
37 |
38 | struct PullRequest: Codable {
39 | let image: String
40 | let name: String?
41 | var registry: String
42 | var organization: String
43 | let storage: String?
44 |
45 | enum CodingKeys: String, CodingKey {
46 | case image, name, registry, organization, storage
47 | }
48 |
49 | init(from decoder: Decoder) throws {
50 | let container = try decoder.container(keyedBy: CodingKeys.self)
51 | image = try container.decode(String.self, forKey: .image)
52 | name = try container.decodeIfPresent(String.self, forKey: .name)
53 | registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
54 | organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
55 | storage = try container.decodeIfPresent(String.self, forKey: .storage)
56 | }
57 | }
58 |
59 | struct CreateVMRequest: Codable {
60 | let name: String
61 | let os: String
62 | let cpu: Int
63 | let memory: String
64 | let diskSize: String
65 | let display: String
66 | let ipsw: String?
67 | let storage: String?
68 |
69 | func parse() throws -> (memory: UInt64, diskSize: UInt64) {
70 | return (
71 | memory: try parseSize(memory),
72 | diskSize: try parseSize(diskSize)
73 | )
74 | }
75 | }
76 |
77 | struct SetVMRequest: Codable {
78 | let cpu: Int?
79 | let memory: String?
80 | let diskSize: String?
81 | let display: String?
82 | let storage: String?
83 |
84 | func parse() throws -> (memory: UInt64?, diskSize: UInt64?, display: VMDisplayResolution?) {
85 | return (
86 | memory: try memory.map { try parseSize($0) },
87 | diskSize: try diskSize.map { try parseSize($0) },
88 | display: try display.map {
89 | guard let resolution = VMDisplayResolution(string: $0) else {
90 | throw ValidationError(
91 | "Invalid display resolution format: \($0). Expected format: WIDTHxHEIGHT")
92 | }
93 | return resolution
94 | }
95 | )
96 | }
97 | }
98 |
99 | struct CloneRequest: Codable {
100 | let name: String
101 | let newName: String
102 | let sourceLocation: String?
103 | let destLocation: String?
104 | }
105 |
106 | struct PushRequest: Codable {
107 | let name: String // Name of the local VM
108 | let imageName: String // Base name for the image in the registry
109 | let tags: [String] // List of tags to push
110 | var registry: String // Registry URL
111 | var organization: String // Organization/user in the registry
112 | let storage: String? // Optional VM storage location or direct path
113 | var chunkSizeMb: Int // Chunk size
114 | // dryRun and reassemble are less common for API, default to false?
115 | // verbose is usually handled by server logging
116 |
117 | enum CodingKeys: String, CodingKey {
118 | case name, imageName, tags, registry, organization, storage, chunkSizeMb
119 | }
120 |
121 | // Provide default values for optional fields during decoding
122 | init(from decoder: Decoder) throws {
123 | let container = try decoder.container(keyedBy: CodingKeys.self)
124 | name = try container.decode(String.self, forKey: .name)
125 | imageName = try container.decode(String.self, forKey: .imageName)
126 | tags = try container.decode([String].self, forKey: .tags)
127 | registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
128 | organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
129 | storage = try container.decodeIfPresent(String.self, forKey: .storage)
130 | chunkSizeMb = try container.decodeIfPresent(Int.self, forKey: .chunkSizeMb) ?? 512
131 | }
132 | }
133 |
```
--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMConfig.swift:
--------------------------------------------------------------------------------
```swift
1 | import ArgumentParser
2 | import Foundation
3 | import Virtualization
4 |
5 | /// Represents a shared directory configuration
6 | struct SharedDirectory: Codable {
7 | let hostPath: String
8 | let tag: String
9 | let readOnly: Bool
10 |
11 | var string: String {
12 | return "\(hostPath):\(tag):\(readOnly ? "ro" : "rw")"
13 | }
14 | }
15 |
16 | // MARK: - VMConfig
17 | struct VMConfig: Codable {
18 |
19 | // MARK: - Properties
20 | let os: String
21 | private var _cpuCount: Int?
22 | private var _memorySize: UInt64?
23 | private var _diskSize: UInt64?
24 | private var _macAddress: String?
25 | private var _display: VMDisplayResolution
26 | private var _hardwareModel: Data?
27 | private var _machineIdentifier: Data?
28 |
29 | // MARK: - Initialization
30 | init(
31 | os: String,
32 | cpuCount: Int? = nil,
33 | memorySize: UInt64? = nil,
34 | diskSize: UInt64? = nil,
35 | macAddress: String? = nil,
36 | display: String,
37 | hardwareModel: Data? = nil,
38 | machineIdentifier: Data? = nil
39 | ) throws {
40 | self.os = os
41 | self._cpuCount = cpuCount
42 | self._memorySize = memorySize
43 | self._diskSize = diskSize
44 | self._macAddress = macAddress
45 | self._display = VMDisplayResolution(string: display) ?? VMDisplayResolution(string: "1024x768")!
46 | self._hardwareModel = hardwareModel
47 | self._machineIdentifier = machineIdentifier
48 | }
49 |
50 | var display: VMDisplayResolution {
51 | get { _display }
52 | set { _display = newValue }
53 | }
54 |
55 | var cpuCount: Int? {
56 | get { _cpuCount }
57 | set { _cpuCount = newValue }
58 | }
59 |
60 | var memorySize: UInt64? {
61 | get { _memorySize }
62 | set { _memorySize = newValue }
63 | }
64 |
65 | var diskSize: UInt64? {
66 | get { _diskSize }
67 | set { _diskSize = newValue }
68 | }
69 |
70 | var hardwareModel: Data? {
71 | get { _hardwareModel }
72 | set { _hardwareModel = newValue }
73 | }
74 |
75 | var machineIdentifier: Data? {
76 | get { _machineIdentifier }
77 | set { _machineIdentifier = newValue }
78 | }
79 |
80 | var macAddress: String? {
81 | get { _macAddress }
82 | set { _macAddress = newValue }
83 | }
84 |
85 | mutating func setCpuCount(_ count: Int) {
86 | _cpuCount = count
87 | }
88 |
89 | mutating func setMemorySize(_ size: UInt64) {
90 | _memorySize = size
91 | }
92 |
93 | mutating func setDiskSize(_ size: UInt64) {
94 | _diskSize = size
95 | }
96 |
97 | mutating func setHardwareModel(_ hardwareModel: Data) {
98 | _hardwareModel = hardwareModel
99 | }
100 |
101 | mutating func setMachineIdentifier(_ machineIdentifier: Data) {
102 | _machineIdentifier = machineIdentifier
103 | }
104 |
105 | mutating func setMacAddress(_ newMacAddress: String) {
106 | self._macAddress = newMacAddress
107 | }
108 |
109 | mutating func setDisplay(_ newDisplay: VMDisplayResolution) {
110 | self._display = newDisplay
111 | }
112 |
113 | // MARK: - Codable
114 | enum CodingKeys: String, CodingKey {
115 | case _cpuCount = "cpuCount"
116 | case _memorySize = "memorySize"
117 | case _diskSize = "diskSize"
118 | case macAddress
119 | case display
120 | case _hardwareModel = "hardwareModel"
121 | case _machineIdentifier = "machineIdentifier"
122 | case os
123 | }
124 |
125 | init(from decoder: Decoder) throws {
126 | let container = try decoder.container(keyedBy: CodingKeys.self)
127 |
128 | os = try container.decode(String.self, forKey: .os)
129 | _cpuCount = try container.decodeIfPresent(Int.self, forKey: ._cpuCount)
130 | _memorySize = try container.decodeIfPresent(UInt64.self, forKey: ._memorySize)
131 | _diskSize = try container.decodeIfPresent(UInt64.self, forKey: ._diskSize)
132 | _macAddress = try container.decodeIfPresent(String.self, forKey: .macAddress)
133 | _display = VMDisplayResolution(string: try container.decode(String.self, forKey: .display))!
134 | _hardwareModel = try container.decodeIfPresent(Data.self, forKey: ._hardwareModel)
135 | _machineIdentifier = try container.decodeIfPresent(Data.self, forKey: ._machineIdentifier)
136 | }
137 |
138 | func encode(to encoder: Encoder) throws {
139 | var container = encoder.container(keyedBy: CodingKeys.self)
140 |
141 | try container.encodeIfPresent(os, forKey: .os)
142 | try container.encodeIfPresent(_cpuCount, forKey: ._cpuCount)
143 | try container.encodeIfPresent(_memorySize, forKey: ._memorySize)
144 | try container.encodeIfPresent(_diskSize, forKey: ._diskSize)
145 | try container.encodeIfPresent(_macAddress, forKey: .macAddress)
146 | try container.encode(display.string, forKey: .display)
147 | try container.encodeIfPresent(_hardwareModel, forKey: ._hardwareModel)
148 | try container.encodeIfPresent(_machineIdentifier, forKey: ._machineIdentifier)
149 | }
150 | }
151 |
```
--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/cli.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Command-line interface for the Computer API server.
3 | """
4 |
5 | import argparse
6 | import asyncio
7 | import logging
8 | import os
9 | import sys
10 | import threading
11 | from typing import List, Optional
12 |
13 | from .server import Server
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
19 | """Parse command-line arguments."""
20 | parser = argparse.ArgumentParser(description="Start the Computer API server")
21 | parser.add_argument(
22 | "--host", default="0.0.0.0", help="Host to bind the server to (default: 0.0.0.0)"
23 | )
24 | parser.add_argument(
25 | "--port", type=int, default=8000, help="Port to bind the server to (default: 8000)"
26 | )
27 | parser.add_argument(
28 | "--log-level",
29 | choices=["debug", "info", "warning", "error", "critical"],
30 | default="info",
31 | help="Logging level (default: info)",
32 | )
33 | parser.add_argument(
34 | "--ssl-keyfile",
35 | type=str,
36 | help="Path to SSL private key file (enables HTTPS)",
37 | )
38 | parser.add_argument(
39 | "--ssl-certfile",
40 | type=str,
41 | help="Path to SSL certificate file (enables HTTPS)",
42 | )
43 | parser.add_argument(
44 | "--watchdog",
45 | action="store_true",
46 | help="Enable watchdog monitoring (automatically enabled if CONTAINER_NAME env var is set)",
47 | )
48 | parser.add_argument(
49 | "--watchdog-interval",
50 | type=int,
51 | default=30,
52 | help="Watchdog ping interval in seconds (default: 30)",
53 | )
54 | parser.add_argument(
55 | "--no-restart",
56 | action="store_true",
57 | help="Disable automatic server restart in watchdog",
58 | )
59 |
60 | return parser.parse_args(args)
61 |
62 |
63 | def main() -> None:
64 | """Main entry point for the CLI."""
65 | args = parse_args()
66 |
67 | # Configure logging
68 | logging.basicConfig(
69 | level=getattr(logging, args.log_level.upper()),
70 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
71 | )
72 |
73 | # Check if watchdog should be enabled
74 | container_name = os.environ.get("CONTAINER_NAME")
75 | enable_watchdog = (args.watchdog or bool(container_name)) and not sys.platform.startswith("win")
76 |
77 | if container_name:
78 | logger.info(
79 | f"Container environment detected (CONTAINER_NAME={container_name}), enabling watchdog"
80 | )
81 | elif args.watchdog:
82 | logger.info("Watchdog explicitly enabled via --watchdog flag")
83 |
84 | # Start watchdog if enabled
85 | if enable_watchdog:
86 | logger.info(f"Starting watchdog monitoring with {args.watchdog_interval}s interval")
87 |
88 | def run_watchdog_thread():
89 | """Run watchdog in a separate thread."""
90 | loop = asyncio.new_event_loop()
91 | asyncio.set_event_loop(loop)
92 | try:
93 | # Create CLI args dict for watchdog
94 | cli_args = {
95 | "host": args.host,
96 | "port": args.port,
97 | "log_level": args.log_level,
98 | "ssl_keyfile": args.ssl_keyfile,
99 | "ssl_certfile": args.ssl_certfile,
100 | }
101 |
102 | # Create watchdog with restart settings
103 | from .watchdog import Watchdog
104 |
105 | watchdog = Watchdog(cli_args=cli_args, ping_interval=args.watchdog_interval)
106 | watchdog.restart_enabled = not args.no_restart
107 |
108 | loop.run_until_complete(watchdog.start_monitoring())
109 | except Exception as e:
110 | logger.error(f"Watchdog error: {e}")
111 | finally:
112 | loop.close()
113 |
114 | # Start watchdog in background thread
115 | watchdog_thread = threading.Thread(target=run_watchdog_thread, daemon=True, name="watchdog")
116 | watchdog_thread.start()
117 |
118 | # Create and start the server
119 | logger.info(f"Starting CUA Computer API server on {args.host}:{args.port}...")
120 |
121 | # Handle SSL configuration
122 | ssl_args = {}
123 | if args.ssl_keyfile and args.ssl_certfile:
124 | ssl_args = {
125 | "ssl_keyfile": args.ssl_keyfile,
126 | "ssl_certfile": args.ssl_certfile,
127 | }
128 | logger.info("HTTPS mode enabled with SSL certificates")
129 | elif args.ssl_keyfile or args.ssl_certfile:
130 | logger.warning(
131 | "Both --ssl-keyfile and --ssl-certfile are required for HTTPS. Running in HTTP mode."
132 | )
133 | else:
134 | logger.info("HTTP mode (no SSL certificates provided)")
135 |
136 | server = Server(host=args.host, port=args.port, log_level=args.log_level, **ssl_args)
137 |
138 | try:
139 | server.start()
140 | except KeyboardInterrupt:
141 | logger.info("Server stopped by user")
142 | sys.exit(0)
143 | except Exception as e:
144 | logger.error(f"Error starting server: {e}")
145 | sys.exit(1)
146 |
147 |
148 | if __name__ == "__main__":
149 | main()
150 |
```
--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/DarwinImageLoader.swift:
--------------------------------------------------------------------------------
```swift
1 | import Foundation
2 | import Virtualization
3 |
4 | /// Handles loading and validation of macOS restore images (IPSW files).
5 | /// Provides functionality to:
6 | /// - Fetch the latest supported macOS restore image URL
7 | /// - Load and validate image requirements for VM creation
8 | /// - Extract hardware model and auxiliary storage configuration
9 | protocol ImageLoader: Sendable {
10 | typealias ImageRequirements = DarwinImageLoader.ImageRequirements
11 | func fetchLatestSupportedURL() async throws -> URL
12 | func loadImageRequirements(from url: URL) async throws -> ImageRequirements
13 | func downloadLatestImage() async throws -> Path
14 | }
15 |
16 | final class DarwinImageLoader: NSObject, ImageLoader, @unchecked Sendable, URLSessionDownloadDelegate {
17 | struct ImageRequirements: Sendable {
18 | let hardwareModel: Data
19 | let minimumSupportedCPUCount: Int
20 | let minimumSupportedMemorySize: UInt64
21 | }
22 |
23 | enum ImageError: Error {
24 | case invalidImage
25 | case unsupportedConfiguration
26 | case downloadFailed
27 | }
28 |
29 | private var lastLoggedProgress: Double = 0.0
30 | private var progressLogger = ProgressLogger()
31 | private var completionHandler: ((URL?, Error?) -> Void)?
32 |
33 | func fetchLatestSupportedURL() async throws -> URL {
34 | try await withCheckedThrowingContinuation { continuation in
35 | VZMacOSRestoreImage.fetchLatestSupported { result in
36 | switch result {
37 | case .success(let image):
38 | continuation.resume(returning: image.url)
39 | case .failure(let error):
40 | continuation.resume(throwing: error)
41 | }
42 | }
43 | }
44 | }
45 |
46 | func loadImageRequirements(from url: URL) async throws -> ImageRequirements {
47 | let image = try await VZMacOSRestoreImage.image(from: url)
48 | guard let requirements = image.mostFeaturefulSupportedConfiguration else {
49 | throw ImageError.unsupportedConfiguration
50 | }
51 |
52 | return ImageRequirements(
53 | hardwareModel: requirements.hardwareModel.dataRepresentation,
54 | minimumSupportedCPUCount: requirements.minimumSupportedCPUCount,
55 | minimumSupportedMemorySize: requirements.minimumSupportedMemorySize
56 | )
57 | }
58 |
59 | func downloadLatestImage() async throws -> Path {
60 | let url = try await fetchLatestSupportedURL()
61 | let tempDir = FileManager.default.temporaryDirectory
62 | let downloadPath = tempDir.appendingPathComponent("latest.ipsw")
63 |
64 | // Reset progress logger state
65 | progressLogger = ProgressLogger(threshold: 0.01)
66 |
67 | // Create a continuation to wait for download completion
68 | return try await withCheckedThrowingContinuation { continuation in
69 | let session = URLSession(configuration: .default, delegate: self, delegateQueue: nil)
70 | let task = session.downloadTask(with: url)
71 |
72 | // Use the delegate method to handle completion
73 | self.completionHandler = { location, error in
74 | if let error = error {
75 | continuation.resume(throwing: error)
76 | return
77 | }
78 |
79 | do {
80 | // Remove existing file if it exists
81 | if FileManager.default.fileExists(atPath: downloadPath.path) {
82 | try FileManager.default.removeItem(at: downloadPath)
83 | }
84 |
85 | try FileManager.default.moveItem(at: location!, to: downloadPath)
86 | Logger.info("Download completed and moved to: \(downloadPath.path)")
87 | continuation.resume(returning: Path(downloadPath.path))
88 | } catch {
89 | continuation.resume(throwing: error)
90 | }
91 | }
92 |
93 | task.resume()
94 | }
95 | }
96 |
97 | func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didWriteData bytesWritten: Int64, totalBytesWritten: Int64, totalBytesExpectedToWrite: Int64) {
98 | let progress = Double(totalBytesWritten) / Double(totalBytesExpectedToWrite)
99 | progressLogger.logProgress(current: progress, context: "Downloading IPSW")
100 | }
101 |
102 | func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) {
103 | // Call the stored completion handler
104 | completionHandler?(location, nil)
105 | }
106 |
107 | func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
108 | // Call the stored completion handler with an error if it occurred
109 | if let error = error {
110 | completionHandler?(nil, error)
111 | }
112 | }
113 | }
```
--------------------------------------------------------------------------------
/examples/agent_examples.py:
--------------------------------------------------------------------------------
```python
1 | """Example demonstrating the ComputerAgent capabilities with the Omni provider."""
2 |
3 | import asyncio
4 | import logging
5 | import signal
6 | import traceback
7 |
8 | # Import the unified agent class and types
9 | from agent import ComputerAgent
10 | from computer import Computer, VMProviderType
11 |
12 | # Import utility functions
13 | from utils import handle_sigint, load_dotenv_files
14 |
15 | # Set up logging
16 | logging.basicConfig(level=logging.INFO)
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | async def run_agent_example():
21 | """Run example of using the ComputerAgent with different models."""
22 | print("\n=== Example: ComputerAgent with different models ===")
23 |
24 | try:
25 | # Create a local macOS computer
26 | computer = Computer(
27 | os_type="macos",
28 | verbosity=logging.DEBUG,
29 | )
30 |
31 | # Create a remote Linux computer with Cua
32 | # computer = Computer(
33 | # os_type="linux",
34 | # api_key=os.getenv("CUA_API_KEY"),
35 | # name=os.getenv("CUA_CONTAINER_NAME"),
36 | # provider_type=VMProviderType.CLOUD,
37 | # )
38 |
39 | # Create ComputerAgent with new API
40 | agent = ComputerAgent(
41 | # Supported models:
42 | # == OpenAI CUA (computer-use-preview) ==
43 | model="openai/computer-use-preview",
44 | # == Anthropic CUA (Claude > 3.5) ==
45 | # model="anthropic/claude-opus-4-20250514",
46 | # model="anthropic/claude-sonnet-4-20250514",
47 | # model="anthropic/claude-3-7-sonnet-20250219",
48 | # model="anthropic/claude-sonnet-4-5-20250929",
49 | # == UI-TARS ==
50 | # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
51 | # model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
52 | # model="ollama_chat/0000/ui-tars-1.5-7b",
53 | # == Omniparser + Any LLM ==
54 | # model="omniparser+anthropic/claude-opus-4-20250514",
55 | # model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
56 | # == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
57 | # model="omni+vertex_ai/gemini-3-flash",
58 | # thinking_level="high", # or "low"
59 | # media_resolution="medium", # or "low" or "high"
60 | tools=[computer],
61 | only_n_most_recent_images=3,
62 | verbosity=logging.DEBUG,
63 | trajectory_dir="trajectories",
64 | use_prompt_caching=True,
65 | max_trajectory_budget=1.0,
66 | )
67 |
68 | # Example tasks to demonstrate the agent
69 | tasks = [
70 | "Look for a repository named trycua/cua on GitHub.",
71 | "Check the open issues, open the most recent one and read it.",
72 | "Clone the repository in users/lume/projects if it doesn't exist yet.",
73 | "Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
74 | "From Cursor, open Composer if not already open.",
75 | "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
76 | ]
77 |
78 | # Use message-based conversation history
79 | history = []
80 |
81 | for i, task in enumerate(tasks):
82 | print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
83 |
84 | # Add user message to history
85 | history.append({"role": "user", "content": task})
86 |
87 | # Run agent with conversation history
88 | async for result in agent.run(history, stream=False):
89 | # Add agent outputs to history
90 | history += result.get("output", [])
91 |
92 | # Print output for debugging
93 | for item in result.get("output", []):
94 | if item.get("type") == "message":
95 | content = item.get("content", [])
96 | for content_part in content:
97 | if content_part.get("text"):
98 | print(f"Agent: {content_part.get('text')}")
99 | elif item.get("type") == "computer_call":
100 | action = item.get("action", {})
101 | action_type = action.get("type", "")
102 | print(f"Computer Action: {action_type}({action})")
103 | elif item.get("type") == "computer_call_output":
104 | print("Computer Output: [Screenshot/Result]")
105 |
106 | print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")
107 |
108 | except Exception as e:
109 | logger.error(f"Error in run_agent_example: {e}")
110 | traceback.print_exc()
111 | raise
112 |
113 |
114 | def main():
115 | """Run the Anthropic agent example."""
116 | try:
117 | load_dotenv_files()
118 |
119 | # Register signal handler for graceful exit
120 | signal.signal(signal.SIGINT, handle_sigint)
121 |
122 | asyncio.run(run_agent_example())
123 | except Exception as e:
124 | print(f"Error running example: {e}")
125 | traceback.print_exc()
126 |
127 |
128 | if __name__ == "__main__":
129 | main()
130 |
```
--------------------------------------------------------------------------------
/examples/computer_examples_windows.py:
--------------------------------------------------------------------------------
```python
1 | import asyncio
2 | import os
3 | import sys
4 | import traceback
5 | from pathlib import Path
6 |
7 | # Load environment variables from .env file
8 | project_root = Path(__file__).parent.parent
9 | env_file = project_root / ".env"
10 | print(f"Loading environment from: {env_file}")
11 | from computer.helpers import sandboxed
12 | from dotenv import load_dotenv
13 |
14 | load_dotenv(env_file)
15 |
16 | # Add paths to sys.path if needed
17 | pythonpath = os.environ.get("PYTHONPATH", "")
18 | for path in pythonpath.split(":"):
19 | if path and path not in sys.path:
20 | sys.path.insert(0, path) # Insert at beginning to prioritize
21 | print(f"Added to sys.path: {path}")
22 |
23 | from computer.computer import Computer
24 | from computer.logger import LogLevel
25 | from computer.providers.base import VMProviderType
26 |
27 | # ANSI color codes
28 | RED = "\033[91m"
29 | RESET = "\033[0m"
30 |
31 |
32 | async def main():
33 | try:
34 | print("\n=== Using direct initialization ===")
35 |
36 | # Create a remote Windows computer with Cua
37 | computer = Computer(
38 | os_type="windows",
39 | api_key=os.getenv("CUA_API_KEY"),
40 | name=os.getenv("CONTAINER_NAME") or "",
41 | provider_type=VMProviderType.CLOUD,
42 | )
43 |
44 | try:
45 | # Run the computer with default parameters
46 | await computer.run()
47 |
48 | # Create output directory if it doesn't exist
49 | output_dir = Path("./output")
50 | output_dir.mkdir(exist_ok=True)
51 |
52 | # Keyboard Actions Examples
53 | print("\n=== Keyboard Actions ===")
54 | await computer.interface.type_text("Hello, World!")
55 | await computer.interface.press_key("enter")
56 |
57 | # Mouse Actions Examples
58 | print("\n=== Mouse Actions ===")
59 | await computer.interface.move_cursor(100, 100)
60 | await computer.interface.left_click()
61 | await computer.interface.double_click(400, 400)
62 | await computer.interface.right_click(300, 300)
63 |
64 | print("\n=== RPC ===")
65 | await computer.venv_install("demo_venv", ["mss"])
66 |
67 | @sandboxed("demo_venv")
68 | def greet_and_print(name):
69 | import os
70 |
71 | from mss import mss
72 |
73 | # get username
74 | username = os.getlogin()
75 | print(f"Hello from inside the container, {name}!")
76 | print("Username:", username)
77 | print("Screens:", mss().monitors)
78 |
79 | # take a screenshot
80 | with mss() as sct:
81 | filename = sct.shot(mon=-1, output="C:/Users/azureuser/Desktop/fullscreen.png")
82 | print(filename)
83 |
84 | return {"greeted": name, "username": username}
85 |
86 | # Call with args and kwargs
87 | result = await greet_and_print("John Doe")
88 | print("Result from sandboxed function:", result)
89 |
90 | # Command Actions Examples
91 | print("\n=== Command Actions ===")
92 | result = await computer.interface.run_command("notepad")
93 | print("Result from command:", result)
94 |
95 | screenshot = await computer.interface.screenshot()
96 | screenshot_path = output_dir / "screenshot.png"
97 | with open(screenshot_path, "wb") as f:
98 | f.write(screenshot)
99 | print(f"Screenshot saved to: {screenshot_path.absolute()}")
100 |
101 | # Clipboard Actions Examples
102 | print("\n=== Clipboard Actions ===")
103 | await computer.interface.set_clipboard("Test clipboard")
104 | content = await computer.interface.copy_to_clipboard()
105 | print(f"Clipboard content: {content}")
106 |
107 | # Simple REPL Loop
108 | print("\n=== Command REPL ===")
109 | print("Enter commands to run on the remote computer.")
110 | print("Type 'exit' or 'quit' to leave the REPL.\n")
111 |
112 | while True:
113 | try:
114 | # Get command from user
115 | command = input("command> ").strip()
116 |
117 | # Check for exit commands
118 | if command.lower() in ["exit", "quit", ""]:
119 | if command.lower() in ["exit", "quit"]:
120 | print("Exiting REPL...")
121 | break
122 |
123 | # Run the command
124 | result = await computer.interface.run_command(command)
125 |
126 | print(result.stdout)
127 | if result.stderr:
128 | print(f"{RED}{result.stderr}{RESET}")
129 | except KeyboardInterrupt:
130 | print("\nExiting REPL...")
131 | break
132 | except Exception as e:
133 | print(f"{RED}Error running command: {e}{RESET}")
134 |
135 | finally:
136 | # Important to clean up resources
137 | # await computer.stop()
138 | pass
139 | except Exception as e:
140 | print(f"Error in main: {e}")
141 | traceback.print_exc()
142 |
143 |
144 | if __name__ == "__main__":
145 | asyncio.run(main())
146 |
```