This is page 4 of 20. Use http://codebase.md/trycua/cua?lines=false&page={x} to view the full context.
# Directory Structure
```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│ ├── FUNDING.yml
│ ├── scripts
│ │ ├── get_pyproject_version.py
│ │ └── tests
│ │ ├── __init__.py
│ │ ├── README.md
│ │ └── test_get_pyproject_version.py
│ └── workflows
│ ├── bump-version.yml
│ ├── ci-lume.yml
│ ├── docker-publish-cua-linux.yml
│ ├── docker-publish-cua-windows.yml
│ ├── docker-publish-kasm.yml
│ ├── docker-publish-xfce.yml
│ ├── docker-reusable-publish.yml
│ ├── link-check.yml
│ ├── lint.yml
│ ├── npm-publish-cli.yml
│ ├── npm-publish-computer.yml
│ ├── npm-publish-core.yml
│ ├── publish-lume.yml
│ ├── pypi-publish-agent.yml
│ ├── pypi-publish-computer-server.yml
│ ├── pypi-publish-computer.yml
│ ├── pypi-publish-core.yml
│ ├── pypi-publish-mcp-server.yml
│ ├── pypi-publish-som.yml
│ ├── pypi-reusable-publish.yml
│ ├── python-tests.yml
│ ├── test-cua-models.yml
│ └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│ ├── docs.code-workspace
│ ├── extensions.json
│ ├── launch.json
│ ├── libs-ts.code-workspace
│ ├── lume.code-workspace
│ ├── lumier.code-workspace
│ ├── py.code-workspace
│ └── settings.json
├── blog
│ ├── app-use.md
│ ├── assets
│ │ ├── composite-agents.png
│ │ ├── docker-ubuntu-support.png
│ │ ├── hack-booth.png
│ │ ├── hack-closing-ceremony.jpg
│ │ ├── hack-cua-ollama-hud.jpeg
│ │ ├── hack-leaderboard.png
│ │ ├── hack-the-north.png
│ │ ├── hack-winners.jpeg
│ │ ├── hack-workshop.jpeg
│ │ ├── hud-agent-evals.png
│ │ └── trajectory-viewer.jpeg
│ ├── bringing-computer-use-to-the-web.md
│ ├── build-your-own-operator-on-macos-1.md
│ ├── build-your-own-operator-on-macos-2.md
│ ├── cloud-windows-ga-macos-preview.md
│ ├── composite-agents.md
│ ├── computer-use-agents-for-growth-hacking.md
│ ├── cua-hackathon.md
│ ├── cua-playground-preview.md
│ ├── cua-vlm-router.md
│ ├── hack-the-north.md
│ ├── hud-agent-evals.md
│ ├── human-in-the-loop.md
│ ├── introducing-cua-cli.md
│ ├── introducing-cua-cloud-containers.md
│ ├── lume-to-containerization.md
│ ├── neurips-2025-cua-papers.md
│ ├── sandboxed-python-execution.md
│ ├── training-computer-use-models-trajectories-1.md
│ ├── trajectory-viewer.md
│ ├── ubuntu-docker-support.md
│ └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│ ├── .env.example
│ ├── .gitignore
│ ├── content
│ │ └── docs
│ │ ├── agent-sdk
│ │ │ ├── agent-loops.mdx
│ │ │ ├── benchmarks
│ │ │ │ ├── index.mdx
│ │ │ │ ├── interactive.mdx
│ │ │ │ ├── introduction.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── osworld-verified.mdx
│ │ │ │ ├── screenspot-pro.mdx
│ │ │ │ └── screenspot-v2.mdx
│ │ │ ├── callbacks
│ │ │ │ ├── agent-lifecycle.mdx
│ │ │ │ ├── cost-saving.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── logging.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── pii-anonymization.mdx
│ │ │ │ └── trajectories.mdx
│ │ │ ├── chat-history.mdx
│ │ │ ├── custom-tools.mdx
│ │ │ ├── customizing-computeragent.mdx
│ │ │ ├── integrations
│ │ │ │ ├── hud.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── observability.mdx
│ │ │ ├── mcp-server
│ │ │ │ ├── client-integrations.mdx
│ │ │ │ ├── configuration.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── llm-integrations.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── tools.mdx
│ │ │ │ └── usage.mdx
│ │ │ ├── message-format.mdx
│ │ │ ├── meta.json
│ │ │ ├── migration-guide.mdx
│ │ │ ├── prompt-caching.mdx
│ │ │ ├── supported-agents
│ │ │ │ ├── composed-agents.mdx
│ │ │ │ ├── computer-use-agents.mdx
│ │ │ │ ├── grounding-models.mdx
│ │ │ │ ├── human-in-the-loop.mdx
│ │ │ │ └── meta.json
│ │ │ ├── supported-model-providers
│ │ │ │ ├── cua-vlm-router.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ └── local-models.mdx
│ │ │ ├── telemetry.mdx
│ │ │ └── usage-tracking.mdx
│ │ ├── cli-playbook
│ │ │ ├── commands.mdx
│ │ │ ├── index.mdx
│ │ │ └── meta.json
│ │ ├── computer-sdk
│ │ │ ├── cloud-vm-management.mdx
│ │ │ ├── commands.mdx
│ │ │ ├── computer-server
│ │ │ │ ├── Commands.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── meta.json
│ │ │ │ ├── REST-API.mdx
│ │ │ │ └── WebSocket-API.mdx
│ │ │ ├── computer-ui.mdx
│ │ │ ├── computers.mdx
│ │ │ ├── custom-computer-handlers.mdx
│ │ │ ├── meta.json
│ │ │ ├── sandboxed-python.mdx
│ │ │ └── tracing-api.mdx
│ │ ├── example-usecases
│ │ │ ├── form-filling.mdx
│ │ │ ├── gemini-complex-ui-navigation.mdx
│ │ │ ├── meta.json
│ │ │ ├── post-event-contact-export.mdx
│ │ │ └── windows-app-behind-vpn.mdx
│ │ ├── get-started
│ │ │ ├── meta.json
│ │ │ └── quickstart.mdx
│ │ ├── index.mdx
│ │ ├── macos-vm-cli-playbook
│ │ │ ├── lume
│ │ │ │ ├── cli-reference.mdx
│ │ │ │ ├── faq.md
│ │ │ │ ├── http-api.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ ├── meta.json
│ │ │ │ └── prebuilt-images.mdx
│ │ │ ├── lumier
│ │ │ │ ├── building-lumier.mdx
│ │ │ │ ├── docker-compose.mdx
│ │ │ │ ├── docker.mdx
│ │ │ │ ├── index.mdx
│ │ │ │ ├── installation.mdx
│ │ │ │ └── meta.json
│ │ │ └── meta.json
│ │ └── meta.json
│ ├── next.config.mjs
│ ├── package-lock.json
│ ├── package.json
│ ├── pnpm-lock.yaml
│ ├── postcss.config.mjs
│ ├── public
│ │ └── img
│ │ ├── agent_gradio_ui.png
│ │ ├── agent.png
│ │ ├── bg-dark.jpg
│ │ ├── bg-light.jpg
│ │ ├── cli.png
│ │ ├── computer.png
│ │ ├── grounding-with-gemini3.gif
│ │ ├── hero.png
│ │ ├── laminar_trace_example.png
│ │ ├── som_box_threshold.png
│ │ └── som_iou_threshold.png
│ ├── README.md
│ ├── source.config.ts
│ ├── src
│ │ ├── app
│ │ │ ├── (home)
│ │ │ │ ├── [[...slug]]
│ │ │ │ │ └── page.tsx
│ │ │ │ └── layout.tsx
│ │ │ ├── api
│ │ │ │ ├── posthog
│ │ │ │ │ └── [...path]
│ │ │ │ │ └── route.ts
│ │ │ │ └── search
│ │ │ │ └── route.ts
│ │ │ ├── favicon.ico
│ │ │ ├── global.css
│ │ │ ├── layout.config.tsx
│ │ │ ├── layout.tsx
│ │ │ ├── llms.mdx
│ │ │ │ └── [[...slug]]
│ │ │ │ └── route.ts
│ │ │ ├── llms.txt
│ │ │ │ └── route.ts
│ │ │ ├── robots.ts
│ │ │ └── sitemap.ts
│ │ ├── assets
│ │ │ ├── discord-black.svg
│ │ │ ├── discord-white.svg
│ │ │ ├── logo-black.svg
│ │ │ └── logo-white.svg
│ │ ├── components
│ │ │ ├── analytics-tracker.tsx
│ │ │ ├── cookie-consent.tsx
│ │ │ ├── doc-actions-menu.tsx
│ │ │ ├── editable-code-block.tsx
│ │ │ ├── footer.tsx
│ │ │ ├── hero.tsx
│ │ │ ├── iou.tsx
│ │ │ ├── mermaid.tsx
│ │ │ └── page-feedback.tsx
│ │ ├── lib
│ │ │ ├── llms.ts
│ │ │ └── source.ts
│ │ ├── mdx-components.tsx
│ │ └── providers
│ │ └── posthog-provider.tsx
│ └── tsconfig.json
├── examples
│ ├── agent_examples.py
│ ├── agent_ui_examples.py
│ ├── browser_tool_example.py
│ ├── cloud_api_examples.py
│ ├── computer_examples_windows.py
│ ├── computer_examples.py
│ ├── computer_ui_examples.py
│ ├── computer-example-ts
│ │ ├── .env.example
│ │ ├── .gitignore
│ │ ├── package-lock.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── README.md
│ │ ├── src
│ │ │ ├── helpers.ts
│ │ │ └── index.ts
│ │ └── tsconfig.json
│ ├── docker_examples.py
│ ├── evals
│ │ ├── hud_eval_examples.py
│ │ └── wikipedia_most_linked.txt
│ ├── pylume_examples.py
│ ├── sandboxed_functions_examples.py
│ ├── som_examples.py
│ ├── tracing_examples.py
│ ├── utils.py
│ └── winsandbox_example.py
├── img
│ ├── agent_gradio_ui.png
│ ├── agent.png
│ ├── cli.png
│ ├── computer.png
│ ├── logo_black.png
│ └── logo_white.png
├── libs
│ ├── kasm
│ │ ├── Dockerfile
│ │ ├── LICENSE
│ │ ├── README.md
│ │ └── src
│ │ └── ubuntu
│ │ └── install
│ │ └── firefox
│ │ ├── custom_startup.sh
│ │ ├── firefox.desktop
│ │ └── install_firefox.sh
│ ├── lume
│ │ ├── .cursorignore
│ │ ├── CONTRIBUTING.md
│ │ ├── Development.md
│ │ ├── img
│ │ │ └── cli.png
│ │ ├── Package.resolved
│ │ ├── Package.swift
│ │ ├── README.md
│ │ ├── resources
│ │ │ └── lume.entitlements
│ │ ├── scripts
│ │ │ ├── build
│ │ │ │ ├── build-debug.sh
│ │ │ │ ├── build-release-notarized.sh
│ │ │ │ └── build-release.sh
│ │ │ └── install.sh
│ │ ├── src
│ │ │ ├── Commands
│ │ │ │ ├── Clone.swift
│ │ │ │ ├── Config.swift
│ │ │ │ ├── Create.swift
│ │ │ │ ├── Delete.swift
│ │ │ │ ├── Get.swift
│ │ │ │ ├── Images.swift
│ │ │ │ ├── IPSW.swift
│ │ │ │ ├── List.swift
│ │ │ │ ├── Logs.swift
│ │ │ │ ├── Options
│ │ │ │ │ └── FormatOption.swift
│ │ │ │ ├── Prune.swift
│ │ │ │ ├── Pull.swift
│ │ │ │ ├── Push.swift
│ │ │ │ ├── Run.swift
│ │ │ │ ├── Serve.swift
│ │ │ │ ├── Set.swift
│ │ │ │ └── Stop.swift
│ │ │ ├── ContainerRegistry
│ │ │ │ ├── ImageContainerRegistry.swift
│ │ │ │ ├── ImageList.swift
│ │ │ │ └── ImagesPrinter.swift
│ │ │ ├── Errors
│ │ │ │ └── Errors.swift
│ │ │ ├── FileSystem
│ │ │ │ ├── Home.swift
│ │ │ │ ├── Settings.swift
│ │ │ │ ├── VMConfig.swift
│ │ │ │ ├── VMDirectory.swift
│ │ │ │ └── VMLocation.swift
│ │ │ ├── LumeController.swift
│ │ │ ├── Main.swift
│ │ │ ├── Server
│ │ │ │ ├── Handlers.swift
│ │ │ │ ├── HTTP.swift
│ │ │ │ ├── Requests.swift
│ │ │ │ ├── Responses.swift
│ │ │ │ └── Server.swift
│ │ │ ├── Utils
│ │ │ │ ├── CommandRegistry.swift
│ │ │ │ ├── CommandUtils.swift
│ │ │ │ ├── Logger.swift
│ │ │ │ ├── NetworkUtils.swift
│ │ │ │ ├── Path.swift
│ │ │ │ ├── ProcessRunner.swift
│ │ │ │ ├── ProgressLogger.swift
│ │ │ │ ├── String.swift
│ │ │ │ └── Utils.swift
│ │ │ ├── Virtualization
│ │ │ │ ├── DarwinImageLoader.swift
│ │ │ │ ├── DHCPLeaseParser.swift
│ │ │ │ ├── ImageLoaderFactory.swift
│ │ │ │ └── VMVirtualizationService.swift
│ │ │ ├── VM
│ │ │ │ ├── DarwinVM.swift
│ │ │ │ ├── LinuxVM.swift
│ │ │ │ ├── VM.swift
│ │ │ │ ├── VMDetails.swift
│ │ │ │ ├── VMDetailsPrinter.swift
│ │ │ │ ├── VMDisplayResolution.swift
│ │ │ │ └── VMFactory.swift
│ │ │ └── VNC
│ │ │ ├── PassphraseGenerator.swift
│ │ │ └── VNCService.swift
│ │ └── tests
│ │ ├── Mocks
│ │ │ ├── MockVM.swift
│ │ │ ├── MockVMVirtualizationService.swift
│ │ │ └── MockVNCService.swift
│ │ ├── VM
│ │ │ └── VMDetailsPrinterTests.swift
│ │ ├── VMTests.swift
│ │ ├── VMVirtualizationServiceTests.swift
│ │ └── VNCServiceTests.swift
│ ├── lumier
│ │ ├── .dockerignore
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── bin
│ │ │ └── entry.sh
│ │ ├── config
│ │ │ └── constants.sh
│ │ ├── hooks
│ │ │ └── on-logon.sh
│ │ └── lib
│ │ ├── utils.sh
│ │ └── vm.sh
│ ├── python
│ │ ├── agent
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── agent
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── adapters
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── cua_adapter.py
│ │ │ │ │ ├── huggingfacelocal_adapter.py
│ │ │ │ │ ├── human_adapter.py
│ │ │ │ │ ├── mlxvlm_adapter.py
│ │ │ │ │ └── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ └── qwen2_5_vl.py
│ │ │ │ ├── agent.py
│ │ │ │ ├── callbacks
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── budget_manager.py
│ │ │ │ │ ├── image_retention.py
│ │ │ │ │ ├── logging.py
│ │ │ │ │ ├── operator_validator.py
│ │ │ │ │ ├── pii_anonymization.py
│ │ │ │ │ ├── prompt_instructions.py
│ │ │ │ │ ├── telemetry.py
│ │ │ │ │ └── trajectory_saver.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── computers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cua.py
│ │ │ │ │ └── custom.py
│ │ │ │ ├── decorators.py
│ │ │ │ ├── human_tool
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ ├── server.py
│ │ │ │ │ └── ui.py
│ │ │ │ ├── integrations
│ │ │ │ │ └── hud
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── agent.py
│ │ │ │ │ └── proxy.py
│ │ │ │ ├── loops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── anthropic.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── composed_grounded.py
│ │ │ │ │ ├── gelato.py
│ │ │ │ │ ├── gemini.py
│ │ │ │ │ ├── generic_vlm.py
│ │ │ │ │ ├── glm45v.py
│ │ │ │ │ ├── gta1.py
│ │ │ │ │ ├── holo.py
│ │ │ │ │ ├── internvl.py
│ │ │ │ │ ├── model_types.csv
│ │ │ │ │ ├── moondream3.py
│ │ │ │ │ ├── omniparser.py
│ │ │ │ │ ├── openai.py
│ │ │ │ │ ├── opencua.py
│ │ │ │ │ ├── uiins.py
│ │ │ │ │ ├── uitars.py
│ │ │ │ │ └── uitars2.py
│ │ │ │ ├── proxy
│ │ │ │ │ ├── examples.py
│ │ │ │ │ └── handlers.py
│ │ │ │ ├── responses.py
│ │ │ │ ├── tools
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── browser_tool.py
│ │ │ │ ├── types.py
│ │ │ │ └── ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ └── gradio
│ │ │ │ ├── __init__.py
│ │ │ │ ├── app.py
│ │ │ │ └── ui_components.py
│ │ │ ├── benchmarks
│ │ │ │ ├── .gitignore
│ │ │ │ ├── contrib.md
│ │ │ │ ├── interactive.py
│ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ └── gta1.py
│ │ │ │ ├── README.md
│ │ │ │ ├── ss-pro.py
│ │ │ │ ├── ss-v2.py
│ │ │ │ └── utils.py
│ │ │ ├── example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer_agent.py
│ │ ├── bench-ui
│ │ │ ├── bench_ui
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ └── child.py
│ │ │ ├── examples
│ │ │ │ ├── folder_example.py
│ │ │ │ ├── gui
│ │ │ │ │ ├── index.html
│ │ │ │ │ ├── logo.svg
│ │ │ │ │ └── styles.css
│ │ │ │ ├── output_overlay.png
│ │ │ │ └── simple_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ └── test_port_detection.py
│ │ ├── computer
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── computer.py
│ │ │ │ ├── diorama_computer.py
│ │ │ │ ├── helpers.py
│ │ │ │ ├── interface
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ ├── models.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── logger.py
│ │ │ │ ├── models.py
│ │ │ │ ├── providers
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── cloud
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── docker
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── lume
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── lume_api.py
│ │ │ │ │ ├── lumier
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── provider.py
│ │ │ │ │ ├── types.py
│ │ │ │ │ └── winsandbox
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── provider.py
│ │ │ │ │ └── setup_script.ps1
│ │ │ │ ├── tracing_wrapper.py
│ │ │ │ ├── tracing.py
│ │ │ │ ├── ui
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── __main__.py
│ │ │ │ │ └── gradio
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── app.py
│ │ │ │ └── utils.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_computer.py
│ │ ├── computer-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── computer_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── browser.py
│ │ │ │ ├── cli.py
│ │ │ │ ├── diorama
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── diorama_computer.py
│ │ │ │ │ ├── diorama.py
│ │ │ │ │ ├── draw.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── safezone.py
│ │ │ │ ├── handlers
│ │ │ │ │ ├── base.py
│ │ │ │ │ ├── factory.py
│ │ │ │ │ ├── generic.py
│ │ │ │ │ ├── linux.py
│ │ │ │ │ ├── macos.py
│ │ │ │ │ └── windows.py
│ │ │ │ ├── main.py
│ │ │ │ ├── server.py
│ │ │ │ ├── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── wallpaper.py
│ │ │ │ └── watchdog.py
│ │ │ ├── examples
│ │ │ │ ├── __init__.py
│ │ │ │ └── usage_example.py
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ ├── run_server.py
│ │ │ ├── test_connection.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_server.py
│ │ ├── core
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── core
│ │ │ │ ├── __init__.py
│ │ │ │ └── telemetry
│ │ │ │ ├── __init__.py
│ │ │ │ └── posthog.py
│ │ │ ├── poetry.toml
│ │ │ ├── pyproject.toml
│ │ │ ├── README.md
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_telemetry.py
│ │ ├── mcp-server
│ │ │ ├── .bumpversion.cfg
│ │ │ ├── build-extension.py
│ │ │ ├── CONCURRENT_SESSIONS.md
│ │ │ ├── desktop-extension
│ │ │ │ ├── cua-extension.mcpb
│ │ │ │ ├── desktop_extension.png
│ │ │ │ ├── manifest.json
│ │ │ │ ├── README.md
│ │ │ │ ├── requirements.txt
│ │ │ │ ├── run_server.sh
│ │ │ │ └── setup.py
│ │ │ ├── mcp_server
│ │ │ │ ├── __init__.py
│ │ │ │ ├── __main__.py
│ │ │ │ ├── server.py
│ │ │ │ └── session_manager.py
│ │ │ ├── pdm.lock
│ │ │ ├── pyproject.toml
│ │ │ ├── QUICK_TEST_COMMANDS.sh
│ │ │ ├── quick_test_local_option.py
│ │ │ ├── README.md
│ │ │ ├── scripts
│ │ │ │ ├── install_mcp_server.sh
│ │ │ │ └── start_mcp_server.sh
│ │ │ ├── test_mcp_server_local_option.py
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_mcp_server.py
│ │ ├── pylume
│ │ │ └── tests
│ │ │ ├── conftest.py
│ │ │ └── test_pylume.py
│ │ └── som
│ │ ├── .bumpversion.cfg
│ │ ├── LICENSE
│ │ ├── poetry.toml
│ │ ├── pyproject.toml
│ │ ├── README.md
│ │ ├── som
│ │ │ ├── __init__.py
│ │ │ ├── detect.py
│ │ │ ├── detection.py
│ │ │ ├── models.py
│ │ │ ├── ocr.py
│ │ │ ├── util
│ │ │ │ └── utils.py
│ │ │ └── visualization.py
│ │ └── tests
│ │ ├── conftest.py
│ │ └── test_omniparser.py
│ ├── qemu-docker
│ │ ├── linux
│ │ │ ├── Dockerfile
│ │ │ ├── README.md
│ │ │ └── src
│ │ │ ├── entry.sh
│ │ │ └── vm
│ │ │ ├── image
│ │ │ │ └── README.md
│ │ │ └── setup
│ │ │ ├── install.sh
│ │ │ ├── setup-cua-server.sh
│ │ │ └── setup.sh
│ │ ├── README.md
│ │ └── windows
│ │ ├── Dockerfile
│ │ ├── README.md
│ │ └── src
│ │ ├── entry.sh
│ │ └── vm
│ │ ├── image
│ │ │ └── README.md
│ │ └── setup
│ │ ├── install.bat
│ │ ├── on-logon.ps1
│ │ ├── setup-cua-server.ps1
│ │ ├── setup-utils.psm1
│ │ └── setup.ps1
│ ├── typescript
│ │ ├── .gitignore
│ │ ├── .nvmrc
│ │ ├── agent
│ │ │ ├── examples
│ │ │ │ ├── playground-example.html
│ │ │ │ └── README.md
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── client.ts
│ │ │ │ ├── index.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ └── client.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── computer
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── computer
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── providers
│ │ │ │ │ │ ├── base.ts
│ │ │ │ │ │ ├── cloud.ts
│ │ │ │ │ │ └── index.ts
│ │ │ │ │ └── types.ts
│ │ │ │ ├── index.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── base.ts
│ │ │ │ │ ├── factory.ts
│ │ │ │ │ ├── index.ts
│ │ │ │ │ ├── linux.ts
│ │ │ │ │ ├── macos.ts
│ │ │ │ │ └── windows.ts
│ │ │ │ └── types.ts
│ │ │ ├── tests
│ │ │ │ ├── computer
│ │ │ │ │ └── cloud.test.ts
│ │ │ │ ├── interface
│ │ │ │ │ ├── factory.test.ts
│ │ │ │ │ ├── index.test.ts
│ │ │ │ │ ├── linux.test.ts
│ │ │ │ │ ├── macos.test.ts
│ │ │ │ │ └── windows.test.ts
│ │ │ │ └── setup.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── core
│ │ │ ├── .editorconfig
│ │ │ ├── .gitattributes
│ │ │ ├── .gitignore
│ │ │ ├── LICENSE
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── index.ts
│ │ │ │ └── telemetry
│ │ │ │ ├── clients
│ │ │ │ │ ├── index.ts
│ │ │ │ │ └── posthog.ts
│ │ │ │ └── index.ts
│ │ │ ├── tests
│ │ │ │ └── telemetry.test.ts
│ │ │ ├── tsconfig.json
│ │ │ ├── tsdown.config.ts
│ │ │ └── vitest.config.ts
│ │ ├── cua-cli
│ │ │ ├── .gitignore
│ │ │ ├── .prettierrc
│ │ │ ├── bun.lock
│ │ │ ├── CLAUDE.md
│ │ │ ├── index.ts
│ │ │ ├── package.json
│ │ │ ├── README.md
│ │ │ ├── src
│ │ │ │ ├── auth.ts
│ │ │ │ ├── cli.ts
│ │ │ │ ├── commands
│ │ │ │ │ ├── auth.ts
│ │ │ │ │ └── sandbox.ts
│ │ │ │ ├── config.ts
│ │ │ │ ├── http.ts
│ │ │ │ ├── storage.ts
│ │ │ │ └── util.ts
│ │ │ └── tsconfig.json
│ │ ├── package.json
│ │ ├── pnpm-lock.yaml
│ │ ├── pnpm-workspace.yaml
│ │ └── README.md
│ └── xfce
│ ├── .dockerignore
│ ├── .gitignore
│ ├── Development.md
│ ├── Dockerfile
│ ├── Dockerfile.dev
│ ├── README.md
│ └── src
│ ├── scripts
│ │ ├── resize-display.sh
│ │ ├── start-computer-server.sh
│ │ ├── start-novnc.sh
│ │ ├── start-vnc.sh
│ │ └── xstartup.sh
│ ├── supervisor
│ │ └── supervisord.conf
│ └── xfce-config
│ ├── helpers.rc
│ ├── xfce4-power-manager.xml
│ └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│ ├── agent_nb.ipynb
│ ├── blog
│ │ ├── build-your-own-operator-on-macos-1.ipynb
│ │ └── build-your-own-operator-on-macos-2.ipynb
│ ├── composite_agents_docker_nb.ipynb
│ ├── computer_nb.ipynb
│ ├── computer_server_nb.ipynb
│ ├── customizing_computeragent.ipynb
│ ├── eval_osworld.ipynb
│ ├── ollama_nb.ipynb
│ ├── README.md
│ ├── sota_hackathon_cloud.ipynb
│ └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│ ├── install-cli.ps1
│ ├── install-cli.sh
│ ├── playground-docker.sh
│ ├── playground.sh
│ ├── run-docker-dev.sh
│ └── typescript-typecheck.js
├── TESTING.md
├── tests
│ ├── agent_loop_testing
│ │ ├── agent_test.py
│ │ └── README.md
│ ├── pytest.ini
│ ├── shell_cmd.py
│ ├── test_files.py
│ ├── test_mcp_server_session_management.py
│ ├── test_mcp_server_streaming.py
│ ├── test_shell_bash.py
│ ├── test_telemetry.py
│ ├── test_tracing.py
│ ├── test_venv.py
│ └── test_watchdog.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/libs/typescript/computer/src/computer/providers/base.ts:
--------------------------------------------------------------------------------
```typescript
import os from 'node:os';
import { Telemetry } from '@trycua/core';
import pino from 'pino';
import type { OSType } from '../../types';
import type { BaseComputerConfig, Display, VMProviderType } from '../types';
const logger = pino({ name: 'computer.provider_base' });
/**
* Base Computer class with shared functionality
*/
export abstract class BaseComputer {
protected name: string;
protected osType: OSType;
protected vmProvider?: VMProviderType;
protected telemetry: Telemetry;
constructor(config: BaseComputerConfig) {
this.name = config.name;
this.osType = config.osType;
this.telemetry = new Telemetry();
this.telemetry.recordEvent('module_init', {
module: 'computer',
version: process.env.npm_package_version,
node_version: process.version,
});
this.telemetry.recordEvent('computer_initialized', {
os: os.platform(),
os_version: os.version(),
node_version: process.version,
});
}
/**
* Get the name of the computer
*/
getName(): string {
return this.name;
}
/**
* Get the OS type of the computer
*/
getOSType(): OSType {
return this.osType;
}
/**
* Get the VM provider type
*/
getVMProviderType(): VMProviderType | undefined {
return this.vmProvider;
}
/**
* Shared method available to all computer types
*/
async disconnect(): Promise<void> {
logger.info(`Disconnecting from ${this.name}`);
// Implementation would go here
}
/**
* Parse display string into Display object
* @param display Display string in format "WIDTHxHEIGHT"
* @returns Display object
*/
public static parseDisplayString(display: string): Display {
const match = display.match(/^(\d+)x(\d+)$/);
if (!match) {
throw new Error(`Invalid display format: ${display}. Expected format: WIDTHxHEIGHT`);
}
return {
width: Number.parseInt(match[1], 10),
height: Number.parseInt(match[2], 10),
};
}
/**
* Parse memory string to MB integer.
*
* Examples:
* "8GB" -> 8192
* "1024MB" -> 1024
* "512" -> 512
*
* @param memoryStr - Memory string to parse
* @returns Memory value in MB
*/
public static parseMemoryString(memoryStr: string): number {
if (!memoryStr) {
return 0;
}
// Convert to uppercase for case-insensitive matching
const upperStr = memoryStr.toUpperCase().trim();
// Extract numeric value and unit
const match = upperStr.match(/^(\d+(?:\.\d+)?)\s*(GB|MB)?$/);
if (!match) {
throw new Error(`Invalid memory format: ${memoryStr}`);
}
const value = Number.parseFloat(match[1]);
const unit = match[2] || 'MB'; // Default to MB if no unit specified
// Convert to MB
if (unit === 'GB') {
return Math.round(value * 1024);
}
return Math.round(value);
}
}
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/computer-server/Commands.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Supported Commands
description: List of all commands supported by the Computer Server API (WebSocket and REST).
---
# Commands Reference
This page lists all supported commands for the Computer Server, available via both WebSocket and REST API endpoints.
| Command | Description |
| ---------------------- | ------------------------------------- |
| version | Get protocol and package version info |
| run_command | Run a shell command |
| screenshot | Capture a screenshot |
| get_screen_size | Get the screen size |
| get_cursor_position | Get the current mouse cursor position |
| mouse_down | Mouse button down |
| mouse_up | Mouse button up |
| left_click | Left mouse click |
| right_click | Right mouse click |
| double_click | Double mouse click |
| move_cursor | Move mouse cursor to coordinates |
| drag_to | Drag mouse to coordinates |
| drag | Drag mouse by offset |
| key_down | Keyboard key down |
| key_up | Keyboard key up |
| type_text | Type text |
| press_key | Press a single key |
| hotkey | Press a hotkey combination |
| scroll | Scroll the screen |
| scroll_down | Scroll down |
| scroll_up | Scroll up |
| copy_to_clipboard | Copy text to clipboard |
| set_clipboard | Set clipboard content |
| file_exists | Check if a file exists |
| directory_exists | Check if a directory exists |
| list_dir | List files/directories in a directory |
| read_text | Read text from a file |
| write_text | Write text to a file |
| read_bytes | Read bytes from a file |
| write_bytes | Write bytes to a file |
| get_file_size | Get file size |
| delete_file | Delete a file |
| create_dir | Create a directory |
| delete_dir | Delete a directory |
| get_accessibility_tree | Get accessibility tree (if supported) |
| find_element | Find element in accessibility tree |
| diorama_cmd | Run a diorama command (if supported) |
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/logger.py:
--------------------------------------------------------------------------------
```python
"""Logging utilities for the Computer module."""
import logging
from enum import IntEnum
# Keep LogLevel for backward compatibility, but it will be deprecated
class LogLevel(IntEnum):
"""Log levels for logging. Deprecated - use standard logging levels instead."""
QUIET = 0 # Only warnings and errors
NORMAL = 1 # Info level, standard output
VERBOSE = 2 # More detailed information
DEBUG = 3 # Full debug information
# Map LogLevel to standard logging levels for backward compatibility
LOGLEVEL_MAP = {
LogLevel.QUIET: logging.WARNING,
LogLevel.NORMAL: logging.INFO,
LogLevel.VERBOSE: logging.DEBUG,
LogLevel.DEBUG: logging.DEBUG,
}
class Logger:
"""Logger class for Computer."""
def __init__(self, name: str, verbosity: int):
"""Initialize the logger.
Args:
name: The name of the logger.
verbosity: The log level (use standard logging levels like logging.INFO).
For backward compatibility, LogLevel enum values are also accepted.
"""
self.logger = logging.getLogger(name)
# Convert LogLevel enum to standard logging level if needed
if isinstance(verbosity, LogLevel):
self.verbosity = LOGLEVEL_MAP.get(verbosity, logging.INFO)
else:
self.verbosity = verbosity
self._configure()
def _configure(self):
"""Configure the logger based on log level."""
# Set the logging level directly
self.logger.setLevel(self.verbosity)
# Log the verbosity level that was set
if self.verbosity <= logging.DEBUG:
self.logger.info("Logger set to DEBUG level")
elif self.verbosity <= logging.INFO:
self.logger.info("Logger set to INFO level")
elif self.verbosity <= logging.WARNING:
self.logger.warning("Logger set to WARNING level")
elif self.verbosity <= logging.ERROR:
self.logger.warning("Logger set to ERROR level")
elif self.verbosity <= logging.CRITICAL:
self.logger.warning("Logger set to CRITICAL level")
def debug(self, message: str):
"""Log a debug message if log level is DEBUG or lower."""
self.logger.debug(message)
def info(self, message: str):
"""Log an info message if log level is INFO or lower."""
self.logger.info(message)
def verbose(self, message: str):
"""Log a verbose message between INFO and DEBUG levels."""
# Since there's no standard verbose level,
# use debug level with [VERBOSE] prefix for backward compatibility
self.logger.debug(f"[VERBOSE] {message}")
def warning(self, message: str):
"""Log a warning message."""
self.logger.warning(message)
def error(self, message: str):
"""Log an error message."""
self.logger.error(message)
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/sandboxed-python.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Sandboxed Python
slug: sandboxed-python
---
<Callout>
A corresponding <a href="https://github.com/trycua/cua/blob/main/examples/sandboxed_functions_examples.py" target="_blank">Python example</a> is available for this documentation.
</Callout>
You can run Python functions securely inside a sandboxed virtual environment on a remote Cua Computer. This is useful for executing untrusted user code, isolating dependencies, or providing a safe environment for automation tasks.
## How It Works
The `sandboxed` decorator from the Computer SDK wraps a Python function so that it is executed remotely in a specified virtual environment on the target Computer. The function and its arguments are serialized, sent to the remote, and executed in isolation. Results or errors are returned to the caller.
## Example Usage
```python
from computer import Computer
from computer.helpers import sandboxed
@sandboxed()
def read_file(location: str) -> str:
"""Read contents of a file"""
with open(location, 'r') as f:
return f.read()
async def main():
async with Computer(os_type="linux", provider_type="cloud", name="my-sandbox", api_key="...") as computer:
# Call the sandboxed function (runs remotely)
result = await read_file("/etc/hostname")
print(result)
```
## Installing Python Packages
You can specify the virtual environment name and target computer:
```python
@sandboxed(venv_name="myenv", computer=my_computer, max_retries=5)
def my_function(...):
...
```
You can also install packages in the virtual environment using the `venv_install` method:
```python
await my_computer.venv_install("myenv", ["requests"])
```
## Example: Interacting with macOS Applications
You can use sandboxed functions to interact with macOS applications on a local Cua Computer (requires `os_type="darwin"`). This is particularly useful for automation tasks that involve GUI applications.
```python
# Example: Use sandboxed functions to execute code in a Cua Sandbox
from computer.helpers import sandboxed
await computer.venv_install("demo_venv", ["macos-pyxa"]) # Install packages in a virtual environment
@sandboxed("demo_venv")
def greet_and_print(name):
"""Get the HTML of the current Safari tab"""
import PyXA
safari = PyXA.Application("Safari")
html = safari.current_document.source()
print(f"Hello from inside the sandbox, {name}!")
return {"greeted": name, "safari_html": html}
# When a @sandboxed function is called, it will execute in the sandbox
result = await greet_and_print("Cua")
# Result: {"greeted": "Cua", "safari_html": "<html>...</html>"}
# stdout and stderr are also captured and printed / raised
print("Result from sandboxed function:", result)
```
## Error Handling
If the remote execution fails, the decorator will retry up to `max_retries` times. If all attempts fail, the last exception is raised locally.
```
--------------------------------------------------------------------------------
/libs/python/mcp-server/desktop-extension/manifest.json:
--------------------------------------------------------------------------------
```json
{
"manifest_version": "0.2",
"name": "cua-mcp-server",
"display_name": "CUA Computer-Use Agent",
"version": "1.0.0",
"description": "Computer-Use Agent (CUA) MCP server for desktop automation and interaction",
"long_description": "The CUA Computer-Use Agent extension provides powerful desktop automation capabilities through Claude Desktop. It can take screenshots, interact with applications, and execute complex computer tasks using AI agents. Perfect for automating repetitive desktop workflows, testing applications, and performing computer-based tasks through natural language instructions.",
"author": {
"name": "Cua",
"email": "[email protected]",
"url": "https://trycua.com"
},
"repository": {
"type": "git",
"url": "https://github.com/trycua/cua"
},
"homepage": "https://trycua.com",
"documentation": "https://docs.trycua.com",
"support": "https://github.com/trycua/cua/issues",
"icon": "desktop_extension.png",
"server": {
"type": "python",
"entry_point": "server.py",
"mcp_config": {
"command": "${__dirname}/run_server.sh",
"args": ["${__dirname}/server.py"],
"env": {
"PYTHONPATH": "${__dirname}",
"API_KEY": "${user_config.api_key}",
"CUA_MODEL_NAME": "${user_config.model_name}",
"CUA_MAX_IMAGES": "${user_config.max_images}"
}
}
},
"tools": [
{
"name": "screenshot_cua",
"description": "Take a screenshot of the current desktop screen and return the image"
},
{
"name": "run_cua_task",
"description": "Run a Computer-Use Agent task on the desktop and return the result with screenshot"
},
{
"name": "run_multi_cua_tasks",
"description": "Run multiple Computer-Use Agent tasks sequentially or concurrently"
},
{
"name": "get_session_stats",
"description": "Get statistics about active sessions and resource usage"
},
{
"name": "cleanup_session",
"description": "Cleanup a specific session and release its resources"
}
],
"keywords": ["automation", "computer-use", "desktop", "ai-agent", "productivity"],
"license": "MIT",
"user_config": {
"api_key": {
"type": "string",
"title": "API Key",
"description": "Your API key for the AI model (Anthropic, OpenAI, etc.)",
"sensitive": true,
"required": true
},
"model_name": {
"type": "string",
"title": "Model Name",
"description": "The AI model to use for computer tasks (e.g., anthropic/claude-sonnet-4-20250514, openai/gpt-4o)",
"default": "anthropic/claude-sonnet-4-20250514",
"required": false
},
"max_images": {
"type": "number",
"title": "Maximum Images",
"description": "Maximum number of recent images to keep in context (default: 3)",
"default": 3,
"min": 1,
"max": 10,
"required": false
}
}
}
```
--------------------------------------------------------------------------------
/libs/lume/tests/VNCServiceTests.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
import Testing
@testable import lume
@Test("VNCService starts correctly")
func testVNCServiceStart() async throws {
let tempDir = try createTempDirectory()
let vmDir = VMDirectory(Path(tempDir.path))
let service = await MockVNCService(vmDirectory: vmDir)
// Initial state
let isRunning = await service.isRunning
let url = await service.url
#expect(!isRunning)
#expect(url == nil)
// Start service
try await service.start(port: 5900, virtualMachine: nil)
#expect(await service.isRunning)
#expect(await service.url?.contains("5900") ?? false)
}
@Test("VNCService stops correctly")
func testVNCServiceStop() async throws {
let tempDir = try createTempDirectory()
let vmDir = VMDirectory(Path(tempDir.path))
let service = await MockVNCService(vmDirectory: vmDir)
try await service.start(port: 5900, virtualMachine: nil)
await service.stop()
let isRunning = await service.isRunning
let url = await service.url
#expect(!isRunning)
#expect(url == nil)
}
@Test("VNCService handles client operations")
func testVNCServiceClient() async throws {
let tempDir = try createTempDirectory()
let vmDir = VMDirectory(Path(tempDir.path))
let service = await MockVNCService(vmDirectory: vmDir)
// Should fail when not started
do {
try await service.openClient(url: "vnc://localhost:5900")
#expect(Bool(false), "Expected openClient to throw when not started")
} catch VMError.vncNotConfigured {
// Expected error
} catch {
#expect(Bool(false), "Expected vncNotConfigured error but got \(error)")
}
// Start and try client operations
try await service.start(port: 5900, virtualMachine: nil)
try await service.openClient(url: "vnc://localhost:5900")
#expect(await service.clientOpenCount == 1)
// Stop and verify client operations fail
await service.stop()
do {
try await service.openClient(url: "vnc://localhost:5900")
#expect(Bool(false), "Expected openClient to throw after stopping")
} catch VMError.vncNotConfigured {
// Expected error
} catch {
#expect(Bool(false), "Expected vncNotConfigured error but got \(error)")
}
}
@Test("VNCService handles virtual machine attachment")
func testVNCServiceVMAttachment() async throws {
let tempDir = try createTempDirectory()
let vmDir = VMDirectory(Path(tempDir.path))
let service = await MockVNCService(vmDirectory: vmDir)
let mockVM = "mock_vm"
try await service.start(port: 5900, virtualMachine: mockVM)
let attachedVM = await service.attachedVM
#expect(attachedVM == mockVM)
}
private func createTempDirectory() throws -> URL {
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent(UUID().uuidString)
try FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
return tempDir
}
```
--------------------------------------------------------------------------------
/docs/content/docs/index.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Introduction
---
import { Monitor, Code, BookOpen, Zap, Bot, Boxes, Rocket } from 'lucide-react';
<div className="not-prose -mt-2 mb-6">
<p className="text-fd-primary font-semibold text-sm mb-1">Welcome</p>
<h1 className="text-3xl font-bold tracking-tight md:text-4xl">Welcome to Cua</h1>
</div>
**Cua** is an open-source framework for building, deploying and evaluating Computer-Use Agents - AI systems that autonomously interact with computer interfaces by understanding visual elements and executing actions. Cua provides SDKs for easy integration with 100+ vision-language models (VLMs), supporting everything from simple task automation to complex multi-step workflows across Windows, Linux, and macOS environments.
<div className="not-prose relative rounded-xl overflow-hidden my-8 w-full">
<img src="/docs/img/hero.png" alt="Cua" className="w-full h-auto rounded-xl" />
</div>
## What is a Computer-Use Agent?
Computer-Use Agents (CUAs) are AI systems that can autonomously interact with computer interfaces through visual understanding and action execution. They work by capturing screenshots, feeding them to a vision-language model (VLM), and letting the model determine the next action to take - such as clicking, typing, or scrolling - in a continuous loop until the task is complete.
## What is a Computer-Use Sandbox?
Computer-Use Sandboxes are isolated, controlled environments where AI agents can safely interact with computer interfaces. They provide a secure execution space for agents to perform actions such as clicking, typing, and running code, test automation workflows, and learn from interactions without affecting production systems.
## Key Features
With the **Computer SDK**, you can:
- Automate **Windows, Linux, and macOS** sandboxes with a consistent, pyautogui-like API
- Create & manage sandboxes locally or using **Cua Cloud**
With the **Agent SDK**, you can:
- Run computer-use models with a consistent schema
- Benchmark on **OSWorld-Verified**, **SheetBench-V2**, and **ScreenSpot**
- Combine UI grounding models with any LLM using **composed agents**
- Use **100+ models** via API or local inference (Claude, GPT-4, Gemini, Ollama, MLX)
## Get Started
Follow the [Quickstart guide](/get-started/quickstart) for step-by-step setup with Python or TypeScript.
Check out our [tutorials](https://cua.ai/blog), [examples](https://github.com/trycua/cua/tree/main/examples), and [notebooks](https://github.com/trycua/cua/tree/main/notebooks) to start building with Cua today.
<div className="grid grid-cols-2 md:grid-cols-4 gap-2 mt-4 text-sm">
<Card icon={<Rocket className="w-4 h-4" />} href="/get-started/quickstart" title="Quickstart" />
<Card icon={<Zap className="w-4 h-4" />} href="/agent-sdk/agent-loops" title="Agent Loops" />
<Card icon={<BookOpen className="w-4 h-4" />} href="/computer-sdk/computers" title="Computer SDK" />
<Card icon={<Monitor className="w-4 h-4" />} href="/example-usecases/form-filling" title="Examples" />
</div>
```
--------------------------------------------------------------------------------
/examples/computer-example-ts/src/index.ts:
--------------------------------------------------------------------------------
```typescript
import { Computer, OSType } from '@trycua/computer';
import OpenAI from 'openai';
import { executeAction } from './helpers';
import 'dotenv/config';
const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
const COMPUTER_USE_PROMPT = 'Open firefox and go to cua.ai';
// Initialize the Computer Connection
const computer = new Computer({
apiKey: process.env.CUA_API_KEY!,
name: process.env.CUA_CONTAINER_NAME!,
osType: OSType.LINUX,
});
await computer.run();
// Take the initial screenshot
const screenshot = await computer.interface.screenshot();
const screenshotBase64 = screenshot.toString('base64');
// Setup openai config for computer use
const computerUseConfig: OpenAI.Responses.ResponseCreateParamsNonStreaming = {
model: 'computer-use-preview',
tools: [
{
type: 'computer_use_preview',
display_width: 1024,
display_height: 768,
environment: 'linux', // we're using a linux vm
},
],
truncation: 'auto',
};
// Send initial screenshot to the openai computer use model
let res = await openai.responses.create({
...computerUseConfig,
input: [
{
role: 'user',
content: [
// what we want the ai to do
{ type: 'input_text', text: COMPUTER_USE_PROMPT },
// current screenshot of the vm
{
type: 'input_image',
image_url: `data:image/png;base64,${screenshotBase64}`,
detail: 'auto',
},
],
},
],
});
// Loop until there are no more computer use actions.
while (true) {
const computerCalls = res.output.filter((o) => o.type === 'computer_call');
if (computerCalls.length < 1) {
console.log('No more computer calls. Loop complete.');
break;
}
// Get the first call
const call = computerCalls[0];
const action = call.action;
console.log('Received action from OpenAI Responses API:', action);
let ackChecks: OpenAI.Responses.ResponseComputerToolCall.PendingSafetyCheck[] = [];
if (call.pending_safety_checks.length > 0) {
console.log('Safety checks pending:', call.pending_safety_checks);
// In a real implementation, you would want to get user confirmation here
ackChecks = call.pending_safety_checks;
}
// Execute the action in the container
await executeAction(computer, action);
// Wait for changes to process within the container (1sec)
await new Promise((resolve) => setTimeout(resolve, 1000));
// Capture new screenshot
const newScreenshot = await computer.interface.screenshot();
const newScreenshotBase64 = newScreenshot.toString('base64');
// Screenshot back as computer_call_output
res = await openai.responses.create({
...computerUseConfig,
previous_response_id: res.id,
input: [
{
type: 'computer_call_output',
call_id: call.call_id,
acknowledged_safety_checks: ackChecks,
output: {
type: 'computer_screenshot',
image_url: `data:image/png;base64,${newScreenshotBase64}`,
},
},
],
});
}
process.exit();
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/supported-agents/computer-use-agents.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: All‑in‑one CUA Models
description: Models that support full computer-use agent capabilities with ComputerAgent.run()
---
These models support complete computer-use agent functionality through `ComputerAgent.run()`. They can understand natural language instructions and autonomously perform sequences of actions to complete tasks.
All agent loops are compatible with any LLM provider supported by LiteLLM.
See [Running Models Locally](/agent-sdk/supported-model-providers/local-models) for how to use Hugging Face and MLX models on your own machine.
## Gemini CUA
Gemini models with computer-use capabilities:
- Gemini 2.5 CUA: `gemini-2.5-computer-use-preview-10-2025`
```python
agent = ComputerAgent("gemini-2.5-computer-use-preview-10-2025", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```
## Anthropic CUAs
Claude models with computer-use capabilities:
- Claude 4.5: `claude-sonnet-4-5-20250929`, `claude-haiku-4-5-20251001`
- Claude 4.1: `claude-opus-4-1-20250805`
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
- Claude 3.7: `claude-3-7-sonnet-20250219`
```python
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```
## OpenAI CUA Preview
OpenAI's computer-use preview model:
- Computer-use-preview: `computer-use-preview`
```python
agent = ComputerAgent("openai/computer-use-preview", tools=[computer])
async for _ in agent.run("Take a screenshot and describe what you see"):
pass
```
## GLM-4.5V
Zhipu AI's GLM-4.5V vision-language model with computer-use capabilities:
- `openrouter/z-ai/glm-4.5v`
- `huggingface-local/zai-org/GLM-4.5V`
```python
agent = ComputerAgent("openrouter/z-ai/glm-4.5v", tools=[computer])
async for _ in agent.run("Click on the search bar and type 'hello world'"):
pass
```
## InternVL 3.5
InternVL 3.5 family:
- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
```python
agent = ComputerAgent("huggingface-local/OpenGVLab/InternVL3_5-1B", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```
## Qwen3 VL
Qwen3 VL family:
- `cua/qwen/qwen3-vl-235b` (via CUA VLM Router - recommended)
```python
agent = ComputerAgent("cua/qwen/qwen3-vl-235b", tools=[computer])
async for _ in agent.run("Open Firefox and navigate to github.com"):
pass
```
## UI-TARS 1.5
Unified vision-language model for computer-use:
- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)
```python
agent = ComputerAgent("huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", tools=[computer])
async for _ in agent.run("Open the settings menu and change the theme to dark mode"):
pass
```
---
CUAs also support direct click prediction. See [Grounding Models](./grounding-models) for details on `predict_click()`.
For details on agent loop behavior and usage, see [Agent Loops](../agent-loops).
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/pii_anonymization.py:
--------------------------------------------------------------------------------
```python
"""
PII anonymization callback handler using Microsoft Presidio for text and image redaction.
"""
import base64
import io
import logging
from typing import Any, Dict, List, Optional, Tuple
from .base import AsyncCallbackHandler
try:
# TODO: Add Presidio dependencies
from PIL import Image
PRESIDIO_AVAILABLE = True
except ImportError:
PRESIDIO_AVAILABLE = False
logger = logging.getLogger(__name__)
class PIIAnonymizationCallback(AsyncCallbackHandler):
"""
Callback handler that anonymizes PII in text and images using Microsoft Presidio.
This handler:
1. Anonymizes PII in messages before sending to the agent loop
2. Deanonymizes PII in tool calls and message outputs after the agent loop
3. Redacts PII from images in computer_call_output messages
"""
def __init__(
self,
# TODO: Any extra kwargs if needed
):
"""
Initialize the PII anonymization callback.
Args:
anonymize_text: Whether to anonymize text content
anonymize_images: Whether to redact images
entities_to_anonymize: List of entity types to anonymize (None for all)
anonymization_operator: Presidio operator to use ("replace", "mask", "redact", etc.)
image_redaction_color: RGB color for image redaction
"""
if not PRESIDIO_AVAILABLE:
raise ImportError(
"Presidio is not available. Install with: "
"pip install cua-agent[pii-anonymization]"
)
# TODO: Implement __init__
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Anonymize PII in messages before sending to agent loop.
Args:
messages: List of message dictionaries
Returns:
List of messages with PII anonymized
"""
anonymized_messages = []
for msg in messages:
anonymized_msg = await self._anonymize_message(msg)
anonymized_messages.append(anonymized_msg)
return anonymized_messages
async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Deanonymize PII in tool calls and message outputs after agent loop.
Args:
output: List of output dictionaries
Returns:
List of output with PII deanonymized for tool calls
"""
deanonymized_output = []
for item in output:
# Only deanonymize tool calls and computer_call messages
if item.get("type") in ["computer_call", "computer_call_output"]:
deanonymized_item = await self._deanonymize_item(item)
deanonymized_output.append(deanonymized_item)
else:
deanonymized_output.append(item)
return deanonymized_output
async def _anonymize_message(self, message: Dict[str, Any]) -> Dict[str, Any]:
# TODO: Implement _anonymize_message
return message
async def _deanonymize_item(self, item: Dict[str, Any]) -> Dict[str, Any]:
# TODO: Implement _deanonymize_item
return item
```
--------------------------------------------------------------------------------
/libs/lume/tests/VM/VMDetailsPrinterTests.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
import Testing
@testable import lume
struct VMDetailsPrinterTests {
@Test func printStatus_whenJSON() throws {
// Given
let vms: [VMDetails] = [
VMDetails(
name: "name",
os: "os",
cpuCount: 2,
memorySize: 1024,
diskSize: .init(allocated: 24, total: 30),
display: "1024x768",
status: "status",
vncUrl: "vncUrl",
ipAddress: "0.0.0.0",
locationName: "mockLocation")
]
let jsonEncoder = JSONEncoder()
jsonEncoder.outputFormatting = .prettyPrinted
let expectedOutput = try String(data: jsonEncoder.encode(vms), encoding: .utf8)!
// When
var printedStatus: String?
try VMDetailsPrinter.printStatus(vms, format: .json, print: { printedStatus = $0 })
// Then
// Decode both JSONs and compare the actual data structures
let jsonDecoder = JSONDecoder()
let printedVMs = try jsonDecoder.decode(
[VMDetails].self, from: printedStatus!.data(using: .utf8)!)
let expectedVMs = try jsonDecoder.decode(
[VMDetails].self, from: expectedOutput.data(using: .utf8)!)
#expect(printedVMs.count == expectedVMs.count)
for (printed, expected) in zip(printedVMs, expectedVMs) {
#expect(printed.name == expected.name)
#expect(printed.os == expected.os)
#expect(printed.cpuCount == expected.cpuCount)
#expect(printed.memorySize == expected.memorySize)
#expect(printed.diskSize.allocated == expected.diskSize.allocated)
#expect(printed.diskSize.total == expected.diskSize.total)
#expect(printed.status == expected.status)
#expect(printed.vncUrl == expected.vncUrl)
#expect(printed.ipAddress == expected.ipAddress)
}
}
@Test func printStatus_whenNotJSON() throws {
// Given
let vms: [VMDetails] = [
VMDetails(
name: "name",
os: "os",
cpuCount: 2,
memorySize: 1024,
diskSize: .init(allocated: 24, total: 30),
display: "1024x768",
status: "status",
vncUrl: "vncUrl",
ipAddress: "0.0.0.0",
locationName: "mockLocation")
]
// When
var printedLines: [String] = []
try VMDetailsPrinter.printStatus(vms, format: .text, print: { printedLines.append($0) })
// Then
#expect(printedLines.count == 2)
let headerParts = printedLines[0].split(whereSeparator: \.isWhitespace)
#expect(
headerParts == [
"name", "os", "cpu", "memory", "disk", "display", "status", "storage", "shared_dirs", "ip", "vnc",
])
#expect(
printedLines[1].split(whereSeparator: \.isWhitespace).map(String.init) == [
"name", "os", "2", "0.00G", "24.0B/30.0B", "1024x768", "status", "mockLocation",
"-",
"0.0.0.0",
"vncUrl",
])
}
}
```
--------------------------------------------------------------------------------
/libs/lume/src/Server/HTTP.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
import Network
enum HTTPError: Error {
case internalError
}
struct HTTPRequest {
let method: String
let path: String
let headers: [String: String]
let body: Data?
init?(data: Data) {
guard let requestString = String(data: data, encoding: .utf8) else { return nil }
let components = requestString.components(separatedBy: "\r\n\r\n")
guard components.count >= 1 else { return nil }
let headerLines = components[0].components(separatedBy: "\r\n")
guard !headerLines.isEmpty else { return nil }
// Parse request line
let requestLine = headerLines[0].components(separatedBy: " ")
guard requestLine.count >= 2 else { return nil }
self.method = requestLine[0]
self.path = requestLine[1]
// Parse headers
var headers: [String: String] = [:]
for line in headerLines.dropFirst() {
let headerComponents = line.split(separator: ":", maxSplits: 1).map(String.init)
if headerComponents.count == 2 {
headers[headerComponents[0].trimmingCharacters(in: .whitespaces)] =
headerComponents[1].trimmingCharacters(in: .whitespaces)
}
}
self.headers = headers
// Parse body if present
if components.count > 1 {
self.body = components[1].data(using: .utf8)
} else {
self.body = nil
}
}
}
struct HTTPResponse {
enum StatusCode: Int {
case ok = 200
case accepted = 202
case badRequest = 400
case notFound = 404
case internalServerError = 500
var description: String {
switch self {
case .ok: return "OK"
case .accepted: return "Accepted"
case .badRequest: return "Bad Request"
case .notFound: return "Not Found"
case .internalServerError: return "Internal Server Error"
}
}
}
let statusCode: StatusCode
let headers: [String: String]
let body: Data?
init(statusCode: StatusCode, headers: [String: String] = [:], body: Data? = nil) {
self.statusCode = statusCode
self.headers = headers
self.body = body
}
init(statusCode: StatusCode, body: String) {
self.statusCode = statusCode
self.headers = ["Content-Type": "text/plain"]
self.body = body.data(using: .utf8)
}
func serialize() -> Data {
var response = "HTTP/1.1 \(statusCode.rawValue) \(statusCode.description)\r\n"
var headers = self.headers
if let body = body {
headers["Content-Length"] = "\(body.count)"
}
for (key, value) in headers {
response += "\(key): \(value)\r\n"
}
response += "\r\n"
var responseData = response.data(using: .utf8) ?? Data()
if let body = body {
responseData.append(body)
}
return responseData
}
}
final class HTTPServer {
let port: UInt16
init(port: UInt16) {
self.port = port
}
}
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/supported-agents/grounding-models.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Grounding Models
description: Models that support click prediction with ComputerAgent.predict_click()
---
These models specialize in UI element grounding and click prediction. They can identify precise coordinates for UI elements based on natural language descriptions, but cannot perform autonomous task planning.
Use `ComputerAgent.predict_click()` to get coordinates for specific UI elements.
All models that support `ComputerAgent.run()` also support `ComputerAgent.predict_click()`. See [All‑in‑one CUAs](./computer-use-agents).
### Anthropic CUAs
- Claude 4.5: `claude-sonnet-4-5-20250929`
- Claude 4.1: `claude-opus-4-1-20250805`
- Claude 4: `claude-opus-4-20250514`, `claude-sonnet-4-20250514`
- Claude 3.7: `claude-3-7-sonnet-20250219`
### OpenAI CUA Preview
- Computer-use-preview: `computer-use-preview`
### UI-TARS 1.5 (Unified VLM with grounding support)
- `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
- `huggingface/ByteDance-Seed/UI-TARS-1.5-7B` (requires TGI endpoint)
## Specialized Grounding Models
These models are optimized specifically for click prediction and UI element grounding:
### OpenCUA
- `huggingface-local/xlangai/OpenCUA-{7B,32B}`
### GTA1 Family
- `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
### Holo 1.5 Family
- `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
### InternVL 3.5 Family
- `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
### OmniParser (OCR)
OCR-focused set-of-marks model that requires an LLM for click prediction:
- `omniparser` (requires combination with any LiteLLM vision model)
### Moondream3 (Local Grounding)
Moondream3 is a powerful small model that can perform UI grounding and click prediction.
- `moondream3`
## Usage Examples
```python
# Using any grounding model for click prediction
agent = ComputerAgent("claude-sonnet-4-5-20250929", tools=[computer])
# Predict coordinates for specific elements
login_coords = agent.predict_click("find the login button")
search_coords = agent.predict_click("locate the search text field")
menu_coords = agent.predict_click("find the hamburger menu icon")
print(f"Login button: {login_coords}")
print(f"Search field: {search_coords}")
print(f"Menu icon: {menu_coords}")
```
```python
# OmniParser is just for OCR, so it requires an LLM for predict_click
agent = ComputerAgent("omniparser+anthropic/claude-sonnet-4-5-20250929", tools=[computer])
# Predict click coordinates using composed agent
coords = agent.predict_click("find the submit button")
print(f"Click coordinates: {coords}") # (450, 320)
# Note: Cannot use omniparser alone for click prediction
# This will raise an error:
# agent = ComputerAgent("omniparser", tools=[computer])
# coords = agent.predict_click("find button") # Error!
```
```python
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B", tools=[computer])
# Predict click coordinates for UI elements
coords = agent.predict_click("find the submit button")
print(f"Click coordinates: {coords}") # (450, 320)
# Note: GTA1 cannot perform autonomous task planning
# This will raise an error:
# agent.run("Fill out the form and submit it")
```
---
For information on combining grounding models with planning capabilities, see [Composed Agents](./composed-agents) and [All‑in‑one CUAs](./computer-use-agents).
```
--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/server.py:
--------------------------------------------------------------------------------
```python
"""
Server interface for Computer API.
Provides a clean API for starting and stopping the server.
"""
import asyncio
import logging
from typing import Optional
import uvicorn
from fastapi import FastAPI
from .main import app as fastapi_app
logger = logging.getLogger(__name__)
class Server:
"""
Server interface for Computer API.
Usage:
from computer_api import Server
# Synchronous usage
server = Server()
server.start() # Blocks until server is stopped
# Asynchronous usage
server = Server()
await server.start_async() # Starts server in background
# Do other things
await server.stop() # Stop the server
"""
def __init__(
self,
host: str = "0.0.0.0",
port: int = 8000,
log_level: str = "info",
ssl_keyfile: Optional[str] = None,
ssl_certfile: Optional[str] = None,
):
"""
Initialize the server.
Args:
host: Host to bind the server to
port: Port to bind the server to
log_level: Logging level (debug, info, warning, error, critical)
ssl_keyfile: Path to SSL private key file (for HTTPS)
ssl_certfile: Path to SSL certificate file (for HTTPS)
"""
self.host = host
self.port = port
self.log_level = log_level
self.ssl_keyfile = ssl_keyfile
self.ssl_certfile = ssl_certfile
self.app = fastapi_app
self._server_task: Optional[asyncio.Task] = None
self._should_exit = asyncio.Event()
def start(self) -> None:
"""
Start the server synchronously. This will block until the server is stopped.
"""
uvicorn.run(
self.app,
host=self.host,
port=self.port,
log_level=self.log_level,
ssl_keyfile=self.ssl_keyfile,
ssl_certfile=self.ssl_certfile,
)
async def start_async(self) -> None:
"""
Start the server asynchronously. This will return immediately and the server
will run in the background.
"""
server_config = uvicorn.Config(
self.app,
host=self.host,
port=self.port,
log_level=self.log_level,
ssl_keyfile=self.ssl_keyfile,
ssl_certfile=self.ssl_certfile,
)
self._should_exit.clear()
server = uvicorn.Server(server_config)
# Create a task to run the server
self._server_task = asyncio.create_task(server.serve())
# Wait a short time to ensure the server starts
await asyncio.sleep(0.5)
protocol = "https" if self.ssl_certfile else "http"
logger.info(f"Server started at {protocol}://{self.host}:{self.port}")
async def stop(self) -> None:
"""
Stop the server if it's running asynchronously.
"""
if self._server_task and not self._server_task.done():
# Signal the server to exit
self._should_exit.set()
# Cancel the server task
self._server_task.cancel()
try:
await self._server_task
except asyncio.CancelledError:
logger.info("Server stopped")
self._server_task = None
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/configuration.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Configuration
---
The server is configured using environment variables (can be set in the Claude Desktop config):
| Variable | Description | Default |
| ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------- |
| `CUA_MODEL_NAME` | Model string (e.g., "anthropic/claude-sonnet-4-20250514", "openai/computer-use-preview", "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B", "omniparser+litellm/gpt-4o", "omniparser+ollama_chat/gemma3") | anthropic/claude-sonnet-4-20250514 |
| `ANTHROPIC_API_KEY` | Your Anthropic API key (required for Anthropic models) | None |
| `CUA_MAX_IMAGES` | Maximum number of images to keep in context | 3 |
| `CUA_USE_HOST_COMPUTER_SERVER` | Target your local desktop instead of a VM. Set to "true" to use your host system. **Warning:** AI models may perform risky actions. | false |
## Model Configuration
The `CUA_MODEL_NAME` environment variable supports various model providers through LiteLLM integration:
### Supported Providers
- **Anthropic**: `anthropic/claude-sonnet-4-20250514`,
- **OpenAI**: `openai/computer-use-preview`, `openai/gpt-4o`
- **Local Models**: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B`
- **Omni + LiteLLM**: `omniparser+litellm/gpt-4o`, `omniparser+litellm/claude-3-haiku`
- **Ollama**: `omniparser+ollama_chat/gemma3`
### Example Configurations
**Claude Desktop Configuration:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
"CUA_MAX_IMAGES": "5",
"CUA_USE_HOST_COMPUTER_SERVER": "false"
}
}
}
}
```
**Local Model Configuration:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
"CUA_MAX_IMAGES": "3"
}
}
}
}
```
## Session Management Configuration
The MCP server automatically manages sessions with the following defaults:
- **Max Concurrent Sessions**: 10
- **Session Timeout**: 10 minutes of inactivity
- **Computer Pool Size**: 5 instances
- **Automatic Cleanup**: Enabled
These settings are optimized for typical usage and don't require configuration for most users.
```
--------------------------------------------------------------------------------
/libs/lume/src/VM/VMDetailsPrinter.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
/// Prints VM status information in a formatted table
enum VMDetailsPrinter {
/// Represents a column in the VM status table
private struct Column: Sendable {
let header: String
let width: Int
let getValue: @Sendable (VMDetails) -> String
}
/// Configuration for all columns in the status table
private static let columns: [Column] = [
Column(header: "name", width: 34, getValue: { $0.name }),
Column(header: "os", width: 8, getValue: { $0.os }),
Column(header: "cpu", width: 8, getValue: { String($0.cpuCount) }),
Column(
header: "memory", width: 8,
getValue: {
String(format: "%.2fG", Float($0.memorySize) / (1024 * 1024 * 1024))
}),
Column(
header: "disk", width: 16,
getValue: {
"\($0.diskSize.formattedAllocated)/\($0.diskSize.formattedTotal)"
}),
Column(header: "display", width: 12, getValue: { $0.display }),
Column(
header: "status", width: 16,
getValue: {
$0.status
}),
Column(header: "storage", width: 16, getValue: { $0.locationName }),
Column(
header: "shared_dirs", width: 54,
getValue: { vm in
// Only show shared directories if the VM is running
if vm.status == "running", let dirs = vm.sharedDirectories, !dirs.isEmpty {
return dirs.map { "\($0.hostPath) (\($0.readOnly ? "ro" : "rw"))" }.joined(separator: ", ")
} else {
return "-"
}
}),
Column(
header: "ip", width: 16,
getValue: {
$0.ipAddress ?? "-"
}),
Column(
header: "vnc", width: 50,
getValue: {
$0.vncUrl ?? "-"
}),
]
/// Prints the status of all VMs in a formatted table
/// - Parameter vms: Array of VM status objects to display
static func printStatus(
_ vms: [VMDetails], format: FormatOption, print: (String) -> Void = { print($0) }
) throws {
if format == .json {
let jsonEncoder = JSONEncoder()
jsonEncoder.outputFormatting = .prettyPrinted
let jsonData = try jsonEncoder.encode(vms)
let jsonString = String(data: jsonData, encoding: .utf8)!
print(jsonString)
} else {
printHeader(print: print)
vms.forEach({ vm in
printVM(vm, print: print)
})
}
}
private static func printHeader(print: (String) -> Void = { print($0) }) {
let paddedHeaders = columns.map { $0.header.paddedToWidth($0.width) }
print(paddedHeaders.joined())
}
private static func printVM(_ vm: VMDetails, print: (String) -> Void = { print($0) }) {
let paddedColumns = columns.map { column in
column.getValue(vm).paddedToWidth(column.width)
}
print(paddedColumns.joined())
}
}
extension String {
/// Pads the string to the specified width with spaces
/// - Parameter width: Target width for padding
/// - Returns: Padded string
fileprivate func paddedToWidth(_ width: Int) -> String {
padding(toLength: width, withPad: " ", startingAt: 0)
}
}
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/utils.py:
--------------------------------------------------------------------------------
```python
import base64
import io
import os
import shlex
from typing import Any, Dict, Optional, Tuple
import mslex
from PIL import Image, ImageDraw
def decode_base64_image(base64_str: str) -> bytes:
"""Decode a base64 string into image bytes."""
return base64.b64decode(base64_str)
def encode_base64_image(image_bytes: bytes) -> str:
"""Encode image bytes to base64 string."""
return base64.b64encode(image_bytes).decode("utf-8")
def bytes_to_image(image_bytes: bytes) -> Image.Image:
"""Convert bytes to PIL Image.
Args:
image_bytes: Raw image bytes
Returns:
PIL.Image: The converted image
"""
return Image.open(io.BytesIO(image_bytes))
def image_to_bytes(image: Image.Image, format: str = "PNG") -> bytes:
"""Convert PIL Image to bytes."""
buf = io.BytesIO()
image.save(buf, format=format)
return buf.getvalue()
def resize_image(image_bytes: bytes, scale_factor: float) -> bytes:
"""Resize an image by a scale factor.
Args:
image_bytes: The original image as bytes
scale_factor: Factor to scale the image by (e.g., 0.5 for half size, 2.0 for double)
Returns:
bytes: The resized image as bytes
"""
image = bytes_to_image(image_bytes)
if scale_factor != 1.0:
new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
image = image.resize(new_size, Image.Resampling.LANCZOS)
return image_to_bytes(image)
def draw_box(
image_bytes: bytes,
x: int,
y: int,
width: int,
height: int,
color: str = "#FF0000",
thickness: int = 2,
) -> bytes:
"""Draw a box on an image.
Args:
image_bytes: The original image as bytes
x: X coordinate of top-left corner
y: Y coordinate of top-left corner
width: Width of the box
height: Height of the box
color: Color of the box in hex format
thickness: Thickness of the box border in pixels
Returns:
bytes: The modified image as bytes
"""
# Convert bytes to PIL Image
image = bytes_to_image(image_bytes)
# Create drawing context
draw = ImageDraw.Draw(image)
# Draw rectangle
draw.rectangle([(x, y), (x + width, y + height)], outline=color, width=thickness)
# Convert back to bytes
return image_to_bytes(image)
def get_image_size(image_bytes: bytes) -> Tuple[int, int]:
"""Get the dimensions of an image.
Args:
image_bytes: The image as bytes
Returns:
Tuple[int, int]: Width and height of the image
"""
image = bytes_to_image(image_bytes)
return image.size
def parse_vm_info(vm_info: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Parse VM info from pylume response."""
if not vm_info:
return None
def safe_join(argv: list[str]) -> str:
"""
Return a platform-correct string that safely quotes `argv` for shell execution.
- On POSIX: uses `shlex.join`.
- On Windows: uses `shlex.join`.
Args:
argv: iterable of argument strings (will be coerced to str).
Returns:
A safely quoted command-line string appropriate for the current platform that protects against
shell injection vulnerabilities.
"""
if os.name == "nt":
# On Windows, use mslex for proper quoting
return mslex.join(argv)
else:
# On POSIX systems, use shlex
return shlex.join(argv)
```
--------------------------------------------------------------------------------
/libs/lume/src/VM/DarwinVM.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
/// macOS-specific virtual machine implementation
@MainActor
final class DarwinVM: VM {
private let imageLoader: ImageLoader
init(
vmDirContext: VMDirContext,
virtualizationServiceFactory: @escaping (VMVirtualizationServiceContext) throws -> VMVirtualizationService = { try DarwinVirtualizationService(configuration: $0) },
vncServiceFactory: @escaping (VMDirectory) -> VNCService = { DefaultVNCService(vmDirectory: $0) },
imageLoader: ImageLoader
) {
self.imageLoader = imageLoader
super.init(
vmDirContext: vmDirContext,
virtualizationServiceFactory: virtualizationServiceFactory,
vncServiceFactory: vncServiceFactory
)
}
override func getOSType() -> String {
return "macOS"
}
// MARK: - Installation and Configuration
override func setup(ipswPath: String, cpuCount: Int, memorySize: UInt64, diskSize: UInt64, display: String) async throws {
let imagePath: Path
if ipswPath == "latest" {
Logger.info("Downloading latest supported Image...")
let downloadedPath = try await self.imageLoader.downloadLatestImage()
imagePath = Path(downloadedPath.path)
} else {
imagePath = Path(ipswPath)
}
let requirements = try await imageLoader.loadImageRequirements(from: imagePath.url)
try setDiskSize(diskSize)
let finalCpuCount = max(cpuCount, requirements.minimumSupportedCPUCount)
try setCpuCount(finalCpuCount)
if finalCpuCount != cpuCount {
Logger.info("CPU count overridden due to minimum image requirements", metadata: ["original": "\(cpuCount)", "final": "\(finalCpuCount)"])
}
let finalMemorySize = max(memorySize, requirements.minimumSupportedMemorySize)
try setMemorySize(finalMemorySize)
if finalMemorySize != memorySize {
Logger.info("Memory size overridden due to minimum image requirements", metadata: ["original": "\(memorySize)", "final": "\(finalMemorySize)"])
}
try updateVMConfig(
vmConfig: try VMConfig(
os: getOSType(),
cpuCount: finalCpuCount,
memorySize: finalMemorySize,
diskSize: diskSize,
macAddress: DarwinVirtualizationService.generateMacAddress(),
display: display,
hardwareModel: requirements.hardwareModel,
machineIdentifier: DarwinVirtualizationService.generateMachineIdentifier()
)
)
let service: any VMVirtualizationService = try virtualizationServiceFactory(
try createVMVirtualizationServiceContext(
cpuCount: finalCpuCount,
memorySize: finalMemorySize,
display: display
)
)
guard let darwinService = service as? DarwinVirtualizationService else {
throw VMError.internalError("Installation requires DarwinVirtualizationService")
}
// Create auxiliary storage with hardware model
try darwinService.createAuxiliaryStorage(at: vmDirContext.nvramPath, hardwareModel: requirements.hardwareModel)
try await darwinService.installMacOS(imagePath: imagePath) { progress in
Logger.info("Installing macOS", metadata: ["progress": "\(Int(progress * 100))%"])
}
}
}
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/interface/models.py:
--------------------------------------------------------------------------------
```python
from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, List, Literal, TypedDict, Union
@dataclass
class CommandResult:
stdout: str
stderr: str
returncode: int
def __init__(self, stdout: str, stderr: str, returncode: int):
self.stdout = stdout
self.stderr = stderr
self.returncode = returncode
# Navigation key literals
NavigationKey = Literal["pagedown", "pageup", "home", "end", "left", "right", "up", "down"]
# Special key literals
SpecialKey = Literal["enter", "esc", "tab", "space", "backspace", "del"]
# Modifier key literals
ModifierKey = Literal["ctrl", "alt", "shift", "win", "command", "option"]
# Function key literals
FunctionKey = Literal["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "f11", "f12"]
class Key(Enum):
"""Keyboard keys that can be used with press_key.
These key names map to PyAutoGUI's expected key names.
"""
# Navigation
PAGE_DOWN = "pagedown"
PAGE_UP = "pageup"
HOME = "home"
END = "end"
LEFT = "left"
RIGHT = "right"
UP = "up"
DOWN = "down"
# Special keys
RETURN = "enter"
ENTER = "enter"
ESCAPE = "esc"
ESC = "esc"
TAB = "tab"
SPACE = "space"
BACKSPACE = "backspace"
DELETE = "del"
# Modifier keys
ALT = "alt"
CTRL = "ctrl"
SHIFT = "shift"
WIN = "win"
COMMAND = "command"
OPTION = "option"
# Function keys
F1 = "f1"
F2 = "f2"
F3 = "f3"
F4 = "f4"
F5 = "f5"
F6 = "f6"
F7 = "f7"
F8 = "f8"
F9 = "f9"
F10 = "f10"
F11 = "f11"
F12 = "f12"
@classmethod
def from_string(cls, key: str) -> "Key | str":
"""Convert a string key name to a Key enum value.
Args:
key: String key name to convert
Returns:
Key enum value if the string matches a known key,
otherwise returns the original string for single character keys
"""
# Map common alternative names to enum values
key_mapping = {
"page_down": cls.PAGE_DOWN,
"page down": cls.PAGE_DOWN,
"pagedown": cls.PAGE_DOWN,
"page_up": cls.PAGE_UP,
"page up": cls.PAGE_UP,
"pageup": cls.PAGE_UP,
"return": cls.RETURN,
"enter": cls.ENTER,
"escape": cls.ESCAPE,
"esc": cls.ESC,
"delete": cls.DELETE,
"del": cls.DELETE,
# Modifier key mappings
"alt": cls.ALT,
"ctrl": cls.CTRL,
"control": cls.CTRL,
"shift": cls.SHIFT,
"win": cls.WIN,
"windows": cls.WIN,
"super": cls.WIN,
"command": cls.COMMAND,
"cmd": cls.COMMAND,
"⌘": cls.COMMAND,
"option": cls.OPTION,
"⌥": cls.OPTION,
}
normalized = key.lower().strip()
return key_mapping.get(normalized, key)
# Combined key type
KeyType = Union[Key, NavigationKey, SpecialKey, ModifierKey, FunctionKey, str]
# Key type for mouse actions
MouseButton = Literal["left", "right", "middle"]
class AccessibilityWindow(TypedDict):
"""Information about a window in the accessibility tree."""
app_name: str
pid: int
frontmost: bool
has_windows: bool
windows: List[Dict[str, Any]]
class AccessibilityTree(TypedDict):
"""Complete accessibility tree information."""
success: bool
frontmost_application: str
windows: List[AccessibilityWindow]
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/image_retention.py:
--------------------------------------------------------------------------------
```python
"""
Image retention callback handler that limits the number of recent images in message history.
"""
from typing import Any, Dict, List, Optional
from .base import AsyncCallbackHandler
class ImageRetentionCallback(AsyncCallbackHandler):
"""
Callback handler that applies image retention policy to limit the number
of recent images in message history to prevent context window overflow.
"""
def __init__(self, only_n_most_recent_images: Optional[int] = None):
"""
Initialize the image retention callback.
Args:
only_n_most_recent_images: If set, only keep the N most recent images in message history
"""
self.only_n_most_recent_images = only_n_most_recent_images
async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Apply image retention policy to messages before sending to agent loop.
Args:
messages: List of message dictionaries
Returns:
List of messages with image retention policy applied
"""
if self.only_n_most_recent_images is None:
return messages
return self._apply_image_retention(messages)
def _apply_image_retention(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Apply image retention policy to keep only the N most recent images.
Removes computer_call_output items with image_url and their corresponding computer_call items,
keeping only the most recent N image pairs based on only_n_most_recent_images setting.
Args:
messages: List of message dictionaries
Returns:
Filtered list of messages with image retention applied
"""
if self.only_n_most_recent_images is None:
return messages
# Gather indices of all computer_call_output messages that contain an image_url
output_indices: List[int] = []
for idx, msg in enumerate(messages):
if msg.get("type") == "computer_call_output":
out = msg.get("output")
if isinstance(out, dict) and ("image_url" in out):
output_indices.append(idx)
# Nothing to trim
if len(output_indices) <= self.only_n_most_recent_images:
return messages
# Determine which outputs to keep (most recent N)
keep_output_indices = set(output_indices[-self.only_n_most_recent_images :])
# Build set of indices to remove in one pass
to_remove: set[int] = set()
for idx in output_indices:
if idx in keep_output_indices:
continue # keep this screenshot and its context
to_remove.add(idx) # remove the computer_call_output itself
# Remove the immediately preceding computer_call with matching call_id (if present)
call_id = messages[idx].get("call_id")
prev_idx = idx - 1
if (
prev_idx >= 0
and messages[prev_idx].get("type") == "computer_call"
and messages[prev_idx].get("call_id") == call_id
):
to_remove.add(prev_idx)
# Check a single reasoning immediately before that computer_call
r_idx = prev_idx - 1
if r_idx >= 0 and messages[r_idx].get("type") == "reasoning":
to_remove.add(r_idx)
# Construct filtered list
filtered = [m for i, m in enumerate(messages) if i not in to_remove]
return filtered
```
--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/handlers/factory.py:
--------------------------------------------------------------------------------
```python
import platform
import subprocess
from typing import Tuple, Type
from computer_server.diorama.base import BaseDioramaHandler
from .base import (
BaseAccessibilityHandler,
BaseAutomationHandler,
BaseDesktopHandler,
BaseFileHandler,
BaseWindowHandler,
)
# Conditionally import platform-specific handlers
system = platform.system().lower()
if system == "darwin":
from computer_server.diorama.macos import MacOSDioramaHandler
from .macos import MacOSAccessibilityHandler, MacOSAutomationHandler
elif system == "linux":
from .linux import LinuxAccessibilityHandler, LinuxAutomationHandler
elif system == "windows":
from .windows import WindowsAccessibilityHandler, WindowsAutomationHandler
from .generic import GenericDesktopHandler, GenericFileHandler, GenericWindowHandler
class HandlerFactory:
"""Factory for creating OS-specific handlers."""
@staticmethod
def _get_current_os() -> str:
"""Determine the current OS.
Returns:
str: The OS type ('darwin' for macOS, 'linux' for Linux, or 'windows' for Windows)
Raises:
RuntimeError: If unable to determine the current OS
"""
try:
# Use platform.system() as primary method
system = platform.system().lower()
if system in ["darwin", "linux", "windows"]:
return system
# Fallback to uname if platform.system() doesn't return expected values (Unix-like systems only)
result = subprocess.run(["uname", "-s"], capture_output=True, text=True)
if result.returncode == 0:
return result.stdout.strip().lower()
raise RuntimeError(f"Unsupported OS: {system}")
except Exception as e:
raise RuntimeError(f"Failed to determine current OS: {str(e)}")
@staticmethod
def create_handlers() -> Tuple[
BaseAccessibilityHandler,
BaseAutomationHandler,
BaseDioramaHandler,
BaseFileHandler,
BaseDesktopHandler,
BaseWindowHandler,
]:
"""Create and return appropriate handlers for the current OS.
Returns:
Tuple[BaseAccessibilityHandler, BaseAutomationHandler, BaseDioramaHandler, BaseFileHandler]: A tuple containing
the appropriate accessibility, automation, diorama, and file handlers for the current OS.
Raises:
NotImplementedError: If the current OS is not supported
RuntimeError: If unable to determine the current OS
"""
os_type = HandlerFactory._get_current_os()
if os_type == "darwin":
return (
MacOSAccessibilityHandler(),
MacOSAutomationHandler(),
MacOSDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "linux":
return (
LinuxAccessibilityHandler(),
LinuxAutomationHandler(),
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
elif os_type == "windows":
return (
WindowsAccessibilityHandler(),
WindowsAutomationHandler(),
BaseDioramaHandler(),
GenericFileHandler(),
GenericDesktopHandler(),
GenericWindowHandler(),
)
else:
raise NotImplementedError(f"OS '{os_type}' is not supported")
```
--------------------------------------------------------------------------------
/libs/typescript/computer/src/computer/providers/cloud.ts:
--------------------------------------------------------------------------------
```typescript
import pino from 'pino';
import { type BaseComputerInterface, InterfaceFactory } from '../../interface/index';
import type { CloudComputerConfig, VMProviderType } from '../types';
import { BaseComputer } from './base';
const DEFAULT_API_BASE = process.env.CUA_API_BASE || 'https://api.cua.ai';
interface VMInfo {
name: string;
host?: string;
status?: string;
}
/**
* Cloud-specific computer implementation
*/
export class CloudComputer extends BaseComputer {
protected static vmProviderType: VMProviderType.CLOUD;
protected apiKey: string;
private iface?: BaseComputerInterface;
private initialized = false;
private cachedHost?: string;
private apiBase: string;
protected logger = pino({ name: 'computer.provider_cloud' });
constructor(config: CloudComputerConfig) {
super(config);
this.apiKey = config.apiKey;
this.apiBase = DEFAULT_API_BASE;
}
/**
* Get the host for this VM.
* Returns cached host if available, otherwise falls back to default format.
*/
get ip(): string {
return this.cachedHost || `${this.name}.sandbox.cua.ai`;
}
/**
* Fetch VM list from API and cache the host for this VM.
*/
private async fetchAndCacheHost(): Promise<string> {
try {
const response = await fetch(`${this.apiBase}/v1/vms`, {
headers: {
Authorization: `Bearer ${this.apiKey}`,
Accept: 'application/json',
},
});
if (response.ok) {
const vms = (await response.json()) as VMInfo[];
const vm = vms.find((v) => v.name === this.name);
if (vm?.host) {
this.cachedHost = vm.host;
this.logger.info(`Cached host from API: ${this.cachedHost}`);
return this.cachedHost;
}
}
} catch (error) {
this.logger.warn(`Failed to fetch VM list for host lookup: ${error}`);
}
// Fall back to default format
const fallbackHost = `${this.name}.sandbox.cua.ai`;
this.cachedHost = fallbackHost;
this.logger.info(`Using fallback host: ${fallbackHost}`);
return fallbackHost;
}
/**
* Initialize the cloud VM and interface
*/
async run(): Promise<void> {
if (this.initialized) {
this.logger.info('Computer already initialized, skipping initialization');
return;
}
try {
// Fetch the host from API before connecting
const ipAddress = await this.fetchAndCacheHost();
this.logger.info(`Connecting to cloud VM at ${ipAddress}`);
// Create the interface with API key authentication
this.iface = InterfaceFactory.createInterfaceForOS(
this.osType,
ipAddress,
this.apiKey,
this.name
);
// Wait for the interface to be ready
this.logger.info('Waiting for interface to be ready...');
await this.iface.waitForReady();
this.initialized = true;
this.logger.info('Cloud computer ready');
} catch (error) {
this.logger.error(`Failed to initialize cloud computer: ${error}`);
throw new Error(`Failed to initialize cloud computer: ${error}`);
}
}
/**
* Stop the cloud computer (disconnect interface)
*/
async stop(): Promise<void> {
this.logger.info('Disconnecting from cloud computer...');
if (this.iface) {
this.iface.disconnect();
this.iface = undefined;
}
this.initialized = false;
this.logger.info('Disconnected from cloud computer');
}
/**
* Get the computer interface
*/
get interface(): BaseComputerInterface {
if (!this.iface) {
throw new Error('Computer not initialized. Call run() first.');
}
return this.iface;
}
/**
* Disconnect from the cloud computer
*/
async disconnect(): Promise<void> {
await this.stop();
}
}
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/migration-guide.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Migration Guide
---
This guide lists **breaking changes** when migrating from the original `ComputerAgent` (v0.3.x) to the rewritten `ComputerAgent` (v0.4.x) and shows old vs new usage for all four agent loops.
## Breaking Changes
- **Initialization:**
- `ComputerAgent` (v0.4.x) uses `model` as a string (e.g. "anthropic/claude-sonnet-4-5-20250929") instead of `LLM` and `AgentLoop` objects.
- `tools` is a list (can include multiple computers and decorated functions).
- `callbacks` are now first-class for extensibility (image retention, budget, trajectory, logging, etc).
- **No explicit `loop` parameter:**
- Loop is inferred from the `model` string (e.g. `anthropic/`, `openai/`, `omniparser+`, `ui-tars`).
- **No explicit `computer` parameter:**
- Computers are added to `tools` list.
---
## Usage Examples: Old vs New
### 1. Anthropic Loop
**Old:**
```python
async with Computer() as computer:
agent = ComputerAgent(
computer=computer,
loop=AgentLoop.ANTHROPIC,
model=LLM(provider=LLMProvider.ANTHROPIC)
)
async for result in agent.run("Take a screenshot"):
print(result)
```
**New:**
```python
async with Computer() as computer:
agent = ComputerAgent(
model="anthropic/claude-sonnet-4-5-20250929",
tools=[computer]
)
messages = [{"role": "user", "content": "Take a screenshot"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
### 2. OpenAI Loop
**Old:**
```python
async with Computer() as computer:
agent = ComputerAgent(
computer=computer,
loop=AgentLoop.OPENAI,
model=LLM(provider=LLMProvider.OPENAI)
)
async for result in agent.run("Take a screenshot"):
print(result)
```
**New:**
```python
async with Computer() as computer:
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer]
)
messages = [{"role": "user", "content": "Take a screenshot"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
### 3. UI-TARS Loop
**Old:**
```python
async with Computer() as computer:
agent = ComputerAgent(
computer=computer,
loop=AgentLoop.UITARS,
model=LLM(provider=LLMProvider.OAICOMPAT, name="ByteDance-Seed/UI-TARS-1.5-7B", provider_base_url="https://.../v1")
)
async for result in agent.run("Take a screenshot"):
print(result)
```
**New:**
```python
async with Computer() as computer:
agent = ComputerAgent(
model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
tools=[computer]
)
messages = [{"role": "user", "content": "Take a screenshot"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
### 4. Omni Loop
**Old:**
```python
async with Computer() as computer:
agent = ComputerAgent(
computer=computer,
loop=AgentLoop.OMNI,
model=LLM(provider=LLMProvider.OLLAMA, name="gemma3")
)
async for result in agent.run("Take a screenshot"):
print(result)
```
**New:**
```python
async with Computer() as computer:
agent = ComputerAgent(
model="omniparser+ollama_chat/gemma3",
tools=[computer]
)
messages = [{"role": "user", "content": "Take a screenshot"}]
async for result in agent.run(messages):
for item in result["output"]:
if item["type"] == "message":
print(item["content"][0]["text"])
```
```
--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lume/faq.md:
--------------------------------------------------------------------------------
```markdown
---
title: FAQ
---
### Where are the VMs stored?
VMs are stored in `~/.lume` by default. You can configure additional storage locations using the `lume config` command.
### How are images cached?
Images are cached in `~/.lume/cache`. When doing `lume pull <image>`, it will check if the image is already cached. If not, it will download the image and cache it, removing any older versions.
### Where is the configuration file stored?
Lume follows the XDG Base Directory specification for the configuration file:
- Configuration is stored in `$XDG_CONFIG_HOME/lume/config.yaml` (defaults to `~/.config/lume/config.yaml`)
By default, other data is stored in:
- VM data: `~/.lume`
- Cache files: `~/.lume/cache`
The config file contains settings for:
- VM storage locations and the default location
- Cache directory location
- Whether caching is enabled
You can view and modify these settings using the `lume config` commands:
```bash
# View current configuration
lume config get
# Manage VM storage locations
lume config storage list # List all VM storage locations
lume config storage add <name> <path> # Add a new VM storage location
lume config storage remove <name> # Remove a VM storage location
lume config storage default <name> # Set the default VM storage location
# Manage cache settings
lume config cache get # Get current cache directory
lume config cache set <path> # Set cache directory
# Manage image caching settings
lume config caching get # Show current caching status
lume config caching set <boolean> # Enable or disable image caching
```
### How do I use multiple VM storage locations?
Lume supports storing VMs in different locations (e.g., internal drive, external SSD). After configuring storage locations, you can specify which location to use with the `--storage` parameter in various commands:
```bash
# Create a VM in a specific storage location
lume create my-vm --os macos --ipsw latest --storage ssd
# Run a VM from a specific storage location
lume run my-vm --storage ssd
# Delete a VM from a specific storage location
lume delete my-vm --storage ssd
# Pull an image to a specific storage location
lume pull macos-sequoia-vanilla:latest --name my-vm --storage ssd
# Clone a VM between storage locations
lume clone source-vm cloned-vm --source-storage default --dest-storage ssd
```
If you don't specify a storage location, Lume will use the default one or search across all configured locations.
### Are VM disks taking up all the disk space?
No, macOS uses sparse files, which only allocate space as needed. For example, VM disks totaling 50 GB may only use 20 GB on disk.
### How do I get the latest macOS restore image URL?
```bash
lume ipsw
```
### How do I delete a VM?
```bash
lume delete <name>
```
### How to Install macOS from an IPSW Image
#### Create a new macOS VM using the latest supported IPSW image:
Run the following command to create a new macOS virtual machine using the latest available IPSW image:
```bash
lume create <name> --os macos --ipsw latest
```
#### Create a new macOS VM using a specific IPSW image:
To create a macOS virtual machine from an older or specific IPSW file, first download the desired IPSW (UniversalMac) from a trusted source.
Then, use the downloaded IPSW path:
```bash
lume create <name> --os macos --ipsw <downloaded_ipsw_path>
```
### How do I install a custom Linux image?
The process for creating a custom Linux image differs than macOS, with IPSW restore files not being used. You need to create a linux VM first, then mount a setup image file to the VM for the first boot.
```bash
lume create <name> --os linux
lume run <name> --mount <path-to-setup-image>
lume run <name>
```
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/client-integrations.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Client Integrations
---
## Claude Desktop Integration
To use with Claude Desktop, add an entry to your Claude Desktop configuration (`claude_desktop_config.json`, typically found in `~/.config/claude-desktop/`):
### Package Installation Method
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here",
"CUA_MAX_IMAGES": "3",
"CUA_USE_HOST_COMPUTER_SERVER": "false"
}
}
}
}
```
### Development Method
If you're working with the CUA source code:
**Standard VM Mode:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Host Computer Control Mode:**
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
**⚠️ Host Computer Control Setup**: When using `CUA_USE_HOST_COMPUTER_SERVER='true'`, you must also:
1. Install computer server dependencies: `python3 -m pip install uvicorn fastapi`
2. Install the computer server: `python3 -m pip install -e libs/python/computer-server --break-system-packages`
3. Start the computer server: `python -m computer_server --log-level debug`
4. The AI will have direct access to your desktop - use with caution!
For more information on MCP with Claude Desktop, see the [official MCP User Guide](https://modelcontextprotocol.io/quickstart/user).
## Cursor Integration
To use with Cursor, add an MCP configuration file in one of these locations:
- **Project-specific**: Create `.cursor/mcp.json` in your project directory
- **Global**: Create `~/.cursor/mcp.json` in your home directory
Example configuration for Cursor:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
}
}
}
}
```
After configuration, you can simply tell Cursor's Agent to perform computer tasks by explicitly mentioning the CUA agent, such as "Use the computer control tools to open Safari."
For more information on MCP with Cursor, see the [official Cursor MCP documentation](https://docs.cursor.com/context/model-context-protocol).
## Other MCP Clients
The MCP server is compatible with any MCP-compliant client. The server exposes the following tools:
- `run_cua_task` - Execute single computer tasks
- `run_multi_cua_tasks` - Execute multiple tasks (sequential or concurrent)
- `screenshot_cua` - Capture screenshots
- `get_session_stats` - Monitor session statistics
- `cleanup_session` - Manage session lifecycle
### Configuration Options
All MCP clients can configure the server using environment variables:
- `CUA_MODEL_NAME` - Model to use for task execution
- `CUA_MAX_IMAGES` - Maximum images to keep in context
- `CUA_USE_HOST_COMPUTER_SERVER` - Use host system instead of VM
See the [Configuration](/docs/libraries/mcp-server/configuration) page for detailed configuration options.
```
--------------------------------------------------------------------------------
/libs/qemu-docker/linux/src/vm/setup/setup-cua-server.sh:
--------------------------------------------------------------------------------
```bash
#!/bin/bash
# Setup CUA Computer Server on Linux
# Creates a system-level systemd service to run computer server in background
set -e
USER_NAME="docker"
USER_HOME="/home/$USER_NAME"
SCRIPT_DIR="/opt/oem"
CUA_DIR="/opt/cua-server"
VENV_DIR="$CUA_DIR/venv"
SERVICE_NAME="cua-computer-server"
LOG_FILE="$SCRIPT_DIR/setup.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
log "=== Installing CUA Computer Server ==="
# Install Python 3 and venv
log "Installing Python 3 and dependencies..."
sudo apt-get install -y python3 python3-venv python3-pip python3-tk python3-dev
# Create CUA directory
log "Creating CUA directory at $CUA_DIR..."
sudo mkdir -p "$CUA_DIR"
sudo chown "$USER_NAME:$USER_NAME" "$CUA_DIR"
# Create virtual environment
if [ -f "$VENV_DIR/bin/python" ]; then
log "Existing venv detected; skipping creation"
else
log "Creating Python virtual environment at $VENV_DIR..."
python3 -m venv "$VENV_DIR"
log "Virtual environment created successfully"
fi
# Activate and install packages
log "Upgrading pip, setuptools, and wheel..."
"$VENV_DIR/bin/pip" install --upgrade pip setuptools wheel
log "Installing cua-computer-server..."
"$VENV_DIR/bin/pip" install --upgrade cua-computer-server
log "cua-computer-server installed successfully"
# Open firewall for port 5000 (if ufw is available)
if command -v ufw &> /dev/null; then
log "Opening firewall for port 5000..."
sudo ufw allow 5000/tcp || true
log "Firewall rule added"
fi
# Create start script with auto-restart
START_SCRIPT="$CUA_DIR/start-server.sh"
log "Creating start script at $START_SCRIPT..."
cat > "$START_SCRIPT" << 'EOF'
#!/bin/bash
# CUA Computer Server Start Script with auto-restart
CUA_DIR="/opt/cua-server"
VENV_DIR="$CUA_DIR/venv"
LOG_FILE="$CUA_DIR/server.log"
start_server() {
echo "$(date '+%Y-%m-%d %H:%M:%S') Updating cua-computer-server..." >> "$LOG_FILE"
"$VENV_DIR/bin/pip" install --upgrade cua-computer-server >> "$LOG_FILE" 2>&1
echo "$(date '+%Y-%m-%d %H:%M:%S') Starting CUA Computer Server on port 5000..." >> "$LOG_FILE"
"$VENV_DIR/bin/python" -m computer_server --port 5000 >> "$LOG_FILE" 2>&1
return $?
}
while true; do
start_server
EXIT_CODE=$?
echo "$(date '+%Y-%m-%d %H:%M:%S') Server exited with code: $EXIT_CODE. Restarting in 5s..." >> "$LOG_FILE"
sleep 5
done
EOF
chmod +x "$START_SCRIPT"
log "Start script created"
# Create xhost script for X11 access
log "Creating xhost script..."
sudo tee /etc/X11/Xsession.d/99xauth > /dev/null << 'EOF'
#!/bin/sh
# Grant local X11 access for CUA Computer Server
export DISPLAY=:0
xhost +local: 2>/dev/null || true
EOF
sudo chmod +x /etc/X11/Xsession.d/99xauth
log "X11 access script created"
# Create system-level systemd service
log "Creating systemd system service..."
sudo tee /etc/systemd/system/$SERVICE_NAME.service > /dev/null << EOF
[Unit]
Description=CUA Computer Server
After=graphical.target
[Service]
Type=simple
ExecStart=$START_SCRIPT
Restart=always
RestartSec=5
Environment=PYTHONUNBUFFERED=1
Environment=DISPLAY=:0
Environment=XAUTHORITY=$USER_HOME/.Xauthority
User=$USER_NAME
WorkingDirectory=$CUA_DIR
[Install]
WantedBy=graphical.target
EOF
log "Systemd service created at /etc/systemd/system/$SERVICE_NAME.service"
# Ensure proper ownership of CUA directory
log "Setting ownership of $CUA_DIR to $USER_NAME..."
sudo chown -R "$USER_NAME:$USER_NAME" "$CUA_DIR"
# Enable and start the service
log "Enabling systemd service..."
sudo systemctl daemon-reload
sudo systemctl enable "$SERVICE_NAME.service"
log "Starting CUA Computer Server service..."
sudo systemctl start "$SERVICE_NAME.service" || true
log "=== CUA Computer Server setup completed ==="
log "Service status: $(sudo systemctl is-active $SERVICE_NAME.service 2>/dev/null || echo 'unknown')"
```
--------------------------------------------------------------------------------
/blog/cua-playground-preview.md:
--------------------------------------------------------------------------------
```markdown
# Cua Playground: Agents + Sandboxes in Your Browser
Building computer-use agents means constant iteration—writing code, deploying to a sandbox, testing behavior, debugging issues, then repeating the cycle. Every test requires switching between your code editor, terminal, and VNC viewer. Want to try a different prompt? Edit your code, redeploy, and wait for the agent to restart. It works, but it's slow.
Today we're launching the **Cua Playground**: a browser-based environment for testing computer-use agents without writing code. Send messages to your sandboxes, watch them execute in real-time, and iterate on prompts instantly—all from your dashboard at cua.ai.

**What's new with this release:**
- Instant testing—send messages to any running sandbox directly from your browser
- Real-time execution—watch your agent work with live tool call updates and screenshots
- Multi-model support—test with Claude Sonnet 4.5, Haiku 4.5, and more
- Persistent chat history—conversations save automatically to local storage
The Playground connects to your existing Cua sandboxes—the same ones you use with the Agent SDK. Select a running sandbox and a model, then start chatting. The agent uses computer-use tools (mouse, keyboard, bash, editor) to complete your tasks, and you see every action it takes.
## Getting Started Today
<div align="center">
<video src="https://github.com/user-attachments/assets/9fef0f30-1024-4833-8b7a-6a2c02d8eb99" width="600" controls></video>
</div>
Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then navigate to the Playground:
1. Navigate to Dashboard > Playground
2. Select a sandbox from the dropdown (must be "running" status)
3. Choose a model (we recommend Claude Sonnet 4.5 to start)
4. Send a message: "Take a screenshot and describe what you see"
5. Watch the agent execute computer actions in real-time
Example use cases:
**Prompt Testing**
```
❌ "Check the website"
✅ "Navigate to example.com in Firefox and take a screenshot of the homepage"
```
**Model Comparison**
Run the same task with different models to compare quality, speed, and cost.
**Debugging Agent Behavior**
1. Send: "Find the login button and click it"
2. View tool calls to see each mouse movement
3. Check screenshots to verify the agent found the right element
4. Adjust your prompt based on what you observe
## FAQs
<details>
<summary><strong>Do I need to know how to code?</strong></summary>
No. The Playground is designed for testing agent behavior without writing code. However, for production deployments, you'll need to use the Agent SDK (Python/TypeScript).
</details>
<details>
<summary><strong>Does this replace the Agent SDK?</strong></summary>
No. The Playground is for rapid testing and experimentation. For production deployments, scheduled tasks, or complex workflows, use the Agent SDK.
</details>
<details>
<summary><strong>How much does it cost?</strong></summary>
Playground requests use the same credit system as Agent SDK requests. You're charged for model inference (varies by model) and sandbox runtime (billed per hour while running).
</details>
<details>
<summary><strong>Why is my sandbox not showing up?</strong></summary>
The sandbox must have `status = "running"` to appear in the dropdown. Check Dashboard > Sandboxes to verify status. If stopped, click "Start" and wait ~30 seconds for it to become available.
</details>
## Need help?
If you hit issues getting the Playground working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.
---
Get started at [cua.ai](https://cua.ai) or try the Playground at [cua.ai/dashboard/playground](https://cua.ai/dashboard/playground).
```
--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/custom-computer-handlers.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Custom Computers
slug: custom-computer-handlers
---
The Agent SDK supports defining custom computer handlers using a simple dictionary interface. This enables integration with custom automation backends, testing frameworks, or specialized computer control systems.
## Example: Defining a Custom Computer Handler
```python
import asyncio
from PIL import Image
# Define your custom computer functions
async def take_screenshot():
"""Your custom screenshot implementation"""
# Return PIL Image, bytes, or base64 string
return Image.new('RGB', (1920, 1080), color='white')
# Create dict-based computer handler - only 'screenshot' is required
custom_computer = {
'screenshot': take_screenshot, # required
# everything below is optional
'environment': 'linux', # linux, mac, windows, browser
'dimensions': (1920, 1080), # (width, height)
'click': lambda x, y, button: print(f"Clicking at ({x}, {y}) with {button} button"),
}
```
You can then use this as a tool for your agent:
```python
from agent import ComputerAgent
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[custom_computer],
)
# Agent will automatically convert dict to agent.computers.CustomComputerHandler
await agent.run("Take a screenshot and click at coordinates 100, 200")
```
## Class-Based Implementation
For more complex implementations, you can create a custom class by inheriting from `AsyncComputerHandler`:
```python
from agent.computers import AsyncComputerHandler
from PIL import Image
from typing import Literal, List, Dict, Union, Optional
class MyCustomComputer(AsyncComputerHandler):
"""Custom computer handler implementation."""
def __init__(self):
# Initialize your custom computer interface here
pass
# ==== Computer-Use-Preview Action Space ====
async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
"""Get the current environment type."""
...
async def get_dimensions(self) -> tuple[int, int]:
"""Get screen dimensions as (width, height)."""
...
async def screenshot(self) -> str:
"""Take a screenshot and return as base64 string."""
...
async def click(self, x: int, y: int, button: str = "left") -> None:
"""Click at coordinates with specified button."""
...
async def double_click(self, x: int, y: int) -> None:
"""Double click at coordinates."""
...
async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
"""Scroll at coordinates with specified scroll amounts."""
...
async def type(self, text: str) -> None:
"""Type text."""
...
async def wait(self, ms: int = 1000) -> None:
"""Wait for specified milliseconds."""
...
async def move(self, x: int, y: int) -> None:
"""Move cursor to coordinates."""
...
async def keypress(self, keys: Union[List[str], str]) -> None:
"""Press key combination."""
...
async def drag(self, path: List[Dict[str, int]]) -> None:
"""Drag along specified path."""
...
async def get_current_url(self) -> str:
"""Get current URL (for browser environments)."""
...
# ==== Anthropic Action Space ====
async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse down at coordinates."""
...
async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
"""Left mouse up at coordinates."""
...
# Use with agent
custom_computer = MyCustomComputer()
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[custom_computer],
)
await agent.run("Take a screenshot and click at coordinates 100, 200")
```
```
--------------------------------------------------------------------------------
/scripts/run-docker-dev.sh:
--------------------------------------------------------------------------------
```bash
#!/bin/bash
# Colors for output
GREEN='\033[0;32m'
BLUE='\033[0;34m'
RED='\033[0;31m'
NC='\033[0m' # No Color
# Print with color
print_info() {
echo -e "${BLUE}==> $1${NC}"
}
print_success() {
echo -e "${GREEN}==> $1${NC}"
}
print_error() {
echo -e "${RED}==> $1${NC}"
}
# Docker image name
IMAGE_NAME="cua-dev-image"
CONTAINER_NAME="cua-dev-container"
PLATFORM="linux/arm64"
# Detect platform based on architecture
arch=$(uname -m)
if [[ $arch == x86_64* ]]; then
PLATFORM="linux/amd64"
print_info "X64 Architecture detected, using platform: ${PLATFORM}"
elif [[ $arch == i*86 ]]; then
PLATFORM="linux/386"
print_info "X32 Architecture detected, using platform: ${PLATFORM}"
elif [[ $arch == arm* ]] || [[ $arch == aarch64 ]]; then
PLATFORM="linux/arm64"
print_info "ARM Architecture detected, using platform: ${PLATFORM}"
else
# Fallback to amd64 for unknown architectures
PLATFORM="linux/amd64"
print_info "Unknown architecture ($arch), defaulting to platform: ${PLATFORM}"
fi
# Environment variables
PYTHONPATH="/app/libs/python/core:/app/libs/python/computer:/app/libs/python/agent:/app/libs/python/som:/app/libs/python/computer-server:/app/libs/python/mcp-server"
# Check if Docker is installed
if ! command -v docker &> /dev/null; then
print_error "Docker is not installed. Please install Docker first."
exit 1
fi
# Command options
case "$1" in
build)
print_info "Building the development Docker image..."
print_info "This will install all dependencies but won't include source code"
docker build -f Dockerfile --platform=${PLATFORM} -t ${IMAGE_NAME} .
print_success "Development Docker image built successfully!"
;;
run)
# Check for interactive flag
if [ "$2" == "--interactive" ]; then
print_info "Running the development Docker container with interactive shell..."
print_info "Mounting source code from host"
print_info "Connecting to host.docker.internal:7777"
docker run -it --rm \
--platform=${PLATFORM} \
--name ${CONTAINER_NAME} \
-v "$(pwd):/app" \
-e PYTHONPATH=${PYTHONPATH} \
-e DISPLAY=${DISPLAY:-:0} \
-e PYLUME_HOST="host.docker.internal" \
-p 7860:7860 \
${IMAGE_NAME} bash
else
# Run the specified example
if [ -z "$2" ]; then
print_error "Please specify an example file, e.g., ./run-docker-dev.sh run computer_examples.py"
exit 1
fi
print_info "Running example: $2"
print_info "Connecting to host.docker.internal:7777"
docker run -it --rm \
--platform=${PLATFORM} \
--name ${CONTAINER_NAME} \
-v "$(pwd):/app" \
-e PYTHONPATH=${PYTHONPATH} \
-e DISPLAY=${DISPLAY:-:0} \
-e PYLUME_HOST="host.docker.internal" \
-p 7860:7860 \
${IMAGE_NAME} python "/app/examples/$2"
fi
;;
stop)
print_info "Stopping any running containers..."
docker stop ${CONTAINER_NAME} 2>/dev/null || true
print_success "Done!"
;;
*)
echo "Usage: $0 {build|run [--interactive] [filename]|stop}"
echo ""
echo "Commands:"
echo " build Build the development Docker image with dependencies"
echo " run [example_filename] Run the specified example file in the container"
echo " run --interactive Run the container with mounted code and get an interactive shell"
echo " stop Stop the container"
exit 1
esac
exit 0
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/tools/browser_tool.py:
--------------------------------------------------------------------------------
```python
"""
Browser Tool for agent interactions.
Allows agents to control a browser programmatically via Playwright.
"""
import logging
from typing import TYPE_CHECKING, Optional
if TYPE_CHECKING:
from computer.interface import GenericComputerInterface
logger = logging.getLogger(__name__)
class BrowserTool:
"""
Browser tool that uses the computer SDK's interface to control a browser.
Implements the Fara/Magentic-One agent interface for browser control.
"""
def __init__(
self,
interface: "GenericComputerInterface",
):
"""
Initialize the BrowserTool.
Args:
interface: A GenericComputerInterface instance that provides playwright_exec
"""
self.interface = interface
self.logger = logger
async def _execute_command(self, command: str, params: dict) -> dict:
"""
Execute a browser command via the computer interface.
Args:
command: Command name
params: Command parameters
Returns:
Response dictionary
"""
try:
result = await self.interface.playwright_exec(command, params)
if not result.get("success"):
self.logger.error(
f"Browser command '{command}' failed: {result.get('error', 'Unknown error')}"
)
return result
except Exception as e:
self.logger.error(f"Error executing browser command '{command}': {e}")
return {"success": False, "error": str(e)}
async def visit_url(self, url: str) -> dict:
"""
Navigate to a URL.
Args:
url: URL to visit
Returns:
Response dictionary with success status and current URL
"""
return await self._execute_command("visit_url", {"url": url})
async def click(self, x: int, y: int) -> dict:
"""
Click at coordinates.
Args:
x: X coordinate
y: Y coordinate
Returns:
Response dictionary with success status
"""
return await self._execute_command("click", {"x": x, "y": y})
async def type(self, text: str) -> dict:
"""
Type text into the focused element.
Args:
text: Text to type
Returns:
Response dictionary with success status
"""
return await self._execute_command("type", {"text": text})
async def scroll(self, delta_x: int, delta_y: int) -> dict:
"""
Scroll the page.
Args:
delta_x: Horizontal scroll delta
delta_y: Vertical scroll delta
Returns:
Response dictionary with success status
"""
return await self._execute_command("scroll", {"delta_x": delta_x, "delta_y": delta_y})
async def web_search(self, query: str) -> dict:
"""
Navigate to a Google search for the query.
Args:
query: Search query
Returns:
Response dictionary with success status and current URL
"""
return await self._execute_command("web_search", {"query": query})
async def screenshot(self) -> bytes:
"""
Take a screenshot of the current browser page.
Returns:
Screenshot image data as bytes (PNG format)
"""
import base64
result = await self._execute_command("screenshot", {})
if result.get("success") and result.get("screenshot"):
# Decode base64 screenshot to bytes
screenshot_b64 = result["screenshot"]
screenshot_bytes = base64.b64decode(screenshot_b64)
return screenshot_bytes
else:
error = result.get("error", "Unknown error")
raise RuntimeError(f"Failed to take screenshot: {error}")
```
--------------------------------------------------------------------------------
/libs/lume/src/Commands/Run.swift:
--------------------------------------------------------------------------------
```swift
import ArgumentParser
import Foundation
import Virtualization
struct Run: AsyncParsableCommand {
static let configuration = CommandConfiguration(
abstract: "Run a virtual machine"
)
@Argument(
help: "Name of the virtual machine or image to pull and run (format: name or name:tag)",
completion: .custom(completeVMName))
var name: String
@Flag(name: [.short, .long], help: "Do not start the VNC client")
var noDisplay: Bool = false
@Option(
name: [.customLong("shared-dir")],
help:
"Directory to share with the VM. Can be just a path for read-write access (e.g. ~/src) or path:tag where tag is 'ro' for read-only or 'rw' for read-write (e.g. ~/src:ro)"
)
var sharedDirectories: [String] = []
@Option(
help:
"For Linux VMs only, a read-only disk image to attach to the VM (e.g. --mount=\"ubuntu.iso\")",
completion: .file())
var mount: String?
@Option(
name: [.customLong("usb-storage")],
help: "Disk image to attach as a USB mass storage device (e.g. --usb-storage=\"disk.img\")",
completion: .file())
var usbStorageDevices: [String] = []
@Option(help: "Github Container Registry to pull the images from. Defaults to ghcr.io")
var registry: String = "ghcr.io"
@Option(help: "Organization to pull the images from. Defaults to trycua")
var organization: String = "trycua"
@Option(
name: [.customLong("vnc-port")],
help: "Port to use for the VNC server. Defaults to 0 (auto-assign)")
var vncPort: Int = 0
@Option(help: "For MacOS VMs only, boot into the VM in recovery mode")
var recoveryMode: Bool = false
@Option(name: .customLong("storage"), help: "VM storage location to use or direct path to VM location")
var storage: String?
private var parsedSharedDirectories: [SharedDirectory] {
get throws {
try sharedDirectories.map { dirString -> SharedDirectory in
let components = dirString.split(separator: ":", maxSplits: 1)
let hostPath = String(components[0])
// If no tag is provided, default to read-write
if components.count == 1 {
return SharedDirectory(
hostPath: hostPath,
tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag,
readOnly: false
)
}
// Parse the tag if provided
let tag = String(components[1])
let readOnly: Bool
switch tag.lowercased() {
case "ro":
readOnly = true
case "rw":
readOnly = false
default:
throw ValidationError(
"Invalid tag value. Must be either 'ro' for read-only or 'rw' for read-write"
)
}
return SharedDirectory(
hostPath: hostPath,
tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag,
readOnly: readOnly
)
}
}
}
private var parsedUSBStorageDevices: [Path] {
usbStorageDevices.map { Path($0) }
}
init() {
}
@MainActor
func run() async throws {
try await LumeController().runVM(
name: name,
noDisplay: noDisplay,
sharedDirectories: parsedSharedDirectories,
mount: mount.map { Path($0) },
registry: registry,
organization: organization,
vncPort: vncPort,
recoveryMode: recoveryMode,
storage: storage,
usbMassStoragePaths: parsedUSBStorageDevices.isEmpty ? nil : parsedUSBStorageDevices
)
}
}
```
--------------------------------------------------------------------------------
/libs/python/mcp-server/test_mcp_server_local_option.py:
--------------------------------------------------------------------------------
```python
"""
Test script to verify MCP Server local desktop option works correctly.
This test verifies:
1. Default behavior: Computer uses VM
2. New behavior: Computer uses host when CUA_USE_HOST_COMPUTER_SERVER=true
"""
import asyncio
import os
import sys
from pathlib import Path
# Add the mcp-server module to path
mcp_server_path = Path(__file__).parent.parent / "libs" / "python" / "mcp-server"
sys.path.insert(0, str(mcp_server_path.parent.parent.parent / "libs" / "python"))
import pytest
@pytest.mark.asyncio
async def test_default_vm_mode():
"""Test that the default mode uses VM (not host computer server)."""
# Ensure environment variable is not set or is false
os.environ.pop("CUA_USE_HOST_COMPUTER_SERVER", None)
from mcp_server.session_manager import ComputerPool
pool = ComputerPool(max_size=1)
try:
computer = await pool.acquire()
# Verify the computer was initialized
assert computer is not None
# Check that use_host_computer_server was set to False (default)
# This should start a VM
print("✓ Default mode: Computer initialized (VM mode expected)")
await pool.release(computer)
finally:
await pool.shutdown()
@pytest.mark.asyncio
async def test_local_desktop_mode():
"""Test that setting CUA_USE_HOST_COMPUTER_SERVER=true uses host."""
# Set environment variable to true
os.environ["CUA_USE_HOST_COMPUTER_SERVER"] = "true"
# Need to reload module to pick up new env var
import importlib
import mcp_server.session_manager
from mcp_server.session_manager import ComputerPool
importlib.reload(mcp_server.session_manager)
pool = mcp_server.session_manager.ComputerPool(max_size=1)
try:
computer = await pool.acquire()
# Verify the computer was initialized
assert computer is not None
# Check that use_host_computer_server was set to True
print("✓ Local mode: Computer initialized (host mode expected)")
await pool.release(computer)
finally:
await pool.shutdown()
# Clean up env var
os.environ.pop("CUA_USE_HOST_COMPUTER_SERVER", None)
@pytest.mark.asyncio
async def test_env_var_parsing():
"""Test that various values of CUA_USE_HOST_COMPUTER_SERVER are parsed correctly."""
test_cases = [
("true", True),
("True", True),
("TRUE", True),
("1", True),
("yes", True),
("false", False),
("False", False),
("FALSE", False),
("0", False),
("no", False),
("", False),
("random", False),
]
for value, expected in test_cases:
os.environ["CUA_USE_HOST_COMPUTER_SERVER"] = value
# Check parsing logic
use_host = os.getenv("CUA_USE_HOST_COMPUTER_SERVER", "false").lower() in (
"true",
"1",
"yes",
)
assert (
use_host == expected
), f"Failed for value '{value}': expected {expected}, got {use_host}"
print(f"✓ Env var '{value}' correctly parsed as {expected}")
os.environ.pop("CUA_USE_HOST_COMPUTER_SERVER", None)
if __name__ == "__main__":
print("Testing MCP Server Local Desktop Option")
print("=" * 60)
print("\n1. Testing environment variable parsing...")
asyncio.run(test_env_var_parsing())
print("\n2. Testing default VM mode...")
try:
asyncio.run(test_default_vm_mode())
except Exception as e:
print(f"✗ Default VM mode test failed: {e}")
print("Note: This may require lume/VM setup to fully test")
print("\n3. Testing local desktop mode...")
try:
asyncio.run(test_local_desktop_mode())
except Exception as e:
print(f"✗ Local desktop mode test failed: {e}")
print("Note: This may require computer-server to be running locally")
print("\n" + "=" * 60)
print("Tests completed!")
```
--------------------------------------------------------------------------------
/examples/browser_tool_example.py:
--------------------------------------------------------------------------------
```python
"""
Browser Tool Example
Demonstrates how to use the BrowserTool to control a browser programmatically
via the computer server. The browser runs visibly on the XFCE desktop so visual
agents can see it.
Prerequisites:
- Computer server running (Docker container or local)
- For Docker: Container should be running with browser tool support
- For local: Playwright and Firefox must be installed
Usage:
python examples/browser_tool_example.py
"""
import asyncio
import logging
import sys
from pathlib import Path
# Add the libs path to sys.path
libs_path = Path(__file__).parent.parent / "libs" / "python"
sys.path.insert(0, str(libs_path))
from agent.tools.browser_tool import BrowserTool
# Import Computer interface and BrowserTool
from computer import Computer
# Configure logging to see what's happening
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def test_browser_tool():
"""Test the BrowserTool with various commands."""
# Initialize the computer interface
# For local testing, use provider_type="docker"
# For provider_type="cloud", provide name and api_key
computer = Computer(provider_type="docker", os_type="linux", image="cua-xfce:dev")
await computer.run()
# Initialize the browser tool with the computer interface
browser = BrowserTool(interface=computer)
logger.info("Testing Browser Tool...")
try:
# Test 0: Take a screenshot (pre-init)
logger.info("Test 0: Taking a screenshot...")
screenshot_bytes = await browser.screenshot()
screenshot_path = Path(__file__).parent / "browser_screenshot_init.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot_bytes)
logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
# Test 1: Visit a URL
logger.info("Test 1: Visiting a URL...")
result = await browser.visit_url("https://www.trycua.com")
logger.info(f"Visit URL result: {result}")
# Wait a bit for the page to load
await asyncio.sleep(2)
# Test 2: Take a screenshot
logger.info("Test 2: Taking a screenshot...")
screenshot_bytes = await browser.screenshot()
screenshot_path = Path(__file__).parent / "browser_screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot_bytes)
logger.info(f"Screenshot captured: {len(screenshot_bytes)} bytes")
# Wait a bit
await asyncio.sleep(1)
# Test 3: Visit bot detector
logger.info("Test 3: Visiting bot detector...")
result = await browser.visit_url("https://bot-detector.rebrowser.net/")
logger.info(f"Visit URL result: {result}")
# Test 2: Web search
logger.info("Test 2: Performing a web search...")
result = await browser.web_search("Python programming")
logger.info(f"Web search result: {result}")
# Wait a bit
await asyncio.sleep(2)
# Test 3: Scroll
logger.info("Test 3: Scrolling the page...")
result = await browser.scroll(delta_x=0, delta_y=500)
logger.info(f"Scroll result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 4: Click (example coordinates - adjust based on your screen)
logger.info("Test 4: Clicking at coordinates...")
result = await browser.click(x=500, y=300)
logger.info(f"Click result: {result}")
# Wait a bit
await asyncio.sleep(1)
# Test 5: Type text (if there's a focused input field)
logger.info("Test 5: Typing text...")
result = await browser.type("Hello from BrowserTool!")
logger.info(f"Type result: {result}")
logger.info("All tests completed!")
except Exception as e:
logger.error(f"Error during testing: {e}", exc_info=True)
if __name__ == "__main__":
asyncio.run(test_browser_tool())
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/models/opencua.py:
--------------------------------------------------------------------------------
```python
import base64
import re
from io import BytesIO
from typing import Any, Dict, List
try:
import blobfile as _ # assert blobfile is installed
import torch # type: ignore
from PIL import Image # type: ignore
from transformers import ( # type: ignore
AutoImageProcessor,
AutoModel,
AutoTokenizer,
)
OPENCUA_AVAILABLE = True
except Exception:
OPENCUA_AVAILABLE = False
class OpenCUAModel:
"""OpenCUA model handler using AutoTokenizer, AutoModel and AutoImageProcessor."""
def __init__(
self, model_name: str, device: str = "auto", trust_remote_code: bool = False
) -> None:
if not OPENCUA_AVAILABLE:
raise ImportError(
'OpenCUA requirements not found. Install with: pip install "cua-agent[opencua-hf]"'
)
self.model_name = model_name
self.device = device
self.model = None
self.tokenizer = None
self.image_processor = None
self.trust_remote_code = trust_remote_code
self._load()
def _load(self) -> None:
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name, trust_remote_code=self.trust_remote_code
)
self.model = AutoModel.from_pretrained(
self.model_name,
torch_dtype="auto",
device_map=self.device,
trust_remote_code=self.trust_remote_code,
attn_implementation="sdpa",
)
self.image_processor = AutoImageProcessor.from_pretrained(
self.model_name, trust_remote_code=self.trust_remote_code
)
@staticmethod
def _extract_last_image_b64(messages: List[Dict[str, Any]]) -> str:
# Expect HF-format messages with content items type: "image" with data URL
for msg in reversed(messages):
for item in reversed(msg.get("content", [])):
if isinstance(item, dict) and item.get("type") == "image":
url = item.get("image", "")
if isinstance(url, str) and url.startswith("data:image/"):
return url.split(",", 1)[1]
return ""
def generate(self, messages: List[Dict[str, Any]], max_new_tokens: int = 512) -> str:
assert (
self.model is not None
and self.tokenizer is not None
and self.image_processor is not None
)
# Tokenize text side using chat template
input_ids = self.tokenizer.apply_chat_template(
messages, tokenize=True, add_generation_prompt=True
)
input_ids = torch.tensor([input_ids]).to(self.model.device)
# Prepare image inputs from last data URL image
image_b64 = self._extract_last_image_b64(messages)
pixel_values = None
grid_thws = None
if image_b64:
image = Image.open(BytesIO(base64.b64decode(image_b64))).convert("RGB")
image_info = self.image_processor.preprocess(images=[image])
pixel_values = torch.tensor(image_info["pixel_values"]).to(
dtype=torch.bfloat16, device=self.model.device
)
grid_thws = (
torch.tensor(image_info["image_grid_thw"])
if "image_grid_thw" in image_info
else None
)
gen_kwargs: Dict[str, Any] = {
"max_new_tokens": max_new_tokens,
"temperature": 0,
}
if pixel_values is not None:
gen_kwargs["pixel_values"] = pixel_values
if grid_thws is not None:
gen_kwargs["grid_thws"] = grid_thws
with torch.no_grad():
generated_ids = self.model.generate(
input_ids,
**gen_kwargs,
)
# Remove prompt tokens
prompt_len = input_ids.shape[1]
generated_ids = generated_ids[:, prompt_len:]
output_text = self.tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
return output_text
```
--------------------------------------------------------------------------------
/libs/python/som/som/models.py:
--------------------------------------------------------------------------------
```python
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from pydantic import BaseModel, Field, validator
class BoundingBox(BaseModel):
"""Normalized bounding box coordinates."""
x1: float = Field(..., description="Normalized left coordinate")
y1: float = Field(..., description="Normalized top coordinate")
x2: float = Field(..., description="Normalized right coordinate")
y2: float = Field(..., description="Normalized bottom coordinate")
@property
def coordinates(self) -> List[float]:
"""Get coordinates as a list [x1, y1, x2, y2]."""
return [self.x1, self.y1, self.x2, self.y2]
class UIElement(BaseModel):
"""Base class for UI elements."""
id: Optional[int] = Field(None, description="Unique identifier for the element (1-indexed)")
type: Literal["icon", "text"]
bbox: BoundingBox
interactivity: bool = Field(default=False, description="Whether the element is interactive")
confidence: float = Field(default=1.0, description="Detection confidence score")
class IconElement(UIElement):
"""An interactive icon element."""
type: Literal["icon"] = "icon"
interactivity: bool = True
scale: Optional[int] = Field(None, description="Detection scale used")
class TextElement(UIElement):
"""A text element."""
type: Literal["text"] = "text"
content: str = Field(..., description="The text content")
interactivity: bool = False
class ImageData(BaseModel):
"""Image data with dimensions."""
base64: str = Field(..., description="Base64 encoded image data")
width: int = Field(..., description="Image width in pixels")
height: int = Field(..., description="Image height in pixels")
@validator("width", "height")
def dimensions_must_be_positive(cls, v):
if v <= 0:
raise ValueError("Dimensions must be positive")
return v
class ParserMetadata(BaseModel):
"""Metadata about the parsing process."""
image_size: Tuple[int, int] = Field(
..., description="Original image dimensions (width, height)"
)
num_icons: int = Field(..., description="Number of icons detected")
num_text: int = Field(..., description="Number of text elements detected")
device: str = Field(..., description="Device used for detection (cpu/cuda/mps)")
ocr_enabled: bool = Field(..., description="Whether OCR was enabled")
latency: float = Field(..., description="Total processing time in seconds")
@property
def width(self) -> int:
"""Get image width from image_size."""
return self.image_size[0]
@property
def height(self) -> int:
"""Get image height from image_size."""
return self.image_size[1]
class ParseResult(BaseModel):
"""Result of parsing a UI screenshot."""
elements: List[UIElement] = Field(..., description="Detected UI elements")
annotated_image_base64: str = Field(..., description="Base64 encoded annotated image")
metadata: ParserMetadata = Field(..., description="Processing metadata")
screen_info: Optional[List[str]] = Field(
None, description="Human-readable descriptions of elements"
)
parsed_content_list: Optional[List[Dict[str, Any]]] = Field(
None, description="Parsed elements as dictionaries"
)
@property
def image(self) -> ImageData:
"""Get image data as a convenience property."""
return ImageData(
base64=self.annotated_image_base64,
width=self.metadata.width,
height=self.metadata.height,
)
@property
def width(self) -> int:
"""Get image width from metadata."""
return self.metadata.width
@property
def height(self) -> int:
"""Get image height from metadata."""
return self.metadata.height
def model_dump(self) -> Dict[str, Any]:
"""Convert model to dict for compatibility with older code."""
result = super().model_dump()
# Add image data dict for backward compatibility
result["image"] = self.image.model_dump()
return result
```
--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/opencua.py:
--------------------------------------------------------------------------------
```python
"""
OpenCUA agent loop implementation for click prediction using litellm.acompletion
Based on OpenCUA model for GUI grounding tasks.
"""
import asyncio
import base64
import json
import math
import re
import uuid
from io import BytesIO
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
import litellm
from PIL import Image
from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..types import AgentCapability, AgentResponse, Messages, Tools
from .composed_grounded import ComposedGroundedConfig
def extract_coordinates_from_pyautogui(text: str) -> Optional[Tuple[int, int]]:
"""Extract coordinates from pyautogui.click(x=..., y=...) format."""
try:
# Look for pyautogui.click(x=1443, y=343) pattern
pattern = r"pyautogui\.click\(x=(\d+),\s*y=(\d+)\)"
match = re.search(pattern, text)
if match:
x, y = int(match.group(1)), int(match.group(2))
return (x, y)
return None
except Exception:
return None
@register_agent(models=r"(?i).*OpenCUA.*")
class OpenCUAConfig(ComposedGroundedConfig):
"""OpenCUA agent configuration implementing AsyncAgentConfig protocol for click prediction."""
def __init__(self):
super().__init__()
self.current_model = None
self.last_screenshot_b64 = None
async def predict_step(
self,
messages: List[Dict[str, Any]],
model: str,
tools: Optional[List[Dict[str, Any]]] = None,
max_retries: Optional[int] = None,
stream: bool = False,
computer_handler=None,
_on_api_start=None,
_on_api_end=None,
_on_usage=None,
_on_screenshot=None,
**kwargs,
) -> Dict[str, Any]:
"""Fallback to a self-composed model"""
return await super().predict_step(
messages=messages,
model=f"{model}+{model}",
tools=tools,
max_retries=max_retries,
stream=stream,
computer_handler=computer_handler,
_on_api_start=_on_api_start,
_on_api_end=_on_api_end,
_on_usage=_on_usage,
_on_screenshot=_on_screenshot,
**kwargs,
)
async def predict_click(
self, model: str, image_b64: str, instruction: str, **kwargs
) -> Optional[Tuple[int, int]]:
"""
Predict click coordinates using OpenCUA model via litellm.acompletion.
Args:
model: The OpenCUA model name
image_b64: Base64 encoded image
instruction: Instruction for where to click
Returns:
Tuple of (x, y) coordinates or None if prediction fails
"""
# Prepare system message
system_prompt = (
"You are a GUI agent. You are given a task and a screenshot of the screen. "
"You need to perform a series of pyautogui actions to complete the task."
)
system_message = {"role": "system", "content": system_prompt}
# Prepare user message with image and instruction
user_message = {
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
{"type": "text", "text": f"Click on {instruction}"},
],
}
# Prepare API call kwargs
api_kwargs = {
"model": model,
"messages": [system_message, user_message],
"max_new_tokens": 2056,
"temperature": 0,
**kwargs,
}
# Use liteLLM acompletion
response = await litellm.acompletion(**api_kwargs)
# Extract response text
output_text = response.choices[0].message.content
# print(output_text)
# Extract coordinates from pyautogui format
coordinates = extract_coordinates_from_pyautogui(output_text)
return coordinates
def get_capabilities(self) -> List[AgentCapability]:
"""Return the capabilities supported by this agent."""
return ["click"]
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/customizing-computeragent.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Customize ComputerAgent
---
<Callout>
A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/customizing_computeragent.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>
The `ComputerAgent` interface provides an easy proxy to any computer-using model configuration, and it is a powerful framework for extending and building your own agentic systems.
This guide shows four proven ways to increase capabilities and success rate:
- 1 — Simple: Prompt engineering
- 2 — Easy: Tools
- 3 — Intermediate: Callbacks
- 4 — Expert: Custom `@register_agent`
## 1) Simple: Prompt engineering
Provide guiding instructions to shape behavior. `ComputerAgent` accepts an optional `instructions: str | None` which acts like a system-style preface. Internally, this uses a callback that pre-pends a user message before each LLM call.
```python
from agent.agent import ComputerAgent
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer],
instructions=(
"You are a meticulous software operator. Prefer safe, deterministic actions. "
"Always confirm via on-screen text before proceeding."
),
)
```
## 2) Easy: Tools
Expose deterministic capabilities as tools (Python functions or custom computer handlers). The agent will call them when appropriate.
```python
def calculate_percentage(numerator: float, denominator: float) -> str:
"""Calculate percentage as a string.
Args:
numerator: Numerator value
denominator: Denominator value
Returns:
A formatted percentage string (e.g., '75.00%').
"""
if denominator == 0:
return "0.00%"
return f"{(numerator/denominator)*100:.2f}%"
agent = ComputerAgent(
model="openai/computer-use-preview",
tools=[computer, calculate_percentage],
)
```
- See `docs/agent-sdk/custom-tools` for authoring function tools.
- See `docs/agent-sdk/custom-computer-handlers` for building full computer interfaces.
## 3) Intermediate: Callbacks
Callbacks provide lifecycle hooks to preprocess messages, postprocess outputs, record trajectories, manage costs, and more.
```python
from agent.callbacks import ImageRetentionCallback, TrajectorySaverCallback, BudgetManagerCallback
agent = ComputerAgent(
model="cua/anthropic/claude-sonnet-4.5",
tools=[computer],
callbacks=[
ImageRetentionCallback(only_n_most_recent_images=3),
TrajectorySaverCallback("./trajectories"),
BudgetManagerCallback(max_budget=10.0, raise_error=True),
],
)
```
- Browse implementations in `libs/python/agent/agent/loops/`.
## 4) Expert: Custom `@register_agent`
Build your own agent configuration class to control prompting, message shaping, and tool handling. This is the most flexible option for specialized domains.
- Register your own `model=...` loop using `@register_agent`
- Browse implementations in `libs/python/agent/agent/loops/`.
- Implement `predict_step()` (and optionally `predict_click()`) and return the standardized output schema.
```python
from agent.decorators import register_agent
@register_agent(models=r".*my-special-model.*", priority=10)
class MyCustomAgentConfig:
async def predict_step(self, messages, model, tools, **kwargs):
# 1) Format messages for your provider
# 2) Call provider
# 3) Convert responses to the agent output schema
return {"output": [], "usage": {}}
async def predict_click(self, model, image_b64, instruction):
# Optional: click-only capability
return None
def get_capabilities(self):
return ["step"]
```
## HUD integration (optional)
When using the HUD evaluation integration (`agent/integrations/hud/`), you can pass `instructions`, `tools`, and `callbacks` directly
```python
from agent.integrations.hud import run_single_task
await run_single_task(
dataset="username/dataset-name",
model="openai/computer-use-preview",
instructions="Operate carefully. Always verify on-screen text before actions.",
# tools=[your_custom_function],
# callbacks=[YourCustomCallback()],
)
```
```
--------------------------------------------------------------------------------
/docs/src/components/iou.tsx:
--------------------------------------------------------------------------------
```typescript
'use client';
import React, { useRef, useEffect, useState, useCallback } from 'react';
/**
* Represents a rectangle with position, dimensions, styling, and identification
*/
interface Rectangle {
/** The x-coordinate of the rectangle's left edge */
left: number;
/** The y-coordinate of the rectangle's top edge */
top: number;
/** The width of the rectangle */
width: number;
/** The height of the rectangle */
height: number;
/** The fill color of the rectangle */
fill: string;
/** The display name of the rectangle */
name: string;
}
/**
* Props for the IOU component
*/
interface IOUProps {
/** The title to display above the visualization */
title: string;
/** The description text to display below the IOU value */
description: string;
/** The first rectangle for IOU calculation */
rect1: Rectangle;
/** The second rectangle for IOU calculation */
rect2: Rectangle;
}
/**
* A React component that visualizes and calculates the Intersection over Union (IOU)
* of two rectangles on a canvas
* @param props - The component props
* @returns The rendered IOU visualization component
*/
export default function IOU({ title, description, rect1, rect2 }: IOUProps) {
const canvasRef = useRef<HTMLCanvasElement>(null);
const [actualIOU, setActualIOU] = useState<number>(0);
/**
* Converts a rectangle to a bounding box with left, right, top, and bottom coordinates
* @param rect - The rectangle to convert
* @returns An object containing the bounding box coordinates
*/
const getBbox = (rect: Rectangle) => ({
left: rect.left,
right: rect.left + rect.width,
top: rect.top,
bottom: rect.top + rect.height,
});
/**
* Calculates the intersection area between two bounding boxes
* @param bbox1 - The first bounding box
* @param bbox2 - The second bounding box
* @returns The area of intersection between the two bounding boxes
*/
const calcIntersection = (bbox1: any, bbox2: any): number => {
const x1 = Math.max(bbox1.left, bbox2.left);
const x2 = Math.min(bbox1.right, bbox2.right);
const y1 = Math.max(bbox1.top, bbox2.top);
const y2 = Math.min(bbox1.bottom, bbox2.bottom);
// Check if there's actually an overlap
if (x2 <= x1 || y2 <= y1) {
return 0;
}
const intersection = (x2 - x1) * (y2 - y1);
return intersection;
};
/**
* Calculates the area of a rectangle
* @param rect - The rectangle to calculate area for
* @returns The area of the rectangle
*/
const calcArea = (rect: Rectangle): number => {
return rect.width * rect.height;
};
/**
* Draws the rectangles on the canvas and calculates the IOU value
*/
const drawCanvas = useCallback(() => {
const canvas = canvasRef.current;
if (!canvas) return;
const ctx = canvas.getContext('2d');
if (!ctx) return;
// Clear canvas
ctx.clearRect(0, 0, canvas.width, canvas.height);
// Calculate IOU
const bbox1 = getBbox(rect1);
const bbox2 = getBbox(rect2);
const intersection = calcIntersection(bbox1, bbox2);
const union = calcArea(rect1) + calcArea(rect2) - intersection;
const iou = intersection / union;
setActualIOU(iou);
// Draw rectangles
[rect1, rect2].forEach((rect) => {
ctx.fillStyle = rect.fill;
ctx.fillRect(rect.left, rect.top, rect.width, rect.height);
ctx.strokeStyle = '#000';
ctx.lineWidth = 2;
ctx.strokeRect(rect.left, rect.top, rect.width, rect.height);
ctx.fillStyle = '#000';
ctx.font = '12px';
ctx.fillText(rect.name, rect.left + 5, rect.top + 15);
});
}, [rect1, rect2]);
useEffect(() => {
drawCanvas();
}, [drawCanvas]);
return (
<div className="">
<h3 className="text-sm font-semibold ">{title}</h3>
<div className="flex items-start gap-6">
<div>
<canvas ref={canvasRef} width={200} height={150} className="border bg-white rounded-md" />
<div className="mt-2 text-sm">
<div className="font-mono mb-2">IOU = {actualIOU.toFixed(3)}</div>
<span className="">{description}</span>
</div>
</div>
</div>
</div>
);
}
```
--------------------------------------------------------------------------------
/libs/python/computer/computer/providers/base.py:
--------------------------------------------------------------------------------
```python
"""Base provider interface for VM backends."""
import abc
from enum import StrEnum
from typing import Any, AsyncContextManager, Dict, Optional
from .types import ListVMsResponse
class VMProviderType(StrEnum):
"""Enum of supported VM provider types."""
LUME = "lume"
LUMIER = "lumier"
CLOUD = "cloud"
WINSANDBOX = "winsandbox"
DOCKER = "docker"
UNKNOWN = "unknown"
class BaseVMProvider(AsyncContextManager):
"""Base interface for VM providers.
All VM provider implementations must implement this interface.
"""
@property
@abc.abstractmethod
def provider_type(self) -> VMProviderType:
"""Get the provider type."""
pass
@abc.abstractmethod
async def get_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
"""Get VM information by name.
Args:
name: Name of the VM to get information for
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
Returns:
Dictionary with VM information including status, IP address, etc.
"""
pass
@abc.abstractmethod
async def list_vms(self) -> ListVMsResponse:
"""List all available VMs.
Returns:
ListVMsResponse: A list of minimal VM objects as defined in
`computer.providers.types.MinimalVM`.
"""
pass
@abc.abstractmethod
async def run_vm(
self, image: str, name: str, run_opts: Dict[str, Any], storage: Optional[str] = None
) -> Dict[str, Any]:
"""Run a VM by name with the given options.
Args:
image: Name/tag of the image to use
name: Name of the VM to run
run_opts: Dictionary of run options (memory, cpu, etc.)
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
Returns:
Dictionary with VM run status and information
"""
pass
@abc.abstractmethod
async def stop_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
"""Stop a VM by name.
Args:
name: Name of the VM to stop
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
Returns:
Dictionary with VM stop status and information
"""
pass
@abc.abstractmethod
async def restart_vm(self, name: str, storage: Optional[str] = None) -> Dict[str, Any]:
"""Restart a VM by name.
Args:
name: Name of the VM to restart
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
Returns:
Dictionary with VM restart status and information
"""
pass
@abc.abstractmethod
async def update_vm(
self, name: str, update_opts: Dict[str, Any], storage: Optional[str] = None
) -> Dict[str, Any]:
"""Update VM configuration.
Args:
name: Name of the VM to update
update_opts: Dictionary of update options (memory, cpu, etc.)
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
Returns:
Dictionary with VM update status and information
"""
pass
@abc.abstractmethod
async def get_ip(self, name: str, storage: Optional[str] = None, retry_delay: int = 2) -> str:
"""Get the IP address of a VM, waiting indefinitely until it's available.
Args:
name: Name of the VM to get the IP for
storage: Optional storage path override. If provided, this will be used
instead of the provider's default storage path.
retry_delay: Delay between retries in seconds (default: 2)
Returns:
IP address of the VM when it becomes available
"""
pass
```
--------------------------------------------------------------------------------
/blog/cua-hackathon.md:
--------------------------------------------------------------------------------
```markdown
# Computer-Use Agents SOTA Challenge: Hack the North + Global Online
_Published on August 25, 2025 by Francesco Bonacci_
We’re bringing something new to [Hack the North](https://hackthenorth.com), Canada’s largest hackathon, this year: a head-to-head competition for **Computer-Use Agents** - on-site at Waterloo and a **Global online challenge**. From September 12–14, 2025, teams build on the **Cua Agent Framework** and are scored in **HUD’s OSWorld-Verified** environment to push past today’s SOTA on [OS-World](https://os-world.github.io).
<img src="./assets/hack-the-north.png">
## Track A: On-site @ Hack the North
There’s one global leaderboard: **Cua - Best State-of-the-Art Computer-Use Agent**. Use any model setup you like (cloud or local). After projects are submitted, [HUD](https://www.hud.so) runs the official benchmark; the top team earns a **guaranteed YC partner interview (W26 batch)**. We’ll also feature winners on our blog and socials and kit the team out with swag.
## Track B: Cua Global Online Hackathon
**Cua** and [**Ollama**](https://ollama.com) organize a global hackathon to find the **most creative uses of local and hybrid computer-use agents**. There are no geographic restrictions on who can join — this is a worldwide competition focused on **originality, impact, and inventive applications** that showcase what's possible with local and hybrid inference.
**Prizes:**
- 1st **MacBook Air M4 (or equivalent value)** + features in Cua & Ollama channels
- 2nd **$500 CAD + swag**
- 3rd **swag + public feature**
---
## How it works
Two different tracks, two different processes:
### On-site (Track A)
Build during the weekend and submit a repo with a one-line start command. **HUD** executes your command in a clean environment and runs **OSWorld-Verified**. Scores come from official benchmark results; ties break by median, then wall-clock time, then earliest submission. Any model setup is allowed (cloud or local).
**HUD** runs official evaluations immediately after submission. Winners are announced at the **closing ceremony**.
### Rules
- Fork and star the [Cua repo](https://github.com/trycua/cua).
- Add your agent and instructions in `samples/community/hack-the-north/<YOUR_TEAM_NAME>`.
- Include a README with details on the approach and any required notes.
- Submit a PR.
**Deadline: Sept 15, 8:00 AM EDT**
### Global Online (Track B)
Open to anyone, anywhere. Build on your own timeline and submit through the **Cua Discord form** by the deadline.
**Project Requirements:**
- Your agent must integrate **Cua and Ollama** in some way
- Your agent must be **easily runnable by judges**
Judged by **Cua** and **Ollama** teams on:
- **Creativity (30%)** – originality, usefulness, surprise factor
- **Technical Depth (30%)** – quality of engineering and agent design
- **Use of Ollama (30%)** – effective integration of local/hybrid inference
- **Polish (10%)** – presentation, clarity, demo readiness
### Submission Process
Submissions will be collected via a **form link provided in the Cua Discord**. Your submission must contain:
- **GitHub repo** containing the agent source code and a clear README with instructions on how to use the agent
- **Explanation** of the models and tools used, and what's local or hybrid about your design
- **Short demo video** (up to two minutes)
A **commit freeze** will be used to ensure that no changes are made after the deadline. Winners will be announced after judging is complete.
**Deadline: Sept 28, 11:59 PM UTC (extended due to popular demand!)**
---
## Join us
Bring a team, pick a model stack, and push what agents can do on real computers. We can’t wait to see what you build at **Hack the North 2025**.
**Discord channels**
- Join the Discord first: https://discord.gg/cua-ai
- **#hack-the-north (on-site):** https://discord.com/channels/1328377437301641247/1409508526774157342
- **#global-online (Ollama × Cua):** https://discord.com/channels/1328377437301641247/1409518100491145226
**Contact**
Questions on Hack the North? Email **[email protected]**.
_P.S. If you’re planning ahead, start with the Cua Agent Framework and OSWorld-Verified docs at cua.ai/docs; we’ll share office-hour times in both Discord channels._
```
--------------------------------------------------------------------------------
/.github/workflows/bump-version.yml:
--------------------------------------------------------------------------------
```yaml
name: Bump Version & Publish
on:
workflow_dispatch:
inputs:
service:
description: "Service/Package to bump"
required: true
type: choice
options:
- cua-agent
- cua-computer
- cua-computer-server
- cua-core
- cua-mcp-server
- cua-som
- pylume
bump_type:
description: "Version bump type"
required: true
type: choice
options:
- patch
- minor
- major
permissions:
contents: write
jobs:
bump-version:
runs-on: ubuntu-latest
outputs:
agent_version: ${{ steps.agent_version.outputs.version }}
computer_version: ${{ steps.computer_version.outputs.version }}
steps:
- name: Set package directory
id: package
run: |
case "${{ inputs.service }}" in
"cua-agent")
echo "directory=libs/python/agent" >> $GITHUB_OUTPUT
;;
"cua-computer")
echo "directory=libs/python/computer" >> $GITHUB_OUTPUT
;;
"cua-computer-server")
echo "directory=libs/python/computer-server" >> $GITHUB_OUTPUT
;;
"cua-core")
echo "directory=libs/python/core" >> $GITHUB_OUTPUT
;;
"cua-mcp-server")
echo "directory=libs/python/mcp-server" >> $GITHUB_OUTPUT
;;
"cua-som")
echo "directory=libs/python/som" >> $GITHUB_OUTPUT
;;
"pylume")
echo "directory=libs/python/pylume" >> $GITHUB_OUTPUT
;;
*)
echo "Unknown service: ${{ inputs.service }}"
exit 1
;;
esac
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 0
token: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install bump2version
run: pip install bump2version
- name: Configure Git
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
- name: Run bump2version
run: |
cd ${{ steps.package.outputs.directory }}
bump2version ${{ inputs.bump_type }}
- name: Also bump cua-agent
if: ${{ inputs.service == 'cua-computer' }}
run: |
cd libs/python/agent
bump2version ${{ inputs.bump_type }}
- name: Capture bumped agent version
if: ${{ inputs.service == 'cua-agent' || inputs.service == 'cua-computer' }}
id: agent_version
run: |
cd libs/python/agent
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
echo "Agent version: $VERSION"
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: Capture bumped computer version
if: ${{ inputs.service == 'cua-computer' }}
id: computer_version
run: |
cd libs/python/computer
VERSION=$(python -c "import tomllib; from pathlib import Path; data = tomllib.loads(Path('pyproject.toml').read_text()); print(data['project']['version'])")
echo "Computer version: $VERSION"
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: Push changes
run: |
git push origin main --follow-tags
publish-computer:
needs: bump-version
if: ${{ inputs.service == 'cua-computer' }}
uses: ./.github/workflows/pypi-publish-computer.yml
with:
version: ${{ needs.bump-version.outputs.computer_version }}
secrets: inherit
publish-agent:
needs: [bump-version, publish-computer]
if: ${{ always() && (inputs.service == 'cua-agent' || inputs.service == 'cua-computer') && needs.bump-version.result == 'success' && (inputs.service == 'cua-agent' || needs.publish-computer.result == 'success') }}
uses: ./.github/workflows/pypi-publish-agent.yml
with:
version: ${{ needs.bump-version.outputs.agent_version }}
secrets: inherit
```
--------------------------------------------------------------------------------
/examples/computer_examples.py:
--------------------------------------------------------------------------------
```python
import asyncio
import os
import sys
import traceback
from pathlib import Path
# Load environment variables from .env file
project_root = Path(__file__).parent.parent
env_file = project_root / ".env"
print(f"Loading environment from: {env_file}")
from dotenv import load_dotenv
load_dotenv(env_file)
# Add paths to sys.path if needed
pythonpath = os.environ.get("PYTHONPATH", "")
for path in pythonpath.split(":"):
if path and path not in sys.path:
sys.path.insert(0, path) # Insert at beginning to prioritize
print(f"Added to sys.path: {path}")
from computer.computer import Computer
from computer.logger import LogLevel
from computer.providers.base import VMProviderType
async def main():
try:
print("\n=== Using direct initialization ===")
# Create a local macOS computer
computer = Computer(
display="1024x768",
memory="8GB",
cpu="4",
os_type="macos",
name="macos",
verbosity=LogLevel.VERBOSE,
provider_type=VMProviderType.LUME,
storage="/Users/<USER>/repos/trycua/computer/examples/storage",
shared_directories=["/Users/<USER>/repos/trycua/computer/examples/shared"],
ephemeral=False,
)
# Create a remote Linux computer with Cua
# computer = Computer(
# os_type="linux",
# api_key=os.getenv("CUA_API_KEY"),
# name=os.getenv("CONTAINER_NAME"),
# provider_type=VMProviderType.CLOUD,
# )
try:
# Run the computer with default parameters
await computer.run()
screenshot = await computer.interface.screenshot()
# Create output directory if it doesn't exist
output_dir = Path("./output")
output_dir.mkdir(exist_ok=True)
screenshot_path = output_dir / "screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(screenshot)
print(f"Screenshot saved to: {screenshot_path.absolute()}")
# await computer.interface.hotkey("command", "space")
# res = await computer.interface.run_command("touch ./Downloads/empty_file")
# print(f"Run command result: {res}")
accessibility_tree = await computer.interface.get_accessibility_tree()
print(f"Accessibility tree: {accessibility_tree}")
# Screen Actions Examples
# print("\n=== Screen Actions ===")
# screenshot = await computer.interface.screenshot()
# with open("screenshot_direct.png", "wb") as f:
# f.write(screenshot)
screen_size = await computer.interface.get_screen_size()
print(f"Screen size: {screen_size}")
# Demonstrate coordinate conversion
center_x, center_y = 733, 736
print(f"Center in screen coordinates: ({center_x}, {center_y})")
screenshot_center = await computer.to_screenshot_coordinates(center_x, center_y)
print(f"Center in screenshot coordinates: {screenshot_center}")
screen_center = await computer.to_screen_coordinates(*screenshot_center)
print(f"Back to screen coordinates: {screen_center}")
# Mouse Actions Examples
print("\n=== Mouse Actions ===")
await computer.interface.move_cursor(100, 100)
await computer.interface.left_click()
await computer.interface.right_click(300, 300)
await computer.interface.double_click(400, 400)
# Keyboard Actions Examples
print("\n=== Keyboard Actions ===")
await computer.interface.type_text("Hello, World!")
await computer.interface.press_key("enter")
# Clipboard Actions Examples
print("\n=== Clipboard Actions ===")
await computer.interface.set_clipboard("Test clipboard")
content = await computer.interface.copy_to_clipboard()
print(f"Clipboard content: {content}")
finally:
# Important to clean up resources
await computer.stop()
except Exception as e:
print(f"Error in main: {e}")
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(main())
```
--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/DHCPLeaseParser.swift:
--------------------------------------------------------------------------------
```swift
import Foundation
/// Represents a DHCP lease entry from the system's DHCP lease file
private struct DHCPLease {
let macAddress: String
let ipAddress: String
let expirationDate: Date
/// Creates a lease entry from raw DHCP lease file key-value pairs
/// - Parameter dict: Dictionary containing the raw lease data
/// - Returns: A DHCPLease instance if the data is valid, nil otherwise
static func from(_ dict: [String: String]) -> DHCPLease? {
guard let hwAddress = dict["hw_address"],
let ipAddress = dict["ip_address"],
let lease = dict["lease"] else {
return nil
}
// Parse MAC address from hw_address field (format can be "1,xx:xx:xx:xx:xx:xx" or "ff,...")
let hwParts = hwAddress.split(separator: ",")
guard hwParts.count >= 2 else { return nil }
// Get the MAC part after the prefix and normalize it
let rawMacAddress = String(hwParts[1]).trimmingCharacters(in: .whitespaces)
// Normalize the MAC address by ensuring each component is two digits
let normalizedMacAddress = rawMacAddress.split(separator: ":")
.map { component in
let hex = String(component)
return hex.count == 1 ? "0\(hex)" : hex
}
.joined(separator: ":")
// Convert hex timestamp to Date
let timestampHex = lease.trimmingCharacters(in: CharacterSet(charactersIn: "0x"))
guard let timestamp = UInt64(timestampHex, radix: 16) else { return nil }
let expirationDate = Date(timeIntervalSince1970: TimeInterval(timestamp))
return DHCPLease(
macAddress: normalizedMacAddress,
ipAddress: ipAddress,
expirationDate: expirationDate
)
}
/// Checks if the lease is currently valid
var isValid: Bool {
expirationDate > Date()
}
}
/// Parses DHCP lease files to retrieve IP addresses for VMs based on their MAC addresses
enum DHCPLeaseParser {
private static let leasePath = "/var/db/dhcpd_leases"
/// Retrieves the IP address for a given MAC address from the DHCP lease file
/// - Parameter macAddress: The MAC address to look up
/// - Returns: The IP address if found, nil otherwise
static func getIPAddress(forMAC macAddress: String) -> String? {
guard let leaseContents = try? String(contentsOfFile: leasePath, encoding: .utf8) else {
return nil
}
// Normalize the input MAC address to ensure consistent format
let normalizedMacAddress = macAddress.split(separator: ":").map { component in
let hex = String(component)
return hex.count == 1 ? "0\(hex)" : hex
}.joined(separator: ":")
let leases = try? parseDHCPLeases(leaseContents)
return leases?.first { lease in
lease.macAddress == normalizedMacAddress
}?.ipAddress
}
/// Parses the contents of a DHCP lease file into lease entries
/// - Parameter contents: The raw contents of the lease file
/// - Returns: Array of parsed lease entries
private static func parseDHCPLeases(_ contents: String) throws -> [DHCPLease] {
var leases: [DHCPLease] = []
var currentLease: [String: String] = [:]
var inLeaseBlock = false
let lines = contents.components(separatedBy: .newlines)
for line in lines {
let trimmedLine = line.trimmingCharacters(in: .whitespaces)
if trimmedLine == "{" {
inLeaseBlock = true
currentLease = [:]
} else if trimmedLine == "}" {
if let lease = DHCPLease.from(currentLease) {
leases.append(lease)
}
inLeaseBlock = false
} else if inLeaseBlock {
let parts = trimmedLine.split(separator: "=", maxSplits: 1)
if parts.count == 2 {
let key = String(parts[0]).trimmingCharacters(in: .whitespaces)
let value = String(parts[1]).trimmingCharacters(in: .whitespaces)
currentLease[key] = value
}
}
}
return leases
}
}
```
--------------------------------------------------------------------------------
/blog/trajectory-viewer.md:
--------------------------------------------------------------------------------
```markdown
# Trajectory Viewer for Cua
_Published on May 13, 2025 by Dillon DuPont_
Don’t forget to check out [Part 1: Building your own Computer-Use Operator](build-your-own-operator-on-macos-1) and [Part 2: Using the Agent framework](build-your-own-operator-on-macos-2) for setting up your Cua environment and basic tips and tricks!
## Introduction
Okay, so you’ve gotten your environment up and also tested a few agent runs. You’ll likely have encountered cases where your agent was successful at doing some tasks but also places where it got stuck or outright failed.
Now what?
If you’ve ever wondered exactly what your computer agent is doing and why it sometimes doesn’t do what you expected, then the Trajectory Viewer for Cua is here to help! Whether you’re a seasoned developer or someone who just wants to dive in and see results, this tool makes it easy to explore every step your agent takes on your screen.
Plus, if you want to start thinking about generating data to train your own agentic model (we’ll cover training in an upcoming blog, so look forward to it), then our Trajectory Viewer might be for you.
## So, what’s a “trajectory”?
Think of a trajectory as a detailed video recording of your agent’s journey:
- **Observations**: What did the agent see (the exact screen content) at each point in time?
- **Actions**: What clicks, keystrokes, or commands did it perform in response?
- **Decisions**: Which options did it choose, and why?
Especially for longer and more complex tasks, your agent will make multiple steps, take multiple actions, and make multiple observations. By examining this record, you can pinpoint where things go right, and more importantly, where they go wrong.
## So, what’s Cua’s Trajectory Viewer and why use it?
The Trajectory Player for Cua is a GUI tool that helps you explore saved trajectories generated from your Cua computer agent runs. This tool provides a powerful way to:
- **Debug your agents**: See exactly what your agent saw to reproduce bugs
- **Analyze failure cases**: Identify the moment when your agent went off-script
- **Collect training data**: Export your trajectories for your own processing, training, and more!
The viewer allows you to see exactly what your agent observed and how it interacted with the computer all through your browser.
## Opening Trajectory Viewer in 3 Simple Steps
1. **Visit**: Open your browser and go to [https://cua.ai/trajectory-viewer](https://cua.ai/trajectory-viewer).
2. **Upload**: Drag and drop a trajectories folder or click Select Folder.
3. **Explore**: View your agent’s trajectories! All data stays in your browser unless you give permission otherwise.

## Recording a Trajectory
### Using the ComputerAgent API
Trajectories are saved by default when using the ComputerAgent API:
```python
agent.run("book a flight for me")
```
You can explicitly control trajectory saving with the `save_trajectory` parameter:
```python
from cua import ComputerAgent
agent = ComputerAgent(save_trajectory=True)
agent.run("search for hotels in Boston")
```
Each trajectory folder is saved in a `trajectories` directory with a timestamp format, for example: `trajectories/20250501_222749`
## Exploring and Analyzing Trajectories
Our Trajectory Viewer is designed to allow for thorough analysis and debugging in a friendly way. Once loaded, the viewer presents:
- **Timeline Slider**: Jump to any step in the session
- **Screen Preview**: See exactly what the agent saw
- **Action Details**: Review clicks, keypresses, and API calls
- **Logs & Metadata**: Inspect debug logs or performance stats
Use these features to:
- Step through each action and observation; understand your agent’s decision-making
- Understand why and where your agent failed
- Collect insights for improving your instructions, prompts, tasks, agent, etc.
The trajectory viewer provides a visual interface for stepping through each action your agent took, making it easy to see what your agent “sees”.
## Getting Started
Ready to see your agent in action? Head over to the Trajectory Viewer and load up your first session. Debug smarter, train faster, and stay in control (all within your browser).
Happy tinkering and Cua on!
Have questions or want to share feedback? Join our community on Discord or open an issue on GitHub.
```
--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/installation.mdx:
--------------------------------------------------------------------------------
```markdown
---
title: Installation
---
Install the package from PyPI:
```bash
pip install cua-mcp-server
```
This will install:
- The MCP server
- CUA agent and computer dependencies
- An executable `cua-mcp-server` script in your PATH
## Easy Setup Script
If you want to simplify installation, you can use this one-liner to download and run the installation script:
```bash
curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/python/mcp-server/scripts/install_mcp_server.sh | bash
```
This script will:
- Create the ~/.cua directory if it doesn't exist
- Generate a startup script at ~/.cua/start_mcp_server.sh
- Make the script executable
- The startup script automatically manages Python virtual environments and installs/updates the cua-mcp-server package
You can then use the script in your MCP configuration like this:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/bin/bash",
"args": ["~/.cua/start_mcp_server.sh"],
"env": {
"CUA_MODEL_NAME": "anthropic/claude-sonnet-4-20250514",
"ANTHROPIC_API_KEY": "your-anthropic-api-key-here"
}
}
}
}
```
**Important**: You must include your Anthropic API key for the MCP server to work properly.
## Development Setup
If you're working with the CUA source code directly (like in the CUA repository), you can use the development script instead:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**For host computer control** (development setup):
1. **Install Computer Server Dependencies**:
```bash
python3 -m pip install uvicorn fastapi
python3 -m pip install -e libs/python/computer-server --break-system-packages
```
2. **Start the Computer Server**:
```bash
cd /path/to/cua
python -m computer_server --log-level debug
```
This will start the computer server on `http://localhost:8000` that controls your actual desktop.
3. **Configure Claude Desktop**:
```json
{
"mcpServers": {
"cua-agent": {
"command": "/usr/bin/env",
"args": [
"bash",
"-lc",
"export CUA_MODEL_NAME='anthropic/claude-sonnet-4-20250514'; export ANTHROPIC_API_KEY='your-anthropic-api-key-here'; export CUA_USE_HOST_COMPUTER_SERVER='true'; export CUA_MAX_IMAGES='1'; /path/to/cua/libs/python/mcp-server/scripts/start_mcp_server.sh"
]
}
}
}
```
**Note**: Replace `/path/to/cua` with the absolute path to your CUA repository directory.
**⚠️ Important**: When using host computer control (`CUA_USE_HOST_COMPUTER_SERVER='true'`), the AI will have direct access to your desktop and can perform actions like opening applications, clicking, typing, and taking screenshots. Make sure you're comfortable with this level of access.
### Troubleshooting
**Common Issues:**
1. **"Claude's response was interrupted"** - This usually means:
- Missing API key: Add `ANTHROPIC_API_KEY` to your environment variables
- Invalid model name: Use a valid model like `anthropic/claude-sonnet-4-20250514`
- Check logs for specific error messages
2. **"Missing Anthropic API Key"** - Add your API key to the configuration:
```json
"env": {
"ANTHROPIC_API_KEY": "your-api-key-here"
}
```
3. **"model not found"** - Use a valid model name:
- ✅ `anthropic/claude-sonnet-4-20250514`
4. **Script not found** - If you get a `/bin/bash: ~/cua/libs/python/mcp-server/scripts/start_mcp_server.sh: No such file or directory` error, try changing the path to the script to be absolute instead of relative.
5. **Host Computer Control Issues** - If using `CUA_USE_HOST_COMPUTER_SERVER='true'`:
- **Computer Server not running**: Make sure you've started the computer server with `python -m computer_server --log-level debug`
- **Port 8000 in use**: Check if another process is using port 8000 with `lsof -i :8000`
- **Missing dependencies**: Install `uvicorn` and `fastapi` with `python3 -m pip install uvicorn fastapi`
- **Image size errors**: Use `CUA_MAX_IMAGES='1'` to reduce image context size
**Viewing Logs:**
```bash
tail -n 20 -f ~/Library/Logs/Claude/mcp*.log
```
```