trycua/cua # codebase.md

This is page 5 of 20. Use http://codebase.md/trycua/cua?lines=false&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── bump-version.yml
│       ├── ci-lume.yml
│       ├── docker-publish-cua-linux.yml
│       ├── docker-publish-cua-windows.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── link-check.yml
│       ├── lint.yml
│       ├── npm-publish-cli.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       ├── python-tests.yml
│       ├── test-cua-models.yml
│       └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│   ├── docs.code-workspace
│   ├── extensions.json
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── cloud-windows-ga-macos-preview.md
│   ├── composite-agents.md
│   ├── computer-use-agents-for-growth-hacking.md
│   ├── cua-hackathon.md
│   ├── cua-playground-preview.md
│   ├── cua-vlm-router.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cli.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── neurips-2025-cua-papers.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .env.example
│   ├── .gitignore
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── observability.mdx
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── cua-vlm-router.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   ├── telemetry.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── cli-playbook
│   │       │   ├── commands.mdx
│   │       │   ├── index.mdx
│   │       │   └── meta.json
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── meta.json
│   │       │   ├── sandboxed-python.mdx
│   │       │   └── tracing-api.mdx
│   │       ├── example-usecases
│   │       │   ├── form-filling.mdx
│   │       │   ├── gemini-complex-ui-navigation.mdx
│   │       │   ├── meta.json
│   │       │   ├── post-event-contact-export.mdx
│   │       │   └── windows-app-behind-vpn.mdx
│   │       ├── get-started
│   │       │   ├── meta.json
│   │       │   └── quickstart.mdx
│   │       ├── index.mdx
│   │       ├── macos-vm-cli-playbook
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   └── meta.json
│   │       └── meta.json
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── bg-dark.jpg
│   │       ├── bg-light.jpg
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── grounding-with-gemini3.gif
│   │       ├── hero.png
│   │       ├── laminar_trace_example.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   ├── posthog
│   │   │   │   │   └── [...path]
│   │   │   │   │       └── route.ts
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   ├── llms.txt
│   │   │   │   └── route.ts
│   │   │   ├── robots.ts
│   │   │   └── sitemap.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── analytics-tracker.tsx
│   │   │   ├── cookie-consent.tsx
│   │   │   ├── doc-actions-menu.tsx
│   │   │   ├── editable-code-block.tsx
│   │   │   ├── footer.tsx
│   │   │   ├── hero.tsx
│   │   │   ├── iou.tsx
│   │   │   ├── mermaid.tsx
│   │   │   └── page-feedback.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   ├── mdx-components.tsx
│   │   └── providers
│   │       └── posthog-provider.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── browser_tool_example.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── tracing_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── cua_adapter.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gelato.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── generic_vlm.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   ├── uiins.py
│   │   │   │   │   ├── uitars.py
│   │   │   │   │   └── uitars2.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── tools
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── browser_tool.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer_agent.py
│   │   ├── bench-ui
│   │   │   ├── bench_ui
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   └── child.py
│   │   │   ├── examples
│   │   │   │   ├── folder_example.py
│   │   │   │   ├── gui
│   │   │   │   │   ├── index.html
│   │   │   │   │   ├── logo.svg
│   │   │   │   │   └── styles.css
│   │   │   │   ├── output_overlay.png
│   │   │   │   └── simple_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       └── test_port_detection.py
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── tracing_wrapper.py
│   │   │   │   ├── tracing.py
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer.py
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── browser.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   ├── utils
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── wallpaper.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   ├── test_connection.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_server.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_telemetry.py
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── build-extension.py
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── desktop-extension
│   │   │   │   ├── cua-extension.mcpb
│   │   │   │   ├── desktop_extension.png
│   │   │   │   ├── manifest.json
│   │   │   │   ├── README.md
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_server.sh
│   │   │   │   └── setup.py
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── QUICK_TEST_COMMANDS.sh
│   │   │   ├── quick_test_local_option.py
│   │   │   ├── README.md
│   │   │   ├── scripts
│   │   │   │   ├── install_mcp_server.sh
│   │   │   │   └── start_mcp_server.sh
│   │   │   ├── test_mcp_server_local_option.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_mcp_server.py
│   │   ├── pylume
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_pylume.py
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           ├── conftest.py
│   │           └── test_omniparser.py
│   ├── qemu-docker
│   │   ├── linux
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   └── src
│   │   │       ├── entry.sh
│   │   │       └── vm
│   │   │           ├── image
│   │   │           │   └── README.md
│   │   │           └── setup
│   │   │               ├── install.sh
│   │   │               ├── setup-cua-server.sh
│   │   │               └── setup.sh
│   │   ├── README.md
│   │   └── windows
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       └── src
│   │           ├── entry.sh
│   │           └── vm
│   │               ├── image
│   │               │   └── README.md
│   │               └── setup
│   │                   ├── install.bat
│   │                   ├── on-logon.ps1
│   │                   ├── setup-cua-server.ps1
│   │                   ├── setup-utils.psm1
│   │                   └── setup.ps1
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── cua-cli
│   │   │   ├── .gitignore
│   │   │   ├── .prettierrc
│   │   │   ├── bun.lock
│   │   │   ├── CLAUDE.md
│   │   │   ├── index.ts
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── auth.ts
│   │   │   │   ├── cli.ts
│   │   │   │   ├── commands
│   │   │   │   │   ├── auth.ts
│   │   │   │   │   └── sandbox.ts
│   │   │   │   ├── config.ts
│   │   │   │   ├── http.ts
│   │   │   │   ├── storage.ts
│   │   │   │   └── util.ts
│   │   │   └── tsconfig.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Development.md
│       ├── Dockerfile
│       ├── Dockerfile.dev
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│   ├── install-cli.ps1
│   ├── install-cli.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   ├── run-docker-dev.sh
│   └── typescript-typecheck.js
├── TESTING.md
├── tests
│   ├── agent_loop_testing
│   │   ├── agent_test.py
│   │   └── README.md
│   ├── pytest.ini
│   ├── shell_cmd.py
│   ├── test_files.py
│   ├── test_mcp_server_session_management.py
│   ├── test_mcp_server_streaming.py
│   ├── test_shell_bash.py
│   ├── test_telemetry.py
│   ├── test_tracing.py
│   ├── test_venv.py
│   └── test_watchdog.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/docs/content/docs/macos-vm-cli-playbook/lumier/docker.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Docker
---

You can use Lumier through Docker:

### Run a macOS VM (ephemeral)

```bash
# Run the container with temporary storage (using pre-built image from Docker Hub)
docker run -it --rm \
    --name macos-vm \
    -p 8006:8006 \
    -e VM_NAME=macos-vm \
    -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
    -e CPU_CORES=4 \
    -e RAM_SIZE=8192 \
    trycua/lumier:latest
```

Access the VM in your browser at **http://localhost:8006**.

After running the command above, you can access your macOS VM through a web browser (e.g., http://localhost:8006).

<Callout title="Note">
  With the basic setup above, your VM will be reset when you stop the container (ephemeral mode).
  This means any changes you make inside the macOS VM will be lost. See the section below for how to
  save your VM state.
</Callout>

## Saving Your VM State

To save your VM state between sessions (so your changes persist when you stop and restart the container), you'll need to set up a storage location:

```bash
# First, create a storage directory if it doesn't exist
mkdir -p storage

# Then run the container with persistent storage
docker run -it --rm \
    --name lumier-vm \
    -p 8006:8006 \
    -v $(pwd)/storage:/storage \
    -e VM_NAME=lumier-vm \
    -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
    -e CPU_CORES=4 \
    -e RAM_SIZE=8192 \
    -e HOST_STORAGE_PATH=$(pwd)/storage \
    trycua/lumier:latest
```

This command creates a connection between a folder on your Mac (`$(pwd)/storage`) and a folder inside the Docker container (`/storage`). The `-v` flag (volume mount) and the `HOST_STORAGE_PATH` variable work together to ensure your VM data is saved on your host Mac.

## Sharing Files with Your VM

To share files between your Mac and the virtual machine, you can set up a shared folder:

```bash
# Create both storage and shared folders
mkdir -p storage shared

# Run with both persistent storage and a shared folder
docker run -it --rm \
    --name lumier-vm \
    -p 8006:8006 \
    -v $(pwd)/storage:/storage \
    -v $(pwd)/shared:/shared \
    -e VM_NAME=lumier-vm \
    -e VERSION=ghcr.io/trycua/macos-sequoia-cua:latest \
    -e CPU_CORES=4 \
    -e RAM_SIZE=8192 \
    -e HOST_STORAGE_PATH=$(pwd)/storage \
    -e HOST_SHARED_PATH=$(pwd)/shared \
    trycua/lumier:latest
```

With this setup, any files you place in the `shared` folder on your Mac will be accessible from within the macOS VM, and vice versa.

## Automating VM Startup with on-logon.sh

You can automatically run scripts when the VM starts up by placing an `on-logon.sh` script in the shared folder's lifecycle directory. This is useful for setting up your VM environment each time it starts.

```bash
# Create the lifecycle directory in your shared folder
mkdir -p shared/lifecycle

# Create a sample on-logon.sh script
cat > shared/lifecycle/on-logon.sh << 'EOF'
#!/usr/bin/env bash

# Create a file on the desktop
echo "Hello from Lumier!" > /Users/lume/Desktop/hello_lume.txt

# You can add more commands to execute at VM startup
# For example:
# - Configure environment variables
# - Start applications
# - Mount network drives
# - Set up development environments
EOF

# Make the script executable
chmod +x shared/lifecycle/on-logon.sh
```

The script will be automatically executed when the VM starts up. It runs in the VM context and has access to:

- The `/Users/lume` user directory (home directory in the VM)
- The shared folder at `/Volumes/My Shared Files` inside the VM
- Any resources available to the VM

This feature enables automation of VM setup without modifying the base VM image.

## Configuration Options

When running Lumier, you'll need to configure a few things:

- **Port forwarding** (`-p 8006:8006`): Makes the VM's VNC interface accessible in your browser. If port 8006 is already in use, you can use a different port like `-p 8007:8006`.

- **Environment variables** (`-e`): Configure your VM settings:
  - `VM_NAME`: A name for your virtual machine
  - `VERSION`: The macOS image to use
  - `CPU_CORES`: Number of CPU cores to allocate
  - `RAM_SIZE`: Memory in MB to allocate
  - `HOST_STORAGE_PATH`: Path to save VM state (when using persistent storage)
  - `HOST_SHARED_PATH`: Path to the shared folder (optional)

- **Background service**: The `lume serve` service should be running on your host (starts automatically when you install Lume using the `install.sh` script above).

```

--------------------------------------------------------------------------------
/libs/typescript/agent/src/types.ts:
--------------------------------------------------------------------------------

```typescript
// #region Request
export type ConnectionType = 'http' | 'https' | 'peer';
export interface AgentClientOptions {
  timeout?: number;
  retries?: number;
  /** Optional CUA API key to send as X-API-Key header for HTTP requests */
  apiKey?: string;
}
// Request types matching the Python proxy API
export interface AgentRequest {
  model: string;
  input: string | AgentMessage[];
  agent_kwargs?: {
    save_trajectory?: boolean;
    verbosity?: number;
    [key: string]: any;
  };
  computer_kwargs?: {
    os_type?: string;
    provider_type?: string;
    [key: string]: any;
  };
  /**
   * Optional per-request environment variable overrides.
   * Keys and values are strings and will be forwarded to the backend proxy.
   */
  env?: Record<string, string>;
}
// #endregion

// #region Response
// Response types
export interface AgentResponse {
  output: AgentMessage[];
  usage: Usage;
  status: 'completed' | 'failed';
  error?: string;
}
// Usage information
export interface Usage {
  prompt_tokens: number;
  completion_tokens: number;
  total_tokens: number;
  response_cost: number;
}
// #endregion

// #region Messages
// Agent message types - can be one of several different message types
export type AgentMessage =
  | UserMessage
  | AssistantMessage
  | ReasoningMessage
  | ComputerCallMessage
  | ComputerCallOutputMessage
  | FunctionCallMessage
  | FunctionCallOutputMessage;
// Input message
export interface UserMessage {
  type?: 'message';
  role: 'user' | 'system' | 'developer';
  content: string | InputContent[];
}
// Output message
export interface AssistantMessage {
  type: 'message';
  role: 'assistant';
  content: OutputContent[];
}
// Output reasoning/thinking message
export interface ReasoningMessage {
  type: 'reasoning';
  summary: SummaryContent[];
}
// Output computer action call
export interface ComputerCallMessage {
  type: 'computer_call';
  call_id: string;
  status: 'completed' | 'failed' | 'pending';
  action: ComputerAction;
}
// Output computer action result (always a screenshot)
export interface ComputerCallOutputMessage {
  type: 'computer_call_output';
  call_id: string;
  output: ComputerResultContent;
}
// Output function call
export interface FunctionCallMessage {
  type: 'function_call';
  call_id: string;
  status: 'completed' | 'failed' | 'pending';
  name: string;
  arguments: string; // JSON dict of kwargs
}
// Output function call result (always text)
export interface FunctionCallOutputMessage {
  type: 'function_call_output';
  call_id: string;
  output: string;
}
// #endregion

// #region Message Content
export interface InputContent {
  type: 'input_image' | 'input_text';
  text?: string;
  image_url?: string;
}
export interface OutputContent {
  type: 'output_text';
  text: string;
}
export interface SummaryContent {
  type: 'summary_text';
  text: string;
}
export interface ComputerResultContent {
  type: 'computer_screenshot' | 'input_image';
  image_url: string;
}
// #endregion

// #region Actions
export type ComputerAction = ComputerActionOpenAI | ComputerActionAnthropic;
// OpenAI Computer Actions
export type ComputerActionOpenAI =
  | ClickAction
  | DoubleClickAction
  | DragAction
  | KeyPressAction
  | MoveAction
  | ScreenshotAction
  | ScrollAction
  | TypeAction
  | WaitAction;
export interface ClickAction {
  type: 'click';
  button: 'left' | 'right' | 'wheel' | 'back' | 'forward';
  x: number;
  y: number;
}
export interface DoubleClickAction {
  type: 'double_click';
  button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
  x: number;
  y: number;
}
export interface DragAction {
  type: 'drag';
  button?: 'left' | 'right' | 'wheel' | 'back' | 'forward';
  path: Array<[number, number]>;
}
export interface KeyPressAction {
  type: 'keypress';
  keys: string[];
}
export interface MoveAction {
  type: 'move';
  x: number;
  y: number;
}
export interface ScreenshotAction {
  type: 'screenshot';
}
export interface ScrollAction {
  type: 'scroll';
  scroll_x: number;
  scroll_y: number;
  x: number;
  y: number;
}
export interface TypeAction {
  type: 'type';
  text: string;
}
export interface WaitAction {
  type: 'wait';
}
// Anthropic Computer Actions
export type ComputerActionAnthropic = LeftMouseDownAction | LeftMouseUpAction;
export interface LeftMouseDownAction {
  type: 'left_mouse_down';
  x: number;
  y: number;
}
export interface LeftMouseUpAction {
  type: 'left_mouse_up';
  x: number;
  y: number;
}
// #endregion

```

--------------------------------------------------------------------------------
/libs/python/agent/example.py:
--------------------------------------------------------------------------------

```python
"""
Example usage of the agent library with docstring-based tool definitions.
"""

import asyncio
import logging

from agent import ComputerAgent
from computer import Computer
from computer.helpers import sandboxed


@sandboxed()
def read_file(location: str) -> str:
    """Read contents of a file

    Parameters
    ----------
    location : str
        Path to the file to read

    Returns
    -------
    str
        Contents of the file or error message
    """
    try:
        with open(location, "r") as f:
            return f.read()
    except Exception as e:
        return f"Error reading file: {str(e)}"


def save_note(content: str, filename: str = "note.txt") -> str:
    """Save content to a note file

    Parameters
    ----------
    content : str
        Content to save to the file
    filename : str, optional
        Name of the file to save to (default is "note.txt")

    Returns
    -------
    str
        Success or error message
    """
    try:
        with open(filename, "w") as f:
            f.write(content)
        return f"Saved note to {filename}"
    except Exception as e:
        return f"Error saving note: {str(e)}"


def calculate(a: int, b: int) -> int:
    """Calculate the sum of two integers

    Parameters
    ----------
    a : int
        First integer
    b : int
        Second integer

    Returns
    -------
    int
        Sum of the two integers
    """
    return a + b


async def main():
    """Example usage of ComputerAgent with different models"""

    # Example 1: Using Claude with computer and custom tools
    print("=== Example 1: Claude with Computer ===")

    import json
    import os

    import dotenv

    dotenv.load_dotenv()

    assert os.getenv("CUA_CONTAINER_NAME") is not None, "CUA_CONTAINER_NAME is not set"
    assert os.getenv("CUA_API_KEY") is not None, "CUA_API_KEY is not set"

    async with Computer(
        os_type="linux",
        provider_type="cloud",
        name=os.getenv("CUA_CONTAINER_NAME") or "",
        api_key=os.getenv("CUA_API_KEY") or "",
    ) as computer:
        agent = ComputerAgent(
            # Supported models:
            # == OpenAI CUA (computer-use-preview) ==
            model="openai/computer-use-preview",
            # == Anthropic CUA (Claude > 3.5) ==
            # model="anthropic/claude-opus-4-20250514",
            # model="anthropic/claude-sonnet-4-20250514",
            # model="anthropic/claude-3-7-sonnet-20250219",
            # model="anthropic/claude-sonnet-4-5-20250929",
            # == UI-TARS ==
            # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
            # TODO: add local mlx provider
            # model="mlx-community/UI-TARS-1.5-7B-6bit",
            # model="ollama_chat/0000/ui-tars-1.5-7b",
            # == Omniparser + Any LLM ==
            # model="omniparser+..."
            # model="omniparser+anthropic/claude-opus-4-20250514",
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.INFO,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget={
                "max_budget": 1.0,
                "raise_error": True,
                "reset_after_each_run": False,
            },
        )

        history = []
        while True:
            user_input = input("> ")
            history.append({"role": "user", "content": user_input})

            # Non-streaming usage
            async for result in agent.run(history, stream=False):
                history += result["output"]

                # # Print output
                # for item in result["output"]:
                #     if item["type"] == "message":
                #         print(item["content"][0]["text"])
                #     elif item["type"] == "computer_call":
                #         action = item["action"]
                #         action_type = action["type"]
                #         action_args = {k: v for k, v in action.items() if k != "type"}
                #         print(f"{action_type}({action_args})")
                #     elif item["type"] == "function_call":
                #         action = item["name"]
                #         action_args = item["arguments"]
                #         print(f"{action}({action_args})")
                #     elif item["type"] == "function_call_output":
                #         print("===>", item["output"])


if __name__ == "__main__":
    asyncio.run(main())

```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/contrib.md:
--------------------------------------------------------------------------------

```markdown
# Contributing Reference Agent Implementations

This guide explains how to add your own reference agent implementations to the benchmark system.

## Adding Reference Agent Implementations

### 1. Implement the ModelProtocol

Create a new file in `models/` directory implementing the `ModelProtocol`:

```python
from models.base import ModelProtocol
from typing import Optional, Tuple
from PIL import Image

class YourModelName(ModelProtocol):
    def __init__(self, model_path: str):
        self.model_path = model_path
        self._model = None

    @property
    def model_name(self) -> str:
        return self.model_path

    async def load_model(self) -> None:
        """Load the model into memory."""
        # Your model loading logic here
        pass

    async def unload_model(self) -> None:
        """Unload the model from memory."""
        # Your model cleanup logic here
        pass

    async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
        """
        Predict click coordinates for the given image and instruction.

        Args:
            image: PIL Image to analyze
            instruction: Text instruction describing what to click

        Returns:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        # Your prediction logic here
        return (x, y)  # Return predicted coordinates
```

### 2. Register Your Model

Add your model to the `get_available_models()` function in `utils.py`:

```python
def get_available_models() -> List[Union[str, ModelProtocol]]:
    models = [
        # Computer Agent SDK providers
        "huggingface-local/HelloKKMe/GTA1-7B",

        # Reference implementations
        GTA1Model("HelloKKMe/GTA1-7B"),
        YourModelName("path/to/your/model"),  # Add your model here
    ]
    return models
```

### 3. Test Your Implementation

Before submitting, test your model with the interactive tool:

```bash
python interactive.py
```

This will help you verify that your model loads correctly and produces reasonable predictions.

## Example: Adding a New Model

Here's a complete example of adding a hypothetical "MyVisionModel":

1. **Create `models/my_vision_model.py`:**

```python
import torch
from transformers import AutoModel, AutoProcessor
from models.base import ModelProtocol
from typing import Optional, Tuple
from PIL import Image

class MyVisionModel(ModelProtocol):
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.model = None
        self.processor = None

    @property
    def model_name(self) -> str:
        return f"MyVisionModel({self.model_path})"

    async def load_model(self) -> None:
        """Load the model and processor."""
        self.processor = AutoProcessor.from_pretrained(self.model_path)
        self.model = AutoModel.from_pretrained(
            self.model_path,
            torch_dtype=torch.float16,
            device_map="auto"
        )

    async def unload_model(self) -> None:
        """Clean up model resources."""
        del self.model
        del self.processor
        self.model = None
        self.processor = None
        torch.cuda.empty_cache()

    async def predict_click(self, image: Image.Image, instruction: str) -> Optional[Tuple[int, int]]:
        """Predict click coordinates."""
        try:
            # Preprocess inputs
            inputs = self.processor(
                text=instruction,
                images=image,
                return_tensors="pt"
            )

            # Run inference
            with torch.no_grad():
                outputs = self.model(**inputs)

            # Extract coordinates (model-specific logic)
            x, y = self._extract_coordinates(outputs)
            return (int(x), int(y))

        except Exception as e:
            print(f"Prediction failed: {e}")
            return None

    def _extract_coordinates(self, outputs):
        """Extract x, y coordinates from model outputs."""
        # Your model-specific coordinate extraction logic
        pass
```

2. **Update `models/__init__.py`:**

```python
from .gta1 import GTA1Model
from .my_vision_model import MyVisionModel

__all__ = ["GTA1Model", "MyVisionModel"]
```

3. **Update `utils.py`:**

```python
from models import GTA1Model, MyVisionModel

def get_available_models() -> List[Union[str, ModelProtocol]]:
    models = [
        "huggingface-local/HelloKKMe/GTA1-7B",
        GTA1Model("HelloKKMe/GTA1-7B"),
        MyVisionModel("my-org/my-vision-model"),  # Add here
    ]
    return models
```

```

--------------------------------------------------------------------------------
/docs/src/components/doc-actions-menu.tsx:
--------------------------------------------------------------------------------

```typescript
'use client';

import { useState } from 'react';
import { SiOpenai, SiAnthropic, SiMarkdown, SiGithub } from 'react-icons/si';
import posthog from 'posthog-js';

interface DocActionsMenuProps {
  pageUrl: string;
  pageTitle: string;
  filePath?: string;
}

export function DocActionsMenu({ pageUrl, pageTitle, filePath }: DocActionsMenuProps) {
  const [copied, setCopied] = useState(false);

  const handleCopyMarkdown = async () => {
    try {
      if (!filePath) {
        throw new Error('No file path available');
      }
      const githubRawUrl = `https://raw.githubusercontent.com/trycua/cua/refs/heads/main/docs/content/docs/${filePath}`;

      const response = await fetch(githubRawUrl);
      if (!response.ok) {
        throw new Error('Failed to fetch markdown');
      }
      const markdown = await response.text();

      await navigator.clipboard.writeText(markdown);

      setCopied(true);
      setTimeout(() => setCopied(false), 2000);

      posthog.capture('docs_copy_markdown_clicked', {
        page: pageUrl,
        page_title: pageTitle,
        success: true,
      });
    } catch (error) {
      console.error('Error copying markdown:', error);

      try {
        const urlWithUtm = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
        await navigator.clipboard.writeText(urlWithUtm);
        setCopied(true);
        setTimeout(() => setCopied(false), 2000);
      } catch (fallbackError) {
        console.error('Error copying URL:', fallbackError);
      }

      posthog.capture('docs_copy_markdown_clicked', {
        page: pageUrl,
        page_title: pageTitle,
        success: false,
        error: error instanceof Error ? error.message : 'Unknown error',
      });
    }
  };

  const handleEditGithub = () => {
    if (!filePath) {
      return;
    }
    posthog.capture('docs_edit_github_clicked', {
      page: pageUrl,
      page_title: pageTitle,
    });

    const githubEditUrl = `https://github.com/trycua/cua/edit/main/docs/content/docs/${filePath}`;
    window.open(githubEditUrl, '_blank', 'noopener,noreferrer');
  };

  const handleOpenChatGPT = () => {
    posthog.capture('docs_open_chatgpt_clicked', {
      page: pageUrl,
      page_title: pageTitle,
    });

    const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
    const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
    const chatgptUrl = `https://chatgpt.com/?q=${encodeURIComponent(prompt)}`;
    window.open(chatgptUrl, '_blank', 'noopener,noreferrer');
  };

  const handleOpenClaude = () => {
    posthog.capture('docs_open_claude_clicked', {
      page: pageUrl,
      page_title: pageTitle,
    });

    const docUrl = `https://cua.ai${pageUrl}?utm_source=cua.ai/docs`;
    const prompt = `I need help understanding this cua.ai documentation page: "${pageTitle}". Please read and help me with: ${docUrl}`;
    const claudeUrl = `https://claude.ai/new?q=${encodeURIComponent(prompt)}`;
    window.open(claudeUrl, '_blank', 'noopener,noreferrer');
  };

  return (
    <div className="flex flex-col gap-2">
      <button
        onClick={handleCopyMarkdown}
        className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
      >
        <SiMarkdown className="w-2 h-4 flex-shrink-0" />
        <span>{copied ? 'Copied!' : 'Copy as markdown'}</span>
      </button>

      <button
        onClick={handleEditGithub}
        className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
      >
        <SiGithub className="w-4 h-4 flex-shrink-0" />
        <span>Edit on GitHub</span>
      </button>

      <button
        onClick={handleOpenChatGPT}
        className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
      >
        <SiOpenai className="w-4 h-4 flex-shrink-0" />
        <span>Open in ChatGPT</span>
      </button>

      <button
        onClick={handleOpenClaude}
        className="inline-flex gap-3 w-full items-center rounded-md p-1 text-sm hover:bg-fd-accent hover:text-fd-accent-foreground text-left transition-colors px-2 hover:cursor-pointer"
      >
        <SiAnthropic className="w-4 h-4 flex-shrink-0" />
        <span>Open in Claude</span>
      </button>
    </div>
  );
}

```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/supported-agents/composed-agents.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Composed Agents
description: Combine grounding models with any LLM for computer-use capabilities
---

Composed agents combine the best of both worlds: specialized grounding models for precise click prediction and powerful LLMs for task planning and reasoning.

Use the format `"grounding_model+planning_model"` to create a composed agent with any vision-enabled LiteLLM-compatible model.

## How Composed Agents Work

1. **Planning Phase**: The planning model (LLM) analyzes the task and decides what actions to take (e.g., `click("find the login button")`, `type("username")`)
2. **Grounding Phase**: The grounding model converts element descriptions to precise coordinates
3. **Execution**: Actions are performed using the predicted coordinates

## Supported Grounding Models

Any model that supports `predict_click()` can be used as the grounding component. See the full list on [Grounding Models](./grounding-models).

- OpenCUA: `huggingface-local/xlangai/OpenCUA-{7B,32B}`
- GTA1 family: `huggingface-local/HelloKKMe/GTA1-{7B,32B,72B}`
- Holo 1.5 family: `huggingface-local/Hcompany/Holo1.5-{3B,7B,72B}`
- InternVL 3.5 family: `huggingface-local/OpenGVLab/InternVL3_5-{1B,2B,4B,8B,...}`
- UI‑TARS 1.5: `huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B` (also supports full CU)
- OmniParser (OCR): `omniparser` (requires combination with a LiteLLM vision model)
- Moondream3: `moondream3` (requires combination with a LiteLLM vision/text model)

## Supported Planning Models

Any vision-enabled LiteLLM-compatible model can be used as the planning component:

- Any All‑in‑one CUA (planning-capable). See [All‑in‑one CUAs](./computer-use-agents).
- Any VLM via LiteLLM providers: `anthropic/*`, `openai/*`, `openrouter/*`, `gemini/*`, `vertex_ai/*`, `huggingface-local/*`, `mlx/*`, etc.
- Examples:
  - **Anthropic**: `anthropic/claude-sonnet-4-5-20250929`, `anthropic/claude-opus-4-1-20250805`
  - **OpenAI**: `openai/gpt-5`, `openai/gpt-o3`, `openai/gpt-4o`
  - **Google**: `gemini/gemini-1.5-pro`, `vertex_ai/gemini-pro-vision`
  - **Local models**: Any Hugging Face vision-language model

## Usage Examples

### GTA1 + GPT-5

Use OpenAI's GPT-5 for planning with specialized grounding:

```python
agent = ComputerAgent(
    "huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-5",
    tools=[computer]
)

async for _ in agent.run("Take a screenshot, analyze the UI, and click on the most prominent button"):
    pass
```

### GTA1 + Claude 3.5 Sonnet

Combine state-of-the-art grounding with powerful reasoning:

```python
agent = ComputerAgent(
    "huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929",
    tools=[computer]
)

async for _ in agent.run("Open Firefox, navigate to github.com, and search for 'computer-use'"):
    pass
# Success! 🎉
# - Claude 3.5 Sonnet plans the sequence of actions
# - GTA1-7B provides precise click coordinates for each UI element
```

### UI-TARS + GPT-4o

Combine two different vision models for enhanced capabilities:

```python
agent = ComputerAgent(
    "huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B+openai/gpt-4o",
    tools=[computer]
)

async for _ in agent.run("Help me fill out this form with my personal information"):
    pass
```

### Moondream3 + GPT-4o

Use the built-in Moondream3 grounding with any planning model. Moondream3 will detect UI elements on the latest screenshot, label them, and provide a user message listing detected element names.

```python
from agent import ComputerAgent
from computer import computer

agent = ComputerAgent(
    "moondream3+openai/gpt-4o",
    tools=[computer]
)

async for _ in agent.run("Close the settings window, then open the Downloads folder"):
    pass
```

## Benefits of Composed Agents

- **Specialized Grounding**: Use models optimized for click prediction accuracy
- **Flexible Planning**: Choose any LLM for task reasoning and planning
- **Cost Optimization**: Use smaller grounding models with larger planning models only when needed
- **Performance**: Leverage the strengths of different model architectures

## Capabilities

Composed agents support both capabilities:

```python
agent = ComputerAgent("huggingface-local/HelloKKMe/GTA1-7B+anthropic/claude-sonnet-4-5-20250929")

# Full computer-use agent capabilities
async for _ in agent.run("Complete this online form"):
    pass

# Direct click prediction (uses grounding model only)
coords = agent.predict_click("find the submit button")
```

---

For more information on individual model capabilities, see [Computer-Use Agents](./computer-use-agents) and [Grounding Models](./grounding-models).

```

--------------------------------------------------------------------------------
/blog/composite-agents.md:
--------------------------------------------------------------------------------

```markdown
# Announcing Cua Agent framework 0.4 and Composite Agents

_Published on August 26, 2025 by Dillon DuPont_

<img src="./assets/composite-agents.png" alt="Composite Agents">

So you want to build an agent that can use a computer. Great! You've probably discovered that there are now dozens of different AI models that claim they can click GUI buttons and fill out forms. Less great: actually getting them to work together is like trying to coordinate a group project where everyone speaks a different language and has invented seventeen different ways to say "click here".

Here's the thing about new GUI models: they're all special snowflakes. One model wants you to feed it images and expects coordinates back as percentages from 0 to 1. Another wants absolute pixel coordinates. A third model has invented its own numeral system with `<|loc095|><|loc821|>` tokens inside tool calls. Some models output Python code that calls `pyautogui.click(x, y)`. Others will start hallucinating coordinates if you forget to format all previous messages within a very specific GUI system prompt.

This is the kind of problem that makes you wonder if we're building the future of computing or just recreating the Tower of Babel with more GPUs.

## What we fixed

Agent framework 0.4 solves this by doing something radical: making all these different models speak the same language.

Instead of writing separate code for each model's peculiarities, you now just pick a model with a string like `"anthropic/claude-sonnet-4-5-20250929"` or `"huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B"`, and everything else Just Works™. Behind the scenes, we handle all the coordinate normalization, token parsing, and image preprocessing so you don't have to.

```python
# This works the same whether you're using Anthropic, OpenAI, or that new model you found on Hugging Face
agent = ComputerAgent(
    model="anthropic/claude-sonnet-4-5-20250929",  # or any other supported model
    tools=[computer]
)
```

The output format is consistent across all providers (OpenAI, Anthropic, Vertex, Hugging Face, OpenRouter, etc.). No more writing different parsers for each model's creative interpretation of how to represent a mouse click.

## Composite Agents: Two Brains Are Better Than One

Here's where it gets interesting. We realized that you don't actually need one model to be good at everything. Some models are excellent at understanding what's on the screen—they can reliably identify buttons and text fields and figure out where to click. Other models are great at planning and reasoning but might be a bit fuzzy on the exact pixel coordinates.

So we let you combine them with a `+` sign:

```python
agent = ComputerAgent(
    # specify the grounding model first, then the planning model
    model="huggingface-local/HelloKKMe/GTA1-7B+huggingface-local/OpenGVLab/InternVL3_5-8B",
    tools=[computer]
)
```

This creates a composite agent where one model (the "grounding" model) handles the visual understanding and precise UI interactions, while the other (the "planning" model) handles the high-level reasoning and task orchestration. It's like having a pilot and a navigator, except they're both AI models and they're trying to help you star a GitHub repository.

You can even take a model that was never designed for computer use—like GPT-4o—and give it GUI capabilities by pairing it with a specialized vision model:

```python
agent = ComputerAgent(
    model="huggingface-local/HelloKKMe/GTA1-7B+openai/gpt-4o",
    tools=[computer]
)
```

## Example notebook

For a full, ready-to-run demo (install deps, local computer using Docker, and a composed agent example), see the notebook:

- https://github.com/trycua/cua/blob/models/opencua/notebooks/composite_agents_docker_nb.ipynb

## What's next

We're building integration with HUD evals, allowing us to curate and benchmark model combinations. This will help us identify which composite agent pairs work best for different types of tasks, and provide you with tested recommendations rather than just throwing model names at the wall to see what sticks.

If you try out version 0.4.x, we'd love to hear how it goes. Join us on Discord to share your results and let us know what model combinations work best for your projects.

---

## Links

- **Composite Agent Docs:** [https://cua.ai/docs/agent-sdk/supported-agents/composed-agents](https://cua.ai/docs/agent-sdk/supported-agents/composed-agents)
- **Discord:** [https://discord.gg/cua-ai](https://discord.gg/cua-ai)

Questions or weird edge cases? Ping us on Discord—we’re curious to see what you build.

```

--------------------------------------------------------------------------------
/blog/cloud-windows-ga-macos-preview.md:
--------------------------------------------------------------------------------

```markdown
# Cloud Windows Sandboxes GA + macOS Preview

If you've been building with our `cua` libraries, you might've hit a limitation with local computer-use sandboxes: to run agents on Windows or macOS, you need to be on that OS - Windows Sandbox for Windows, Apple Virtualization for macOS. The only cross-platform option is Linux on Docker, which limits you to virtualizing Linux environments ([see all local options here](https://cua.ai/docs/computer-sdk/computers)).

Today the story changes - we're announcing general availability of **Cloud Windows Sandboxes** and opening early preview access for **Cloud macOS Sandboxes**.

## Cloud Windows Sandboxes: Now GA

![Cloud Windows Sandboxes](https://github.com/user-attachments/assets/db15f4c4-70a4-425a-a264-82e629074de7)

Cloud Windows Sandboxes are now generally available. You get a full Windows 11 desktop in your browser with Edge and Python pre-installed, working seamlessly with all our [Computer-Use libraries](https://github.com/trycua/cua) for RPA, UI automation, code execution, and agent development.

**What's new with this release:**

- Hot-start under 1 second
- Direct noVNC over HTTPS under our sandbox.cua.ai domain
- 3 sandbox sizes available:

| Size   | CPU     | RAM   | Storage    |
| ------ | ------- | ----- | ---------- |
| Small  | 2 cores | 8 GB  | 128 GB SSD |
| Medium | 4 cores | 16 GB | 128 GB SSD |
| Large  | 8 cores | 32 GB | 256 GB SSD |

<div align="center">
  <video src="https://github.com/user-attachments/assets/8ab07646-6018-4128-87ce-53180cfea696" width="600" controls></video>
</div>

**Pricing:** Windows Sandboxes start at 8 credits/hour (Small), 15 credits/hour (Medium), or 31 credits/hour (Large).

## Cloud macOS Sandboxes: Now in Preview

Running macOS locally comes with challenges: 30GB golden images, a maximum of 2 sandboxes per host, and unpredictable compatibility issues. With Cloud macOS Sandboxes, we provision bare-metal macOS hosts (M1, M2, M4) on-demand—giving you full desktop access without the overhead of managing local sandboxes.

![macOS Preview Waitlist](https://github.com/user-attachments/assets/343c9a3f-59d8-4b1a-bba8-6af91e8a9cf0)

**Preview access:** Invite-only. [Join the waitlist](https://cua.ai/macos-waitlist) if you're building agents for macOS workflows.

## Getting Started Today

Sign up at [cua.ai/signin](https://cua.ai/signin) and grab your API key from the dashboard. Then connect to a sandbox:

```python
from computer import Computer

computer = Computer(
    os_type="windows",      # or "macos"
    provider_type="cloud",
    name="my-sandbox",
    api_key="your-api-key"
)

await computer.run()
```

Manage existing sandboxes:

```python
from computer.providers.cloud.provider import CloudProvider

provider = CloudProvider(api_key="your-api-key")
async with provider:
    sandboxes = await provider.list_vms()
    await provider.run_vm("my-sandbox")
    await provider.stop_vm("my-sandbox")
```

Run an agent on Windows to automate a workflow:

```python
from agent import ComputerAgent

agent = ComputerAgent(
    model="anthropic/claude-sonnet-4-5-20250929",
    tools=[computer],
    max_trajectory_budget=5.0
)

response = await agent.run(
    "Open Excel, create a sales report with this month's data, and save it to the desktop"
)
```

## FAQs

<details>
<summary><strong>Why not just use local Windows Sandbox?</strong></summary>

Local Windows Sandbox resets on every restart. No persistence, no hot-start, and you need Windows Pro. Our sandboxes persist state, hot-start in under a second, and work from any OS.

</details>

<details>
<summary><strong>What happens to my work when I stop a sandbox?</strong></summary>

Everything persists. Files, installed software, browser profiles—it's all there when you restart. Only pay for runtime, not storage.

</details>

<details>
<summary><strong>How's the latency for UI automation?</strong></summary>

We run in 4 regions so you can pick what's closest. The noVNC connection is optimized for automation, not video streaming. Your agent sees crisp screenshots, not compressed video.

</details>

<details>
<summary><strong>Are there software restrictions?</strong></summary>

No. Full admin access on both platforms. Install whatever you need—Visual Studio, Photoshop, custom enterprise software. It's your sandbox.

</details>

## Need help?

If you hit issues getting either platform working, reach out in [Discord](https://discord.gg/cua-ai). We respond fast and fix based on what people actually use.

---

Get started at [cua.ai](https://cua.ai) or [join the macOS waitlist](https://cua.ai/macos-waitlist).

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/callbacks/base.py:
--------------------------------------------------------------------------------

```python
"""
Base callback handler interface for ComputerAgent preprocessing and postprocessing hooks.
"""

from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Union


class AsyncCallbackHandler(ABC):
    """
    Base class for async callback handlers that can preprocess messages before
    the agent loop and postprocess output after the agent loop.
    """

    async def on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
        """Called at the start of an agent run loop."""
        pass

    async def on_run_end(
        self,
        kwargs: Dict[str, Any],
        old_items: List[Dict[str, Any]],
        new_items: List[Dict[str, Any]],
    ) -> None:
        """Called at the end of an agent run loop."""
        pass

    async def on_run_continue(
        self,
        kwargs: Dict[str, Any],
        old_items: List[Dict[str, Any]],
        new_items: List[Dict[str, Any]],
    ) -> bool:
        """Called during agent run loop to determine if execution should continue.

        Args:
            kwargs: Run arguments
            old_items: Original messages
            new_items: New messages generated during run

        Returns:
            True to continue execution, False to stop
        """
        return True

    async def on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Called before messages are sent to the agent loop.

        Args:
            messages: List of message dictionaries to preprocess

        Returns:
            List of preprocessed message dictionaries
        """
        return messages

    async def on_llm_end(self, output: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Called after the agent loop returns output.

        Args:
            output: List of output message dictionaries to postprocess

        Returns:
            List of postprocessed output dictionaries
        """
        return output

    async def on_computer_call_start(self, item: Dict[str, Any]) -> None:
        """
        Called when a computer call is about to start.

        Args:
            item: The computer call item dictionary
        """
        pass

    async def on_computer_call_end(
        self, item: Dict[str, Any], result: List[Dict[str, Any]]
    ) -> None:
        """
        Called when a computer call has completed.

        Args:
            item: The computer call item dictionary
            result: The result of the computer call
        """
        pass

    async def on_function_call_start(self, item: Dict[str, Any]) -> None:
        """
        Called when a function call is about to start.

        Args:
            item: The function call item dictionary
        """
        pass

    async def on_function_call_end(
        self, item: Dict[str, Any], result: List[Dict[str, Any]]
    ) -> None:
        """
        Called when a function call has completed.

        Args:
            item: The function call item dictionary
            result: The result of the function call
        """
        pass

    async def on_text(self, item: Dict[str, Any]) -> None:
        """
        Called when a text message is encountered.

        Args:
            item: The message item dictionary
        """
        pass

    async def on_api_start(self, kwargs: Dict[str, Any]) -> None:
        """
        Called when an API call is about to start.

        Args:
            kwargs: The kwargs being passed to the API call
        """
        pass

    async def on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
        """
        Called when an API call has completed.

        Args:
            kwargs: The kwargs that were passed to the API call
            result: The result of the API call
        """
        pass

    async def on_usage(self, usage: Dict[str, Any]) -> None:
        """
        Called when usage information is received.

        Args:
            usage: The usage information
        """
        pass

    async def on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
        """
        Called when a screenshot is taken.

        Args:
            screenshot: The screenshot image
            name: The name of the screenshot
        """
        pass

    async def on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
        """
        Called when responses are received.

        Args:
            kwargs: The kwargs being passed to the agent loop
            responses: The responses received
        """
        pass

```

--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/computers.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Computer Types
description: Understanding Cua computer types and connection methods
---

{/* prettier-ignore */}
<Callout>A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/computer_nb.ipynb" target="_blank">Jupyter Notebook</a> and <a href="https://github.com/trycua/cua/tree/main/examples/computer-example-ts" target="_blank">NodeJS project</a> are available for this documentation.</Callout>

Before we can automate apps using AI, we need to first connect to a Computer Server to give the AI a safe environment to execute workflows in.

Cua Computers are preconfigured sandboxes running the Computer Server. They can be either macOS, Linux, or Windows. They're found in either a cloud-native sandbox, or on your host desktop.

## Cloud Sandbox

**Easiest & safest way to get started - works on any host OS**

This is a Cloud Sandbox running the Computer Server. Get a sandbox at [cua.ai](https://cua.ai/).

<Tabs items={['Python', 'TypeScript']}>
  <Tab value="Python">
    ```python
    from computer import Computer

    computer = Computer(
        os_type="linux",
        provider_type="cloud",
        name="your-sandbox-name",
        api_key="your-api-key"
    )

    await computer.run() # Connect to the sandbox
    ```

  </Tab>
  <Tab value="TypeScript">
    ```typescript
    import { Computer, OSType } from '@trycua/computer';

    const computer = new Computer({
      osType: OSType.LINUX,
      name: "your-sandbox-name",
      apiKey: "your-api-key"
    });

    await computer.run(); // Connect to the sandbox
    ```

  </Tab>
</Tabs>

## Linux on Docker

**Run Linux desktop locally on macOS, Windows, or Linux hosts**

Cua provides two Docker images for running Linux desktops:

<Tabs items={['XFCE (Lightweight)', 'KASM (Full-Featured)']}>
  <Tab value="XFCE (Lightweight)">

    **Recommended for most use cases** - lightweight XFCE desktop with Firefox

    1. Install Docker Desktop or Docker Engine

    2. Pull the CUA XFCE image

    ```bash
    docker pull --platform=linux/amd64 trycua/cua-xfce:latest
    ```

    3. Connect with Computer

    ```python
    from computer import Computer

    computer = Computer(
        os_type="linux",
        provider_type="docker",
        image="trycua/cua-xfce:latest",
        name="my-xfce-sandbox"
    )

    await computer.run() # Launch & connect to Docker sandbox
    ```

  </Tab>
  <Tab value="KASM (Full-Featured)">

    **Full-featured Ubuntu desktop** with additional applications

    1. Install Docker Desktop or Docker Engine

    2. Build or pull the CUA KASM image

    ```bash
    # Option 1: Pull from Docker Hub
    docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest

    # Option 2: Build locally
    cd libs/kasm
    docker build -t cua-ubuntu:latest .
    ```

    3. Connect with Computer

    ```python
    from computer import Computer

    computer = Computer(
        os_type="linux",
        provider_type="docker",
        image="trycua/cua-ubuntu:latest",
        name="my-kasm-sandbox"
    )

    await computer.run() # Launch & connect to Docker sandbox
    ```

  </Tab>
</Tabs>

## Windows Sandbox

**Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11**

1. Enable Windows Sandbox
2. Install pywinsandbox dependency

```bash
pip install -U git+git://github.com/karkason/pywinsandbox.git
```

3. Connect with Computer

```python
from computer import Computer

computer = Computer(
    os_type="windows",
    provider_type="winsandbox",
    ephemeral=True # Windows Sandbox is always ephemeral
)

await computer.run() # Launch & connect to Windows Sandbox
```

## macOS Sandbox

**macOS hosts only - requires Lume CLI**

1. Install lume cli

```bash
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
```

2. Start a local Cua macOS sandbox

```bash
lume run macos-sequoia-cua:latest
```

3. Connect with Computer

```python
from computer import Computer

computer = Computer(
    os_type="macos",
    provider_type="lume",
    name="macos-sequoia-cua:latest"
)

await computer.run() # Launch & connect to the sandbox
```

## Your host desktop

You can also have agents control your desktop directly by running Computer Server without any containerization layer. Beware that AI models may perform risky actions.

```bash
pip install cua-computer-server
python -m computer_server
```

Connect with:

<Tabs items={['Python']}>
  <Tab value="Python">
    ```python

    computer = Computer(use_host_computer_server=True)
    await computer.run() # Connect to the host desktop

    ```

  </Tab>
</Tabs>

```

--------------------------------------------------------------------------------
/libs/lumier/src/bin/entry.sh:
--------------------------------------------------------------------------------

```bash
#!/usr/bin/env bash

# Configure SSH to prevent known hosts warnings
export SSHPASS_PROMPT=
export SSH_ASKPASS=/bin/echo
# Set SSH quiet mode via the SSHPASS environment variable
export SSHPASS_OPTS="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -q"

# We'll enable strict error checking AFTER initialization
# to prevent premature exits

# Source configuration files
CONFIG_DIR="/run/config"
LIB_DIR="/run/lib"

# Source constants if available
if [ -f "${CONFIG_DIR}/constants.sh" ]; then
  source "${CONFIG_DIR}/constants.sh"
fi

# Import utilities
for lib in "${LIB_DIR}"/*.sh; do
  if [ -f "$lib" ]; then
    source "$lib"
  fi
done

# Set VM_NAME to env or fallback to container name (from --name)
if [ -z "${VM_NAME:-}" ]; then
    VM_NAME="$(cat /etc/hostname)"
    export VM_NAME
fi

# Set HOST_STORAGE_PATH to a lume ephemeral storage if not set
if [ -z "${HOST_STORAGE_PATH:-}" ]; then
    HOST_STORAGE_PATH="ephemeral"
    
    # Tell user that ephemeral storage is being used
    echo "Using ephemeral storage. VM state will be lost when macOS cleans up temporary files."
    
    export HOST_STORAGE_PATH
fi

# Only check and report mountpoints in debug mode
if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
    if mountpoint -q /storage; then
        echo "/storage is mounted"
    fi
    if mountpoint -q /shared; then
        echo "/shared is mounted"
    fi
    # if mountpoint -q /data; then
    #     echo "/data is mounted"
    # fi
fi

# Check if we're running as PID 1 (important for Docker signal handling)
if [ $$ -ne 1 ]; then
    echo "Warning: This script is not running as PID 1 (current PID: $$)."
    echo "Docker signal handling may not work properly when stopped from Docker Desktop."
fi

# Log startup info
echo "Lumier VM is starting..."

# Cleanup function to ensure VM and noVNC proxy shutdown on container stop
# Counter for signal handling
SIGNAL_COUNT=0

cleanup() {
  local signal_name=$1
  set +e  # Don't exit on error in cleanup
  
  # Increment signal counter
  SIGNAL_COUNT=$((SIGNAL_COUNT + 1))
  
  # If this is the first signal, try graceful shutdown
  if [ $SIGNAL_COUNT -eq 1 ]; then
    echo "[cleanup] Caught $signal_name signal, shutting down..."
    
    # Check if we're in the middle of an image pull
    if [[ "$PULL_IN_PROGRESS" == "1" ]]; then
      echo "[cleanup] Interrupted during image pull, skipping VM stop."
    else
      echo "[cleanup] Stopping VM..."
      stop_vm true
    fi
    
    # Attempt to clean up ephemeral storage if it's in the /private/tmp directory
    if [[ "$HOST_STORAGE_PATH" == "ephemeral" ]]; then
      # First check if VM actually exists
      VM_INFO=$(lume_get "$VM_NAME" "$HOST_STORAGE_PATH" "json" "false")
      
      # Only try VM deletion if VM exists and not in the middle of a pull
      if [[ "$PULL_IN_PROGRESS" != "1" && $VM_INFO != *"Virtual machine not found"* ]]; then
        echo "[cleanup] Cleaning up VM..."
        lume_delete "$VM_NAME" "$HOST_STORAGE_PATH" > /dev/null 2>&1
      fi
    fi
  else
    # For multiple signals, force an immediate exit
    echo "got $SIGNAL_COUNT SIGTERM/SIGINTs, forcefully exiting"
  fi
  
  # If we've received multiple signals, just exit immediately
  if [ $SIGNAL_COUNT -ge 3 ]; then
    exit 1
  fi
  
  # Exit with success for the first signal
  if [ $SIGNAL_COUNT -eq 1 ]; then
    exit 0
  fi
}
# Ensure we catch all typical container termination signals
trap 'cleanup SIGTERM' SIGTERM
trap 'cleanup SIGINT' SIGINT
trap 'cleanup SIGHUP' SIGHUP

# Now enable strict error handling after initialization
set -euo pipefail

# Start the VM with error handling
if ! start_vm; then
    echo "ERROR: Failed to start VM!" >&2
    exit 1
fi

# Start noVNC for VNC access
NOVNC_PID=""
if [ -n "${VNC_PORT:-}" ] && [ -n "${VNC_PASSWORD:-}" ]; then
  # Only show this in debug mode
  if [ "${LUMIER_DEBUG:-0}" == "1" ]; then
    echo "Starting noVNC proxy with optimized color settings..."
  fi
  ${NOVNC_PATH}/utils/novnc_proxy --vnc host.docker.internal:${VNC_PORT} --listen 8006 --web ${NOVNC_PATH} > /dev/null 2>&1 &
  NOVNC_PID=$!
  disown $NOVNC_PID
  echo "noVNC interface available at: http://localhost:8006/vnc.html?password=${VNC_PASSWORD}&autoconnect=true (replace PORT with the port you forwarded to 8006)"
fi

echo "Lumier is running. Press Ctrl+C to stop."

# Instead of tail -f /dev/null, use a wait loop that can be interrupted by signals
while true; do
  # Sleep in small increments to make signal handling more responsive
  sleep 1 &
  wait $!
  # Break the loop if we've received a signal
  if [ $SIGNAL_COUNT -gt 0 ]; then
    break
  fi
done
```

--------------------------------------------------------------------------------
/libs/lume/src/Server/Requests.swift:
--------------------------------------------------------------------------------

```swift
import ArgumentParser
import Foundation
import Virtualization

struct RunVMRequest: Codable {
    let noDisplay: Bool?
    let sharedDirectories: [SharedDirectoryRequest]?
    let recoveryMode: Bool?
    let storage: String?

    struct SharedDirectoryRequest: Codable {
        let hostPath: String
        let readOnly: Bool?
    }

    func parse() throws -> [SharedDirectory] {
        guard let sharedDirectories = sharedDirectories else { return [] }

        return try sharedDirectories.map { dir -> SharedDirectory in
            // Validate that the host path exists and is a directory
            var isDirectory: ObjCBool = false
            guard FileManager.default.fileExists(atPath: dir.hostPath, isDirectory: &isDirectory),
                isDirectory.boolValue
            else {
                throw ValidationError(
                    "Host path does not exist or is not a directory: \(dir.hostPath)")
            }

            return SharedDirectory(
                hostPath: dir.hostPath,
                tag: VZVirtioFileSystemDeviceConfiguration.macOSGuestAutomountTag,
                readOnly: dir.readOnly ?? false
            )
        }
    }
}

struct PullRequest: Codable {
    let image: String
    let name: String?
    var registry: String
    var organization: String
    let storage: String?

    enum CodingKeys: String, CodingKey {
        case image, name, registry, organization, storage
    }

    init(from decoder: Decoder) throws {
        let container = try decoder.container(keyedBy: CodingKeys.self)
        image = try container.decode(String.self, forKey: .image)
        name = try container.decodeIfPresent(String.self, forKey: .name)
        registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
        organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
        storage = try container.decodeIfPresent(String.self, forKey: .storage)
    }
}

struct CreateVMRequest: Codable {
    let name: String
    let os: String
    let cpu: Int
    let memory: String
    let diskSize: String
    let display: String
    let ipsw: String?
    let storage: String?

    func parse() throws -> (memory: UInt64, diskSize: UInt64) {
        return (
            memory: try parseSize(memory),
            diskSize: try parseSize(diskSize)
        )
    }
}

struct SetVMRequest: Codable {
    let cpu: Int?
    let memory: String?
    let diskSize: String?
    let display: String?
    let storage: String?

    func parse() throws -> (memory: UInt64?, diskSize: UInt64?, display: VMDisplayResolution?) {
        return (
            memory: try memory.map { try parseSize($0) },
            diskSize: try diskSize.map { try parseSize($0) },
            display: try display.map {
                guard let resolution = VMDisplayResolution(string: $0) else {
                    throw ValidationError(
                        "Invalid display resolution format: \($0). Expected format: WIDTHxHEIGHT")
                }
                return resolution
            }
        )
    }
}

struct CloneRequest: Codable {
    let name: String
    let newName: String
    let sourceLocation: String?
    let destLocation: String?
}

struct PushRequest: Codable {
    let name: String // Name of the local VM
    let imageName: String // Base name for the image in the registry
    let tags: [String] // List of tags to push
    var registry: String // Registry URL
    var organization: String // Organization/user in the registry
    let storage: String? // Optional VM storage location or direct path
    var chunkSizeMb: Int // Chunk size
    // dryRun and reassemble are less common for API, default to false?
    // verbose is usually handled by server logging

    enum CodingKeys: String, CodingKey {
        case name, imageName, tags, registry, organization, storage, chunkSizeMb
    }

    // Provide default values for optional fields during decoding
    init(from decoder: Decoder) throws {
        let container = try decoder.container(keyedBy: CodingKeys.self)
        name = try container.decode(String.self, forKey: .name)
        imageName = try container.decode(String.self, forKey: .imageName)
        tags = try container.decode([String].self, forKey: .tags)
        registry = try container.decodeIfPresent(String.self, forKey: .registry) ?? "ghcr.io"
        organization = try container.decodeIfPresent(String.self, forKey: .organization) ?? "trycua"
        storage = try container.decodeIfPresent(String.self, forKey: .storage)
        chunkSizeMb = try container.decodeIfPresent(Int.self, forKey: .chunkSizeMb) ?? 512
    }
}

```

--------------------------------------------------------------------------------
/libs/lume/src/FileSystem/VMConfig.swift:
--------------------------------------------------------------------------------

```swift
import ArgumentParser
import Foundation
import Virtualization

/// Represents a shared directory configuration
struct SharedDirectory: Codable {
    let hostPath: String
    let tag: String
    let readOnly: Bool

    var string: String {
        return "\(hostPath):\(tag):\(readOnly ? "ro" : "rw")"
    }
}

// MARK: - VMConfig
struct VMConfig: Codable {
    
    // MARK: - Properties
    let os: String
    private var _cpuCount: Int?
    private var _memorySize: UInt64?
    private var _diskSize: UInt64?
    private var _macAddress: String?
    private var _display: VMDisplayResolution
    private var _hardwareModel: Data?
    private var _machineIdentifier: Data?
    
    // MARK: - Initialization
    init(
        os: String,
        cpuCount: Int? = nil,
        memorySize: UInt64? = nil,
        diskSize: UInt64? = nil,
        macAddress: String? = nil,
        display: String,
        hardwareModel: Data? = nil,
        machineIdentifier: Data? = nil
    ) throws {
        self.os = os
        self._cpuCount = cpuCount
        self._memorySize = memorySize
        self._diskSize = diskSize
        self._macAddress = macAddress
        self._display = VMDisplayResolution(string: display) ?? VMDisplayResolution(string: "1024x768")!
        self._hardwareModel = hardwareModel
        self._machineIdentifier = machineIdentifier
    }
    
    var display: VMDisplayResolution {
        get { _display }
        set { _display = newValue }
    }
    
    var cpuCount: Int? {
        get { _cpuCount }
        set { _cpuCount = newValue }
    }
    
    var memorySize: UInt64? {
        get { _memorySize }
        set { _memorySize = newValue }
    }
    
    var diskSize: UInt64? {
        get { _diskSize }
        set { _diskSize = newValue }
    }

    var hardwareModel: Data? {
        get { _hardwareModel }
        set { _hardwareModel = newValue }
    }

    var machineIdentifier: Data? {
        get { _machineIdentifier }
        set { _machineIdentifier = newValue }
    }

    var macAddress: String? {
        get { _macAddress }
        set { _macAddress = newValue }
    }
    
    mutating func setCpuCount(_ count: Int) {
        _cpuCount = count
    }
    
    mutating func setMemorySize(_ size: UInt64) {
        _memorySize = size
    }
    
    mutating func setDiskSize(_ size: UInt64) {
        _diskSize = size
    }

    mutating func setHardwareModel(_ hardwareModel: Data) {
        _hardwareModel = hardwareModel
    }

    mutating func setMachineIdentifier(_ machineIdentifier: Data) {
        _machineIdentifier = machineIdentifier
    }

    mutating func setMacAddress(_ newMacAddress: String) {
        self._macAddress = newMacAddress
    }

    mutating func setDisplay(_ newDisplay: VMDisplayResolution) {
        self._display = newDisplay
    }

    // MARK: - Codable
    enum CodingKeys: String, CodingKey {
        case _cpuCount = "cpuCount"
        case _memorySize = "memorySize"
        case _diskSize = "diskSize"
        case macAddress
        case display
        case _hardwareModel = "hardwareModel"
        case _machineIdentifier = "machineIdentifier"
        case os
    }
    
    init(from decoder: Decoder) throws {
        let container = try decoder.container(keyedBy: CodingKeys.self)
        
        os = try container.decode(String.self, forKey: .os)
        _cpuCount = try container.decodeIfPresent(Int.self, forKey: ._cpuCount)
        _memorySize = try container.decodeIfPresent(UInt64.self, forKey: ._memorySize)
        _diskSize = try container.decodeIfPresent(UInt64.self, forKey: ._diskSize)
        _macAddress = try container.decodeIfPresent(String.self, forKey: .macAddress)
        _display = VMDisplayResolution(string: try container.decode(String.self, forKey: .display))!
        _hardwareModel = try container.decodeIfPresent(Data.self, forKey: ._hardwareModel)
        _machineIdentifier = try container.decodeIfPresent(Data.self, forKey: ._machineIdentifier)
    }
    
    func encode(to encoder: Encoder) throws {
        var container = encoder.container(keyedBy: CodingKeys.self)
        
        try container.encodeIfPresent(os, forKey: .os)
        try container.encodeIfPresent(_cpuCount, forKey: ._cpuCount)
        try container.encodeIfPresent(_memorySize, forKey: ._memorySize)
        try container.encodeIfPresent(_diskSize, forKey: ._diskSize)
        try container.encodeIfPresent(_macAddress, forKey: .macAddress)
        try container.encode(display.string, forKey: .display)
        try container.encodeIfPresent(_hardwareModel, forKey: ._hardwareModel)
        try container.encodeIfPresent(_machineIdentifier, forKey: ._machineIdentifier)
    }
}

```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/cli.py:
--------------------------------------------------------------------------------

```python
"""
Command-line interface for the Computer API server.
"""

import argparse
import asyncio
import logging
import os
import sys
import threading
from typing import List, Optional

from .server import Server

logger = logging.getLogger(__name__)


def parse_args(args: Optional[List[str]] = None) -> argparse.Namespace:
    """Parse command-line arguments."""
    parser = argparse.ArgumentParser(description="Start the Computer API server")
    parser.add_argument(
        "--host", default="0.0.0.0", help="Host to bind the server to (default: 0.0.0.0)"
    )
    parser.add_argument(
        "--port", type=int, default=8000, help="Port to bind the server to (default: 8000)"
    )
    parser.add_argument(
        "--log-level",
        choices=["debug", "info", "warning", "error", "critical"],
        default="info",
        help="Logging level (default: info)",
    )
    parser.add_argument(
        "--ssl-keyfile",
        type=str,
        help="Path to SSL private key file (enables HTTPS)",
    )
    parser.add_argument(
        "--ssl-certfile",
        type=str,
        help="Path to SSL certificate file (enables HTTPS)",
    )
    parser.add_argument(
        "--watchdog",
        action="store_true",
        help="Enable watchdog monitoring (automatically enabled if CONTAINER_NAME env var is set)",
    )
    parser.add_argument(
        "--watchdog-interval",
        type=int,
        default=30,
        help="Watchdog ping interval in seconds (default: 30)",
    )
    parser.add_argument(
        "--no-restart",
        action="store_true",
        help="Disable automatic server restart in watchdog",
    )

    return parser.parse_args(args)


def main() -> None:
    """Main entry point for the CLI."""
    args = parse_args()

    # Configure logging
    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    # Check if watchdog should be enabled
    container_name = os.environ.get("CONTAINER_NAME")
    enable_watchdog = (args.watchdog or bool(container_name)) and not sys.platform.startswith("win")

    if container_name:
        logger.info(
            f"Container environment detected (CONTAINER_NAME={container_name}), enabling watchdog"
        )
    elif args.watchdog:
        logger.info("Watchdog explicitly enabled via --watchdog flag")

    # Start watchdog if enabled
    if enable_watchdog:
        logger.info(f"Starting watchdog monitoring with {args.watchdog_interval}s interval")

        def run_watchdog_thread():
            """Run watchdog in a separate thread."""
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)
            try:
                # Create CLI args dict for watchdog
                cli_args = {
                    "host": args.host,
                    "port": args.port,
                    "log_level": args.log_level,
                    "ssl_keyfile": args.ssl_keyfile,
                    "ssl_certfile": args.ssl_certfile,
                }

                # Create watchdog with restart settings
                from .watchdog import Watchdog

                watchdog = Watchdog(cli_args=cli_args, ping_interval=args.watchdog_interval)
                watchdog.restart_enabled = not args.no_restart

                loop.run_until_complete(watchdog.start_monitoring())
            except Exception as e:
                logger.error(f"Watchdog error: {e}")
            finally:
                loop.close()

        # Start watchdog in background thread
        watchdog_thread = threading.Thread(target=run_watchdog_thread, daemon=True, name="watchdog")
        watchdog_thread.start()

    # Create and start the server
    logger.info(f"Starting CUA Computer API server on {args.host}:{args.port}...")

    # Handle SSL configuration
    ssl_args = {}
    if args.ssl_keyfile and args.ssl_certfile:
        ssl_args = {
            "ssl_keyfile": args.ssl_keyfile,
            "ssl_certfile": args.ssl_certfile,
        }
        logger.info("HTTPS mode enabled with SSL certificates")
    elif args.ssl_keyfile or args.ssl_certfile:
        logger.warning(
            "Both --ssl-keyfile and --ssl-certfile are required for HTTPS. Running in HTTP mode."
        )
    else:
        logger.info("HTTP mode (no SSL certificates provided)")

    server = Server(host=args.host, port=args.port, log_level=args.log_level, **ssl_args)

    try:
        server.start()
    except KeyboardInterrupt:
        logger.info("Server stopped by user")
        sys.exit(0)
    except Exception as e:
        logger.error(f"Error starting server: {e}")
        sys.exit(1)


if __name__ == "__main__":
    main()

```

--------------------------------------------------------------------------------
/libs/lume/src/Virtualization/DarwinImageLoader.swift:
--------------------------------------------------------------------------------

```swift
import Foundation
import Virtualization

/// Handles loading and validation of macOS restore images (IPSW files).
/// Provides functionality to:
/// - Fetch the latest supported macOS restore image URL
/// - Load and validate image requirements for VM creation
/// - Extract hardware model and auxiliary storage configuration
protocol ImageLoader: Sendable {
    typealias ImageRequirements = DarwinImageLoader.ImageRequirements
    func fetchLatestSupportedURL() async throws -> URL
    func loadImageRequirements(from url: URL) async throws -> ImageRequirements
    func downloadLatestImage() async throws -> Path
}

final class DarwinImageLoader: NSObject, ImageLoader, @unchecked Sendable, URLSessionDownloadDelegate {
    struct ImageRequirements: Sendable {
        let hardwareModel: Data
        let minimumSupportedCPUCount: Int
        let minimumSupportedMemorySize: UInt64
    }
    
    enum ImageError: Error {
        case invalidImage
        case unsupportedConfiguration
        case downloadFailed
    }
    
    private var lastLoggedProgress: Double = 0.0
    private var progressLogger = ProgressLogger()
    private var completionHandler: ((URL?, Error?) -> Void)?
    
    func fetchLatestSupportedURL() async throws -> URL {
        try await withCheckedThrowingContinuation { continuation in
            VZMacOSRestoreImage.fetchLatestSupported { result in
                switch result {
                case .success(let image):
                    continuation.resume(returning: image.url)
                case .failure(let error):
                    continuation.resume(throwing: error)
                }
            }
        }
    }
    
    func loadImageRequirements(from url: URL) async throws -> ImageRequirements {
        let image = try await VZMacOSRestoreImage.image(from: url)
        guard let requirements = image.mostFeaturefulSupportedConfiguration else {
            throw ImageError.unsupportedConfiguration
        }
        
        return ImageRequirements(
            hardwareModel: requirements.hardwareModel.dataRepresentation,
            minimumSupportedCPUCount: requirements.minimumSupportedCPUCount,
            minimumSupportedMemorySize: requirements.minimumSupportedMemorySize
        )
    }
    
    func downloadLatestImage() async throws -> Path {
        let url = try await fetchLatestSupportedURL()
        let tempDir = FileManager.default.temporaryDirectory
        let downloadPath = tempDir.appendingPathComponent("latest.ipsw")
        
        // Reset progress logger state
        progressLogger = ProgressLogger(threshold: 0.01)
        
        // Create a continuation to wait for download completion
        return try await withCheckedThrowingContinuation { continuation in
            let session = URLSession(configuration: .default, delegate: self, delegateQueue: nil)
            let task = session.downloadTask(with: url)
            
            // Use the delegate method to handle completion
            self.completionHandler = { location, error in
                if let error = error {
                    continuation.resume(throwing: error)
                    return
                }
                
                do {
                    // Remove existing file if it exists
                    if FileManager.default.fileExists(atPath: downloadPath.path) {
                        try FileManager.default.removeItem(at: downloadPath)
                    }
                    
                    try FileManager.default.moveItem(at: location!, to: downloadPath)
                    Logger.info("Download completed and moved to: \(downloadPath.path)")
                    continuation.resume(returning: Path(downloadPath.path))
                } catch {
                    continuation.resume(throwing: error)
                }
            }
            
            task.resume()
        }
    }
    
    func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didWriteData bytesWritten: Int64, totalBytesWritten: Int64, totalBytesExpectedToWrite: Int64) {
        let progress = Double(totalBytesWritten) / Double(totalBytesExpectedToWrite)
        progressLogger.logProgress(current: progress, context: "Downloading IPSW")
    }
    
    func urlSession(_ session: URLSession, downloadTask: URLSessionDownloadTask, didFinishDownloadingTo location: URL) {
        // Call the stored completion handler
        completionHandler?(location, nil)
    }
    
    func urlSession(_ session: URLSession, task: URLSessionTask, didCompleteWithError error: Error?) {
        // Call the stored completion handler with an error if it occurred
        if let error = error {
            completionHandler?(nil, error)
        }
    }
}
```

--------------------------------------------------------------------------------
/examples/agent_examples.py:
--------------------------------------------------------------------------------

```python
"""Example demonstrating the ComputerAgent capabilities with the Omni provider."""

import asyncio
import logging
import signal
import traceback

# Import the unified agent class and types
from agent import ComputerAgent
from computer import Computer, VMProviderType

# Import utility functions
from utils import handle_sigint, load_dotenv_files

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


async def run_agent_example():
    """Run example of using the ComputerAgent with different models."""
    print("\n=== Example: ComputerAgent with different models ===")

    try:
        # Create a local macOS computer
        computer = Computer(
            os_type="macos",
            verbosity=logging.DEBUG,
        )

        # Create a remote Linux computer with Cua
        # computer = Computer(
        #     os_type="linux",
        #     api_key=os.getenv("CUA_API_KEY"),
        #     name=os.getenv("CUA_CONTAINER_NAME"),
        #     provider_type=VMProviderType.CLOUD,
        # )

        # Create ComputerAgent with new API
        agent = ComputerAgent(
            # Supported models:
            # == OpenAI CUA (computer-use-preview) ==
            model="openai/computer-use-preview",
            # == Anthropic CUA (Claude > 3.5) ==
            # model="anthropic/claude-opus-4-20250514",
            # model="anthropic/claude-sonnet-4-20250514",
            # model="anthropic/claude-3-7-sonnet-20250219",
            # model="anthropic/claude-sonnet-4-5-20250929",
            # == UI-TARS ==
            # model="huggingface-local/ByteDance-Seed/UI-TARS-1.5-7B",
            # model="mlx/mlx-community/UI-TARS-1.5-7B-6bit",
            # model="ollama_chat/0000/ui-tars-1.5-7b",
            # == Omniparser + Any LLM ==
            # model="omniparser+anthropic/claude-opus-4-20250514",
            # model="omniparser+ollama_chat/gemma3:12b-it-q4_K_M",
            # == Omniparser + Vertex AI Gemini 3 (with thinking_level) ==
            # model="omni+vertex_ai/gemini-3-flash",
            # thinking_level="high",  # or "low"
            # media_resolution="medium",  # or "low" or "high"
            tools=[computer],
            only_n_most_recent_images=3,
            verbosity=logging.DEBUG,
            trajectory_dir="trajectories",
            use_prompt_caching=True,
            max_trajectory_budget=1.0,
        )

        # Example tasks to demonstrate the agent
        tasks = [
            "Look for a repository named trycua/cua on GitHub.",
            "Check the open issues, open the most recent one and read it.",
            "Clone the repository in users/lume/projects if it doesn't exist yet.",
            "Open the repository with an app named Cursor (on the dock, black background and white cube icon).",
            "From Cursor, open Composer if not already open.",
            "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
        ]

        # Use message-based conversation history
        history = []

        for i, task in enumerate(tasks):
            print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")

            # Add user message to history
            history.append({"role": "user", "content": task})

            # Run agent with conversation history
            async for result in agent.run(history, stream=False):
                # Add agent outputs to history
                history += result.get("output", [])

                # Print output for debugging
                for item in result.get("output", []):
                    if item.get("type") == "message":
                        content = item.get("content", [])
                        for content_part in content:
                            if content_part.get("text"):
                                print(f"Agent: {content_part.get('text')}")
                    elif item.get("type") == "computer_call":
                        action = item.get("action", {})
                        action_type = action.get("type", "")
                        print(f"Computer Action: {action_type}({action})")
                    elif item.get("type") == "computer_call_output":
                        print("Computer Output: [Screenshot/Result]")

            print(f"✅ Task {i+1}/{len(tasks)} completed: {task}")

    except Exception as e:
        logger.error(f"Error in run_agent_example: {e}")
        traceback.print_exc()
        raise


def main():
    """Run the Anthropic agent example."""
    try:
        load_dotenv_files()

        # Register signal handler for graceful exit
        signal.signal(signal.SIGINT, handle_sigint)

        asyncio.run(run_agent_example())
    except Exception as e:
        print(f"Error running example: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    main()

```

--------------------------------------------------------------------------------
/examples/computer_examples_windows.py:
--------------------------------------------------------------------------------

```python
import asyncio
import os
import sys
import traceback
from pathlib import Path

# Load environment variables from .env file
project_root = Path(__file__).parent.parent
env_file = project_root / ".env"
print(f"Loading environment from: {env_file}")
from computer.helpers import sandboxed
from dotenv import load_dotenv

load_dotenv(env_file)

# Add paths to sys.path if needed
pythonpath = os.environ.get("PYTHONPATH", "")
for path in pythonpath.split(":"):
    if path and path not in sys.path:
        sys.path.insert(0, path)  # Insert at beginning to prioritize
        print(f"Added to sys.path: {path}")

from computer.computer import Computer
from computer.logger import LogLevel
from computer.providers.base import VMProviderType

# ANSI color codes
RED = "\033[91m"
RESET = "\033[0m"


async def main():
    try:
        print("\n=== Using direct initialization ===")

        # Create a remote Windows computer with Cua
        computer = Computer(
            os_type="windows",
            api_key=os.getenv("CUA_API_KEY"),
            name=os.getenv("CONTAINER_NAME") or "",
            provider_type=VMProviderType.CLOUD,
        )

        try:
            # Run the computer with default parameters
            await computer.run()

            # Create output directory if it doesn't exist
            output_dir = Path("./output")
            output_dir.mkdir(exist_ok=True)

            # Keyboard Actions Examples
            print("\n=== Keyboard Actions ===")
            await computer.interface.type_text("Hello, World!")
            await computer.interface.press_key("enter")

            # Mouse Actions Examples
            print("\n=== Mouse Actions ===")
            await computer.interface.move_cursor(100, 100)
            await computer.interface.left_click()
            await computer.interface.double_click(400, 400)
            await computer.interface.right_click(300, 300)

            print("\n=== RPC ===")
            await computer.venv_install("demo_venv", ["mss"])

            @sandboxed("demo_venv")
            def greet_and_print(name):
                import os

                from mss import mss

                # get username
                username = os.getlogin()
                print(f"Hello from inside the container, {name}!")
                print("Username:", username)
                print("Screens:", mss().monitors)

                # take a screenshot
                with mss() as sct:
                    filename = sct.shot(mon=-1, output="C:/Users/azureuser/Desktop/fullscreen.png")
                    print(filename)

                return {"greeted": name, "username": username}

            # Call with args and kwargs
            result = await greet_and_print("John Doe")
            print("Result from sandboxed function:", result)

            # Command Actions Examples
            print("\n=== Command Actions ===")
            result = await computer.interface.run_command("notepad")
            print("Result from command:", result)

            screenshot = await computer.interface.screenshot()
            screenshot_path = output_dir / "screenshot.png"
            with open(screenshot_path, "wb") as f:
                f.write(screenshot)
            print(f"Screenshot saved to: {screenshot_path.absolute()}")

            # Clipboard Actions Examples
            print("\n=== Clipboard Actions ===")
            await computer.interface.set_clipboard("Test clipboard")
            content = await computer.interface.copy_to_clipboard()
            print(f"Clipboard content: {content}")

            # Simple REPL Loop
            print("\n=== Command REPL ===")
            print("Enter commands to run on the remote computer.")
            print("Type 'exit' or 'quit' to leave the REPL.\n")

            while True:
                try:
                    # Get command from user
                    command = input("command> ").strip()

                    # Check for exit commands
                    if command.lower() in ["exit", "quit", ""]:
                        if command.lower() in ["exit", "quit"]:
                            print("Exiting REPL...")
                        break

                    # Run the command
                    result = await computer.interface.run_command(command)

                    print(result.stdout)
                    if result.stderr:
                        print(f"{RED}{result.stderr}{RESET}")
                except KeyboardInterrupt:
                    print("\nExiting REPL...")
                    break
                except Exception as e:
                    print(f"{RED}Error running command: {e}{RESET}")

        finally:
            # Important to clean up resources
            # await computer.stop()
            pass
    except Exception as e:
        print(f"Error in main: {e}")
        traceback.print_exc()


if __name__ == "__main__":
    asyncio.run(main())

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/computers/cua.py:
--------------------------------------------------------------------------------

```python
"""
Computer handler implementation for OpenAI computer-use-preview protocol.
"""

import base64
from typing import Any, Dict, List, Literal, Optional, Union

from computer import Computer

from .base import AsyncComputerHandler


class cuaComputerHandler(AsyncComputerHandler):
    """Computer handler that implements the Computer protocol using the computer interface."""

    def __init__(self, cua_computer: Computer):
        """Initialize with a computer interface (from tool schema)."""
        self.cua_computer = cua_computer
        self.interface = None

    async def _initialize(self):
        if hasattr(self.cua_computer, "_initialized") and not self.cua_computer._initialized:
            await self.cua_computer.run()
        self.interface = self.cua_computer.interface

    # ==== Computer-Use-Preview Action Space ====

    async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
        """Get the current environment type."""
        # TODO: detect actual environment
        return "linux"

    async def get_dimensions(self) -> tuple[int, int]:
        """Get screen dimensions as (width, height)."""
        assert self.interface is not None
        screen_size = await self.interface.get_screen_size()
        return screen_size["width"], screen_size["height"]

    async def screenshot(self, text: Optional[str] = None) -> str:
        """Take a screenshot and return as base64 string.

        Args:
            text: Optional descriptive text (for compatibility with GPT-4o models, ignored)
        """
        assert self.interface is not None
        screenshot_bytes = await self.interface.screenshot()
        return base64.b64encode(screenshot_bytes).decode("utf-8")

    async def click(self, x: int, y: int, button: str = "left") -> None:
        """Click at coordinates with specified button."""
        assert self.interface is not None
        if button == "left":
            await self.interface.left_click(x, y)
        elif button == "right":
            await self.interface.right_click(x, y)
        else:
            # Default to left click for unknown buttons
            await self.interface.left_click(x, y)

    async def double_click(self, x: int, y: int) -> None:
        """Double click at coordinates."""
        assert self.interface is not None
        await self.interface.double_click(x, y)

    async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
        """Scroll at coordinates with specified scroll amounts."""
        assert self.interface is not None
        await self.interface.move_cursor(x, y)
        await self.interface.scroll(scroll_x, scroll_y)

    async def type(self, text: str) -> None:
        """Type text."""
        assert self.interface is not None
        await self.interface.type_text(text)

    async def wait(self, ms: int = 1000) -> None:
        """Wait for specified milliseconds."""
        assert self.interface is not None
        import asyncio

        await asyncio.sleep(ms / 1000.0)

    async def move(self, x: int, y: int) -> None:
        """Move cursor to coordinates."""
        assert self.interface is not None
        await self.interface.move_cursor(x, y)

    async def keypress(self, keys: Union[List[str], str]) -> None:
        """Press key combination."""
        assert self.interface is not None
        if isinstance(keys, str):
            keys = keys.replace("-", "+").split("+")
        if len(keys) == 1:
            await self.interface.press_key(keys[0])
        else:
            # Handle key combinations
            await self.interface.hotkey(*keys)

    async def drag(self, path: List[Dict[str, int]]) -> None:
        """Drag along specified path."""
        assert self.interface is not None
        if not path:
            return

        # Start drag from first point
        start = path[0]
        await self.interface.mouse_down(start["x"], start["y"])

        # Move through path
        for point in path[1:]:
            await self.interface.move_cursor(point["x"], point["y"])

        # End drag at last point
        end = path[-1]
        await self.interface.mouse_up(end["x"], end["y"])

    async def get_current_url(self) -> str:
        """Get current URL (for browser environments)."""
        # This would need to be implemented based on the specific browser interface
        # For now, return empty string
        return ""

    # ==== Anthropic Computer Action Space ====
    async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
        """Left mouse down at coordinates."""
        assert self.interface is not None
        await self.interface.mouse_down(x, y, button="left")

    async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
        """Left mouse up at coordinates."""
        assert self.interface is not None
        await self.interface.mouse_up(x, y, button="left")

```

--------------------------------------------------------------------------------
/libs/python/agent/tests/test_computer_agent.py:
--------------------------------------------------------------------------------

```python
"""Unit tests for ComputerAgent class.

This file tests ONLY the ComputerAgent initialization and basic functionality.
Following SRP: This file tests ONE class (ComputerAgent).
All external dependencies (liteLLM, Computer) are mocked.
"""

from unittest.mock import AsyncMock, MagicMock, Mock, patch

import pytest


class TestComputerAgentInitialization:
    """Test ComputerAgent initialization (SRP: Only tests initialization)."""

    @patch("agent.agent.litellm")
    def test_agent_initialization_with_model(self, mock_litellm, disable_telemetry):
        """Test that agent can be initialized with a model string."""
        from agent import ComputerAgent

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        assert agent is not None
        assert hasattr(agent, "model")
        assert agent.model == "anthropic/claude-sonnet-4-5-20250929"

    @patch("agent.agent.litellm")
    def test_agent_initialization_with_tools(self, mock_litellm, disable_telemetry, mock_computer):
        """Test that agent can be initialized with tools."""
        from agent import ComputerAgent

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])

        assert agent is not None
        assert hasattr(agent, "tools")

    @patch("agent.agent.litellm")
    def test_agent_initialization_with_max_budget(self, mock_litellm, disable_telemetry):
        """Test that agent can be initialized with max trajectory budget."""
        from agent import ComputerAgent

        budget = 5.0
        agent = ComputerAgent(
            model="anthropic/claude-sonnet-4-5-20250929", max_trajectory_budget=budget
        )

        assert agent is not None

    @patch("agent.agent.litellm")
    def test_agent_requires_model(self, mock_litellm, disable_telemetry):
        """Test that agent requires a model parameter."""
        from agent import ComputerAgent

        with pytest.raises(TypeError):
            # Should fail without model parameter - intentionally missing required argument
            ComputerAgent()  # type: ignore[call-arg]


class TestComputerAgentRun:
    """Test ComputerAgent.run() method (SRP: Only tests run logic)."""

    @pytest.mark.asyncio
    @patch("agent.agent.litellm")
    async def test_agent_run_with_messages(self, mock_litellm, disable_telemetry, sample_messages):
        """Test that agent.run() works with valid messages."""
        from agent import ComputerAgent

        # Mock liteLLM response
        mock_response = {
            "id": "chatcmpl-test",
            "choices": [
                {
                    "message": {"role": "assistant", "content": "Test response"},
                    "finish_reason": "stop",
                }
            ],
            "usage": {"prompt_tokens": 10, "completion_tokens": 20, "total_tokens": 30},
        }

        mock_litellm.acompletion = AsyncMock(return_value=mock_response)

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Run should return an async generator
        result_generator = agent.run(sample_messages)

        assert result_generator is not None
        # Check it's an async generator
        assert hasattr(result_generator, "__anext__")

    def test_agent_has_run_method(self, disable_telemetry):
        """Test that agent has run method available."""
        from agent import ComputerAgent

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Verify run method exists
        assert hasattr(agent, "run")
        assert callable(agent.run)

    def test_agent_has_agent_loop(self, disable_telemetry):
        """Test that agent has agent_loop initialized."""
        from agent import ComputerAgent

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929")

        # Verify agent_loop is initialized
        assert hasattr(agent, "agent_loop")
        assert agent.agent_loop is not None


class TestComputerAgentTypes:
    """Test AgentResponse and Messages types (SRP: Only tests type definitions)."""

    def test_messages_type_exists(self):
        """Test that Messages type is exported."""
        from agent import Messages

        assert Messages is not None

    def test_agent_response_type_exists(self):
        """Test that AgentResponse type is exported."""
        from agent import AgentResponse

        assert AgentResponse is not None


class TestComputerAgentIntegration:
    """Test ComputerAgent integration with Computer tool (SRP: Integration within package)."""

    def test_agent_accepts_computer_tool(self, disable_telemetry, mock_computer):
        """Test that agent can be initialized with Computer tool."""
        from agent import ComputerAgent

        agent = ComputerAgent(model="anthropic/claude-sonnet-4-5-20250929", tools=[mock_computer])

        # Verify agent accepted the tool
        assert agent is not None
        assert hasattr(agent, "tools")

```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/mcp-server/usage.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Usage
---

## Basic Usage

Once configured, you can simply ask Claude to perform computer tasks:

- "Open Chrome and go to github.com"
- "Create a folder called 'Projects' on my desktop"
- "Find all PDFs in my Downloads folder"
- "Take a screenshot and highlight the error message"

Claude will automatically use your CUA agent to perform these tasks.

## Advanced Features

### Progress Reporting

The MCP server provides real-time progress updates during task execution:

- Task progress is reported as percentages (0-100%)
- Multi-task operations show progress for each individual task
- Progress updates are streamed to the MCP client for real-time feedback

### Error Handling

Robust error handling ensures reliable operation:

- Failed tasks return error messages with screenshots when possible
- Session state is preserved even when individual tasks fail
- Automatic cleanup prevents resource leaks
- Detailed error logging for troubleshooting

### Concurrent Task Execution

For improved performance, multiple tasks can run concurrently:

- Set `concurrent=true` in `run_multi_cua_tasks` for parallel execution
- Each task runs in its own context with isolated state
- Progress tracking works for both sequential and concurrent modes
- Resource pooling ensures efficient computer instance usage

### Session Management

Multi-client support with automatic resource management:

- Each client gets isolated sessions with separate computer instances
- Sessions automatically clean up after 10 minutes of inactivity
- Resource pooling prevents resource exhaustion
- Session statistics available for monitoring

## Target Computer Options

By default, the MCP server runs CUA in a virtual machine for safety. However, you can also configure it to run on your local system.

### Default: Using a VM (Recommended)

The MCP server will automatically start and connect to a VM based on your platform. This is the safest option as AI actions are isolated from your host system.

No additional configuration is needed - this is the default behavior.

### Option: Targeting Your Local Desktop

<Callout type="warn">
  **Warning:** When targeting your local system, AI models have direct access to your desktop and
  may perform risky actions. Use with caution.
</Callout>

To have the MCP server control your local desktop instead of a VM:

1. **Start the Computer Server on your host:**

```bash
pip install cua-computer-server
python -m computer_server
```

2. **Configure the MCP server to use your host system:**

Add the `CUA_USE_HOST_COMPUTER_SERVER` environment variable to your MCP client configuration:

<Tabs items={['Claude Desktop', 'Other MCP Clients']}>
  <Tab value="Claude Desktop">
    Update your Claude Desktop config (see [Installation](/docs/libraries/mcp-server/installation)) to include the environment variable:

    ```json
    {
      "mcpServers": {
        "cua-agent": {
          "command": "/bin/bash",
          "args": ["~/.cua/start_mcp_server.sh"],
          "env": {
            "CUA_MODEL_NAME": "anthropic/claude-sonnet-4-5-20250929",
            "CUA_USE_HOST_COMPUTER_SERVER": "true"
          }
        }
      }
    }
    ```

  </Tab>
  <Tab value="Other MCP Clients">
    Set the environment variable in your MCP client configuration:

    ```bash
    export CUA_USE_HOST_COMPUTER_SERVER=true
    ```

    Then start your MCP client as usual.

  </Tab>
</Tabs>

3. **Restart your MCP client** (e.g., Claude Desktop) to apply the changes.

Now Claude will control your local desktop directly when you ask it to perform computer tasks.

## Usage Examples

### Single Task Execution

```
"Open Safari and navigate to apple.com"
"Create a new folder on the desktop called 'My Projects'"
"Take a screenshot of the current screen"
```

### Multi-Task Execution (Sequential)

```
"Run these tasks in order: 1) Open Finder, 2) Navigate to Documents folder, 3) Create a new folder called 'Work'"
```

### Multi-Task Execution (Concurrent)

```
"Run these tasks simultaneously: 1) Open Chrome, 2) Open Safari, 3) Open Finder"
```

### Session Management

```
"Show me the current session statistics"
"Take a screenshot using session abc123"
"Cleanup session xyz789"
```

### Error Recovery

```
"Try to open a non-existent application and show me the error"
"Find all files with .tmp extension and delete them safely"
```

## First-time Usage Notes

**API Keys**: Ensure you have valid API keys:

- Add your Anthropic API key in the Claude Desktop config (as shown above)
- Or set it as an environment variable in your shell profile
- **Required**: The MCP server needs an API key to authenticate with the model provider

**Model Selection**: Choose the appropriate model for your needs:

- **Claude Sonnet 4**: Latest model with best performance (`anthropic/claude-sonnet-4-20250514`)
- **Computer-Use Preview**: Specialized for computer tasks (`openai/computer-use-preview`)
- **Local Models**: For privacy-sensitive environments
- **Ollama**: For offline usage

```

--------------------------------------------------------------------------------
/libs/lume/src/VNC/VNCService.swift:
--------------------------------------------------------------------------------

```swift
import Foundation
import Dynamic
import Virtualization

/// Protocol defining the interface for VNC server operations
@MainActor
protocol VNCService {
    var url: String? { get }
    func start(port: Int, virtualMachine: Any?) async throws
    func stop()
    func openClient(url: String) async throws
}

/// Default implementation of VNCService
@MainActor
final class DefaultVNCService: VNCService {
    private var vncServer: Any?
    private let vmDirectory: VMDirectory
    
    init(vmDirectory: VMDirectory) {
        self.vmDirectory = vmDirectory
    }
    
    var url: String? {
        get {
            return try? vmDirectory.loadSession().url
        }
    }
    
    func start(port: Int, virtualMachine: Any?) async throws {
        let password = Array(PassphraseGenerator().prefix(4)).joined(separator: "-")
        let securityConfiguration = Dynamic._VZVNCAuthenticationSecurityConfiguration(password: password)
        
        // Create VNC server with specified port
        let server = Dynamic._VZVNCServer(port: port, queue: DispatchQueue.main,
                                      securityConfiguration: securityConfiguration)
        
        if let vm = virtualMachine as? VZVirtualMachine {
            server.virtualMachine = vm
        }
        server.start()
        
        vncServer = server
        
        // Wait for port to be assigned (both for auto-assign and specific port)
        var attempts = 0
        let maxAttempts = 20  // 1 second total wait time
        while true {
            if let assignedPort: UInt16 = server.port.asUInt16 {
                // If we got a non-zero port, check if it matches our request
                if assignedPort != 0 {
                    // For specific port requests, verify we got the requested port
                    if port != 0 && Int(assignedPort) != port {
                        throw VMError.vncPortBindingFailed(requested: port, actual: Int(assignedPort))
                    }
                    
                    // Get the local IP address for the URL - prefer IPv4
                    let hostIP = try getLocalIPAddress() ?? "127.0.0.1"
                    let url = "vnc://:\(password)@127.0.0.1:\(assignedPort)"  // Use localhost for local connections
                    let externalUrl = "vnc://:\(password)@\(hostIP):\(assignedPort)"  // External URL for remote connections
                    
                    Logger.info("VNC server started", metadata: [
                        "local": url,
                        "external": externalUrl
                    ])
                    
                    // Save session information with local URL for the client
                    let session = VNCSession(url: url)
                    try vmDirectory.saveSession(session)
                    break
                }
            }
            
            attempts += 1
            if attempts >= maxAttempts {
                // If we've timed out and we requested a specific port, it likely means binding failed
                vncServer = nil
                if port != 0 {
                    throw VMError.vncPortBindingFailed(requested: port, actual: -1)
                }
                throw VMError.internalError("Timeout waiting for VNC server to start")
            }
            try await Task.sleep(nanoseconds: 50_000_000)  // 50ms delay between checks
        }
    }
    
    // Modified to prefer IPv4 addresses
    private func getLocalIPAddress() throws -> String? {
        var address: String?
        
        var ifaddr: UnsafeMutablePointer<ifaddrs>?
        guard getifaddrs(&ifaddr) == 0 else {
            return nil
        }
        defer { freeifaddrs(ifaddr) }
        
        var ptr = ifaddr
        while ptr != nil {
            defer { ptr = ptr?.pointee.ifa_next }
            
            let interface = ptr?.pointee
            let family = interface?.ifa_addr.pointee.sa_family
            
            // Only look for IPv4 addresses
            if family == UInt8(AF_INET) {
                let name = String(cString: (interface?.ifa_name)!)
                if name == "en0" { // Primary interface
                    var hostname = [CChar](repeating: 0, count: Int(NI_MAXHOST))
                    getnameinfo(interface?.ifa_addr,
                              socklen_t((interface?.ifa_addr.pointee.sa_len)!),
                              &hostname,
                              socklen_t(hostname.count),
                              nil,
                              0,
                              NI_NUMERICHOST)
                    address = String(cString: hostname, encoding: .utf8)
                    break
                }
            }
        }
        
        return address
    }
    
    func stop() {
        if let server = vncServer as? Dynamic {
            server.stop()
        }
        vncServer = nil
        vmDirectory.clearSession()
    }
    
    func openClient(url: String) async throws {
        let processRunner = DefaultProcessRunner()
        try processRunner.run(executable: "/usr/bin/open", arguments: [url])
    }
} 
```

--------------------------------------------------------------------------------
/libs/typescript/agent/examples/playground-example.html:
--------------------------------------------------------------------------------

```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>CUA Agent Playground Example</title>
</head>
<body>
    <h1>CUA Agent Playground Example</h1>
    
    <div>
        <h2>Configuration</h2>
        <label for="url">Agent URL:</label><br>
        <input type="text" id="url" placeholder="https://localhost:8000 or peer://peer-id" value="https://localhost:8000" style="width: 400px;"><br><br>
        
        <label for="model">Model:</label><br>
        <input type="text" id="model" placeholder="anthropic/claude-opus-4-1-20250805" value="anthropic/claude-opus-4-1-20250805" style="width: 400px;"><br><br>
    </div>

    <div>
        <h2>Chat</h2>
        <label for="message">Message:</label><br>
        <input type="text" id="message" placeholder="Enter your message here..." style="width: 400px;"><br><br>
        
        <button onclick="sendMessage()">Send Message</button>
        <!-- <button onclick="checkHealth()">Check Health</button> -->
        <button onclick="clearOutput()">Clear Output</button><br><br>
        
        <label for="output">Output:</label><br>
        <textarea id="output" rows="20" cols="80" readonly></textarea>
    </div>

    <script src="https://unpkg.com/[email protected]/dist/peerjs.min.js"></script>
    <script type="module">
        // Import the AgentClient from the built library
        import AgentClient from '/dist/index.js';
        
        let client = null;
        
        // Make functions available globally
        window.sendMessage = sendMessage;
        window.checkHealth = checkHealth;
        window.clearOutput = clearOutput;
        
        function log(message) {
            const output = document.getElementById('output');
            const timestamp = new Date().toLocaleTimeString();
            output.value += `[${timestamp}] ${message}\n`;
            output.scrollTop = output.scrollHeight;
        }
        
        function getClient() {
            const url = document.getElementById('url').value.trim();
            if (!url) {
                log('ERROR: Please enter a URL');
                return null;
            }
            
            // Create new client if URL changed or client doesn't exist
            if (!client || client.url !== url) {
                try {
                    client = new AgentClient(url);
                    client.url = url; // Store URL for comparison
                    log(`Created new client for: ${url}`);
                } catch (error) {
                    log(`ERROR creating client: ${error.message}`);
                    return null;
                }
            }
            
            return client;
        }
        
        async function sendMessage() {
            const messageInput = document.getElementById('message');
            const modelInput = document.getElementById('model');
            
            const message = messageInput.value.trim();
            const model = modelInput.value.trim();
            
            if (!message) {
                log('ERROR: Please enter a message');
                return;
            }
            
            if (!model) {
                log('ERROR: Please enter a model');
                return;
            }
            
            const agentClient = getClient();
            if (!agentClient) return;
            
            try {
                log(`Sending message: "${message}"`);
                log(`Using model: ${model}`);
                
                const request = {
                    model: model,
                    input: message
                };
                
                log('Sending request...');
                const response = await agentClient.responses.create(request);
                
                log('Response received:');
                log(JSON.stringify(response, null, 2));
                
                // Clear the message input
                messageInput.value = '';
                
            } catch (error) {
                log(`ERROR: ${error.message}`);
            }
        }
        
        async function checkHealth() {
            const agentClient = getClient();
            if (!agentClient) return;
            
            try {
                log('Checking health...');
                const health = await agentClient.health();
                log(`Health status: ${health.status}`);
            } catch (error) {
                log(`ERROR checking health: ${error.message}`);
            }
        }
        
        function clearOutput() {
            document.getElementById('output').value = '';
        }
        
        // Allow sending message with Enter key
        document.getElementById('message').addEventListener('keypress', function(e) {
            if (e.key === 'Enter') {
                sendMessage();
            }
        });
        
        // Log initial message
        log('CUA Agent Client Browser Example loaded');
        log('Enter a URL (HTTP/HTTPS or peer://) and model, then send a message');
    </script>
</body>
</html>

```

--------------------------------------------------------------------------------
/docs/src/assets/logo-black.svg:
--------------------------------------------------------------------------------

```
<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN"
 "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
<svg version="1.0" xmlns="http://www.w3.org/2000/svg"
 width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000"
 preserveAspectRatio="xMidYMid meet">

<g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)"
fill="#000000" stroke="none">
<path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38
-73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23
513 -38 69 -133 101 -213 71z"/>
<path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49
107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104
86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65
75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/>
<path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230
-265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57
-29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60
70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/>
<path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577
-579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310
22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0
-33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216
-91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15
-75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30
-70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66
521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32
415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112
31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71
159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17
31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149
43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50
19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19
189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0
-35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2
-330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16
-45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212
-255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705
209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35
-36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79
-11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0
16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85
2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299
-307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70
201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0
305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225
-102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301
303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39
550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134
-217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8
-55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382
-25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251
-25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140
-72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48
-410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17
-75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181
-199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597
5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16
-52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329
-150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96
22 103 23 243 25 95 1 154 -4 228 -20z"/>
<path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268
20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253
362 -108 4 -147 2 -171 -9z"/>
<path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3
-281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342
-15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/>
<path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151
-296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9
-24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1
-238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236
-42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15
70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186
406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13
-132 19 -200 10z"/>
</g>
</svg>

```

--------------------------------------------------------------------------------
/docs/src/assets/logo-white.svg:
--------------------------------------------------------------------------------

```
<?xml version="1.0" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN"
 "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
<svg version="1.0" xmlns="http://www.w3.org/2000/svg"
 width="1000.000000pt" height="1000.000000pt" viewBox="0 0 1000.000000 1000.000000"
 preserveAspectRatio="xMidYMid meet">

<g transform="translate(0.000000,1000.000000) scale(0.100000,-0.100000)"
fill="#ffffff" stroke="none">
<path d="M4934 9086 c-40 -14 -62 -33 -80 -69 -22 -42 -21 -994 1 -1037 38
-73 174 -101 243 -50 19 14 43 42 53 62 18 35 19 65 19 510 0 471 0 473 -23
513 -38 69 -133 101 -213 71z"/>
<path d="M3702 8472 c-52 -28 -82 -81 -82 -147 0 -67 8 -80 125 -210 44 -49
107 -121 139 -160 165 -196 233 -268 278 -291 58 -29 66 -30 124 -2 67 31 104
86 104 154 0 60 -14 82 -149 235 -42 47 -95 108 -117 135 -23 27 -52 61 -65
75 -13 14 -57 65 -98 112 -41 47 -89 93 -107 102 -42 20 -111 19 -152 -3z"/>
<path d="M6145 8472 c-29 -18 -136 -133 -235 -252 -53 -64 -190 -222 -230
-265 -37 -41 -70 -108 -70 -142 0 -16 10 -49 23 -73 17 -36 33 -51 79 -73 57
-29 57 -29 107 -12 44 14 63 31 149 128 54 62 122 141 151 177 30 36 57 67 60
70 12 10 157 175 179 204 33 43 31 150 -2 188 -56 64 -151 86 -211 50z"/>
<path d="M2245 7400 c-188 -14 -374 -75 -585 -191 -222 -123 -464 -366 -577
-579 -13 -25 -28 -52 -33 -60 -74 -123 -137 -348 -161 -580 -10 -106 1 -310
22 -384 5 -17 9 -44 9 -60 0 -72 116 -366 181 -458 11 -14 19 -29 19 -33 0
-33 296 -355 326 -355 7 0 14 -4 16 -10 5 -17 139 -99 243 -150 106 -52 216
-91 303 -109 98 -20 92 -7 92 -215 0 -176 26 -472 50 -571 5 -22 12 -56 15
-75 8 -44 31 -129 56 -201 10 -31 19 -62 19 -69 0 -8 8 -32 19 -54 10 -23 30
-70 45 -106 76 -182 189 -363 319 -515 296 -344 701 -603 1162 -743 216 -66
521 -126 730 -143 335 -27 467 -31 653 -19 103 6 237 15 297 19 120 8 282 32
415 62 47 10 98 19 113 19 16 0 37 5 48 11 11 5 48 16 82 24 34 7 85 21 112
31 104 36 161 58 201 76 22 10 43 18 47 18 12 0 185 85 263 131 44 25 116 71
159 100 43 30 87 61 99 68 107 74 344 310 444 444 40 53 72 98 72 101 0 2 17
31 38 63 68 104 202 390 202 431 0 10 4 22 9 28 12 12 53 168 80 304 30 149
43 293 48 538 l5 214 33 14 c18 7 53 16 77 20 23 4 48 10 53 14 6 4 28 13 50
19 91 27 214 86 318 152 224 141 416 353 524 580 98 206 129 320 153 562 19
189 -20 467 -92 657 -144 382 -420 674 -811 859 -48 22 -93 41 -101 41 -7 0
-35 8 -62 19 -27 10 -92 29 -144 41 -84 20 -119 23 -325 22 -212 0 -238 -2
-330 -25 -55 -14 -131 -37 -170 -52 -38 -15 -84 -32 -101 -39 -18 -6 -38 -16
-45 -22 -8 -6 -27 -18 -44 -26 -79 -40 -121 -67 -205 -134 -69 -54 -225 -212
-255 -257 -21 -32 -26 -33 -84 -6 -25 12 -64 29 -86 40 -183 84 -514 183 -705
209 -41 6 -91 15 -110 20 -50 13 -318 30 -470 30 -159 0 -363 -16 -450 -35
-36 -8 -87 -17 -115 -20 -48 -7 -178 -36 -240 -55 -84 -26 -222 -71 -240 -79
-11 -4 -47 -19 -80 -31 -77 -30 -162 -66 -198 -85 -32 -17 -67 -20 -67 -6 0
16 -211 230 -274 279 -96 74 -124 92 -237 149 -204 102 -346 139 -569 146 -85
2 -200 1 -255 -3z m396 -331 c163 -33 302 -93 433 -184 97 -68 232 -206 299
-307 32 -48 70 -94 85 -104 38 -25 155 -24 185 3 28 24 183 99 302 146 180 70
201 77 214 77 8 0 39 8 70 19 77 26 221 57 376 82 111 17 173 20 418 20 159 0
305 -5 325 -10 21 -5 71 -14 112 -21 178 -28 372 -81 590 -161 65 -24 225
-102 279 -137 48 -30 63 -34 118 -34 78 1 105 20 179 131 65 97 213 245 301
303 74 48 228 128 248 128 6 0 25 6 41 14 61 30 229 56 359 56 202 0 365 -39
550 -131 285 -142 521 -410 616 -699 108 -331 69 -692 -109 -995 -79 -134
-217 -274 -366 -369 -63 -40 -221 -116 -242 -116 -8 0 -28 -7 -44 -15 -16 -8
-55 -19 -87 -24 -230 -37 -274 -55 -306 -124 -15 -30 -16 -58 -7 -238 18 -382
-25 -716 -128 -994 -63 -171 -182 -380 -298 -523 -59 -74 -186 -204 -244 -251
-25 -20 -54 -44 -65 -54 -26 -24 -178 -128 -235 -161 -25 -14 -88 -46 -140
-72 -52 -25 -106 -51 -120 -58 -34 -18 -216 -80 -315 -107 -114 -31 -197 -48
-410 -85 -126 -21 -452 -46 -625 -48 -376 -3 -837 62 -1105 155 -16 6 -50 17
-75 24 -72 21 -256 98 -320 135 -8 5 -40 21 -70 36 -63 31 -172 103 -277 181
-199 148 -392 374 -504 588 -118 228 -190 479 -220 775 -11 113 -7 483 7 597
5 42 2 62 -15 96 -37 77 -60 86 -318 127 -29 4 -67 15 -84 24 -18 9 -41 16
-52 16 -10 0 -36 8 -56 18 -20 10 -58 30 -86 43 -139 67 -301 202 -395 329
-150 203 -229 445 -230 705 0 331 117 613 355 850 175 176 364 280 615 339 96
22 103 23 243 25 95 1 154 -4 228 -20z"/>
<path d="M3464 5185 c-17 -8 -43 -28 -58 -45 l-26 -32 0 -265 c0 -249 1 -268
20 -298 38 -62 51 -65 244 -65 l175 0 36 34 37 35 -4 283 c-4 378 13 353 -253
362 -108 4 -147 2 -171 -9z"/>
<path d="M6174 5171 c-12 -5 -31 -22 -43 -37 -22 -28 -22 -32 -19 -309 l3
-281 25 -31 25 -32 189 0 188 -1 41 40 40 40 -5 253 c-6 260 -10 288 -53 342
-15 18 -29 20 -193 22 -97 1 -187 -2 -198 -6z"/>
<path d="M4935 5079 c-199 -25 -341 -112 -454 -278 -49 -71 -134 -238 -151
-296 -7 -22 -21 -59 -31 -83 -11 -23 -19 -50 -19 -60 0 -9 -7 -37 -15 -60 -9
-24 -20 -69 -25 -100 -5 -32 -16 -93 -25 -137 -12 -59 -16 -144 -17 -325 -1
-238 0 -247 25 -321 63 -188 164 -313 318 -394 86 -45 137 -61 274 -85 236
-42 492 -10 651 81 238 137 348 357 348 699 0 89 -21 335 -34 390 -6 25 -15
70 -20 100 -5 30 -15 71 -21 90 -6 19 -15 51 -19 70 -24 100 -107 282 -186
406 -59 94 -167 193 -265 242 -46 23 -93 42 -104 42 -12 0 -25 4 -30 9 -15 13
-132 19 -200 10z"/>
</g>
</svg>

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/adapters/cua_adapter.py:
--------------------------------------------------------------------------------

```python
import os
from typing import Any, AsyncIterator, Iterator

from litellm import acompletion, completion
from litellm.llms.custom_llm import CustomLLM
from litellm.types.utils import GenericStreamingChunk, ModelResponse


class CUAAdapter(CustomLLM):
    def __init__(self, base_url: str | None = None, api_key: str | None = None, **_: Any):
        super().__init__()
        self.base_url = base_url or os.environ.get("CUA_BASE_URL") or "https://inference.cua.ai/v1"
        self.api_key = (
            api_key or os.environ.get("CUA_INFERENCE_API_KEY") or os.environ.get("CUA_API_KEY")
        )

    def _normalize_model(self, model: str) -> str:
        # Accept either "cua/<model>" or raw "<model>"
        return model.split("/", 1)[1] if model and model.startswith("cua/") else model

    def completion(self, *args, **kwargs) -> ModelResponse:
        model = kwargs.get("model", "")
        api_base = kwargs.get("api_base") or self.base_url
        if "anthropic/" in model:
            model = f"anthropic/{self._normalize_model(model)}"
            api_base = api_base.removesuffix("/v1")
        else:
            model = f"openai/{self._normalize_model(model)}"

        params = {
            "model": model,
            "messages": kwargs.get("messages", []),
            "api_base": api_base,
            "api_key": kwargs.get("api_key") or self.api_key,
            "stream": False,
        }

        if "optional_params" in kwargs:
            params.update(kwargs["optional_params"])
            del kwargs["optional_params"]

        if "headers" in kwargs:
            params["headers"] = kwargs["headers"]
            del kwargs["headers"]

        # Print dropped parameters
        original_keys = set(kwargs.keys())
        used_keys = set(params.keys())  # Only these are extracted from kwargs
        ignored_keys = {
            "litellm_params",
            "client",
            "print_verbose",
            "acompletion",
            "timeout",
            "logging_obj",
            "encoding",
            "custom_prompt_dict",
            "model_response",
            "logger_fn",
        }
        dropped_keys = original_keys - used_keys - ignored_keys
        if dropped_keys:
            dropped_keyvals = {k: kwargs[k] for k in dropped_keys}
            # print(f"CUAAdapter.completion: Dropped parameters: {dropped_keyvals}")

        return completion(**params)  # type: ignore

    async def acompletion(self, *args, **kwargs) -> ModelResponse:
        model = kwargs.get("model", "")
        api_base = kwargs.get("api_base") or self.base_url
        if "anthropic/" in model:
            model = f"anthropic/{self._normalize_model(model)}"
            api_base = api_base.removesuffix("/v1")
        else:
            model = f"openai/{self._normalize_model(model)}"

        params = {
            "model": model,
            "messages": kwargs.get("messages", []),
            "api_base": api_base,
            "api_key": kwargs.get("api_key") or self.api_key,
            "stream": False,
        }

        if "optional_params" in kwargs:
            params.update(kwargs["optional_params"])
            del kwargs["optional_params"]

        if "headers" in kwargs:
            params["headers"] = kwargs["headers"]
            del kwargs["headers"]

        # Print dropped parameters
        original_keys = set(kwargs.keys())
        used_keys = set(params.keys())  # Only these are extracted from kwargs
        ignored_keys = {
            "litellm_params",
            "client",
            "print_verbose",
            "acompletion",
            "timeout",
            "logging_obj",
            "encoding",
            "custom_prompt_dict",
            "model_response",
            "logger_fn",
        }
        dropped_keys = original_keys - used_keys - ignored_keys
        if dropped_keys:
            dropped_keyvals = {k: kwargs[k] for k in dropped_keys}
            # print(f"CUAAdapter.acompletion: Dropped parameters: {dropped_keyvals}")

        response = await acompletion(**params)  # type: ignore

        return response

    def streaming(self, *args, **kwargs) -> Iterator[GenericStreamingChunk]:
        params = dict(kwargs)
        inner_model = self._normalize_model(params.get("model", ""))
        params.update(
            {
                "model": f"openai/{inner_model}",
                "api_base": self.base_url,
                "api_key": self.api_key,
                "stream": True,
            }
        )
        # Yield chunks directly from LiteLLM's streaming generator
        for chunk in completion(**params):  # type: ignore
            yield chunk  # type: ignore

    async def astreaming(self, *args, **kwargs) -> AsyncIterator[GenericStreamingChunk]:
        params = dict(kwargs)
        inner_model = self._normalize_model(params.get("model", ""))
        params.update(
            {
                "model": f"openai/{inner_model}",
                "api_base": self.base_url,
                "api_key": self.api_key,
                "stream": True,
            }
        )
        stream = await acompletion(**params)  # type: ignore
        async for chunk in stream:  # type: ignore
            yield chunk  # type: ignore

```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/integrations/hud.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: HUD Evals
description: Use ComputerAgent with HUD for benchmarking and evaluation
---

<Callout>
  A corresponding <a href="https://github.com/trycua/cua/blob/main/notebooks/eval_osworld.ipynb" target="_blank">Jupyter Notebook</a> is available for this documentation.
</Callout>

The HUD integration allows an agent to be benchmarked using the [HUD framework](https://www.hud.so/). Through the HUD integration, the agent controls a computer inside HUD, where tests are run to evaluate the success of each task.

## Installation

First, install the required package:

```bash
pip install "cua-agent[hud]"
## or install hud-python directly
# pip install hud-python==0.4.12
```

## Environment Variables

Before running any evaluations, you’ll need to set up your environment variables for HUD and your model providers:

```bash
# HUD access
export HUD_API_KEY="your_hud_api_key"

# Model provider keys (at least one required)
export OPENAI_API_KEY="your_openai_key"
export ANTHROPIC_API_KEY="your_anthropic_key"
```

## Running a Single Task

You can run a single task from a HUD dataset for quick verification.

### Example

```python
from agent.integrations.hud import run_single_task

await run_single_task(
    dataset="hud-evals/OSWorld-Verified",   # or another HUD dataset
    model="openai/computer-use-preview+openai/gpt-5-nano",  # any supported model string
    task_id=155,  # e.g., reopen last closed tab
)
```

### Parameters

- `task_id` (`int`): Default: `0`
  Index of the task to run from the dataset.

## Running a Full Dataset

To benchmark your agent at scale, you can run an entire dataset (or a subset) in parallel.

### Example

```python
from agent.integrations.hud import run_full_dataset

results = await run_full_dataset(
    dataset="hud-evals/OSWorld-Verified",   # can also pass a Dataset or list[dict]
    model="openai/computer-use-preview",
    split="train[:3]",           # try a few tasks to start
    max_concurrent=20,            # tune to your infra
    max_steps=50                  # safety cap per task
)
```

### Parameters

- `job_name` (`str` | `None`):
  Optional human-readable name for the evaluation job (shows up in HUD UI).
- `max_concurrent` (`int`): Default: `30`
  Number of tasks to run in parallel. Scale this based on your infra.
- `max_steps` (`int`): Default: `50`
  Safety cap on steps per task to prevent infinite loops.
- `split` (`str`): Default: `"train"`
  Dataset split or subset to run. Uses the [Hugging Face split format](https://huggingface.co/docs/datasets/v1.11.0/splits.html), e.g., `"train[:10]"` for the first 10 tasks.

## Additional Parameters

Both single-task and full-dataset runs share a common set of configuration options. These let you fine-tune how the evaluation runs.

- `dataset` (`str` | `Dataset` | `list[dict]`): **Required**
  HUD dataset name (e.g. `"hud-evals/OSWorld-Verified"`), a loaded `Dataset`, or a list of tasks.
- `model` (`str`): Default: `"computer-use-preview"`
  Model string, e.g. `"openai/computer-use-preview+openai/gpt-5-nano"`. Supports composition with `+` (planning + grounding).
- `allowed_tools` (`list[str]`): Default: `["openai_computer"]`
  Restrict which tools the agent may use.
- `tools` (`list[Any]`):
  Extra tool configs to inject.
- `custom_loop` (`Callable`):
  Optional custom agent loop function. If provided, overrides automatic loop selection.
- `only_n_most_recent_images` (`int`): Default: `5` for full dataset, `None` for single task.
  Retain only the last N screenshots in memory.
- `callbacks` (`list[Any]`):
  Hook functions for logging, telemetry, or side effects.
- `verbosity` (`int`):
  Logging level. Set `2` for debugging every call/action.
- `trajectory_dir` (`str` | `dict`):
  Save local copies of trajectories for replay/analysis.
- `max_retries` (`int`): Default: `3`
  Number of retries for failed model/tool calls.
- `screenshot_delay` (`float` | `int`): Default: `0.5`
  Delay (seconds) between screenshots to avoid race conditions.
- `use_prompt_caching` (`bool`): Default: `False`
  Cache repeated prompts to reduce API calls.
- `max_trajectory_budget` (`float` | `dict`):
  Limit on trajectory size/budget (e.g., tokens, steps).
- `telemetry_enabled` (`bool`): Default: `True`
  Whether to send telemetry/traces to HUD.
- `**kwargs` (`any`):
  Any additional keyword arguments are passed through to the agent loop or model provider.

## Available Benchmarks

HUD provides multiple benchmark datasets for realistic evaluation.

1. **[OSWorld-Verified](/agent-sdk/benchmarks/osworld-verified)** – Benchmark on 369+ real-world desktop tasks across Chrome, LibreOffice, GIMP, VS Code, etc.
   _Best for_: evaluating full computer-use agents in realistic environments.
   _Verified variant_: fixes 300+ issues from earlier versions for reliability.

**Coming soon:** SheetBench (spreadsheet automation) and other specialized HUD datasets.

See the [HUD docs](https://docs.hud.so/environment-creation) for more eval environments.

## Tips

- **Debugging:** set `verbosity=2` to see every model call and tool action.
- **Performance:** lower `screenshot_delay` for faster runs; raise it if you see race conditions.
- **Safety:** always set `max_steps` (defaults to 50) to prevent runaway loops.
- **Custom tools:** pass extra `tools=[...]` into the agent config if you need beyond `openai_computer`.

```

--------------------------------------------------------------------------------
/docs/content/docs/computer-sdk/cloud-vm-management.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Cloud Sandbox Management
description: Manage your Cua Cloud sandboxes via Python SDK or HTTP API
---

import { Tab, Tabs } from 'fumadocs-ui/components/tabs';

Using the Cua Cloud API, you can manage your Cua Cloud sandboxes with Python or HTTP (curl).

All examples require a CUA API key. You can obtain one from the [Dashboard](https://www.cua.ai/dashboard/keys).

---

## List Sandboxes

<Tabs items={['Python', 'curl']}>
  <Tab value="Python">

```python
import asyncio
from computer.providers.cloud.provider import CloudProvider

async def main():
    # CloudProvider automatically reads CUA_API_KEY from environment
    # You can also pass api_key explicitly: CloudProvider(api_key="your-api-key")
    # Optional: point to a different API base
    # os.environ["CUA_API_BASE"] = "https://api.cua.ai"

    provider = CloudProvider(verbose=False)
    async with provider:
        vms = await provider.list_vms()
        for vm in vms:
            print({
                "name": vm["name"],
                "status": vm["status"],
                "api_url": vm.get("api_url"),
                "vnc_url": vm.get("vnc_url"),
            })

if __name__ == "__main__":
    asyncio.run(main())
```

  </Tab>
  <Tab value="curl">

```bash
curl -H "Authorization: Bearer $CUA_API_KEY" \
     "https://api.cua.ai/v1/vms"
```

Responses:

- 200: Array of minimal sandbox objects with fields `{ name, password, status }`
- 401: Unauthorized (missing/invalid API key)

```json
[
  {
    "name": "s-windows-x4snp46ebf",
    "password": "49b8daa3",
    "status": "running"
  }
]
```

Status values:

- `pending`: Sandbox deployment in progress
- `running`: Sandbox is active and accessible
- `stopped`: Sandbox is stopped but not terminated
- `terminated`: Sandbox has been permanently destroyed
- `failed`: Sandbox deployment or operation failed

---

      </Tab>

  </Tabs>

---

## Start a Sandbox

Provide the sandbox name you want to start.

<Tabs items={["Python", "curl"]}>
  <Tab value="Python">

```python
import asyncio
from computer.providers.cloud.provider import CloudProvider

async def main():
    # CloudProvider automatically reads CUA_API_KEY from environment
    name = "my-vm-name"  # e.g., "m-linux-96lcxd2c2k"

    provider = CloudProvider()
    async with provider:
        resp = await provider.run_vm(name)
        print(resp)  # { "name": name, "status": "starting" }

if __name__ == "__main__":
    asyncio.run(main())
```

  </Tab>
  <Tab value="curl">

```bash
curl -X POST \
     -H "Authorization: Bearer $CUA_API_KEY" \
     "https://api.cua.ai/v1/vms/my-vm-name/start" -i
```

Responses:

- 204: No Content (start accepted)
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user

```text
HTTP/1.1 204 No Content
```

  </Tab>
</Tabs>

---

## Stop a Sandbox

Stops the sandbox asynchronously.

<Tabs items={["Python", "curl"]}>
  <Tab value="Python">

```python
import asyncio
from computer.providers.cloud.provider import CloudProvider

async def main():
    # CloudProvider automatically reads CUA_API_KEY from environment
    name = "my-vm-name"

    provider = CloudProvider()
    async with provider:
        resp = await provider.stop_vm(name)
        print(resp)  # { "name": name, "status": "stopping" }

if __name__ == "__main__":
    asyncio.run(main())
```

  </Tab>
  <Tab value="curl">

```bash
curl -X POST \
     -H "Authorization: Bearer $CUA_API_KEY" \
     "https://api.cua.ai/v1/vms/my-vm-name/stop"
```

Responses:

- 202: Accepted with `{ "status": "stopping" }`
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user

```json
{ "status": "stopping" }
```

  </Tab>
</Tabs>

---

## Restart a Sandbox

Restarts the sandbox asynchronously.

<Tabs items={["Python", "curl"]}>
  <Tab value="Python">

```python
import asyncio
from computer.providers.cloud.provider import CloudProvider

async def main():
    # CloudProvider automatically reads CUA_API_KEY from environment
    name = "my-vm-name"

    provider = CloudProvider()
    async with provider:
        resp = await provider.restart_vm(name)
        print(resp)  # { "name": name, "status": "restarting" }

if __name__ == "__main__":
    asyncio.run(main())
```

  </Tab>
  <Tab value="curl">

```bash
curl -X POST \
     -H "Authorization: Bearer $CUA_API_KEY" \
     "https://api.cua.ai/v1/vms/my-vm-name/restart"
```

Responses:

- 202: Accepted with `{ "status": "restarting" }`
- 401: Unauthorized (missing/invalid API key)
- 404: Sandbox not found or not owned by the user

```json
{ "status": "restarting" }
```

  </Tab>
</Tabs>

---

## Query a Sandbox by name

Query the computer-server running on the sandbox. Useful for checking details like status or OS type.

<Tabs items={["Python", "curl"]}>
  <Tab value="Python">

```python
import asyncio
from computer.providers.cloud.provider import CloudProvider

async def main():
    # CloudProvider automatically reads CUA_API_KEY from environment
    name = "my-vm-name"

    provider = CloudProvider()
    async with provider:
        info = await provider.get_vm(name)
        print(info)

if __name__ == "__main__":
    asyncio.run(main())
```

  </Tab>
  <Tab value="curl">

```bash
curl "https://my-vm-name.containers.cloud.cua.ai:8443/status"
```

Responses:

- 200: Server available

```json
{ "status": "ok", "os_type": "linux", "features": ["agent"] }
```

  </Tab>
</Tabs>

```

--------------------------------------------------------------------------------
/docs/content/docs/agent-sdk/message-format.mdx:
--------------------------------------------------------------------------------

```markdown
---
title: Message Format
---

This page documents the Python message and response schema used by the Agent SDK.
It mirrors the structure shown in Chat History and provides precise type definitions you can target in your own code.

All examples below use Python type hints with `TypedDict` and `Literal` from the standard `typing` module.

## Response

The agent yields response chunks as an async generator of objects with `output` and `usage`.

```python
from typing import List, TypedDict

class Usage(TypedDict, total=False):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
    response_cost: float  # USD cost if available

class AgentResponse(TypedDict):
    output: List["AgentMessage"]
    usage: Usage
```

## Messages

Agent messages represent the state of the conversation and the agent's actions.

```python
from typing import List, Literal, Optional, TypedDict, Union

# Union of all message variants
AgentMessage = Union[
    "UserMessage",
    "AssistantMessage",
    "ReasoningMessage",
    "ComputerCallMessage",
    "ComputerCallOutputMessage",
    "FunctionCallMessage",
    "FunctionCallOutputMessage",
]

# Input message (role: user/system/developer)
class UserMessage(TypedDict, total=False):
    type: Literal["message"]  # optional for user input
    role: Literal["user", "system", "developer"]
    content: Union[str, List["InputContent"]]

# Output message (assistant text)
class AssistantMessage(TypedDict):
    type: Literal["message"]
    role: Literal["assistant"]
    content: List["OutputContent"]

# Output reasoning/thinking message
class ReasoningMessage(TypedDict):
    type: Literal["reasoning"]
    summary: List["SummaryContent"]

# Output computer action call (agent intends to act)
class ComputerCallMessage(TypedDict):
    type: Literal["computer_call"]
    call_id: str
    status: Literal["completed", "failed", "pending"]
    action: "ComputerAction"

# Output computer action result (always a screenshot)
class ComputerCallOutputMessage(TypedDict):
    type: Literal["computer_call_output"]
    call_id: str
    output: "ComputerResultContent"

# Output function call (agent calls a Python tool)
class FunctionCallMessage(TypedDict):
    type: Literal["function_call"]
    call_id: str
    status: Literal["completed", "failed", "pending"]
    name: str
    arguments: str  # JSON-serialized kwargs

# Output function call result (text)
class FunctionCallOutputMessage(TypedDict):
    type: Literal["function_call_output"]
    call_id: str
    output: str
```

## Message Content

These content items appear inside `content` arrays for the message types above.

```python
# Input content kinds
class InputContent(TypedDict):
    type: Literal["input_image", "input_text"]
    text: Optional[str]
    image_url: Optional[str]  # e.g., data URL

# Assistant output content
class OutputContent(TypedDict):
    type: Literal["output_text"]
    text: str

# Reasoning/summary output content
class SummaryContent(TypedDict):
    type: Literal["summary_text"]
    text: str

# Computer call outputs (screenshots)
class ComputerResultContent(TypedDict):
    type: Literal["computer_screenshot", "input_image"]
    image_url: str  # data URL (e.g., "data:image/png;base64,....")
```

## Actions

Computer actions represent concrete operations the agent will perform on the computer.

Two broad families exist depending on the provider: OpenAI-style and Anthropic-style.

```python
# Union of all supported computer actions
ComputerAction = Union[
    "ClickAction",
    "DoubleClickAction",
    "DragAction",
    "KeyPressAction",
    "MoveAction",
    "ScreenshotAction",
    "ScrollAction",
    "TypeAction",
    "WaitAction",
    # Anthropic variants
    "LeftMouseDownAction",
    "LeftMouseUpAction",
]

# OpenAI Computer Actions
class ClickAction(TypedDict):
    type: Literal["click"]
    button: Literal["left", "right", "wheel", "back", "forward"]
    x: int
    y: int

class DoubleClickAction(TypedDict, total=False):
    type: Literal["double_click"]
    button: Literal["left", "right", "wheel", "back", "forward"]
    x: int
    y: int

class DragAction(TypedDict, total=False):
    type: Literal["drag"]
    button: Literal["left", "right", "wheel", "back", "forward"]
    path: List[tuple[int, int]]  # [(x1, y1), (x2, y2), ...]

class KeyPressAction(TypedDict):
    type: Literal["keypress"]
    keys: List[str]  # e.g., ["ctrl", "a"]

class MoveAction(TypedDict):
    type: Literal["move"]
    x: int
    y: int

class ScreenshotAction(TypedDict):
    type: Literal["screenshot"]

class ScrollAction(TypedDict):
    type: Literal["scroll"]
    scroll_x: int
    scroll_y: int
    x: int
    y: int

class TypeAction(TypedDict):
    type: Literal["type"]
    text: str

class WaitAction(TypedDict):
    type: Literal["wait"]

# Anthropic Computer Actions
class LeftMouseDownAction(TypedDict):
    type: Literal["left_mouse_down"]
    x: int
    y: int

class LeftMouseUpAction(TypedDict):
    type: Literal["left_mouse_up"]
    x: int
    y: int
```

## Notes

- The agent runtime may add provider-specific fields when available (e.g., usage cost). Unknown fields should be ignored for forward compatibility.
- Computer action outputs are screenshots as data URLs. For security and storage, some serializers may redact or omit large fields in persisted metadata.
- The message flow typically alternates between reasoning, actions, screenshots, and concluding assistant text. See [Chat History](./chat-history) for a step-by-step example.

```

--------------------------------------------------------------------------------
/docs/src/components/footer.tsx:
--------------------------------------------------------------------------------

```typescript
export function Footer() {
  return (
    <footer className="mt-auto border-t border-fd-border py-8">
      <div className="container mx-auto px-4">
        <div className="grid grid-cols-1 md:grid-cols-4 gap-8 mb-6">
          {/* Product Links */}
          <div>
            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Product</h3>
            <ul className="space-y-2">
              <li>
                <a
                  href="https://cua.ai"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Home
                </a>
              </li>
              <li>
                <a
                  href="https://cua.ai/pricing"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Pricing
                </a>
              </li>
              <li>
                <a
                  href="https://cua.ai/#features"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Features
                </a>
              </li>
            </ul>
          </div>

          {/* Documentation Links */}
          <div>
            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Documentation</h3>
            <ul className="space-y-2">
              <li>
                <a
                  href="/docs"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Getting Started
                </a>
              </li>
              <li>
                <a
                  href="/docs/agent-sdk/agent-loops"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Agent Loops
                </a>
              </li>
              <li>
                <a
                  href="/docs/get-started/quickstart"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Quick Start
                </a>
              </li>
            </ul>
          </div>

          {/* Resources Links */}
          <div>
            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Resources</h3>
            <ul className="space-y-2">
              <li>
                <a
                  href="https://cua.ai/blog"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Blog
                </a>
              </li>
              <li>
                <a
                  href="https://github.com/trycua/cua"
                  target="_blank"
                  rel="noopener noreferrer"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  GitHub
                </a>
              </li>
              <li>
                <a
                  href="https://discord.com/invite/mVnXXpdE85"
                  target="_blank"
                  rel="noopener noreferrer"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Discord Community
                </a>
              </li>
            </ul>
          </div>

          {/* Company Links */}
          <div>
            <h3 className="font-semibold text-sm mb-3 text-fd-foreground">Company</h3>
            <ul className="space-y-2">
              <li>
                <a
                  href="https://cua.ai/about"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  About
                </a>
              </li>
              <li>
                <a
                  href="mailto:[email protected]"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Contact
                </a>
              </li>
              <li>
                <a
                  href="https://cua.ai/cookie-policy"
                  target="_blank"
                  rel="noopener noreferrer"
                  className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
                >
                  Cookie Policy
                </a>
              </li>
            </ul>
          </div>
        </div>

        {/* Bottom Bar */}
        <div className="pt-6 border-t border-fd-border flex flex-col md:flex-row justify-between items-center gap-4">
          <p className="text-sm text-fd-muted-foreground">
            © {new Date().getFullYear()} Cua. All rights reserved.
          </p>
          <div className="flex gap-4">
            <a
              href="https://cua.ai/privacy-policy"
              className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
            >
              Privacy Policy
            </a>
            <a
              href="https://cua.ai/cookie-policy"
              className="text-sm text-fd-muted-foreground hover:text-fd-foreground transition-colors"
            >
              Cookie Policy
            </a>
          </div>
        </div>
      </div>
    </footer>
  );
}
```

--------------------------------------------------------------------------------
/libs/typescript/agent/src/client.ts:
--------------------------------------------------------------------------------

```typescript
import { Peer } from 'peerjs';
import type { AgentRequest, AgentResponse, ConnectionType, AgentClientOptions } from './types';

export class AgentClient {
  private url: string;
  private connectionType: ConnectionType;
  private options: AgentClientOptions;
  private peer?: Peer;
  private connection?: any;

  constructor(url: string, options: AgentClientOptions = {}) {
    this.url = url;
    this.options = {
      timeout: 30000,
      retries: 3,
      ...options,
    };

    // Determine connection type from URL
    if (url.startsWith('http://') || url.startsWith('https://')) {
      this.connectionType = url.startsWith('https://') ? 'https' : 'http';
    } else if (url.startsWith('peer://')) {
      this.connectionType = 'peer';
    } else {
      throw new Error('Invalid URL format. Must start with http://, https://, or peer://');
    }
  }

  // Main responses API matching the desired usage pattern
  public responses = {
    create: async (request: AgentRequest): Promise<AgentResponse> => {
      return this.sendRequest(request);
    },
  };

  private async sendRequest(request: AgentRequest): Promise<AgentResponse> {
    switch (this.connectionType) {
      case 'http':
      case 'https':
        return this.sendHttpRequest(request);
      case 'peer':
        return this.sendPeerRequest(request);
      default:
        throw new Error(`Unsupported connection type: ${this.connectionType}`);
    }
  }

  private async sendHttpRequest(request: AgentRequest): Promise<AgentResponse> {
    const controller = new AbortController();
    const timeoutId = setTimeout(() => controller.abort(), this.options.timeout);

    try {
      const headers: Record<string, string> = {
        'Content-Type': 'application/json',
      };
      if (this.options.apiKey) {
        headers['X-API-Key'] = this.options.apiKey;
      }

      const response = await fetch(`${this.url}/responses`, {
        method: 'POST',
        headers,
        body: JSON.stringify(request),
        signal: controller.signal,
      });

      clearTimeout(timeoutId);

      if (!response.ok) {
        throw new Error(`HTTP error! status: ${response.status}`);
      }

      const data = await response.json();
      return data as AgentResponse;
    } catch (error) {
      clearTimeout(timeoutId);
      if (error instanceof Error) {
        throw new Error(`Failed to send HTTP request: ${error.message}`);
      }
      throw error;
    }
  }

  private async sendPeerRequest(request: AgentRequest): Promise<AgentResponse> {
    // Extract peer ID from peer:// URL
    const peerId = this.url.replace('peer://', '');

    if (!this.peer) {
      // Initialize peer connection with default options as requested
      this.peer = new Peer();

      return new Promise<AgentResponse>((resolve, reject) => {
        const timeout = setTimeout(() => {
          reject(new Error('Peer connection timeout'));
        }, this.options.timeout);

        this.peer!.on('open', () => {
          // Connect to the target peer
          this.connection = this.peer!.connect(peerId);

          this.connection.on('open', () => {
            // Send the request
            this.connection!.send(JSON.stringify(request));
          });

          this.connection.on('data', (data: any) => {
            clearTimeout(timeout);
            try {
              const response = typeof data === 'string' ? JSON.parse(data) : data;
              resolve(response as AgentResponse);
            } catch (error) {
              reject(new Error('Failed to parse peer response'));
            }
          });

          this.connection.on('error', (error: any) => {
            clearTimeout(timeout);
            reject(new Error(`Peer connection error: ${error}`));
          });
        });

        this.peer!.on('error', (error: any) => {
          clearTimeout(timeout);
          reject(new Error(`Peer error: ${error}`));
        });
      });
    } else {
      // Reuse existing connection
      return new Promise<AgentResponse>((resolve, reject) => {
        const timeout = setTimeout(() => {
          reject(new Error('Peer request timeout'));
        }, this.options.timeout);

        if (this.connection && this.connection.open) {
          this.connection.send(JSON.stringify(request));

          const handleData = (data: any) => {
            clearTimeout(timeout);
            this.connection!.off('data', handleData);
            try {
              const response = typeof data === 'string' ? JSON.parse(data) : data;
              resolve(response as AgentResponse);
            } catch (error) {
              reject(new Error('Failed to parse peer response'));
            }
          };

          this.connection.on('data', handleData);
        } else {
          clearTimeout(timeout);
          reject(new Error('Peer connection not available'));
        }
      });
    }
  }

  // Health check method
  async health(): Promise<{ status: string }> {
    if (this.connectionType === 'peer') {
      return { status: this.peer?.open ? 'connected' : 'disconnected' };
    }

    try {
      const response = await fetch(`${this.url}/health`);
      if (response.ok) {
        return { status: 'healthy' };
      }
      return { status: 'unhealthy' };
    } catch {
      return { status: 'unreachable' };
    }
  }

  // Clean up resources
  async disconnect(): Promise<void> {
    if (this.connection) {
      this.connection.close();
      this.connection = undefined;
    }
    if (this.peer) {
      this.peer.destroy();
      this.peer = undefined;
    }
  }
}

```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/gta1.py:
--------------------------------------------------------------------------------

```python
"""
GTA1 agent loop implementation for click prediction using litellm.acompletion
Paper: https://arxiv.org/pdf/2507.05791
Code: https://github.com/Yan98/GTA1
"""

import asyncio
import base64
import json
import math
import re
import uuid
from io import BytesIO
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union

import litellm
from PIL import Image

from ..decorators import register_agent
from ..loops.base import AsyncAgentConfig
from ..types import AgentCapability, AgentResponse, Messages, Tools

SYSTEM_PROMPT = """
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.

Output the coordinate pair exactly:
(x,y)
""".strip()


def extract_coordinates(raw_string: str) -> Tuple[float, float]:
    """Extract coordinates from model output."""
    try:
        matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
        return tuple(map(float, matches[0]))  # type: ignore
    except:
        return (0.0, 0.0)


def smart_resize(
    height: int, width: int, factor: int = 28, min_pixels: int = 3136, max_pixels: int = 8847360
) -> Tuple[int, int]:
    """Smart resize function similar to qwen_vl_utils."""
    # Calculate the total pixels
    total_pixels = height * width

    # If already within bounds, return original dimensions
    if min_pixels <= total_pixels <= max_pixels:
        # Round to nearest factor
        new_height = (height // factor) * factor
        new_width = (width // factor) * factor
        return new_height, new_width

    # Calculate scaling factor
    if total_pixels > max_pixels:
        scale = (max_pixels / total_pixels) ** 0.5
    else:
        scale = (min_pixels / total_pixels) ** 0.5

    # Apply scaling
    new_height = int(height * scale)
    new_width = int(width * scale)

    # Round to nearest factor
    new_height = (new_height // factor) * factor
    new_width = (new_width // factor) * factor

    # Ensure minimum size
    new_height = max(new_height, factor)
    new_width = max(new_width, factor)

    return new_height, new_width


@register_agent(models=r".*GTA1.*")
class GTA1Config(AsyncAgentConfig):
    """GTA1 agent configuration implementing AsyncAgentConfig protocol for click prediction."""

    def __init__(self):
        self.current_model = None
        self.last_screenshot_b64 = None

    async def predict_step(
        self,
        messages: List[Dict[str, Any]],
        model: str,
        tools: Optional[List[Dict[str, Any]]] = None,
        max_retries: Optional[int] = None,
        stream: bool = False,
        computer_handler=None,
        _on_api_start=None,
        _on_api_end=None,
        _on_usage=None,
        _on_screenshot=None,
        **kwargs,
    ) -> Dict[str, Any]:
        raise NotImplementedError()

    async def predict_click(
        self, model: str, image_b64: str, instruction: str, **kwargs
    ) -> Optional[Tuple[float, float]]:
        """
        Predict click coordinates using GTA1 model via litellm.acompletion.

        Args:
            model: The GTA1 model name
            image_b64: Base64 encoded image
            instruction: Instruction for where to click

        Returns:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        # Decode base64 image
        image_data = base64.b64decode(image_b64)
        image = Image.open(BytesIO(image_data))
        width, height = image.width, image.height

        # Smart resize the image (similar to qwen_vl_utils)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=28,  # Default factor for Qwen models
            min_pixels=3136,
            max_pixels=4096 * 2160,
        )
        resized_image = image.resize((resized_width, resized_height))
        scale_x, scale_y = width / resized_width, height / resized_height

        # Convert resized image back to base64
        buffered = BytesIO()
        resized_image.save(buffered, format="PNG")
        resized_image_b64 = base64.b64encode(buffered.getvalue()).decode()

        # Prepare system and user messages
        system_message = {
            "role": "system",
            "content": SYSTEM_PROMPT.format(height=resized_height, width=resized_width),
        }

        user_message = {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/png;base64,{resized_image_b64}"},
                },
                {"type": "text", "text": instruction},
            ],
        }

        # Prepare API call kwargs
        api_kwargs = {
            "model": model,
            "messages": [system_message, user_message],
            "max_tokens": 2056,
            "temperature": 0.0,
            **kwargs,
        }

        # Use liteLLM acompletion
        response = await litellm.acompletion(**api_kwargs)

        # Extract response text
        output_text = response.choices[0].message.content  # type: ignore

        # Extract and rescale coordinates
        pred_x, pred_y = extract_coordinates(output_text)  # type: ignore
        pred_x *= scale_x
        pred_y *= scale_y

        return (math.floor(pred_x), math.floor(pred_y))

    def get_capabilities(self) -> List[AgentCapability]:
        """Return the capabilities supported by this agent."""
        return ["click"]

```

--------------------------------------------------------------------------------
/libs/python/agent/benchmarks/models/gta1.py:
--------------------------------------------------------------------------------

```python
"""
GTA1 model implementation for benchmarking.
"""

import gc
import re
from typing import Optional, Tuple

import torch
from PIL import Image
from qwen_vl_utils import process_vision_info, smart_resize
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

from .base import ModelProtocol


class GTA1Model:
    """Ground truth GTA1 model implementation."""

    def __init__(self, model_path: str = "HelloKKMe/GTA1-7B"):
        self.model_path = model_path
        self.model = None
        self.processor = None
        self.max_new_tokens = 32

        self.system_prompt = """
You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.

Output the coordinate pair exactly:
(x,y)
""".strip()

    @property
    def model_name(self) -> str:
        """Return the name of the model."""
        return f"GTA1-{self.model_path.split('/')[-1]}"

    async def load_model(self) -> None:
        """Load the model into memory."""
        if self.model is None:
            print(f"Loading GTA1 model: {self.model_path}")
            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
                self.model_path, torch_dtype=torch.bfloat16, device_map="auto"
            )
            self.processor = AutoProcessor.from_pretrained(
                self.model_path, min_pixels=3136, max_pixels=4096 * 2160
            )
            print("GTA1 model loaded successfully")

    async def unload_model(self) -> None:
        """Unload the model from memory."""
        if self.model is not None:
            print("Unloading GTA1 model from GPU...")
            del self.model
            del self.processor
            self.model = None
            self.processor = None
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
            print("GTA1 model unloaded")

    def _extract_coordinates(self, raw_string: str) -> Tuple[int, int]:
        """Extract coordinates from model output."""
        try:
            matches = re.findall(r"\((-?\d*\.?\d+),\s*(-?\d*\.?\d+)\)", raw_string)
            return tuple(map(int, map(float, matches[0])))  # type: ignore
        except:
            return (0, 0)

    async def predict_click(
        self, image: Image.Image, instruction: str
    ) -> Optional[Tuple[int, int]]:
        """
        Predict click coordinates for the given image and instruction.

        Args:
            image: PIL Image to analyze
            instruction: Text instruction describing what to click

        Returns:
            Tuple of (x, y) coordinates or None if prediction fails
        """
        if self.model is None or self.processor is None:
            await self.load_model()

        assert self.processor is not None
        assert self.model is not None

        try:
            width, height = image.width, image.height

            # Resize image according to processor requirements
            resized_height, resized_width = smart_resize(
                image.height,
                image.width,
                factor=self.processor.image_processor.patch_size
                * self.processor.image_processor.merge_size,
                min_pixels=self.processor.image_processor.min_pixels,
                max_pixels=self.processor.image_processor.max_pixels,
            )
            resized_image = image.resize((resized_width, resized_height))
            scale_x, scale_y = width / resized_width, height / resized_height

            # Prepare messages
            system_message = {
                "role": "system",
                "content": self.system_prompt.format(height=resized_height, width=resized_width),
            }

            user_message = {
                "role": "user",
                "content": [
                    {"type": "image", "image": resized_image},
                    {"type": "text", "text": instruction},
                ],
            }

            # Process inputs
            image_inputs, video_inputs = process_vision_info([system_message, user_message])  # type: ignore
            text = self.processor.apply_chat_template(
                [system_message, user_message], tokenize=False, add_generation_prompt=True
            )
            inputs = self.processor(
                text=[text],
                images=image_inputs,
                videos=video_inputs,
                padding=True,
                return_tensors="pt",
            )
            inputs = inputs.to(self.model.device)

            # Generate prediction
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=self.max_new_tokens,
                do_sample=False,
                temperature=1.0,
                use_cache=True,
            )
            generated_ids = [
                output_ids[len(input_ids) :]
                for input_ids, output_ids in zip(inputs.input_ids, output_ids)
            ]
            output_text = self.processor.batch_decode(
                generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )[0]

            # Extract and rescale coordinates
            pred_x, pred_y = self._extract_coordinates(output_text)
            pred_x = int(pred_x * scale_x)
            pred_y = int(pred_y * scale_y)

            return (pred_x, pred_y)

        except Exception as e:
            print(f"Error in GTA1 prediction: {e}")
            return None

```