trycua/cua # codebase.md

This is page 19 of 28. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .cursorignore
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── bump-version.yml
│       ├── ci-lume.yml
│       ├── docker-publish-cua-linux.yml
│       ├── docker-publish-cua-windows.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── link-check.yml
│       ├── lint.yml
│       ├── npm-publish-cli.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       ├── python-tests.yml
│       ├── test-cua-models.yml
│       └── test-validation-script.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc.yaml
├── .vscode
│   ├── docs.code-workspace
│   ├── extensions.json
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   ├── py.code-workspace
│   └── settings.json
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── cloud-windows-ga-macos-preview.md
│   ├── composite-agents.md
│   ├── computer-use-agents-for-growth-hacking.md
│   ├── cua-hackathon.md
│   ├── cua-playground-preview.md
│   ├── cua-vlm-router.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cli.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── neurips-2025-cua-papers.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .env.example
│   ├── .gitignore
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── observability.mdx
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── cua-vlm-router.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   ├── telemetry.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── cli-playbook
│   │       │   ├── commands.mdx
│   │       │   ├── index.mdx
│   │       │   └── meta.json
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── meta.json
│   │       │   ├── sandboxed-python.mdx
│   │       │   └── tracing-api.mdx
│   │       ├── example-usecases
│   │       │   ├── form-filling.mdx
│   │       │   ├── gemini-complex-ui-navigation.mdx
│   │       │   ├── meta.json
│   │       │   ├── post-event-contact-export.mdx
│   │       │   └── windows-app-behind-vpn.mdx
│   │       ├── get-started
│   │       │   ├── meta.json
│   │       │   └── quickstart.mdx
│   │       ├── index.mdx
│   │       ├── macos-vm-cli-playbook
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   └── meta.json
│   │       └── meta.json
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── bg-dark.jpg
│   │       ├── bg-light.jpg
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── grounding-with-gemini3.gif
│   │       ├── hero.png
│   │       ├── laminar_trace_example.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   ├── posthog
│   │   │   │   │   └── [...path]
│   │   │   │   │       └── route.ts
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   ├── llms.txt
│   │   │   │   └── route.ts
│   │   │   ├── robots.ts
│   │   │   └── sitemap.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── analytics-tracker.tsx
│   │   │   ├── cookie-consent.tsx
│   │   │   ├── doc-actions-menu.tsx
│   │   │   ├── editable-code-block.tsx
│   │   │   ├── footer.tsx
│   │   │   ├── hero.tsx
│   │   │   ├── iou.tsx
│   │   │   ├── mermaid.tsx
│   │   │   └── page-feedback.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   ├── mdx-components.tsx
│   │   └── providers
│   │       └── posthog-provider.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── browser_tool_example.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── tracing_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── cua_adapter.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── gelato.py
│   │   │   │   │   ├── gemini.py
│   │   │   │   │   ├── generic_vlm.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   ├── uiins.py
│   │   │   │   │   ├── uitars.py
│   │   │   │   │   └── uitars2.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── tools
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── browser_tool.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer_agent.py
│   │   ├── bench-ui
│   │   │   ├── bench_ui
│   │   │   │   ├── __init__.py
│   │   │   │   ├── api.py
│   │   │   │   └── child.py
│   │   │   ├── examples
│   │   │   │   ├── folder_example.py
│   │   │   │   ├── gui
│   │   │   │   │   ├── index.html
│   │   │   │   │   ├── logo.svg
│   │   │   │   │   └── styles.css
│   │   │   │   ├── output_overlay.png
│   │   │   │   └── simple_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       └── test_port_detection.py
│   │   ├── computer
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── tracing_wrapper.py
│   │   │   │   ├── tracing.py
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_computer.py
│   │   ├── computer-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── browser.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   ├── utils
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── wallpaper.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   ├── test_connection.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_server.py
│   │   ├── core
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_telemetry.py
│   │   ├── mcp-server
│   │   │   ├── .bumpversion.cfg
│   │   │   ├── build-extension.py
│   │   │   ├── CONCURRENT_SESSIONS.md
│   │   │   ├── desktop-extension
│   │   │   │   ├── cua-extension.mcpb
│   │   │   │   ├── desktop_extension.png
│   │   │   │   ├── manifest.json
│   │   │   │   ├── README.md
│   │   │   │   ├── requirements.txt
│   │   │   │   ├── run_server.sh
│   │   │   │   └── setup.py
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── server.py
│   │   │   │   └── session_manager.py
│   │   │   ├── pdm.lock
│   │   │   ├── pyproject.toml
│   │   │   ├── QUICK_TEST_COMMANDS.sh
│   │   │   ├── quick_test_local_option.py
│   │   │   ├── README.md
│   │   │   ├── scripts
│   │   │   │   ├── install_mcp_server.sh
│   │   │   │   └── start_mcp_server.sh
│   │   │   ├── test_mcp_server_local_option.py
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_mcp_server.py
│   │   ├── pylume
│   │   │   └── tests
│   │   │       ├── conftest.py
│   │   │       └── test_pylume.py
│   │   └── som
│   │       ├── .bumpversion.cfg
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           ├── conftest.py
│   │           └── test_omniparser.py
│   ├── qemu-docker
│   │   ├── linux
│   │   │   ├── Dockerfile
│   │   │   ├── README.md
│   │   │   └── src
│   │   │       ├── entry.sh
│   │   │       └── vm
│   │   │           ├── image
│   │   │           │   └── README.md
│   │   │           └── setup
│   │   │               ├── install.sh
│   │   │               ├── setup-cua-server.sh
│   │   │               └── setup.sh
│   │   ├── README.md
│   │   └── windows
│   │       ├── Dockerfile
│   │       ├── README.md
│   │       └── src
│   │           ├── entry.sh
│   │           └── vm
│   │               ├── image
│   │               │   └── README.md
│   │               └── setup
│   │                   ├── install.bat
│   │                   ├── on-logon.ps1
│   │                   ├── setup-cua-server.ps1
│   │                   ├── setup-utils.psm1
│   │                   └── setup.ps1
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── cua-cli
│   │   │   ├── .gitignore
│   │   │   ├── .prettierrc
│   │   │   ├── bun.lock
│   │   │   ├── CLAUDE.md
│   │   │   ├── index.ts
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── auth.ts
│   │   │   │   ├── cli.ts
│   │   │   │   ├── commands
│   │   │   │   │   ├── auth.ts
│   │   │   │   │   └── sandbox.ts
│   │   │   │   ├── config.ts
│   │   │   │   ├── http.ts
│   │   │   │   ├── storage.ts
│   │   │   │   └── util.ts
│   │   │   └── tsconfig.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Development.md
│       ├── Dockerfile
│       ├── Dockerfile.dev
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── Makefile
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── package-lock.json
├── package.json
├── pnpm-lock.yaml
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── scripts
│   ├── install-cli.ps1
│   ├── install-cli.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   ├── run-docker-dev.sh
│   └── typescript-typecheck.js
├── TESTING.md
├── tests
│   ├── agent_loop_testing
│   │   ├── agent_test.py
│   │   └── README.md
│   ├── pytest.ini
│   ├── shell_cmd.py
│   ├── test_files.py
│   ├── test_mcp_server_session_management.py
│   ├── test_mcp_server_streaming.py
│   ├── test_shell_bash.py
│   ├── test_telemetry.py
│   ├── test_tracing.py
│   ├── test_venv.py
│   └── test_watchdog.py
└── uv.lock
```

# Files

--------------------------------------------------------------------------------
/docs/content/docs/get-started/quickstart.mdx:
--------------------------------------------------------------------------------

```markdown
  1 | ---
  2 | title: Quickstart
  3 | description: Get started with Cua
  4 | ---
  5 | 
  6 | import { Step, Steps } from 'fumadocs-ui/components/steps';
  7 | import { Tab, Tabs } from 'fumadocs-ui/components/tabs';
  8 | import { Accordion, Accordions } from 'fumadocs-ui/components/accordion';
  9 | import { Code, Terminal } from 'lucide-react';
 10 | 
 11 | {/* Choose your quickstart path:
 12 | 
 13 | <div className="grid grid-cols-1 md:grid-cols-2 gap-6 mt-8 mb-8">
 14 |   <Card icon={<Code />} href="#developer-quickstart" title="Developer Quickstart">
 15 |     Build with Python or TypeScript SDKs - full programmatic control
 16 |   </Card>
 17 |   <Card icon={<Terminal />} href="#cli-quickstart" title="CLI Quickstart">
 18 |     Get started quickly with the command-line interface
 19 |   </Card>
 20 | </div> */}
 21 | 
 22 | ## Set Up Your Computer Environment
 23 | 
 24 | Choose how you want to run your Cua computer. This will be the environment where your automated tasks will execute.
 25 | 
 26 | You can run your Cua computer in the cloud (recommended for easiest setup), locally on macOS with Lume, locally on Windows with a Windows Sandbox, or in a Docker container on any platform. Choose the option that matches your system and needs.
 27 | 
 28 | <Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox']}>
 29 |   <Tab value="Cloud Sandbox">
 30 | 
 31 |     Create and manage cloud sandboxes that run Linux (Ubuntu), Windows, or macOS.
 32 | 
 33 |     **First, create your API key:**
 34 | 
 35 |     1. Go to [cua.ai/signin](https://cua.ai/signin)
 36 |     2. Navigate to **Dashboard > API Keys > New API Key** to create your API key
 37 |     3. **Important:** Copy and save your API key immediately - you won't be able to see it again (you'll need to regenerate if lost)
 38 | 
 39 |     **Then, create your sandbox using either option:**
 40 | 
 41 |     **Option 1: Via Website**
 42 | 
 43 |     1. Navigate to **Dashboard > Sandboxes > Create Sandbox**
 44 |     2. Create a sandbox, choosing **Linux**, **Windows**, or **macOS**
 45 |     3. Note your sandbox name
 46 | 
 47 |     **Option 2: Via CLI**
 48 | 
 49 |     1. Install the CUA CLI:
 50 |     ```bash
 51 |     # macOS/Linux
 52 |     curl -LsSf https://cua.ai/cli/install.sh | sh
 53 | 
 54 |     # Windows
 55 |     powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
 56 |     ```
 57 | 
 58 |     2. Login and create a sandbox:
 59 |     ```bash
 60 |     cua auth login
 61 |     cua sb create --os linux --size small --region north-america
 62 |     ```
 63 | 
 64 |     3. Note your sandbox name and password from the output
 65 | 
 66 |     Your Cloud Sandbox will be automatically configured and ready to use.
 67 | 
 68 |   </Tab>
 69 |   <Tab value="Linux on Docker">
 70 | 
 71 |     Run Linux desktop locally on macOS, Windows, or Linux hosts.
 72 | 
 73 |     1. Install Docker Desktop or Docker Engine
 74 | 
 75 |     2. Pull a CUA Docker image:
 76 | 
 77 |     ```bash
 78 |     # XFCE (Lightweight) - recommended for most use cases
 79 |     docker pull --platform=linux/amd64 trycua/cua-xfce:latest
 80 | 
 81 |     # OR KASM (Full-Featured) - full Ubuntu desktop
 82 |     docker pull --platform=linux/amd64 trycua/cua-ubuntu:latest
 83 |     ```
 84 | 
 85 |   </Tab>
 86 |   <Tab value="macOS Sandbox">
 87 | 
 88 |     macOS hosts only - requires Lume CLI.
 89 | 
 90 |     1. Install the Lume CLI:
 91 | 
 92 |     ```bash
 93 |     /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
 94 |     ```
 95 | 
 96 |     2. Start a local Cua sandbox:
 97 | 
 98 |     ```bash
 99 |     lume run macos-sequoia-cua:latest
100 |     ```
101 | 
102 |   </Tab>
103 |   <Tab value="Windows Sandbox">
104 | 
105 |     Windows hosts only - requires Windows 10 Pro/Enterprise or Windows 11.
106 | 
107 |     1. Enable [Windows Sandbox](https://learn.microsoft.com/en-us/windows/security/application-security/application-isolation/windows-sandbox/windows-sandbox-install)
108 |     2. Install the `pywinsandbox` dependency:
109 | 
110 |     ```bash
111 |     pip install -U git+git://github.com/karkason/pywinsandbox.git
112 |     ```
113 | 
114 |     3. Windows Sandbox will be automatically configured when you run the CLI
115 | 
116 |   </Tab>
117 | </Tabs>
118 | 
119 | ---
120 | 
121 | ## Developer Quickstart
122 | 
123 | <Callout type="warn" title="Python Version Compatibility">
124 |   Cua packages require **Python 3.12 or 3.13**. Python 3.14 is not currently supported due to dependency compatibility issues (pydantic-core/PyO3 compatibility). If you encounter build errors on Python 3.14, please use Python 3.13 or earlier.
125 | </Callout>
126 | 
127 | <Steps>
128 | 
129 | <Step>
130 | 
131 | ### Using Computer
132 | 
133 | Connect to your Cua computer and perform basic interactions, such as taking screenshots or simulating user input.
134 | 
135 | <Tabs items={['Python', 'TypeScript']}>
136 |   <Tab value="Python">
137 |     Install the Cua computer Python SDK:
138 |     ```bash
139 |     pip install cua-computer
140 |     ```
141 | 
142 |     Then, connect to your desired computer environment:
143 | 
144 |     <Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
145 |       <Tab value="Cloud Sandbox">
146 |         Set your CUA API key (same key used for model inference) and connect to your sandbox:
147 |         ```python
148 |         import os
149 |         from computer import Computer
150 |         import asyncio
151 | 
152 |         os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
153 | 
154 |         computer = Computer(
155 |             os_type="linux",  # or "windows" or "macos"
156 |             provider_type="cloud",
157 |             name="your-sandbox-name"  # from CLI or website
158 |         )
159 | 
160 |         async def main():
161 |             await computer.run()  # Connect to the sandbox
162 |             # Alternative: If your VM is not running, use start() instead:
163 |             # await computer.start()  # Start and connect to the sandbox
164 | 
165 |             try:
166 |                 # Take a screenshot of the computer's current display
167 |                 screenshot = await computer.interface.screenshot()
168 |                 # Simulate a left-click at coordinates (100, 100)
169 |                 await computer.interface.left_click(100, 100)
170 |                 # Type "Hello!" into the active application
171 |                 await computer.interface.type_text("Hello!")
172 |             finally:
173 |                 await computer.disconnect()
174 |                 # Alternative: If you want to fully stop the VM, use stop() instead:
175 |                 # await computer.stop()  # Fully stop VM and disconnect
176 | 
177 |         asyncio.run(main())
178 |         ```
179 |       </Tab>
180 |       <Tab value="Linux on Docker">
181 |         ```python
182 |         from computer import Computer
183 |         import asyncio
184 | 
185 |         computer = Computer(
186 |             os_type="linux",
187 |             provider_type="docker",
188 |             image="trycua/cua-xfce:latest"  # or "trycua/cua-ubuntu:latest"
189 |         )
190 | 
191 |         async def main():
192 |             await computer.run()  # Launch & connect to the sandbox
193 |             # Alternative: If your VM is not running, use start() instead:
194 |             # await computer.start()  # Start and connect to the sandbox
195 | 
196 |             try:
197 |                 # Take a screenshot of the computer's current display
198 |                 screenshot = await computer.interface.screenshot()
199 |                 # Simulate a left-click at coordinates (100, 100)
200 |                 await computer.interface.left_click(100, 100)
201 |                 # Type "Hello!" into the active application
202 |                 await computer.interface.type_text("Hello!")
203 |             finally:
204 |                 await computer.disconnect()
205 |                 # Alternative: If you want to fully stop the VM, use stop() instead:
206 |                 # await computer.stop()  # Fully stop VM and disconnect
207 | 
208 |         asyncio.run(main())
209 |         ```
210 |       </Tab>
211 |       <Tab value="macOS Sandbox">
212 |         ```python
213 |         from computer import Computer
214 |         import asyncio
215 | 
216 |         computer = Computer(
217 |             os_type="macos",
218 |             provider_type="lume",
219 |             name="macos-sequoia-cua:latest"
220 |         )
221 | 
222 |         async def main():
223 |             await computer.run()  # Launch & connect to the sandbox
224 |             # Alternative: If your VM is not running, use start() instead:
225 |             # await computer.start()  # Start and connect to the sandbox
226 | 
227 |             try:
228 |                 # Take a screenshot of the computer's current display
229 |                 screenshot = await computer.interface.screenshot()
230 |                 # Simulate a left-click at coordinates (100, 100)
231 |                 await computer.interface.left_click(100, 100)
232 |                 # Type "Hello!" into the active application
233 |                 await computer.interface.type_text("Hello!")
234 |             finally:
235 |                 await computer.disconnect()
236 |                 # Alternative: If you want to fully stop the VM, use stop() instead:
237 |                 # await computer.stop()  # Fully stop VM and disconnect
238 | 
239 |         asyncio.run(main())
240 |         ```
241 |       </Tab>
242 |       <Tab value="Windows Sandbox">
243 |         ```python
244 |         from computer import Computer
245 |         import asyncio
246 | 
247 |         computer = Computer(
248 |             os_type="windows",
249 |             provider_type="windows_sandbox"
250 |         )
251 | 
252 |         async def main():
253 |             await computer.run()  # Launch & connect to the sandbox
254 |             # Alternative: If your VM is not running, use start() instead:
255 |             # await computer.start()  # Start and connect to the sandbox
256 | 
257 |             try:
258 |                 # Take a screenshot of the computer's current display
259 |                 screenshot = await computer.interface.screenshot()
260 |                 # Simulate a left-click at coordinates (100, 100)
261 |                 await computer.interface.left_click(100, 100)
262 |                 # Type "Hello!" into the active application
263 |                 await computer.interface.type_text("Hello!")
264 |             finally:
265 |                 await computer.disconnect()
266 |                 # Alternative: If you want to fully stop the VM, use stop() instead:
267 |                 # await computer.stop()  # Fully stop VM and disconnect
268 | 
269 |         asyncio.run(main())
270 |         ```
271 |       </Tab>
272 |       <Tab value="Your host desktop">
273 |         Install and run `cua-computer-server`:
274 |         ```bash
275 |         pip install cua-computer-server
276 |         python -m computer_server
277 |         ```
278 | 
279 |         Then, use the `Computer` object to connect:
280 |         ```python
281 |         from computer import Computer
282 |         import asyncio
283 | 
284 |         computer = Computer(use_host_computer_server=True)
285 | 
286 |         async def main():
287 |             await computer.run()  # Connect to the host desktop
288 |             # Alternative: If your computer server is not running, use start() instead:
289 |             # await computer.start()  # Start and connect to the host desktop
290 | 
291 |             try:
292 |                 # Take a screenshot of the computer's current display
293 |                 screenshot = await computer.interface.screenshot()
294 |                 # Simulate a left-click at coordinates (100, 100)
295 |                 await computer.interface.left_click(100, 100)
296 |                 # Type "Hello!" into the active application
297 |                 await computer.interface.type_text("Hello!")
298 |             finally:
299 |                 await computer.disconnect()
300 |                 # Alternative: If you want to fully stop everything, use stop() instead:
301 |                 # await computer.stop()  # Fully stop and disconnect
302 | 
303 |         asyncio.run(main())
304 |         ```
305 |       </Tab>
306 |     </Tabs>
307 | 
308 |   </Tab>
309 |   <Tab value="TypeScript">
310 |     <Callout type="warn" title="TypeScript SDK Deprecated">
311 |       The TypeScript interface is currently deprecated. We're working on version 0.2.0 with improved TypeScript support. In the meantime, please use the Python SDK.
312 |     </Callout>
313 | 
314 |     Install the Cua computer TypeScript SDK:
315 |     ```bash
316 |     npm install @trycua/computer
317 |     ```
318 | 
319 |     Then, connect to your desired computer environment:
320 | 
321 |     <Tabs items={['Cloud Sandbox', 'Linux on Docker', 'macOS Sandbox', 'Windows Sandbox', 'Your host desktop']}>
322 |       <Tab value="Cloud Sandbox">
323 |         Set your CUA API key (same key used for model inference):
324 |         ```bash
325 |         export CUA_API_KEY="sk_cua-api01_..."
326 |         ```
327 | 
328 |         Then connect to your sandbox:
329 |         ```typescript
330 |         import { Computer, OSType } from '@trycua/computer';
331 | 
332 |         const computer = new Computer({
333 |           osType: OSType.LINUX,  // or OSType.WINDOWS or OSType.MACOS
334 |           name: "your-sandbox-name"  // from CLI or website
335 |         });
336 |         await computer.run(); // Connect to the sandbox
337 |         ```
338 |       </Tab>
339 |       <Tab value="Linux on Docker">
340 |         ```typescript
341 |         import { Computer, OSType, ProviderType } from '@trycua/computer';
342 | 
343 |         const computer = new Computer({
344 |           osType: OSType.LINUX,
345 |           providerType: ProviderType.DOCKER,
346 |           image: "trycua/cua-xfce:latest"  // or "trycua/cua-ubuntu:latest"
347 |         });
348 |         await computer.run(); // Launch & connect to the sandbox
349 |         ```
350 |       </Tab>
351 |       <Tab value="macOS Sandbox">
352 |         ```typescript
353 |         import { Computer, OSType, ProviderType } from '@trycua/computer';
354 | 
355 |         const computer = new Computer({
356 |           osType: OSType.MACOS,
357 |           providerType: ProviderType.LUME,
358 |           name: "macos-sequoia-cua:latest"
359 |         });
360 |         await computer.run(); // Launch & connect to the sandbox
361 |         ```
362 |       </Tab>
363 |       <Tab value="Windows Sandbox">
364 |         ```typescript
365 |         import { Computer, OSType, ProviderType } from '@trycua/computer';
366 | 
367 |         const computer = new Computer({
368 |           osType: OSType.WINDOWS,
369 |           providerType: ProviderType.WINDOWS_SANDBOX
370 |         });
371 |         await computer.run(); // Launch & connect to the sandbox
372 |         ```
373 |       </Tab>
374 |       <Tab value="Your host desktop">
375 |         First, install and run `cua-computer-server`:
376 |         ```bash
377 |         pip install cua-computer-server
378 |         python -m computer_server
379 |         ```
380 | 
381 |         Then, use the `Computer` object to connect:
382 |         ```typescript
383 |         import { Computer } from '@trycua/computer';
384 | 
385 |         const computer = new Computer({ useHostComputerServer: true });
386 |         await computer.run(); // Connect to the host desktop
387 |         ```
388 |       </Tab>
389 |     </Tabs>
390 | 
391 |     Once connected, you can perform interactions:
392 |     ```typescript
393 |     try {
394 |       // Take a screenshot of the computer's current display
395 |       const screenshot = await computer.interface.screenshot();
396 |       // Simulate a left-click at coordinates (100, 100)
397 |       await computer.interface.leftClick(100, 100);
398 |       // Type "Hello!" into the active application
399 |       await computer.interface.typeText("Hello!");
400 |     } finally {
401 |       await computer.disconnect();
402 |     }
403 |     ```
404 | 
405 |   </Tab>
406 | </Tabs>
407 | 
408 | Learn more about computers in the [Cua computers documentation](/computer-sdk/computers). You will see how to automate computers with agents in the next step.
409 | 
410 | </Step>
411 | 
412 | <Step>
413 | 
414 | ### Using Agent
415 | 
416 | Utilize an Agent to automate complex tasks by providing it with a goal and allowing it to interact with the computer environment.
417 | 
418 | Install the Cua agent Python SDK:
419 | 
420 | ```bash
421 | pip install "cua-agent[all]"
422 | ```
423 | 
424 | Choose how you want to access vision-language models for your agent:
425 | 
426 | <Tabs items={['CUA VLM Router', 'BYOK (Bring Your Own Key)']}>
427 |   <Tab value="CUA VLM Router">
428 | 
429 |     Use CUA's inference API to access multiple model providers with a single API key (same key used for sandbox access). CUA VLM Router provides intelligent routing and cost optimization.
430 | 
431 |     **Use the agent with CUA models:**
432 |     ```python
433 |     import os
434 |     import asyncio
435 |     from computer import Computer
436 |     from agent import ComputerAgent
437 | 
438 |     os.environ["CUA_API_KEY"] = "sk_cua-api01_..."
439 | 
440 |     computer = Computer(
441 |         os_type="linux",  # or "windows" or "macos"
442 |         provider_type="cloud",
443 |         name="your-sandbox-name"  # from CLI or website
444 |     )
445 | 
446 |     async def main():
447 |         await computer.run()  # Connect to the sandbox
448 |         # Alternative: If your VM is not running, use start() instead:
449 |         # await computer.start()  # Start and connect to the sandbox
450 | 
451 |         try:
452 |             agent = ComputerAgent(
453 |                 model="cua/anthropic/claude-sonnet-4.5",  # CUA-routed model
454 |                 tools=[computer],
455 |                 max_trajectory_budget=5.0
456 |             )
457 | 
458 |             messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
459 | 
460 |             async for result in agent.run(messages):
461 |                 for item in result["output"]:
462 |                     if item["type"] == "message":
463 |                         print(item["content"][0]["text"])
464 |         finally:
465 |             await computer.disconnect()
466 |             # Alternative: If you want to fully stop the VM, use stop() instead:
467 |             # await computer.stop()  # Fully stop VM and disconnect
468 | 
469 |     asyncio.run(main())
470 |     ```
471 | 
472 |     **Available CUA models:**
473 |     - `cua/anthropic/claude-sonnet-4.5` - Claude Sonnet 4.5 (recommended)
474 |     - `cua/anthropic/claude-opus-4.5` - Claude Opus 4.5 (enhanced agentic capabilities)
475 |     - `cua/anthropic/claude-haiku-4.5` - Claude Haiku 4.5 (faster, cost-effective)
476 |     - `cua/qwen/qwen3-vl-235b` - Qwen3 VL 235B (large-scale vision-language tasks)
477 | 
478 |     **Benefits:**
479 |     - Single API key for multiple providers
480 |     - Cost tracking and optimization
481 |     - No need to manage multiple provider keys
482 | 
483 |   </Tab>
484 |   <Tab value="BYOK (Bring Your Own Key)">
485 | 
486 |     Use your own API keys from model providers like Anthropic, OpenAI, or others.
487 | 
488 |     **Use the agent with your provider:**
489 |     ```python
490 |     import os
491 |     import asyncio
492 |     from computer import Computer
493 |     from agent import ComputerAgent
494 | 
495 |     # Set your provider API key
496 |     os.environ["ANTHROPIC_API_KEY"] = "sk-ant-..."  # For Anthropic
497 |     # OR
498 |     os.environ["OPENAI_API_KEY"] = "sk-..."  # For OpenAI
499 | 
500 |     computer = Computer(
501 |         os_type="linux",  # or "windows" or "macos"
502 |         provider_type="cloud",
503 |         name="your-sandbox-name"  # from CLI or website
504 |     )
505 | 
506 |     async def main():
507 |         await computer.run()  # Launch & connect to the sandbox
508 |         # Alternative: If your VM is not running, use start() instead:
509 |         # await computer.start()  # Start and connect to the sandbox
510 | 
511 |         try:
512 |             agent = ComputerAgent(
513 |                 model="anthropic/claude-sonnet-4-5-20250929",  # Direct provider model
514 |                 tools=[computer],
515 |                 max_trajectory_budget=5.0
516 |             )
517 | 
518 |             messages = [{"role": "user", "content": "Take a screenshot and tell me what you see"}]
519 | 
520 |             async for result in agent.run(messages):
521 |                 for item in result["output"]:
522 |                     if item["type"] == "message":
523 |                         print(item["content"][0]["text"])
524 |         finally:
525 |             await computer.disconnect()
526 |             # Alternative: If you want to fully stop the VM, use stop() instead:
527 |             # await computer.stop()  # Fully stop VM and disconnect
528 | 
529 |     asyncio.run(main())
530 |     ```
531 | 
532 |     **Supported providers:**
533 |     - `anthropic/claude-*` - Anthropic Claude models
534 |     - `openai/gpt-*` - OpenAI GPT models
535 |     - `openai/o1-*` - OpenAI o1 models
536 |     - `huggingface-local/*` - Local HuggingFace models
537 |     - And many more via LiteLLM
538 | 
539 |     See [Supported Models](/agent-sdk/supported-model-providers/) for the complete list.
540 | 
541 |   </Tab>
542 | </Tabs>
543 | 
544 | Learn more about agents in [Agent Loops](/agent-sdk/agent-loops) and available models in [Supported Models](/agent-sdk/supported-model-providers/).
545 | 
546 | </Step>
547 | </Steps>
548 | 
549 | ### Next Steps
550 | 
551 | - Learn more about [Cua computers](/computer-sdk/computers) and [computer commands](/computer-sdk/commands)
552 | - Read about [Agent loops](/agent-sdk/agent-loops), [tools](/agent-sdk/custom-tools), and [supported model providers](/agent-sdk/supported-model-providers/)
553 | - Join our [Discord community](https://discord.com/invite/mVnXXpdE85) for help
554 | - Try out [Form Filling](/example-usecases/form-filling) preset usecase
555 | 
556 | {/* ---
557 | 
558 | ## CLI Quickstart
559 | 
560 | Get started quickly with the CUA CLI - the easiest way to manage cloud sandboxes and run AI agents.
561 | 
562 | <Steps>
563 | <Step>
564 | 
565 | ### Install the CUA CLI
566 | 
567 | <Tabs items={['macOS / Linux', 'Windows', 'Bun (Alternative)', 'From Source']}>
568 |   <Tab value="macOS / Linux">
569 |     ```bash
570 |     curl -LsSf https://cua.ai/cli/install.sh | sh
571 |     ```
572 |   </Tab>
573 |   <Tab value="Windows">
574 |     ```powershell
575 |     powershell -ExecutionPolicy ByPass -c "irm https://cua.ai/cli/install.ps1 | iex"
576 |     ```
577 |   </Tab>
578 |   <Tab value="Bun (Alternative)">
579 |     ```bash
580 |     # Install Bun if you don't have it
581 |     curl -fsSL https://bun.sh/install | bash
582 | 
583 |     # Install CUA CLI
584 |     bun add -g @trycua/cli
585 |     ```
586 |   </Tab>
587 |   <Tab value="From Source">
588 |     ```bash
589 |     # Install Bun (macOS/Linux)
590 |     curl -fsSL https://bun.sh/install | bash
591 | 
592 |     # Install Bun (Windows)
593 |     # powershell -c "irm bun.sh/install.ps1|iex"
594 | 
595 |     # Clone the repo
596 |     git clone https://github.com/trycua/cua
597 |     cd cua/libs/typescript/cua-cli
598 | 
599 |     # Install the CLI
600 |     bun install
601 |     bun link
602 |     bun link cua-cli
603 |     ```
604 | 
605 |   </Tab>
606 | </Tabs>
607 | 
608 | </Step>
609 | 
610 | <Step>
611 | 
612 | ### Authenticate with CUA
613 | 
614 | Login to your CUA account:
615 | 
616 | ```bash
617 | # Interactive browser login (recommended)
618 | cua auth login
619 | 
620 | # Or provide your API key directly
621 | cua auth login --api-key sk-your-api-key-here
622 | ```
623 | 
624 | If you don't have a CUA account yet, sign up at [cua.ai/signin](https://cua.ai/signin).
625 | 
626 | </Step>
627 | 
628 | <Step>
629 | 
630 | ### Create Your First Sandbox
631 | 
632 | Create a cloud sandbox where your AI agents will run:
633 | 
634 | ```bash
635 | # Create a Linux sandbox (recommended for most use cases)
636 | cua sb create --os linux --size small --region north-america
637 | 
638 | # Or create a Windows sandbox
639 | cua sb create --os windows --size small --region north-america
640 | 
641 | ```
642 | 
643 | Your sandbox will be created and you'll see output like:
644 | 
645 | ```
646 | Sandbox created and ready: my-sandbox-abc123
647 | Password: secure-password-here
648 | Host: my-sandbox-abc123.sandbox.cua.ai
649 | ```
650 | 
651 | </Step>
652 | 
653 | <Step>
654 | 
655 | ### Start Using Your Sandbox
656 | 
657 | You can now interact with your sandbox in multiple ways:
658 | 
659 | 
660 | 
661 | #### Option 1: Access VNC Desktop
662 | 
663 | ```bash
664 | cua sb vnc my-sandbox-abc123
665 | ```
666 | 
667 | This opens a remote desktop connection to your sandbox.
668 | 
669 | #### Option 2: List and Manage Sandboxes
670 | 
671 | ```bash
672 | # List all your sandboxes
673 | cua sb list
674 | 
675 | # Start/stop sandboxes as needed
676 | cua sb stop my-sandbox-abc123
677 | cua sb start my-sandbox-abc123
678 | 
679 | # Delete sandboxes when done
680 | cua sb delete my-sandbox-abc123
681 | ```
682 | 
683 | </Step>
684 | 
685 | </Steps>
686 | 
687 | ### What's Next?
688 | 
689 | - **Explore more commands**: Check out the [complete CLI reference](/libraries/cua-cli/commands)
690 | - **Learn about programming**: Try the [Developer Quickstart](#developer-quickstart) to build custom automations
691 | - **Join the community**: Get help in our [Discord community](https://discord.com/invite/mVnXXpdE85)
692 | 
693 | ---
694 | 
695 | For running models locally, see [Running Models Locally](/agent-sdk/supported-model-providers/local-models). */}
696 | 
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/diorama/diorama.py:
--------------------------------------------------------------------------------

```python
  1 | #!/usr/bin/env python3
  2 | """Diorama: A virtual desktop manager for macOS"""
  3 | 
  4 | import asyncio
  5 | import io
  6 | import logging
  7 | import os
  8 | import sys
  9 | from typing import Union
 10 | 
 11 | from computer_server.diorama.diorama_computer import DioramaComputer
 12 | from computer_server.diorama.draw import (
 13 |     AppActivationContext,
 14 |     capture_all_apps,
 15 |     get_all_windows,
 16 |     get_frontmost_and_active_app,
 17 |     get_running_apps,
 18 | )
 19 | from computer_server.handlers.macos import *
 20 | from PIL import Image, ImageDraw
 21 | 
 22 | # simple, nicely formatted logging
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | automation_handler = MacOSAutomationHandler()
 26 | 
 27 | 
 28 | class Diorama:
 29 |     """Virtual desktop manager that provides automation capabilities for macOS applications.
 30 | 
 31 |     Manages application windows and provides an interface for taking screenshots,
 32 |     mouse interactions, keyboard input, and coordinate transformations between
 33 |     screenshot space and screen space.
 34 |     """
 35 | 
 36 |     _scheduler_queue = None
 37 |     _scheduler_task = None
 38 |     _loop = None
 39 |     _scheduler_started = False
 40 | 
 41 |     @classmethod
 42 |     def create_from_apps(cls, *args) -> DioramaComputer:
 43 |         """Create a DioramaComputer instance from a list of application names.
 44 | 
 45 |         Args:
 46 |             *args: Variable number of application names to include in the desktop
 47 | 
 48 |         Returns:
 49 |             DioramaComputer: A computer interface for the specified applications
 50 |         """
 51 |         cls._ensure_scheduler()
 52 |         return cls(args).computer
 53 | 
 54 |     # Dictionary to store cursor positions for each unique app_list hash
 55 |     _cursor_positions = {}
 56 | 
 57 |     def __init__(self, app_list):
 58 |         """Initialize a Diorama instance for the specified applications.
 59 | 
 60 |         Args:
 61 |             app_list: List of application names to manage
 62 |         """
 63 |         self.app_list = app_list
 64 |         self.interface = self.Interface(self)
 65 |         self.computer = DioramaComputer(self)
 66 |         self.focus_context = None
 67 | 
 68 |         # Create a hash for this app_list to use as a key
 69 |         self.app_list_hash = hash(tuple(sorted(app_list)))
 70 | 
 71 |         # Initialize cursor position for this app_list if it doesn't exist
 72 |         if self.app_list_hash not in Diorama._cursor_positions:
 73 |             Diorama._cursor_positions[self.app_list_hash] = (0, 0)
 74 | 
 75 |     @classmethod
 76 |     def _ensure_scheduler(cls):
 77 |         """Ensure the async scheduler loop is running.
 78 | 
 79 |         Creates and starts the scheduler task if it hasn't been started yet.
 80 |         """
 81 |         if not cls._scheduler_started:
 82 |             logger.info("Starting Diorama scheduler loop…")
 83 |             cls._scheduler_queue = asyncio.Queue()
 84 |             cls._loop = asyncio.get_event_loop()
 85 |             cls._scheduler_task = cls._loop.create_task(cls._scheduler_loop())
 86 |             cls._scheduler_started = True
 87 | 
 88 |     @classmethod
 89 |     async def _scheduler_loop(cls):
 90 |         """Main scheduler loop that processes automation commands.
 91 | 
 92 |         Continuously processes commands from the scheduler queue, handling
 93 |         screenshots, mouse actions, keyboard input, and scrolling operations.
 94 |         """
 95 |         while True:
 96 |             cmd = await cls._scheduler_queue.get()
 97 |             action = cmd.get("action")
 98 |             args = cmd.get("arguments", {})
 99 |             future = cmd.get("future")
100 |             logger.info(f"Processing command: {action} | args={args}")
101 | 
102 |             app_whitelist = args.get("app_list", [])
103 | 
104 |             all_windows = get_all_windows()
105 |             running_apps = get_running_apps()
106 |             frontmost_app, active_app_to_use, active_app_pid = get_frontmost_and_active_app(
107 |                 all_windows, running_apps, app_whitelist
108 |             )
109 |             focus_context = AppActivationContext(active_app_pid, active_app_to_use, logger)
110 | 
111 |             with focus_context:
112 |                 try:
113 |                     if action == "screenshot":
114 |                         logger.info(f"Taking screenshot for apps: {app_whitelist}")
115 |                         result, img = capture_all_apps(
116 |                             app_whitelist=app_whitelist, save_to_disk=False, take_focus=False
117 |                         )
118 |                         logger.info("Screenshot complete.")
119 |                         if future:
120 |                             future.set_result((result, img))
121 |                     # Mouse actions
122 |                     elif action in [
123 |                         "left_click",
124 |                         "right_click",
125 |                         "double_click",
126 |                         "move_cursor",
127 |                         "drag_to",
128 |                     ]:
129 |                         x = args.get("x")
130 |                         y = args.get("y")
131 | 
132 |                         duration = args.get("duration", 0.5)
133 |                         if action == "left_click":
134 |                             await automation_handler.left_click(x, y)
135 |                         elif action == "right_click":
136 |                             await automation_handler.right_click(x, y)
137 |                         elif action == "double_click":
138 |                             await automation_handler.double_click(x, y)
139 |                         elif action == "move_cursor":
140 |                             await automation_handler.move_cursor(x, y)
141 |                         elif action == "drag_to":
142 |                             await automation_handler.drag_to(x, y, duration=duration)
143 |                         if future:
144 |                             future.set_result(None)
145 |                     elif action in ["scroll_up", "scroll_down"]:
146 |                         x = args.get("x")
147 |                         y = args.get("y")
148 |                         if x is not None and y is not None:
149 |                             await automation_handler.move_cursor(x, y)
150 | 
151 |                         clicks = args.get("clicks", 1)
152 |                         if action == "scroll_up":
153 |                             await automation_handler.scroll_up(clicks)
154 |                         else:
155 |                             await automation_handler.scroll_down(clicks)
156 |                         if future:
157 |                             future.set_result(None)
158 |                     # Keyboard actions
159 |                     elif action == "type_text":
160 |                         text = args.get("text")
161 |                         await automation_handler.type_text(text)
162 |                         if future:
163 |                             future.set_result(None)
164 |                     elif action == "press_key":
165 |                         key = args.get("key")
166 |                         await automation_handler.press_key(key)
167 |                         if future:
168 |                             future.set_result(None)
169 |                     elif action == "hotkey":
170 |                         keys = args.get("keys", [])
171 |                         await automation_handler.hotkey(keys)
172 |                         if future:
173 |                             future.set_result(None)
174 |                     elif action == "get_cursor_position":
175 |                         pos = await automation_handler.get_cursor_position()
176 |                         if future:
177 |                             future.set_result(pos)
178 |                     else:
179 |                         logger.warning(f"Unknown action: {action}")
180 |                         if future:
181 |                             future.set_exception(ValueError(f"Unknown action: {action}"))
182 |                 except Exception as e:
183 |                     logger.error(f"Exception during {action}: {e}", exc_info=True)
184 |                     if future:
185 |                         future.set_exception(e)
186 | 
187 |     class Interface:
188 |         """Interface for interacting with the virtual desktop.
189 | 
190 |         Provides methods for taking screenshots, mouse interactions, keyboard input,
191 |         and coordinate transformations between screenshot and screen coordinates.
192 |         """
193 | 
194 |         def __init__(self, diorama):
195 |             """Initialize the interface with a reference to the parent Diorama instance.
196 | 
197 |             Args:
198 |                 diorama: The parent Diorama instance
199 |             """
200 |             self._diorama = diorama
201 | 
202 |             self._scene_hitboxes = []
203 |             self._scene_size = None
204 | 
205 |         async def _send_cmd(self, action, arguments=None):
206 |             """Send a command to the scheduler queue.
207 | 
208 |             Args:
209 |                 action (str): The action to perform
210 |                 arguments (dict, optional): Arguments for the action
211 | 
212 |             Returns:
213 |                 The result of the command execution
214 |             """
215 |             Diorama._ensure_scheduler()
216 |             loop = asyncio.get_event_loop()
217 |             future = loop.create_future()
218 |             logger.info(f"Enqueuing {action} command for apps: {self._diorama.app_list}")
219 |             await Diorama._scheduler_queue.put(
220 |                 {
221 |                     "action": action,
222 |                     "arguments": {"app_list": self._diorama.app_list, **(arguments or {})},
223 |                     "future": future,
224 |                 }
225 |             )
226 |             try:
227 |                 return await future
228 |             except asyncio.CancelledError:
229 |                 logger.warning(f"Command was cancelled: {action}")
230 |                 return None
231 | 
232 |         async def screenshot(self, as_bytes: bool = True) -> Union[str, Image.Image]:
233 |             """Take a screenshot of the managed applications.
234 | 
235 |             Args:
236 |                 as_bytes (bool): If True, return base64-encoded bytes; if False, return PIL Image
237 | 
238 |             Returns:
239 |                 Union[str, Image.Image]: Base64-encoded PNG bytes or PIL Image object
240 |             """
241 |             import base64
242 | 
243 |             result, img = await self._send_cmd("screenshot")
244 |             self._scene_hitboxes = result.get("hitboxes", [])
245 |             self._scene_size = img.size
246 | 
247 |             if as_bytes:
248 |                 # PIL Image to bytes, then base64 encode for JSON
249 |                 import io
250 | 
251 |                 img_byte_arr = io.BytesIO()
252 |                 img.save(img_byte_arr, format="PNG")
253 |                 img_bytes = img_byte_arr.getvalue()
254 |                 img_b64 = base64.b64encode(img_bytes).decode("ascii")
255 |                 return img_b64
256 |             else:
257 |                 return img
258 | 
259 |         async def left_click(self, x, y):
260 |             """Perform a left mouse click at the specified coordinates.
261 | 
262 |             Args:
263 |                 x (int): X coordinate in screenshot space (or None to use last position)
264 |                 y (int): Y coordinate in screenshot space (or None to use last position)
265 |             """
266 |             # Get last cursor position for this app_list hash
267 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
268 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
269 |             x, y = x or last_pos[0], y or last_pos[1]
270 |             # Update cursor position for this app_list hash
271 |             Diorama._cursor_positions[app_list_hash] = (x, y)
272 | 
273 |             sx, sy = await self.to_screen_coordinates(x, y)
274 |             await self._send_cmd("left_click", {"x": sx, "y": sy})
275 | 
276 |         async def right_click(self, x, y):
277 |             """Perform a right mouse click at the specified coordinates.
278 | 
279 |             Args:
280 |                 x (int): X coordinate in screenshot space (or None to use last position)
281 |                 y (int): Y coordinate in screenshot space (or None to use last position)
282 |             """
283 |             # Get last cursor position for this app_list hash
284 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
285 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
286 |             x, y = x or last_pos[0], y or last_pos[1]
287 |             # Update cursor position for this app_list hash
288 |             Diorama._cursor_positions[app_list_hash] = (x, y)
289 | 
290 |             sx, sy = await self.to_screen_coordinates(x, y)
291 |             await self._send_cmd("right_click", {"x": sx, "y": sy})
292 | 
293 |         async def double_click(self, x, y):
294 |             """Perform a double mouse click at the specified coordinates.
295 | 
296 |             Args:
297 |                 x (int): X coordinate in screenshot space (or None to use last position)
298 |                 y (int): Y coordinate in screenshot space (or None to use last position)
299 |             """
300 |             # Get last cursor position for this app_list hash
301 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
302 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
303 |             x, y = x or last_pos[0], y or last_pos[1]
304 |             # Update cursor position for this app_list hash
305 |             Diorama._cursor_positions[app_list_hash] = (x, y)
306 | 
307 |             sx, sy = await self.to_screen_coordinates(x, y)
308 |             await self._send_cmd("double_click", {"x": sx, "y": sy})
309 | 
310 |         async def move_cursor(self, x, y):
311 |             """Move the mouse cursor to the specified coordinates.
312 | 
313 |             Args:
314 |                 x (int): X coordinate in screenshot space (or None to use last position)
315 |                 y (int): Y coordinate in screenshot space (or None to use last position)
316 |             """
317 |             # Get last cursor position for this app_list hash
318 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
319 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
320 |             x, y = x or last_pos[0], y or last_pos[1]
321 |             # Update cursor position for this app_list hash
322 |             Diorama._cursor_positions[app_list_hash] = (x, y)
323 | 
324 |             sx, sy = await self.to_screen_coordinates(x, y)
325 |             await self._send_cmd("move_cursor", {"x": sx, "y": sy})
326 | 
327 |         async def drag_to(self, x, y, duration=0.5):
328 |             """Drag the mouse from current position to the specified coordinates.
329 | 
330 |             Args:
331 |                 x (int): X coordinate in screenshot space (or None to use last position)
332 |                 y (int): Y coordinate in screenshot space (or None to use last position)
333 |                 duration (float): Duration of the drag operation in seconds
334 |             """
335 |             # Get last cursor position for this app_list hash
336 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
337 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
338 |             x, y = x or last_pos[0], y or last_pos[1]
339 |             # Update cursor position for this app_list hash
340 |             Diorama._cursor_positions[app_list_hash] = (x, y)
341 | 
342 |             sx, sy = await self.to_screen_coordinates(x, y)
343 |             await self._send_cmd("drag_to", {"x": sx, "y": sy, "duration": duration})
344 | 
345 |         async def get_cursor_position(self):
346 |             """Get the current cursor position in screen coordinates.
347 | 
348 |             Returns:
349 |                 tuple: (x, y) coordinates of the cursor in screen space
350 |             """
351 |             return await self._send_cmd("get_cursor_position")
352 | 
353 |         async def type_text(self, text):
354 |             """Type the specified text using the keyboard.
355 | 
356 |             Args:
357 |                 text (str): The text to type
358 |             """
359 |             await self._send_cmd("type_text", {"text": text})
360 | 
361 |         async def press_key(self, key):
362 |             """Press a single key on the keyboard.
363 | 
364 |             Args:
365 |                 key (str): The key to press
366 |             """
367 |             await self._send_cmd("press_key", {"key": key})
368 | 
369 |         async def hotkey(self, keys):
370 |             """Press a combination of keys simultaneously.
371 | 
372 |             Args:
373 |                 keys (list): List of keys to press together
374 |             """
375 |             await self._send_cmd("hotkey", {"keys": list(keys)})
376 | 
377 |         async def scroll_up(self, clicks: int = 1):
378 |             """Scroll up at the current cursor position.
379 | 
380 |             Args:
381 |                 clicks (int): Number of scroll clicks to perform
382 |             """
383 |             # Get last cursor position for this app_list hash
384 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
385 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
386 |             x, y = last_pos[0], last_pos[1]
387 | 
388 |             await self._send_cmd("scroll_up", {"clicks": clicks, "x": x, "y": y})
389 | 
390 |         async def scroll_down(self, clicks: int = 1):
391 |             """Scroll down at the current cursor position.
392 | 
393 |             Args:
394 |                 clicks (int): Number of scroll clicks to perform
395 |             """
396 |             # Get last cursor position for this app_list hash
397 |             app_list_hash = hash(tuple(sorted(self._diorama.app_list)))
398 |             last_pos = Diorama._cursor_positions.get(app_list_hash, (0, 0))
399 |             x, y = last_pos[0], last_pos[1]
400 | 
401 |             await self._send_cmd("scroll_down", {"clicks": clicks, "x": x, "y": y})
402 | 
403 |         async def get_screen_size(self) -> dict[str, int]:
404 |             """Get the size of the screenshot area.
405 | 
406 |             Returns:
407 |                 dict[str, int]: Dictionary with 'width' and 'height' keys
408 |             """
409 |             if not self._scene_size:
410 |                 await self.screenshot()
411 |             return {"width": self._scene_size[0], "height": self._scene_size[1]}
412 | 
413 |         async def to_screen_coordinates(self, x: float, y: float) -> tuple[float, float]:
414 |             """Convert screenshot coordinates to screen coordinates.
415 | 
416 |             Args:
417 |                 x: X absolute coordinate in screenshot space
418 |                 y: Y absolute coordinate in screenshot space
419 | 
420 |             Returns:
421 |                 tuple[float, float]: (x, y) absolute coordinates in screen space
422 |             """
423 |             if not self._scene_hitboxes:
424 |                 await self.screenshot()  # get hitboxes
425 |             # Try all hitboxes
426 |             for h in self._scene_hitboxes[::-1]:
427 |                 rect_from = h.get("hitbox")
428 |                 rect_to = h.get("target")
429 |                 if not rect_from or len(rect_from) != 4:
430 |                     continue
431 | 
432 |                 # check if (x, y) is inside rect_from
433 |                 x0, y0, x1, y1 = rect_from
434 |                 if x0 <= x <= x1 and y0 <= y <= y1:
435 |                     logger.info(f"Found hitbox: {h}")
436 |                     # remap (x, y) to rect_to
437 |                     tx0, ty0, tx1, ty1 = rect_to
438 | 
439 |                     # calculate offset from x0, y0
440 |                     offset_x = x - x0
441 |                     offset_y = y - y0
442 | 
443 |                     # remap offset to rect_to
444 |                     tx = tx0 + offset_x
445 |                     ty = ty0 + offset_y
446 | 
447 |                     return tx, ty
448 |             return x, y
449 | 
450 |         async def to_screenshot_coordinates(self, x: float, y: float) -> tuple[float, float]:
451 |             """Convert screen coordinates to screenshot coordinates.
452 | 
453 |             Args:
454 |                 x: X absolute coordinate in screen space
455 |                 y: Y absolute coordinate in screen space
456 | 
457 |             Returns:
458 |                 tuple[float, float]: (x, y) absolute coordinates in screenshot space
459 |             """
460 |             if not self._scene_hitboxes:
461 |                 await self.screenshot()  # get hitboxes
462 |             # Try all hitboxes
463 |             for h in self._scene_hitboxes[::-1]:
464 |                 rect_from = h.get("target")
465 |                 rect_to = h.get("hitbox")
466 |                 if not rect_from or len(rect_from) != 4:
467 |                     continue
468 | 
469 |                 # check if (x, y) is inside rect_from
470 |                 x0, y0, x1, y1 = rect_from
471 |                 if x0 <= x <= x1 and y0 <= y <= y1:
472 |                     # remap (x, y) to rect_to
473 |                     tx0, ty0, tx1, ty1 = rect_to
474 | 
475 |                     # calculate offset from x0, y0
476 |                     offset_x = x - x0
477 |                     offset_y = y - y0
478 | 
479 |                     # remap offset to rect_to
480 |                     tx = tx0 + offset_x
481 |                     ty = ty0 + offset_y
482 | 
483 |                     return tx, ty
484 |             return x, y
485 | 
486 | 
487 | import time
488 | 
489 | import pyautogui
490 | 
491 | 
492 | async def main():
493 |     """Main function demonstrating Diorama usage with multiple desktops and mouse tracking."""
494 |     desktop1 = Diorama.create_from_apps(["Discord", "Notes"])
495 |     desktop2 = Diorama.create_from_apps(["Terminal"])
496 | 
497 |     img1 = await desktop1.interface.screenshot(as_bytes=False)
498 |     img2 = await desktop2.interface.screenshot(as_bytes=False)
499 | 
500 |     img1.save("app_screenshots/desktop1.png")
501 |     img2.save("app_screenshots/desktop2.png")
502 |     # Initialize Diorama desktop
503 |     desktop3 = Diorama.create_from_apps("Safari")
504 |     screen_size = await desktop3.interface.get_screen_size()
505 |     print(screen_size)
506 | 
507 |     # Take initial screenshot
508 |     img = await desktop3.interface.screenshot(as_bytes=False)
509 |     img.save("app_screenshots/desktop3.png")
510 | 
511 |     # Prepare hitboxes and draw on the single screenshot
512 |     hitboxes = desktop3.interface._scene_hitboxes[::-1]
513 |     base_img = img.copy()
514 |     draw = ImageDraw.Draw(base_img)
515 |     for h in hitboxes:
516 |         rect = h.get("hitbox")
517 |         if not rect or len(rect) != 4:
518 |             continue
519 |         draw.rectangle(rect, outline="red", width=2)
520 | 
521 |     # Track and draw mouse position in real time (single screenshot size)
522 |     last_mouse_pos = None
523 |     print("Tracking mouse... Press Ctrl+C to stop.")
524 |     try:
525 |         while True:
526 |             mouse_x, mouse_y = pyautogui.position()
527 |             if last_mouse_pos != (mouse_x, mouse_y):
528 |                 last_mouse_pos = (mouse_x, mouse_y)
529 |                 # Map to screenshot coordinates
530 |                 sx, sy = await desktop3.interface.to_screenshot_coordinates(mouse_x, mouse_y)
531 |                 # Draw on a copy of the screenshot
532 |                 frame = base_img.copy()
533 |                 frame_draw = ImageDraw.Draw(frame)
534 |                 frame_draw.ellipse((sx - 5, sy - 5, sx + 5, sy + 5), fill="blue", outline="blue")
535 |                 # Save the frame
536 |                 frame.save("app_screenshots/desktop3_mouse.png")
537 |                 print(f"Mouse at screen ({mouse_x}, {mouse_y}) -> screenshot ({sx:.1f}, {sy:.1f})")
538 |             time.sleep(0.05)  # Throttle updates to ~20 FPS
539 |     except KeyboardInterrupt:
540 |         print("Stopped tracking.")
541 | 
542 |         draw.text((rect[0], rect[1]), str(idx), fill="red")
543 | 
544 |     canvas.save("app_screenshots/desktop3_hitboxes.png")
545 | 
546 |     # move mouse in a square spiral around the screen
547 |     import math
548 |     import random
549 | 
550 |     step = 20  # pixels per move
551 |     dot_radius = 10
552 |     width = screen_size["width"]
553 |     height = screen_size["height"]
554 |     x, y = 0, 10
555 | 
556 |     while x < width and y < height:
557 |         await desktop3.interface.move_cursor(x, y)
558 |         img = await desktop3.interface.screenshot(as_bytes=False)
559 |         draw = ImageDraw.Draw(img)
560 |         draw.ellipse((x - dot_radius, y - dot_radius, x + dot_radius, y + dot_radius), fill="red")
561 |         img.save("current.png")
562 |         await asyncio.sleep(0.03)
563 |         x += step
564 |         y = math.sin(x / width * math.pi * 2) * 50 + 25
565 | 
566 | 
567 | if __name__ == "__main__":
568 |     asyncio.run(main())
569 | 
```

--------------------------------------------------------------------------------
/blog/build-your-own-operator-on-macos-1.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Build Your Own Operator on macOS - Part 1
  2 | 
  3 | _Published on March 31, 2025 by Francesco Bonacci_
  4 | 
  5 | In this first blogpost, we'll learn how to build our own Computer-Use Operator using OpenAI's `computer-use-preview` model. But first, let's understand what some common terms mean:
  6 | 
  7 | - A **Virtual Machine (VM)** is like a computer within your computer - a safe, isolated environment where the AI can work without affecting your main system.
  8 | - **computer-use-preview** is OpenAI's specialized language model trained to understand and interact with computer interfaces through screenshots.
  9 | - A **Computer-Use Agent** is an AI agent that can control a computer just like a human would - clicking buttons, typing text, and interacting with applications.
 10 | 
 11 | Our Operator will run in an isolated macOS VM, by making use of our [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer) package and [lume virtualization CLI](https://github.com/trycua/cua/tree/main/libs/lume).
 12 | 
 13 | Check out what it looks like to use your own Operator from a Gradio app:
 14 | 
 15 | <div align="center">
 16 |   <video src="https://github.com/user-attachments/assets/a2cf69ad-2ab2-4eb9-8e1a-45606dd7eec6" width="600" controls></video>
 17 | </div>
 18 | 
 19 | ## What You'll Learn
 20 | 
 21 | By the end of this tutorial, you'll be able to:
 22 | 
 23 | - Set up a macOS virtual machine for AI automation
 24 | - Connect OpenAI's computer-use model to your VM
 25 | - Create a basic loop for the AI to interact with your VM
 26 | - Handle different types of computer actions (clicking, typing, etc.)
 27 | - Implement safety checks and error handling
 28 | 
 29 | **Prerequisites:**
 30 | 
 31 | - macOS Sonoma (14.0) or later
 32 | - 8GB RAM minimum (16GB recommended)
 33 | - OpenAI API access (Tier 3+)
 34 | - Basic Python knowledge
 35 | - Familiarity with terminal commands
 36 | 
 37 | **Estimated Time:** 45-60 minutes
 38 | 
 39 | ## Introduction to Computer-Use Agents
 40 | 
 41 | Last March OpenAI released a fine-tuned version of GPT-4o, namely [CUA](https://openai.com/index/computer-using-agent/), introducing pixel-level vision capabilities with advanced reasoning through reinforcement learning. This fine-tuning enables the computer-use model to interpret screenshots and interact with graphical user interfaces on a pixel-level such as buttons, menus, and text fields - mimicking human interactions on a computer screen. It scores a remarkable 38.1% success rate on [OSWorld](https://os-world.github.io) - a benchmark for Computer-Use agents on Linux and Windows. This is the 2nd available model after Anthropic's [Claude 3.5 Sonnet](https://www.anthropic.com/news/3-5-models-and-computer-use) to support computer-use capabilities natively with no external models (e.g. accessory [SoM (Set-of-Mark)](https://arxiv.org/abs/2310.11441) and OCR runs).
 42 | 
 43 | Professor Ethan Mollick provides an excellent explanation of computer-use agents in this article: [When you give a Claude a mouse](https://www.oneusefulthing.org/p/when-you-give-a-claude-a-mouse).
 44 | 
 45 | ### ChatGPT Operator
 46 | 
 47 | OpenAI's computer-use model powers [ChatGPT Operator](https://openai.com/index/introducing-operator), a Chromium-based interface exclusively available to ChatGPT Pro subscribers. Users leverage this functionality to automate web-based tasks such as online shopping, expense report submission, and booking reservations by interacting with websites in a human-like manner.
 48 | 
 49 | ## Benefits of Custom Operators
 50 | 
 51 | ### Why Build Your Own?
 52 | 
 53 | While OpenAI's Operator uses a controlled Chromium VM instance, there are scenarios where you may want to use your own VM with full desktop capabilities. Here are some examples:
 54 | 
 55 | - Automating native macOS apps like Finder, Xcode
 56 | - Managing files, changing settings, and running terminal commands
 57 | - Testing desktop software and applications
 58 | - Creating workflows that combine web and desktop tasks
 59 | - Automating media editing in apps like Final Cut Pro and Blender
 60 | 
 61 | This gives you more control and flexibility to automate tasks beyond just web browsing, with full access to interact with native applications and system-level operations. Additionally, running your own VM locally provides better privacy for sensitive user files and delivers superior performance by leveraging your own hardware instead of renting expensive Cloud VMs.
 62 | 
 63 | ## Access Requirements
 64 | 
 65 | ### Model Availability
 66 | 
 67 | As we speak, the **computer-use-preview** model has limited availability:
 68 | 
 69 | - Only accessible to OpenAI tier 3+ users
 70 | - Additional application process may be required even for eligible users
 71 | - Cannot be used in the OpenAI Playground
 72 | - Outside of ChatGPT Operator, usage is restricted to the new **Responses API**
 73 | 
 74 | ## Understanding the OpenAI API
 75 | 
 76 | ### Responses API Overview
 77 | 
 78 | Let's start with the basics. In our case, we'll use OpenAI's Responses API to communicate with their computer-use model.
 79 | 
 80 | Think of it like this:
 81 | 
 82 | 1. We send the model a screenshot of our VM and tell it what we want it to do
 83 | 2. The model looks at the screenshot and decides what actions to take
 84 | 3. It sends back instructions (like "click here" or "type this")
 85 | 4. We execute those instructions in our VM
 86 | 
 87 | The [Responses API](https://platform.openai.com/docs/guides/responses) is OpenAI's newest way to interact with their AI models. It comes with several built-in tools:
 88 | 
 89 | - **Web search**: Let the AI search the internet
 90 | - **File search**: Help the AI find documents
 91 | - **Computer use**: Allow the AI to control a computer (what we'll be using)
 92 | 
 93 | As we speak, the computer-use model is only available through the Responses API.
 94 | 
 95 | ### Responses API Examples
 96 | 
 97 | Let's look at some simple examples. We'll start with the traditional way of using OpenAI's API with Chat Completions, then show the new Responses API primitive.
 98 | 
 99 | Chat Completions:
100 | 
101 | ```python
102 | # The old way required managing conversation history manually
103 | messages = [{"role": "user", "content": "Hello"}]
104 | response = client.chat.completions.create(
105 |     model="gpt-4",
106 |     messages=messages  # We had to track all messages ourselves
107 | )
108 | messages.append(response.choices[0].message)  # Manual message tracking
109 | ```
110 | 
111 | Responses API:
112 | 
113 | ```python
114 | # Example 1: Simple web search
115 | # The API handles all the complexity for us
116 | response = client.responses.create(
117 |     model="gpt-4",
118 |     input=[{
119 |         "role": "user",
120 |         "content": "What's the latest news about AI?"
121 |     }],
122 |     tools=[{
123 |         "type": "web_search",  # Tell the API to use web search
124 |         "search_query": "latest AI news"
125 |     }]
126 | )
127 | 
128 | # Example 2: File search
129 | # Looking for specific documents becomes easy
130 | response = client.responses.create(
131 |     model="gpt-4",
132 |     input=[{
133 |         "role": "user",
134 |         "content": "Find documents about project X"
135 |     }],
136 |     tools=[{
137 |         "type": "file_search",
138 |         "query": "project X",
139 |         "file_types": ["pdf", "docx"]  # Specify which file types to look for
140 |     }]
141 | )
142 | ```
143 | 
144 | ### Computer-Use Model Setup
145 | 
146 | For our operator, we'll use the computer-use model. Here's how we set it up:
147 | 
148 | ```python
149 | # Set up the computer-use model to control our VM
150 | response = client.responses.create(
151 |     model="computer-use-preview",  # Special model for computer control
152 |     tools=[{
153 |         "type": "computer_use_preview",
154 |         "display_width": 1024,     # Size of our VM screen
155 |         "display_height": 768,
156 |         "environment": "mac"       # Tell it we're using macOS.
157 |     }],
158 |     input=[
159 |         {
160 |             "role": "user",
161 |             "content": [
162 |                 # What we want the AI to do
163 |                 {"type": "input_text", "text": "Open Safari and go to google.com"},
164 |                 # Current screenshot of our VM
165 |                 {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"}
166 |             ]
167 |         }
168 |     ],
169 |     truncation="auto"  # Let OpenAI handle message length
170 | )
171 | ```
172 | 
173 | ### Understanding the Response
174 | 
175 | When we send a request, the API sends back a response that looks like this:
176 | 
177 | ```json
178 | "output": [
179 |     {
180 |         "type": "reasoning",           # The AI explains what it's thinking
181 |         "id": "rs_67cc...",
182 |         "summary": [
183 |             {
184 |                 "type": "summary_text",
185 |                 "text": "Clicking on the browser address bar."
186 |             }
187 |         ]
188 |     },
189 |     {
190 |         "type": "computer_call",       # The actual action to perform
191 |         "id": "cu_67cc...",
192 |         "call_id": "call_zw3...",
193 |         "action": {
194 |             "type": "click",           # What kind of action (click, type, etc.)
195 |             "button": "left",          # Which mouse button to use
196 |             "x": 156,                  # Where to click (coordinates)
197 |             "y": 50
198 |         },
199 |         "pending_safety_checks": [],   # Any safety warnings to consider
200 |         "status": "completed"          # Whether the action was successful
201 |     }
202 | ]
203 | ```
204 | 
205 | Each response contains:
206 | 
207 | 1. **Reasoning**: The AI's explanation of what it's doing
208 | 2. **Action**: The specific computer action to perform
209 | 3. **Safety Checks**: Any potential risks to review
210 | 4. **Status**: Whether everything worked as planned
211 | 
212 | ## CUA-Computer Interface
213 | 
214 | ### Architecture Overview
215 | 
216 | Let's break down the main components of our system and how they work together:
217 | 
218 | 1. **The Virtual Machine (VM)**
219 |    - Think of this as a safe playground for our AI
220 |    - It's a complete macOS system running inside your computer
221 |    - Anything the AI does stays inside this VM, keeping your main system safe
222 |    - We use `lume` to create and manage this VM
223 | 
224 | 2. **The Computer Interface (CUI)**
225 |    - This is how we control the VM
226 |    - It can move the mouse, type text, and take screenshots
227 |    - Works like a remote control for the VM
228 |    - Built using our `cua-computer` package
229 | 
230 | 3. **The OpenAI Model**
231 |    - This is the brain of our operator
232 |    - It looks at screenshots of the VM
233 |    - Decides what actions to take
234 |    - Sends back instructions like "click here" or "type this"
235 | 
236 | Here's how they all work together:
237 | 
238 | ```mermaid
239 | sequenceDiagram
240 |     participant User as You
241 |     participant CUI as Computer Interface
242 |     participant VM as Virtual Machine
243 |     participant AI as OpenAI API
244 | 
245 |     Note over User,AI: The Main Loop
246 |     User->>CUI: Start the operator
247 |     CUI->>VM: Create macOS sandbox
248 |     activate VM
249 |     VM-->>CUI: VM is ready
250 | 
251 |     loop Action Loop
252 |         Note over CUI,AI: Each iteration
253 |         CUI->>VM: Take a screenshot
254 |         VM-->>CUI: Return current screen
255 |         CUI->>AI: Send screenshot + instructions
256 |         AI-->>CUI: Return next action
257 | 
258 |         Note over CUI,VM: Execute the action
259 |         alt Mouse Click
260 |             CUI->>VM: Move and click mouse
261 |         else Type Text
262 |             CUI->>VM: Type characters
263 |         else Scroll Screen
264 |             CUI->>VM: Scroll window
265 |         else Press Keys
266 |             CUI->>VM: Press keyboard keys
267 |         else Wait
268 |             CUI->>VM: Pause for a moment
269 |         end
270 |     end
271 | 
272 |     VM-->>CUI: Task finished
273 |     deactivate VM
274 |     CUI-->>User: All done!
275 | ```
276 | 
277 | The diagram above shows how information flows through our system:
278 | 
279 | 1. You start the operator
280 | 2. The Computer Interface creates a virtual macOS
281 | 3. Then it enters a loop:
282 |    - Take a picture of the VM screen
283 |    - Send it to OpenAI with instructions
284 |    - Get back an action to perform
285 |    - Execute that action in the VM
286 |    - Repeat until the task is done
287 | 
288 | This design keeps everything organized and safe. The AI can only interact with the VM through our controlled interface, and the VM keeps the AI's actions isolated from your main system.
289 | 
290 | ---
291 | 
292 | ## Implementation Guide
293 | 
294 | ### Prerequisites
295 | 
296 | 1. **Lume CLI Setup**
297 |    For installing the standalone lume binary, run the following command from a terminal, or download the [latest pkg](https://github.com/trycua/cua/releases/download/lume-v0.2.22/lume-darwin.pkg.tar.gz).
298 | 
299 |    ```bash
300 |    sudo /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/lume/scripts/install.sh)"
301 |    ```
302 | 
303 |    **Important Storage Notes:**
304 |    - Initial download requires 80GB of free space
305 |    - After first run, space usage reduces to ~30GB due to macOS's sparse file system
306 |    - VMs are stored in `~/.lume`
307 |    - Cached images are stored in `~/.lume/cache`
308 | 
309 |    You can check your downloaded VM images anytime:
310 | 
311 |    ```bash
312 |    lume ls
313 |    ```
314 | 
315 |    Example output:
316 | 
317 |    | name                     | os    | cpu | memory | disk          | display  | status  | ip            | vnc                                            |
318 |    | ------------------------ | ----- | --- | ------ | ------------- | -------- | ------- | ------------- | ---------------------------------------------- |
319 |    | macos-sequoia-cua:latest | macOS | 12  | 16.00G | 64.5GB/80.0GB | 1024x768 | running | 192.168.64.78 | vnc://:[email protected]:56085 |
320 | 
321 |    After checking your available images, you can run the VM to ensure everything is working correctly:
322 | 
323 |    ```bash
324 |    lume run macos-sequoia-cua:latest
325 |    ```
326 | 
327 | 2. **Python Environment Setup**
328 |    **Note**: The `cua-computer` package requires Python 3.10 or later. We recommend creating a dedicated Python environment:
329 | 
330 |    **Using venv:**
331 | 
332 |    ```bash
333 |    python -m venv cua-env
334 |    source cua-env/bin/activate
335 |    ```
336 | 
337 |    **Using conda:**
338 | 
339 |    ```bash
340 |    conda create -n cua-env python=3.10
341 |    conda activate cua-env
342 |    ```
343 | 
344 |    Then install the required packages:
345 | 
346 |    ```bash
347 |    pip install openai
348 |    pip install cua-computer
349 |    ```
350 | 
351 |    Ensure you have an OpenAI API key (set as an environment variable or in your OpenAI configuration).
352 | 
353 | ### Building the Operator
354 | 
355 | #### Importing Required Modules
356 | 
357 | With the prerequisites installed and configured, we're ready to build our first operator.
358 | The following example uses asynchronous Python (async/await). You can run it either in a VS Code Notebook or as a standalone Python script.
359 | 
360 | ```python
361 | import asyncio
362 | import base64
363 | import openai
364 | 
365 | from computer import Computer
366 | ```
367 | 
368 | #### Mapping API Actions to CUA Methods
369 | 
370 | The following helper function converts a `computer_call` action from the OpenAI Responses API into corresponding commands on the CUI interface. For example, if the API instructs a `click` action, we move the cursor and perform a left click on the lume VM Sandbox. We will use the computer interface to execute the actions.
371 | 
372 | ```python
373 | async def execute_action(computer, action):
374 |     action_type = action.type
375 | 
376 |     if action_type == "click":
377 |         x = action.x
378 |         y = action.y
379 |         button = action.button
380 |         print(f"Executing click at ({x}, {y}) with button '{button}'")
381 |         await computer.interface.move_cursor(x, y)
382 |         if button == "right":
383 |             await computer.interface.right_click()
384 |         else:
385 |             await computer.interface.left_click()
386 | 
387 |     elif action_type == "type":
388 |         text = action.text
389 |         print(f"Typing text: {text}")
390 |         await computer.interface.type_text(text)
391 | 
392 |     elif action_type == "scroll":
393 |         x = action.x
394 |         y = action.y
395 |         scroll_x = action.scroll_x
396 |         scroll_y = action.scroll_y
397 |         print(f"Scrolling at ({x}, {y}) with offsets (scroll_x={scroll_x}, scroll_y={scroll_y})")
398 |         await computer.interface.move_cursor(x, y)
399 |         await computer.interface.scroll(scroll_y)  # Using vertical scroll only
400 | 
401 |     elif action_type == "keypress":
402 |         keys = action.keys
403 |         for key in keys:
404 |             print(f"Pressing key: {key}")
405 |             # Map common key names to CUA equivalents
406 |             if key.lower() == "enter":
407 |                 await computer.interface.press_key("return")
408 |             elif key.lower() == "space":
409 |                 await computer.interface.press_key("space")
410 |             else:
411 |                 await computer.interface.press_key(key)
412 | 
413 |     elif action_type == "wait":
414 |         wait_time = action.time
415 |         print(f"Waiting for {wait_time} seconds")
416 |         await asyncio.sleep(wait_time)
417 | 
418 |     elif action_type == "screenshot":
419 |         print("Taking screenshot")
420 |         # This is handled automatically in the main loop, but we can take an extra one if requested
421 |         screenshot = await computer.interface.screenshot()
422 |         return screenshot
423 | 
424 |     else:
425 |         print(f"Unrecognized action: {action_type}")
426 | ```
427 | 
428 | #### Implementing the Computer-Use Loop
429 | 
430 | This section defines a loop that:
431 | 
432 | 1. Initializes the cua-computer instance (connecting to a macOS sandbox).
433 | 2. Captures a screenshot of the current state.
434 | 3. Sends the screenshot (with a user prompt) to the OpenAI Responses API using the `computer-use-preview` model.
435 | 4. Processes the returned `computer_call` action and executes it using our helper function.
436 | 5. Captures an updated screenshot after the action (this example runs one iteration, but you can wrap it in a loop).
437 | 
438 | For a full loop, you would repeat these steps until no further actions are returned.
439 | 
440 | ```python
441 | async def cua_openai_loop():
442 |     # Initialize the lume computer instance (macOS sandbox)
443 |     async with Computer(
444 |         display="1024x768",
445 |         memory="4GB",
446 |         cpu="2",
447 |         os_type="macos"
448 |     ) as computer:
449 |         await computer.run() # Start the lume VM
450 | 
451 |         # Capture the initial screenshot
452 |         screenshot = await computer.interface.screenshot()
453 |         screenshot_base64 = base64.b64encode(screenshot).decode('utf-8')
454 | 
455 |         # Initial request to start the loop
456 |         response = openai.responses.create(
457 |             model="computer-use-preview",
458 |             tools=[{
459 |                 "type": "computer_use_preview",
460 |                 "display_width": 1024,
461 |                 "display_height": 768,
462 |                 "environment": "mac"
463 |             }],
464 |             input=[
465 |                 {
466 |                     "role": "user",
467 |                     "content": [
468 |                         {"type": "input_text", "text": "Open Safari, download and install Cursor."},
469 |                         {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_base64}"}
470 |                     ]
471 |                 }
472 |             ],
473 |             truncation="auto"
474 |         )
475 | 
476 |         # Continue the loop until no more computer_call actions
477 |         while True:
478 |             # Check for computer_call actions
479 |             computer_calls = [item for item in response.output if item and item.type == "computer_call"]
480 |             if not computer_calls:
481 |                 print("No more computer calls. Loop complete.")
482 |                 break
483 | 
484 |             # Get the first computer call
485 |             call = computer_calls[0]
486 |             last_call_id = call.call_id
487 |             action = call.action
488 |             print("Received action from OpenAI Responses API:", action)
489 | 
490 |             # Handle any pending safety checks
491 |             if call.pending_safety_checks:
492 |                 print("Safety checks pending:", call.pending_safety_checks)
493 |                 # In a real implementation, you would want to get user confirmation here
494 |                 acknowledged_checks = call.pending_safety_checks
495 |             else:
496 |                 acknowledged_checks = []
497 | 
498 |             # Execute the action
499 |             await execute_action(computer, action)
500 |             await asyncio.sleep(1)  # Allow time for changes to take effect
501 | 
502 |             # Capture new screenshot after action
503 |             new_screenshot = await computer.interface.screenshot()
504 |             new_screenshot_base64 = base64.b64encode(new_screenshot).decode('utf-8')
505 | 
506 |             # Send the screenshot back as computer_call_output
507 |             response = openai.responses.create(
508 |                 model="computer-use-preview",
509 |                 tools=[{
510 |                     "type": "computer_use_preview",
511 |                     "display_width": 1024,
512 |                     "display_height": 768,
513 |                     "environment": "mac"
514 |                 }],
515 |                 input=[{
516 |                     "type": "computer_call_output",
517 |                     "call_id": last_call_id,
518 |                     "acknowledged_safety_checks": acknowledged_checks,
519 |                     "output": {
520 |                         "type": "input_image",
521 |                         "image_url": f"data:image/png;base64,{new_screenshot_base64}"
522 |                     }
523 |                 }],
524 |                 truncation="auto"
525 |             )
526 | 
527 |         # End the session
528 |         await computer.stop()
529 | 
530 | # Run the loop
531 | if __name__ == "__main__":
532 |     asyncio.run(cua_openai_loop())
533 | ```
534 | 
535 | You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-1.ipynb).
536 | 
537 | #### Request Handling Differences
538 | 
539 | The first request to the OpenAI Responses API is special in that it includes the initial screenshot and prompt. Subsequent requests are handled differently, using the `computer_call_output` type to provide feedback on the executed action.
540 | 
541 | ##### Initial Request Format
542 | 
543 | - We use `role: "user"` with `content` that contains both `input_text` (the prompt) and `input_image` (the screenshot)
544 | 
545 | ##### Subsequent Request Format
546 | 
547 | - We use `type: "computer_call_output"` instead of the user role
548 | - We include the `call_id` to link the output to the specific previous action that was executed
549 | - We provide any `acknowledged_safety_checks` that were approved
550 | - We include the new screenshot in the `output` field
551 | 
552 | This structured approach allows the API to maintain context and continuity throughout the interaction session.
553 | 
554 | **Note**: For multi-turn conversations, you should include the `previous_response_id` in your initial requests when starting a new conversation with prior context. However, when using `computer_call_output` for action feedback, you don't need to explicitly manage the conversation history - OpenAI's API automatically tracks the context using the `call_id`. The `previous_response_id` is primarily important when the user provides additional instructions or when starting a new request that should continue from a previous session.
555 | 
556 | ## Conclusion
557 | 
558 | ### Summary
559 | 
560 | This blogpost demonstrates a single iteration of a OpenAI Computer-Use loop where:
561 | 
562 | - A macOS sandbox is controlled using the CUA interface.
563 | - A screenshot and prompt are sent to the OpenAI Responses API.
564 | - The returned action (e.g. a click or type command) is executed via the CUI interface.
565 | 
566 | In a production setting, you would wrap the action-response cycle in a loop, handling multiple actions and safety checks as needed.
567 | 
568 | ### Next Steps
569 | 
570 | In the next blogpost, we'll introduce our Agent framework which abstracts away all these tedious implementation steps. This framework provides a higher-level API that handles the interaction loop between OpenAI's computer-use model and the macOS sandbox, allowing you to focus on building sophisticated applications rather than managing the low-level details we've explored here. Can't wait? Check out the [cua-agent](https://github.com/trycua/cua/tree/main/libs/python/agent) package!
571 | 
572 | ### Resources
573 | 
574 | - [OpenAI Computer-Use docs](https://platform.openai.com/docs/guides/tools-computer-use)
575 | - [cua-computer](https://github.com/trycua/cua/tree/main/libs/python/computer)
576 | - [lume](https://github.com/trycua/cua/tree/main/libs/lume)
577 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/loops/generic_vlm.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | Qwen3-VL agent loop implementation using litellm with function/tool calling.
  3 | - Passes a ComputerUse tool schema to acompletion
  4 | - Converts between Responses items and completion messages using helpers
  5 | """
  6 | 
  7 | from __future__ import annotations
  8 | 
  9 | import json
 10 | import re
 11 | from typing import Any, Dict, List, Optional, Tuple
 12 | 
 13 | import litellm
 14 | from litellm.responses.litellm_completion_transformation.transformation import (
 15 |     LiteLLMCompletionResponsesConfig,
 16 | )
 17 | 
 18 | from ..decorators import register_agent
 19 | from ..loops.base import AsyncAgentConfig
 20 | from ..responses import (
 21 |     convert_completion_messages_to_responses_items,
 22 |     convert_responses_items_to_completion_messages,
 23 |     make_reasoning_item,
 24 | )
 25 | from ..types import AgentCapability
 26 | 
 27 | # ComputerUse tool schema (OpenAI function tool format)
 28 | QWEN3_COMPUTER_TOOL: Dict[str, Any] = {
 29 |     "type": "function",
 30 |     "function": {
 31 |         "name": "computer",
 32 |         "description": (
 33 |             "Use a mouse and keyboard to interact with a computer, and take screenshots.\n"
 34 |             "* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n"
 35 |             "* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n"
 36 |             "* The screen's resolution is 1000x1000.\n"
 37 |             "* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n"
 38 |             "* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n"
 39 |             "* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges."
 40 |         ),
 41 |         "parameters": {
 42 |             "type": "object",
 43 |             "properties": {
 44 |                 "action": {
 45 |                     "description": "The action to perform.",
 46 |                     "enum": [
 47 |                         "key",
 48 |                         "type",
 49 |                         "mouse_move",
 50 |                         "left_click",
 51 |                         "left_click_drag",
 52 |                         "right_click",
 53 |                         "middle_click",
 54 |                         "double_click",
 55 |                         "triple_click",
 56 |                         "scroll",
 57 |                         "hscroll",
 58 |                         "screenshot",
 59 |                         "wait",
 60 |                         # "terminate",
 61 |                         # "answer",
 62 |                     ],
 63 |                     "type": "string",
 64 |                 },
 65 |                 "keys": {
 66 |                     "description": "Required only by action=key.",
 67 |                     "type": "array",
 68 |                     "items": {"type": "string"},
 69 |                 },
 70 |                 "text": {
 71 |                     "description": "Required only by action=type and action=answer.",
 72 |                     "type": "string",
 73 |                 },
 74 |                 "coordinate": {
 75 |                     "description": "(x, y): Pixel coordinates from top-left.",
 76 |                     "type": "array",
 77 |                     "items": {"type": ["number", "integer"]},
 78 |                     "minItems": 2,
 79 |                     "maxItems": 2,
 80 |                 },
 81 |                 "pixels": {
 82 |                     "description": "Scroll amount. Positive=up, negative=down. For scroll/hscroll.",
 83 |                     "type": "number",
 84 |                 },
 85 |                 "time": {
 86 |                     "description": "Seconds to wait (action=wait).",
 87 |                     "type": "number",
 88 |                 },
 89 |                 # "status": {
 90 |                 #     "description": "Task status (action=terminate).",
 91 |                 #     "type": "string",
 92 |                 #     "enum": ["success", "failure"],
 93 |                 # },
 94 |             },
 95 |             "required": ["action"],
 96 |         },
 97 |     },
 98 | }
 99 | 
100 | 
101 | def _build_nous_system(functions: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
102 |     """Use qwen-agent NousFnCallPrompt to generate a system message embedding tool schema."""
103 |     try:
104 |         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
105 |             ContentItem as NousContentItem,
106 |         )
107 |         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
108 |             Message as NousMessage,
109 |         )
110 |         from qwen_agent.llm.fncall_prompts.nous_fncall_prompt import (
111 |             NousFnCallPrompt,
112 |         )
113 |     except ImportError:
114 |         raise ImportError(
115 |             "qwen-agent not installed. Please install it with `pip install cua-agent[qwen]`."
116 |         )
117 |     msgs = NousFnCallPrompt().preprocess_fncall_messages(
118 |         messages=[
119 |             NousMessage(
120 |                 role="system", content=[NousContentItem(text="You are a helpful assistant.")]
121 |             )
122 |         ],
123 |         functions=functions,
124 |         lang="en",
125 |     )
126 |     sys = msgs[0].model_dump()
127 |     # Convert qwen-agent structured content to OpenAI-style content list
128 |     content = [{"type": "text", "text": c["text"]} for c in sys.get("content", [])]
129 |     return {"role": "system", "content": content}
130 | 
131 | 
132 | def _parse_tool_call_from_text(text: str) -> Optional[Dict[str, Any]]:
133 |     """Extract JSON object within <tool_call>...</tool_call> from model text."""
134 |     m = re.search(r"<tool_call>\s*(\{[\s\S]*?\})\s*</tool_call>", text)
135 |     if not m:
136 |         return None
137 |     try:
138 |         return json.loads(m.group(1))
139 |     except Exception:
140 |         return None
141 | 
142 | 
143 | async def _unnormalize_coordinate(args: Dict[str, Any], dims: Tuple[int, int]) -> Dict[str, Any]:
144 |     """Coordinates appear in 0..1000 space, scale to actual screen size using dims if provided."""
145 |     coord = args.get("coordinate")
146 |     if not coord or not isinstance(coord, (list, tuple)) or len(coord) < 2:
147 |         return args
148 |     x, y = float(coord[0]), float(coord[1])
149 |     width, height = float(dims[0]), float(dims[1])
150 |     x_abs = max(0.0, min(width, (x / 1000.0) * width))
151 |     y_abs = max(0.0, min(height, (y / 1000.0) * height))
152 |     args = {**args, "coordinate": [round(x_abs), round(y_abs)]}
153 |     return args
154 | 
155 | 
156 | def convert_qwen_tool_args_to_computer_action(args: Dict[str, Any]) -> Optional[Dict[str, Any]]:
157 |     """
158 |     Convert Qwen computer tool arguments to the Computer Calls action schema.
159 | 
160 |     Qwen (example):
161 |         {"action": "left_click", "coordinate": [114, 68]}
162 | 
163 |     Target (example):
164 |         {"action": "left_click", "x": 114, "y": 68}
165 | 
166 |     Other mappings:
167 |     - right_click, middle_click, double_click (triple_click -> double_click)
168 |     - mouse_move -> { action: "move", x, y }
169 |     - key -> { action: "keypress", keys: [...] }
170 |     - type -> { action: "type", text }
171 |     - scroll/hscroll -> { action: "scroll", scroll_x, scroll_y, x, y }
172 |     - wait -> { action: "wait" }
173 |     - terminate/answer are not direct UI actions; return None for now
174 |     """
175 |     if not isinstance(args, dict):
176 |         return None
177 | 
178 |     action = args.get("action")
179 |     if not isinstance(action, str):
180 |         return None
181 | 
182 |     # Coordinates helper
183 |     coord = args.get("coordinate")
184 |     x = y = None
185 |     if isinstance(coord, (list, tuple)) and len(coord) >= 2:
186 |         try:
187 |             x = int(round(float(coord[0])))
188 |             y = int(round(float(coord[1])))
189 |         except Exception:
190 |             x = y = None
191 | 
192 |     # Map actions
193 |     a = action.lower()
194 |     if a in {"left_click", "right_click", "middle_click", "double_click"}:
195 |         if x is None or y is None:
196 |             return None
197 |         return {"action": a, "x": x, "y": y}
198 |     if a == "triple_click":
199 |         # Approximate as double_click
200 |         if x is None or y is None:
201 |             return None
202 |         return {"action": "double_click", "x": x, "y": y}
203 |     if a == "mouse_move":
204 |         if x is None or y is None:
205 |             return None
206 |         return {"action": "move", "x": x, "y": y}
207 |     if a == "key":
208 |         keys = args.get("keys")
209 |         if isinstance(keys, list) and all(isinstance(k, str) for k in keys):
210 |             return {"action": "keypress", "keys": keys}
211 |         return None
212 |     if a == "type":
213 |         text = args.get("text")
214 |         if isinstance(text, str):
215 |             return {"action": "type", "text": text}
216 |         return None
217 |     if a in {"scroll", "hscroll"}:
218 |         pixels = args.get("pixels") or 0
219 |         try:
220 |             pixels_val = int(round(float(pixels)))
221 |         except Exception:
222 |             pixels_val = 0
223 |         scroll_x = pixels_val if a == "hscroll" else 0
224 |         scroll_y = pixels_val if a == "scroll" else 0
225 |         # Include cursor position if available (optional)
226 |         out: Dict[str, Any] = {"action": "scroll", "scroll_x": scroll_x, "scroll_y": scroll_y}
227 |         if x is not None and y is not None:
228 |             out.update({"x": x, "y": y})
229 |         return out
230 |     if a == "wait":
231 |         return {"action": "wait"}
232 | 
233 |     # Non-UI or terminal actions: terminate/answer -> not mapped here
234 |     return None
235 | 
236 | 
237 | @register_agent(models=r"(?i).*", priority=-100)
238 | class GenericVlmConfig(AsyncAgentConfig):
239 |     async def predict_step(
240 |         self,
241 |         messages: List[Dict[str, Any]],
242 |         model: str,
243 |         tools: Optional[List[Dict[str, Any]]] = None,
244 |         max_retries: Optional[int] = None,
245 |         stream: bool = False,
246 |         computer_handler=None,
247 |         use_prompt_caching: Optional[bool] = False,
248 |         _on_api_start=None,
249 |         _on_api_end=None,
250 |         _on_usage=None,
251 |         _on_screenshot=None,
252 |         **kwargs,
253 |     ) -> Dict[str, Any]:
254 |         # Build messages using NousFnCallPrompt system with tool schema in text
255 |         # Start with converted conversation (images/text preserved)
256 |         converted_msgs = convert_responses_items_to_completion_messages(
257 |             messages,
258 |             allow_images_in_tool_results=False,
259 |         )
260 | 
261 |         # Prepend Nous-generated system if available
262 |         nous_system = _build_nous_system([QWEN3_COMPUTER_TOOL["function"]])
263 |         completion_messages = ([nous_system] if nous_system else []) + converted_msgs
264 | 
265 |         # If there is no screenshot in the conversation, take one now and inject it.
266 |         # Also record a pre_output_items assistant message to reflect action.
267 |         def _has_any_image(msgs: List[Dict[str, Any]]) -> bool:
268 |             for m in msgs:
269 |                 content = m.get("content")
270 |                 if isinstance(content, list):
271 |                     for p in content:
272 |                         if isinstance(p, dict) and p.get("type") == "image_url":
273 |                             return True
274 |             return False
275 | 
276 |         pre_output_items: List[Dict[str, Any]] = []
277 |         if not _has_any_image(completion_messages):
278 |             if computer_handler is None or not hasattr(computer_handler, "screenshot"):
279 |                 raise RuntimeError(
280 |                     "No screenshots present and computer_handler.screenshot is not available."
281 |                 )
282 |             screenshot_b64 = await computer_handler.screenshot()
283 |             if not screenshot_b64:
284 |                 raise RuntimeError("Failed to capture screenshot from computer_handler.")
285 |             # Inject a user message with the screenshot so the model can see current context
286 |             completion_messages.append(
287 |                 {
288 |                     "role": "user",
289 |                     "content": [
290 |                         {
291 |                             "type": "image_url",
292 |                             "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"},
293 |                         },
294 |                         {"type": "text", "text": "Current screen"},
295 |                     ],
296 |                 }
297 |             )
298 |             # Add assistant message to outputs to reflect the action, similar to composed_grounded.py
299 |             pre_output_items.append(
300 |                 {
301 |                     "type": "message",
302 |                     "role": "assistant",
303 |                     "content": [
304 |                         {
305 |                             "type": "text",
306 |                             "text": "Taking a screenshot to see the current computer screen.",
307 |                         }
308 |                     ],
309 |                 }
310 |             )
311 | 
312 |         # Smart-resize all screenshots and attach min/max pixel hints. Fail fast if deps missing.
313 |         # Also record the last resized width/height to unnormalize coordinates later.
314 |         last_rw: Optional[int] = None
315 |         last_rh: Optional[int] = None
316 |         MIN_PIXELS = 3136
317 |         MAX_PIXELS = 12845056
318 |         try:
319 |             import base64
320 |             import io
321 | 
322 |             from PIL import Image  # type: ignore
323 |             from qwen_vl_utils import smart_resize  # type: ignore
324 |         except Exception:
325 |             raise ImportError(
326 |                 "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
327 |             )
328 | 
329 |         for msg in completion_messages:
330 |             content = msg.get("content")
331 |             if not isinstance(content, list):
332 |                 continue
333 |             for part in content:
334 |                 if isinstance(part, dict) and part.get("type") == "image_url":
335 |                     url = ((part.get("image_url") or {}).get("url")) or ""
336 |                     # Expect data URL like data:image/png;base64,<b64>
337 |                     if url.startswith("data:") and "," in url:
338 |                         b64 = url.split(",", 1)[1]
339 |                         img_bytes = base64.b64decode(b64)
340 |                         im = Image.open(io.BytesIO(img_bytes))
341 |                         h, w = im.height, im.width
342 |                         rh, rw = smart_resize(
343 |                             h, w, factor=32, min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS
344 |                         )
345 |                         # Attach hints on this image block
346 |                         part["min_pixels"] = MIN_PIXELS
347 |                         part["max_pixels"] = MAX_PIXELS
348 |                         last_rw, last_rh = rw, rh
349 | 
350 |         api_kwargs: Dict[str, Any] = {
351 |             "model": model,
352 |             "messages": completion_messages,
353 |             "max_retries": max_retries,
354 |             "stream": stream,
355 |             **{k: v for k, v in kwargs.items()},
356 |         }
357 |         if use_prompt_caching:
358 |             api_kwargs["use_prompt_caching"] = use_prompt_caching
359 | 
360 |         if _on_api_start:
361 |             await _on_api_start(api_kwargs)
362 | 
363 |         response = await litellm.acompletion(**api_kwargs)
364 | 
365 |         if _on_api_end:
366 |             await _on_api_end(api_kwargs, response)
367 | 
368 |         usage = {
369 |             **LiteLLMCompletionResponsesConfig._transform_chat_completion_usage_to_responses_usage(  # type: ignore
370 |                 response.usage
371 |             ).model_dump(),
372 |             "response_cost": response._hidden_params.get("response_cost", 0.0),
373 |         }
374 |         if _on_usage:
375 |             await _on_usage(usage)
376 | 
377 |         # Extract response data
378 |         resp_dict = response.model_dump()  # type: ignore
379 |         choice = (resp_dict.get("choices") or [{}])[0]
380 |         message = choice.get("message") or {}
381 |         content_text = message.get("content") or ""
382 |         tool_calls_array = message.get("tool_calls") or []
383 |         reasoning_text = message.get("reasoning") or ""
384 | 
385 |         output_items: List[Dict[str, Any]] = []
386 | 
387 |         # Add reasoning if present (Ollama Cloud format)
388 |         if reasoning_text:
389 |             output_items.append(make_reasoning_item(reasoning_text))
390 | 
391 |         # Priority 1: Try to parse tool call from content text (OpenRouter format)
392 |         tool_call = _parse_tool_call_from_text(content_text)
393 | 
394 |         if tool_call and isinstance(tool_call, dict):
395 |             fn_name = tool_call.get("name") or "computer"
396 |             raw_args = tool_call.get("arguments") or {}
397 |             # Unnormalize coordinates to actual screen size using last resized dims
398 |             if last_rw is None or last_rh is None:
399 |                 raise RuntimeError(
400 |                     "No screenshots found to derive dimensions for coordinate unnormalization."
401 |                 )
402 |             args = await _unnormalize_coordinate(raw_args, (last_rw, last_rh))
403 | 
404 |             # Build an OpenAI-style tool call so we can reuse the converter
405 |             fake_cm = {
406 |                 "role": "assistant",
407 |                 "tool_calls": [
408 |                     {
409 |                         "type": "function",
410 |                         "id": "call_0",
411 |                         "function": {
412 |                             "name": fn_name,
413 |                             "arguments": json.dumps(args),
414 |                         },
415 |                     }
416 |                 ],
417 |             }
418 |             output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
419 |         elif tool_calls_array:
420 |             # Priority 2: Use tool_calls field if present (Ollama Cloud format)
421 |             # Process and unnormalize coordinates in tool calls
422 |             processed_tool_calls = []
423 |             for tc in tool_calls_array:
424 |                 function = tc.get("function", {})
425 |                 fn_name = function.get("name", "computer")
426 |                 args_str = function.get("arguments", "{}")
427 | 
428 |                 try:
429 |                     args = json.loads(args_str)
430 | 
431 |                     # Unnormalize coordinates if present
432 |                     if "coordinate" in args and last_rw is not None and last_rh is not None:
433 |                         args = await _unnormalize_coordinate(args, (last_rw, last_rh))
434 | 
435 |                     # Convert Qwen format to Computer Calls format if this is a computer tool
436 |                     if fn_name == "computer":
437 |                         converted_action = convert_qwen_tool_args_to_computer_action(args)
438 |                         if converted_action:
439 |                             args = converted_action
440 | 
441 |                     processed_tool_calls.append(
442 |                         {
443 |                             "type": tc.get("type", "function"),
444 |                             "id": tc.get("id", "call_0"),
445 |                             "function": {
446 |                                 "name": fn_name,
447 |                                 "arguments": json.dumps(args),
448 |                             },
449 |                         }
450 |                     )
451 |                 except json.JSONDecodeError:
452 |                     # Keep original if parsing fails
453 |                     processed_tool_calls.append(tc)
454 | 
455 |             fake_cm = {
456 |                 "role": "assistant",
457 |                 "content": content_text if content_text else "",
458 |                 "tool_calls": processed_tool_calls,
459 |             }
460 |             output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
461 |         else:
462 |             # No tool calls found in either format, return text response
463 |             fake_cm = {"role": "assistant", "content": content_text}
464 |             output_items.extend(convert_completion_messages_to_responses_items([fake_cm]))
465 | 
466 |         # Prepend any pre_output_items (e.g., simulated screenshot-taking message)
467 |         return {"output": (pre_output_items + output_items), "usage": usage}
468 | 
469 |     def get_capabilities(self) -> List[AgentCapability]:
470 |         return ["step"]
471 | 
472 |     async def predict_click(
473 |         self, model: str, image_b64: str, instruction: str, **kwargs
474 |     ) -> Optional[Tuple[int, int]]:
475 |         """
476 |         Predict click coordinates using Qwen3-VL via litellm.acompletion.
477 | 
478 |         Only exposes a reduced tool schema with left_click to bias model to output a single click.
479 |         Returns (x, y) absolute pixels when screen dimensions can be obtained; otherwise normalized 0..1000 integers.
480 |         """
481 |         # Reduced tool
482 |         reduced_tool = {
483 |             "type": "function",
484 |             "function": {
485 |                 **QWEN3_COMPUTER_TOOL["function"],
486 |                 "parameters": {
487 |                     "type": "object",
488 |                     "properties": {
489 |                         "action": {"type": "string", "enum": ["left_click"]},
490 |                         "coordinate": {
491 |                             "description": "(x, y) in 0..1000 reference space",
492 |                             "type": "array",
493 |                             "items": {"type": ["number", "integer"]},
494 |                             "minItems": 2,
495 |                             "maxItems": 2,
496 |                         },
497 |                     },
498 |                     "required": ["action", "coordinate"],
499 |                 },
500 |             },
501 |         }
502 | 
503 |         # Build Nous system (lazy import inside helper already raises clear guidance if missing)
504 |         nous_system = _build_nous_system([reduced_tool["function"]])
505 | 
506 |         # Pre-process using smart_resize
507 |         min_pixels = 3136
508 |         max_pixels = 12845056
509 |         try:
510 |             # Lazy import to avoid hard dependency
511 |             import base64
512 |             import io
513 | 
514 |             # If PIL is available, estimate size from image to derive smart bounds
515 |             from PIL import Image
516 |             from qwen_vl_utils import smart_resize  # type: ignore
517 | 
518 |             img_bytes = base64.b64decode(image_b64)
519 |             im = Image.open(io.BytesIO(img_bytes))
520 |             h, w = im.height, im.width
521 |             # Qwen notebook suggests factor=32 and a wide min/max range
522 |             rh, rw = smart_resize(h, w, factor=32, min_pixels=min_pixels, max_pixels=max_pixels)
523 |         except Exception:
524 |             raise ImportError(
525 |                 "qwen-vl-utils not installed. Please install it with `pip install cua-agent[qwen]`."
526 |             )
527 | 
528 |         messages = []
529 |         if nous_system:
530 |             messages.append(nous_system)
531 |         image_block: Dict[str, Any] = {
532 |             "type": "image_url",
533 |             "image_url": {"url": f"data:image/png;base64,{image_b64}"},
534 |             "min_pixels": min_pixels,
535 |             "max_pixels": max_pixels,
536 |         }
537 |         # Single user message with image and instruction, matching OpenAI-style content blocks
538 |         messages.append(
539 |             {
540 |                 "role": "user",
541 |                 "content": [
542 |                     image_block,
543 |                     {"type": "text", "text": instruction},
544 |                 ],
545 |             }
546 |         )
547 | 
548 |         api_kwargs: Dict[str, Any] = {
549 |             "model": model,
550 |             "messages": messages,
551 |             **{k: v for k, v in kwargs.items()},
552 |         }
553 |         response = await litellm.acompletion(**api_kwargs)
554 |         resp = response.model_dump()  # type: ignore
555 |         choice = (resp.get("choices") or [{}])[0]
556 |         content_text = ((choice.get("message") or {}).get("content")) or ""
557 |         tool_call = _parse_tool_call_from_text(content_text) or {}
558 |         args = tool_call.get("arguments") or {}
559 |         args = await _unnormalize_coordinate(args, (rh, rw))
560 |         coord = args.get("coordinate")
561 |         if isinstance(coord, (list, tuple)) and len(coord) >= 2:
562 |             return int(coord[0]), int(coord[1])
563 |         return None
564 | 
```

--------------------------------------------------------------------------------
/libs/typescript/computer/src/interface/macos.ts:
--------------------------------------------------------------------------------

```typescript
  1 | /**
  2 |  * macOS computer interface implementation.
  3 |  */
  4 | 
  5 | import type { ScreenSize } from '../types';
  6 | import type { AccessibilityNode, CursorPosition, MouseButton } from './base';
  7 | import { BaseComputerInterface } from './base';
  8 | 
  9 | export class MacOSComputerInterface extends BaseComputerInterface {
 10 |   // Mouse Actions
 11 |   /**
 12 |    * Press and hold a mouse button at the specified coordinates.
 13 |    * @param {number} [x] - X coordinate for the mouse action
 14 |    * @param {number} [y] - Y coordinate for the mouse action
 15 |    * @param {MouseButton} [button='left'] - Mouse button to press down
 16 |    * @returns {Promise<void>}
 17 |    */
 18 |   async mouseDown(x?: number, y?: number, button: MouseButton = 'left'): Promise<void> {
 19 |     await this.sendCommand('mouse_down', { x, y, button });
 20 |   }
 21 | 
 22 |   /**
 23 |    * Release a mouse button at the specified coordinates.
 24 |    * @param {number} [x] - X coordinate for the mouse action
 25 |    * @param {number} [y] - Y coordinate for the mouse action
 26 |    * @param {MouseButton} [button='left'] - Mouse button to release
 27 |    * @returns {Promise<void>}
 28 |    */
 29 |   async mouseUp(x?: number, y?: number, button: MouseButton = 'left'): Promise<void> {
 30 |     await this.sendCommand('mouse_up', { x, y, button });
 31 |   }
 32 | 
 33 |   /**
 34 |    * Perform a left mouse click at the specified coordinates.
 35 |    * @param {number} [x] - X coordinate for the click
 36 |    * @param {number} [y] - Y coordinate for the click
 37 |    * @returns {Promise<void>}
 38 |    */
 39 |   async leftClick(x?: number, y?: number): Promise<void> {
 40 |     await this.sendCommand('left_click', { x, y });
 41 |   }
 42 | 
 43 |   /**
 44 |    * Perform a right mouse click at the specified coordinates.
 45 |    * @param {number} [x] - X coordinate for the click
 46 |    * @param {number} [y] - Y coordinate for the click
 47 |    * @returns {Promise<void>}
 48 |    */
 49 |   async rightClick(x?: number, y?: number): Promise<void> {
 50 |     await this.sendCommand('right_click', { x, y });
 51 |   }
 52 | 
 53 |   /**
 54 |    * Perform a double click at the specified coordinates.
 55 |    * @param {number} [x] - X coordinate for the double click
 56 |    * @param {number} [y] - Y coordinate for the double click
 57 |    * @returns {Promise<void>}
 58 |    */
 59 |   async doubleClick(x?: number, y?: number): Promise<void> {
 60 |     await this.sendCommand('double_click', { x, y });
 61 |   }
 62 | 
 63 |   /**
 64 |    * Move the cursor to the specified coordinates.
 65 |    * @param {number} x - X coordinate to move to
 66 |    * @param {number} y - Y coordinate to move to
 67 |    * @returns {Promise<void>}
 68 |    */
 69 |   async moveCursor(x: number, y: number): Promise<void> {
 70 |     await this.sendCommand('move_cursor', { x, y });
 71 |   }
 72 | 
 73 |   /**
 74 |    * Drag from current position to the specified coordinates.
 75 |    * @param {number} x - X coordinate to drag to
 76 |    * @param {number} y - Y coordinate to drag to
 77 |    * @param {MouseButton} [button='left'] - Mouse button to use for dragging
 78 |    * @param {number} [duration=0.5] - Duration of the drag operation in seconds
 79 |    * @returns {Promise<void>}
 80 |    */
 81 |   async dragTo(x: number, y: number, button: MouseButton = 'left', duration = 0.5): Promise<void> {
 82 |     await this.sendCommand('drag_to', { x, y, button, duration });
 83 |   }
 84 | 
 85 |   /**
 86 |    * Drag along a path of coordinates.
 87 |    * @param {Array<[number, number]>} path - Array of [x, y] coordinate pairs to drag through
 88 |    * @param {MouseButton} [button='left'] - Mouse button to use for dragging
 89 |    * @param {number} [duration=0.5] - Duration of the drag operation in seconds
 90 |    * @returns {Promise<void>}
 91 |    */
 92 |   async drag(
 93 |     path: Array<[number, number]>,
 94 |     button: MouseButton = 'left',
 95 |     duration = 0.5
 96 |   ): Promise<void> {
 97 |     await this.sendCommand('drag', { path, button, duration });
 98 |   }
 99 | 
100 |   // Keyboard Actions
101 |   /**
102 |    * Press and hold a key.
103 |    * @param {string} key - Key to press down
104 |    * @returns {Promise<void>}
105 |    */
106 |   async keyDown(key: string): Promise<void> {
107 |     await this.sendCommand('key_down', { key });
108 |   }
109 | 
110 |   /**
111 |    * Release a key.
112 |    * @param {string} key - Key to release
113 |    * @returns {Promise<void>}
114 |    */
115 |   async keyUp(key: string): Promise<void> {
116 |     await this.sendCommand('key_up', { key });
117 |   }
118 | 
119 |   /**
120 |    * Type text as if entered from keyboard.
121 |    * @param {string} text - Text to type
122 |    * @returns {Promise<void>}
123 |    */
124 |   async typeText(text: string): Promise<void> {
125 |     await this.sendCommand('type_text', { text });
126 |   }
127 | 
128 |   /**
129 |    * Press and release a key.
130 |    * @param {string} key - Key to press
131 |    * @returns {Promise<void>}
132 |    */
133 |   async pressKey(key: string): Promise<void> {
134 |     await this.sendCommand('press_key', { key });
135 |   }
136 | 
137 |   /**
138 |    * Press multiple keys simultaneously as a hotkey combination.
139 |    * @param {...string} keys - Keys to press together
140 |    * @returns {Promise<void>}
141 |    */
142 |   async hotkey(...keys: string[]): Promise<void> {
143 |     await this.sendCommand('hotkey', { keys });
144 |   }
145 | 
146 |   // Scrolling Actions
147 |   /**
148 |    * Scroll by the specified amount in x and y directions.
149 |    * @param {number} x - Horizontal scroll amount
150 |    * @param {number} y - Vertical scroll amount
151 |    * @returns {Promise<void>}
152 |    */
153 |   async scroll(x: number, y: number): Promise<void> {
154 |     await this.sendCommand('scroll', { x, y });
155 |   }
156 | 
157 |   /**
158 |    * Scroll down by the specified number of clicks.
159 |    * @param {number} [clicks=1] - Number of scroll clicks
160 |    * @returns {Promise<void>}
161 |    */
162 |   async scrollDown(clicks = 1): Promise<void> {
163 |     await this.sendCommand('scroll_down', { clicks });
164 |   }
165 | 
166 |   /**
167 |    * Scroll up by the specified number of clicks.
168 |    * @param {number} [clicks=1] - Number of scroll clicks
169 |    * @returns {Promise<void>}
170 |    */
171 |   async scrollUp(clicks = 1): Promise<void> {
172 |     await this.sendCommand('scroll_up', { clicks });
173 |   }
174 | 
175 |   // Screen Actions
176 |   /**
177 |    * Take a screenshot of the screen.
178 |    * @returns {Promise<Buffer>} Screenshot image data as a Buffer
179 |    * @throws {Error} If screenshot fails
180 |    */
181 |   async screenshot(): Promise<Buffer> {
182 |     const response = await this.sendCommand('screenshot');
183 |     if (!response.image_data) {
184 |       throw new Error('Failed to take screenshot');
185 |     }
186 |     return Buffer.from(response.image_data as string, 'base64');
187 |   }
188 | 
189 |   /**
190 |    * Get the current screen size.
191 |    * @returns {Promise<ScreenSize>} Screen dimensions
192 |    * @throws {Error} If unable to get screen size
193 |    */
194 |   async getScreenSize(): Promise<ScreenSize> {
195 |     const response = await this.sendCommand('get_screen_size');
196 |     if (!response.success || !response.size) {
197 |       throw new Error('Failed to get screen size');
198 |     }
199 |     return response.size as ScreenSize;
200 |   }
201 | 
202 |   /**
203 |    * Get the current cursor position.
204 |    * @returns {Promise<CursorPosition>} Current cursor coordinates
205 |    * @throws {Error} If unable to get cursor position
206 |    */
207 |   async getCursorPosition(): Promise<CursorPosition> {
208 |     const response = await this.sendCommand('get_cursor_position');
209 |     if (!response.success || !response.position) {
210 |       throw new Error('Failed to get cursor position');
211 |     }
212 |     return response.position as CursorPosition;
213 |   }
214 | 
215 |   // Window Management
216 |   /** Open a file path or URL with the default handler. */
217 |   async open(target: string): Promise<void> {
218 |     const response = await this.sendCommand('open', { target });
219 |     if (!response.success) {
220 |       throw new Error((response.error as string) || 'Failed to open target');
221 |     }
222 |   }
223 | 
224 |   /** Launch an application (string may include args). Returns pid if available. */
225 |   async launch(app: string, args?: string[]): Promise<number | undefined> {
226 |     const response = await this.sendCommand('launch', args ? { app, args } : { app });
227 |     if (!response.success) {
228 |       throw new Error((response.error as string) || 'Failed to launch application');
229 |     }
230 |     return (response.pid as number) || undefined;
231 |   }
232 | 
233 |   /** Get the current active window id. */
234 |   async getCurrentWindowId(): Promise<number | string> {
235 |     const response = await this.sendCommand('get_current_window_id');
236 |     if (!response.success || response.window_id === undefined) {
237 |       throw new Error((response.error as string) || 'Failed to get current window id');
238 |     }
239 |     return response.window_id as number | string;
240 |   }
241 | 
242 |   /** Get windows belonging to an application (by name). */
243 |   async getApplicationWindows(app: string): Promise<Array<number | string>> {
244 |     const response = await this.sendCommand('get_application_windows', { app });
245 |     if (!response.success) {
246 |       throw new Error((response.error as string) || 'Failed to get application windows');
247 |     }
248 |     return (response.windows as Array<number | string>) || [];
249 |   }
250 | 
251 |   /** Get window title/name by id. */
252 |   async getWindowName(windowId: number | string): Promise<string> {
253 |     const response = await this.sendCommand('get_window_name', { window_id: windowId });
254 |     if (!response.success) {
255 |       throw new Error((response.error as string) || 'Failed to get window name');
256 |     }
257 |     return (response.name as string) || '';
258 |   }
259 | 
260 |   /** Get window size as [width, height]. */
261 |   async getWindowSize(windowId: number | string): Promise<[number, number]> {
262 |     const response = await this.sendCommand('get_window_size', { window_id: windowId });
263 |     if (!response.success) {
264 |       throw new Error((response.error as string) || 'Failed to get window size');
265 |     }
266 |     return [Number(response.width) || 0, Number(response.height) || 0];
267 |   }
268 | 
269 |   /** Get window position as [x, y]. */
270 |   async getWindowPosition(windowId: number | string): Promise<[number, number]> {
271 |     const response = await this.sendCommand('get_window_position', { window_id: windowId });
272 |     if (!response.success) {
273 |       throw new Error((response.error as string) || 'Failed to get window position');
274 |     }
275 |     return [Number(response.x) || 0, Number(response.y) || 0];
276 |   }
277 | 
278 |   /** Set window size. */
279 |   async setWindowSize(windowId: number | string, width: number, height: number): Promise<void> {
280 |     const response = await this.sendCommand('set_window_size', {
281 |       window_id: windowId,
282 |       width,
283 |       height,
284 |     });
285 |     if (!response.success) {
286 |       throw new Error((response.error as string) || 'Failed to set window size');
287 |     }
288 |   }
289 | 
290 |   /** Set window position. */
291 |   async setWindowPosition(windowId: number | string, x: number, y: number): Promise<void> {
292 |     const response = await this.sendCommand('set_window_position', {
293 |       window_id: windowId,
294 |       x,
295 |       y,
296 |     });
297 |     if (!response.success) {
298 |       throw new Error((response.error as string) || 'Failed to set window position');
299 |     }
300 |   }
301 | 
302 |   /** Maximize a window. */
303 |   async maximizeWindow(windowId: number | string): Promise<void> {
304 |     const response = await this.sendCommand('maximize_window', { window_id: windowId });
305 |     if (!response.success) {
306 |       throw new Error((response.error as string) || 'Failed to maximize window');
307 |     }
308 |   }
309 | 
310 |   /** Minimize a window. */
311 |   async minimizeWindow(windowId: number | string): Promise<void> {
312 |     const response = await this.sendCommand('minimize_window', { window_id: windowId });
313 |     if (!response.success) {
314 |       throw new Error((response.error as string) || 'Failed to minimize window');
315 |     }
316 |   }
317 | 
318 |   /** Activate a window by id. */
319 |   async activateWindow(windowId: number | string): Promise<void> {
320 |     const response = await this.sendCommand('activate_window', { window_id: windowId });
321 |     if (!response.success) {
322 |       throw new Error((response.error as string) || 'Failed to activate window');
323 |     }
324 |   }
325 | 
326 |   /** Close a window by id. */
327 |   async closeWindow(windowId: number | string): Promise<void> {
328 |     const response = await this.sendCommand('close_window', { window_id: windowId });
329 |     if (!response.success) {
330 |       throw new Error((response.error as string) || 'Failed to close window');
331 |     }
332 |   }
333 | 
334 |   // Desktop Actions
335 |   /**
336 |    * Get the current desktop environment string (e.g., 'xfce4', 'gnome', 'kde', 'mac', 'windows').
337 |    */
338 |   async getDesktopEnvironment(): Promise<string> {
339 |     const response = await this.sendCommand('get_desktop_environment');
340 |     if (!response.success) {
341 |       throw new Error((response.error as string) || 'Failed to get desktop environment');
342 |     }
343 |     return (response.environment as string) || 'unknown';
344 |   }
345 | 
346 |   /**
347 |    * Set the desktop wallpaper image.
348 |    * @param path Absolute path to the image file on the VM
349 |    */
350 |   async setWallpaper(path: string): Promise<void> {
351 |     const response = await this.sendCommand('set_wallpaper', { path });
352 |     if (!response.success) {
353 |       throw new Error((response.error as string) || 'Failed to set wallpaper');
354 |     }
355 |   }
356 | 
357 |   // Clipboard Actions
358 |   /**
359 |    * Copy current selection to clipboard and return the content.
360 |    * @returns {Promise<string>} Clipboard content
361 |    * @throws {Error} If unable to get clipboard content
362 |    */
363 |   async copyToClipboard(): Promise<string> {
364 |     const response = await this.sendCommand('copy_to_clipboard');
365 |     if (!response.success || !response.content) {
366 |       throw new Error('Failed to get clipboard content');
367 |     }
368 |     return response.content as string;
369 |   }
370 | 
371 |   /**
372 |    * Set the clipboard content to the specified text.
373 |    * @param {string} text - Text to set in clipboard
374 |    * @returns {Promise<void>}
375 |    */
376 |   async setClipboard(text: string): Promise<void> {
377 |     await this.sendCommand('set_clipboard', { text });
378 |   }
379 | 
380 |   // File System Actions
381 |   /**
382 |    * Check if a file exists at the specified path.
383 |    * @param {string} path - Path to the file
384 |    * @returns {Promise<boolean>} True if file exists, false otherwise
385 |    */
386 |   async fileExists(path: string): Promise<boolean> {
387 |     const response = await this.sendCommand('file_exists', { path });
388 |     return (response.exists as boolean) || false;
389 |   }
390 | 
391 |   /**
392 |    * Check if a directory exists at the specified path.
393 |    * @param {string} path - Path to the directory
394 |    * @returns {Promise<boolean>} True if directory exists, false otherwise
395 |    */
396 |   async directoryExists(path: string): Promise<boolean> {
397 |     const response = await this.sendCommand('directory_exists', { path });
398 |     return (response.exists as boolean) || false;
399 |   }
400 | 
401 |   /**
402 |    * List the contents of a directory.
403 |    * @param {string} path - Path to the directory
404 |    * @returns {Promise<string[]>} Array of file and directory names
405 |    * @throws {Error} If unable to list directory
406 |    */
407 |   async listDir(path: string): Promise<string[]> {
408 |     const response = await this.sendCommand('list_dir', { path });
409 |     if (!response.success) {
410 |       throw new Error((response.error as string) || 'Failed to list directory');
411 |     }
412 |     return (response.files as string[]) || [];
413 |   }
414 | 
415 |   /**
416 |    * Get the size of a file in bytes.
417 |    * @param {string} path - Path to the file
418 |    * @returns {Promise<number>} File size in bytes
419 |    * @throws {Error} If unable to get file size
420 |    */
421 |   async getFileSize(path: string): Promise<number> {
422 |     const response = await this.sendCommand('get_file_size', { path });
423 |     if (!response.success) {
424 |       throw new Error((response.error as string) || 'Failed to get file size');
425 |     }
426 |     return (response.size as number) || 0;
427 |   }
428 | 
429 |   /**
430 |    * Read file content in chunks for large files.
431 |    * @private
432 |    * @param {string} path - Path to the file
433 |    * @param {number} offset - Starting byte offset
434 |    * @param {number} totalLength - Total number of bytes to read
435 |    * @param {number} [chunkSize=1048576] - Size of each chunk in bytes
436 |    * @returns {Promise<Buffer>} File content as Buffer
437 |    * @throws {Error} If unable to read file chunk
438 |    */
439 |   private async readBytesChunked(
440 |     path: string,
441 |     offset: number,
442 |     totalLength: number,
443 |     chunkSize: number = 1024 * 1024
444 |   ): Promise<Buffer> {
445 |     const chunks: Buffer[] = [];
446 |     let currentOffset = offset;
447 |     let remaining = totalLength;
448 | 
449 |     while (remaining > 0) {
450 |       const readSize = Math.min(chunkSize, remaining);
451 |       const response = await this.sendCommand('read_bytes', {
452 |         path,
453 |         offset: currentOffset,
454 |         length: readSize,
455 |       });
456 | 
457 |       if (!response.success) {
458 |         throw new Error((response.error as string) || 'Failed to read file chunk');
459 |       }
460 | 
461 |       const chunkData = Buffer.from(response.content_b64 as string, 'base64');
462 |       chunks.push(chunkData);
463 | 
464 |       currentOffset += readSize;
465 |       remaining -= readSize;
466 |     }
467 | 
468 |     return Buffer.concat(chunks);
469 |   }
470 | 
471 |   /**
472 |    * Write file content in chunks for large files.
473 |    * @private
474 |    * @param {string} path - Path to the file
475 |    * @param {Buffer} content - Content to write
476 |    * @param {boolean} [append=false] - Whether to append to existing file
477 |    * @param {number} [chunkSize=1048576] - Size of each chunk in bytes
478 |    * @returns {Promise<void>}
479 |    * @throws {Error} If unable to write file chunk
480 |    */
481 |   private async writeBytesChunked(
482 |     path: string,
483 |     content: Buffer,
484 |     append: boolean = false,
485 |     chunkSize: number = 1024 * 1024
486 |   ): Promise<void> {
487 |     const totalSize = content.length;
488 |     let currentOffset = 0;
489 | 
490 |     while (currentOffset < totalSize) {
491 |       const chunkEnd = Math.min(currentOffset + chunkSize, totalSize);
492 |       const chunkData = content.subarray(currentOffset, chunkEnd);
493 | 
494 |       // First chunk uses the original append flag, subsequent chunks always append
495 |       const chunkAppend = currentOffset === 0 ? append : true;
496 | 
497 |       const response = await this.sendCommand('write_bytes', {
498 |         path,
499 |         content_b64: chunkData.toString('base64'),
500 |         append: chunkAppend,
501 |       });
502 | 
503 |       if (!response.success) {
504 |         throw new Error((response.error as string) || 'Failed to write file chunk');
505 |       }
506 | 
507 |       currentOffset = chunkEnd;
508 |     }
509 |   }
510 | 
511 |   /**
512 |    * Read text from a file with specified encoding.
513 |    * @param {string} path - Path to the file to read
514 |    * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use
515 |    * @returns {Promise<string>} The decoded text content of the file
516 |    */
517 |   async readText(path: string, encoding: BufferEncoding = 'utf8'): Promise<string> {
518 |     const contentBytes = await this.readBytes(path);
519 |     return contentBytes.toString(encoding);
520 |   }
521 | 
522 |   /**
523 |    * Write text to a file with specified encoding.
524 |    * @param {string} path - Path to the file to write
525 |    * @param {string} content - Text content to write
526 |    * @param {BufferEncoding} [encoding='utf8'] - Text encoding to use
527 |    * @param {boolean} [append=false] - Whether to append to the file instead of overwriting
528 |    * @returns {Promise<void>}
529 |    */
530 |   async writeText(
531 |     path: string,
532 |     content: string,
533 |     encoding: BufferEncoding = 'utf8',
534 |     append: boolean = false
535 |   ): Promise<void> {
536 |     const contentBytes = Buffer.from(content, encoding);
537 |     await this.writeBytes(path, contentBytes, append);
538 |   }
539 | 
540 |   /**
541 |    * Read bytes from a file, with optional offset and length.
542 |    * @param {string} path - Path to the file
543 |    * @param {number} [offset=0] - Starting byte offset
544 |    * @param {number} [length] - Number of bytes to read (reads entire file if not specified)
545 |    * @returns {Promise<Buffer>} File content as Buffer
546 |    * @throws {Error} If unable to read file
547 |    */
548 |   async readBytes(path: string, offset: number = 0, length?: number): Promise<Buffer> {
549 |     // For large files, use chunked reading
550 |     if (length === undefined) {
551 |       // Get file size first to determine if we need chunking
552 |       const fileSize = await this.getFileSize(path);
553 |       // If file is larger than 5MB, read in chunks
554 |       if (fileSize > 5 * 1024 * 1024) {
555 |         const readLength = offset > 0 ? fileSize - offset : fileSize;
556 |         return await this.readBytesChunked(path, offset, readLength);
557 |       }
558 |     }
559 | 
560 |     const response = await this.sendCommand('read_bytes', {
561 |       path,
562 |       offset,
563 |       length,
564 |     });
565 |     if (!response.success) {
566 |       throw new Error((response.error as string) || 'Failed to read file');
567 |     }
568 |     return Buffer.from(response.content_b64 as string, 'base64');
569 |   }
570 | 
571 |   /**
572 |    * Write bytes to a file.
573 |    * @param {string} path - Path to the file
574 |    * @param {Buffer} content - Content to write as Buffer
575 |    * @param {boolean} [append=false] - Whether to append to existing file
576 |    * @returns {Promise<void>}
577 |    * @throws {Error} If unable to write file
578 |    */
579 |   async writeBytes(path: string, content: Buffer, append: boolean = false): Promise<void> {
580 |     // For large files, use chunked writing
581 |     if (content.length > 5 * 1024 * 1024) {
582 |       // 5MB threshold
583 |       await this.writeBytesChunked(path, content, append);
584 |       return;
585 |     }
586 | 
587 |     const response = await this.sendCommand('write_bytes', {
588 |       path,
589 |       content_b64: content.toString('base64'),
590 |       append,
591 |     });
592 |     if (!response.success) {
593 |       throw new Error((response.error as string) || 'Failed to write file');
594 |     }
595 |   }
596 | 
597 |   /**
598 |    * Delete a file at the specified path.
599 |    * @param {string} path - Path to the file to delete
600 |    * @returns {Promise<void>}
601 |    * @throws {Error} If unable to delete file
602 |    */
603 |   async deleteFile(path: string): Promise<void> {
604 |     const response = await this.sendCommand('delete_file', { path });
605 |     if (!response.success) {
606 |       throw new Error((response.error as string) || 'Failed to delete file');
607 |     }
608 |   }
609 | 
610 |   /**
611 |    * Create a directory at the specified path.
612 |    * @param {string} path - Path where to create the directory
613 |    * @returns {Promise<void>}
614 |    * @throws {Error} If unable to create directory
615 |    */
616 |   async createDir(path: string): Promise<void> {
617 |     const response = await this.sendCommand('create_dir', { path });
618 |     if (!response.success) {
619 |       throw new Error((response.error as string) || 'Failed to create directory');
620 |     }
621 |   }
622 | 
623 |   /**
624 |    * Delete a directory at the specified path.
625 |    * @param {string} path - Path to the directory to delete
626 |    * @returns {Promise<void>}
627 |    * @throws {Error} If unable to delete directory
628 |    */
629 |   async deleteDir(path: string): Promise<void> {
630 |     const response = await this.sendCommand('delete_dir', { path });
631 |     if (!response.success) {
632 |       throw new Error((response.error as string) || 'Failed to delete directory');
633 |     }
634 |   }
635 | 
636 |   /**
637 |    * Execute a shell command and return stdout and stderr.
638 |    * @param {string} command - Command to execute
639 |    * @returns {Promise<[string, string]>} Tuple of [stdout, stderr]
640 |    * @throws {Error} If command execution fails
641 |    */
642 |   async runCommand(command: string): Promise<[string, string]> {
643 |     const response = await this.sendCommand('run_command', { command });
644 |     if (!response.success) {
645 |       throw new Error((response.error as string) || 'Failed to run command');
646 |     }
647 |     return [(response.stdout as string) || '', (response.stderr as string) || ''];
648 |   }
649 | 
650 |   // Accessibility Actions
651 |   /**
652 |    * Get the accessibility tree of the current screen.
653 |    * @returns {Promise<AccessibilityNode>} Root accessibility node
654 |    * @throws {Error} If unable to get accessibility tree
655 |    */
656 |   async getAccessibilityTree(): Promise<AccessibilityNode> {
657 |     const response = await this.sendCommand('get_accessibility_tree');
658 |     if (!response.success) {
659 |       throw new Error((response.error as string) || 'Failed to get accessibility tree');
660 |     }
661 |     return response as unknown as AccessibilityNode;
662 |   }
663 | 
664 |   /**
665 |    * Convert coordinates to screen coordinates.
666 |    * @param {number} x - X coordinate to convert
667 |    * @param {number} y - Y coordinate to convert
668 |    * @returns {Promise<[number, number]>} Converted screen coordinates as [x, y]
669 |    * @throws {Error} If coordinate conversion fails
670 |    */
671 |   async toScreenCoordinates(x: number, y: number): Promise<[number, number]> {
672 |     const response = await this.sendCommand('to_screen_coordinates', { x, y });
673 |     if (!response.success || !response.coordinates) {
674 |       throw new Error('Failed to convert to screen coordinates');
675 |     }
676 |     return response.coordinates as [number, number];
677 |   }
678 | 
679 |   /**
680 |    * Convert coordinates to screenshot coordinates.
681 |    * @param {number} x - X coordinate to convert
682 |    * @param {number} y - Y coordinate to convert
683 |    * @returns {Promise<[number, number]>} Converted screenshot coordinates as [x, y]
684 |    * @throws {Error} If coordinate conversion fails
685 |    */
686 |   async toScreenshotCoordinates(x: number, y: number): Promise<[number, number]> {
687 |     const response = await this.sendCommand('to_screenshot_coordinates', {
688 |       x,
689 |       y,
690 |     });
691 |     if (!response.success || !response.coordinates) {
692 |       throw new Error('Failed to convert to screenshot coordinates');
693 |     }
694 |     return response.coordinates as [number, number];
695 |   }
696 | }
697 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/Server/Server.swift:
--------------------------------------------------------------------------------

```swift
  1 | import Darwin
  2 | import Foundation
  3 | import Network
  4 | 
  5 | // MARK: - Error Types
  6 | enum PortError: Error, LocalizedError {
  7 |     case alreadyInUse(port: UInt16)
  8 | 
  9 |     var errorDescription: String? {
 10 |         switch self {
 11 |         case .alreadyInUse(let port):
 12 |             return "Port \(port) is already in use by another process"
 13 |         }
 14 |     }
 15 | }
 16 | 
 17 | // MARK: - Server Class
 18 | @MainActor
 19 | final class Server {
 20 | 
 21 |     // MARK: - Route Type
 22 |     private struct Route {
 23 |         let method: String
 24 |         let path: String
 25 |         let handler: (HTTPRequest) async throws -> HTTPResponse
 26 | 
 27 |         func matches(_ request: HTTPRequest) -> Bool {
 28 |             if method != request.method { return false }
 29 | 
 30 |             // Handle path parameters
 31 |             let routeParts = path.split(separator: "/")
 32 |             let requestParts = request.path.split(separator: "/")
 33 | 
 34 |             if routeParts.count != requestParts.count { return false }
 35 | 
 36 |             for (routePart, requestPart) in zip(routeParts, requestParts) {
 37 |                 if routePart.hasPrefix(":") { continue }  // Path parameter
 38 |                 if routePart != requestPart { return false }
 39 |             }
 40 | 
 41 |             return true
 42 |         }
 43 | 
 44 |         func extractParams(_ request: HTTPRequest) -> [String: String] {
 45 |             var params: [String: String] = [:]
 46 |             let routeParts = path.split(separator: "/")
 47 |             
 48 |             // Split request path to remove query parameters
 49 |             let requestPathOnly = request.path.split(separator: "?", maxSplits: 1)[0]
 50 |             let requestParts = requestPathOnly.split(separator: "/")
 51 | 
 52 |             for (routePart, requestPart) in zip(routeParts, requestParts) {
 53 |                 if routePart.hasPrefix(":") {
 54 |                     let paramName = String(routePart.dropFirst())
 55 |                     params[paramName] = String(requestPart)
 56 |                 }
 57 |             }
 58 | 
 59 |             return params
 60 |         }
 61 |     }
 62 | 
 63 |     // MARK: - Properties
 64 |     private let port: NWEndpoint.Port
 65 |     private let controller: LumeController
 66 |     private var isRunning = false
 67 |     private var listener: NWListener?
 68 |     private var routes: [Route]
 69 | 
 70 |     // MARK: - Initialization
 71 |     init(port: UInt16 = 7777) {
 72 |         self.port = NWEndpoint.Port(rawValue: port)!
 73 |         self.controller = LumeController()
 74 |         self.routes = []
 75 | 
 76 |         // Define API routes after self is fully initialized
 77 |         self.setupRoutes()
 78 |     }
 79 | 
 80 |     // MARK: - Route Setup
 81 |     private func setupRoutes() {
 82 |         routes = [
 83 |             Route(
 84 |                 method: "GET", path: "/lume/vms",
 85 |                 handler: { [weak self] request in
 86 |                     guard let self else { throw HTTPError.internalError }
 87 |                     // Extract storage from query params if present
 88 |                     let storage = self.extractQueryParam(request: request, name: "storage")
 89 |                     return try await self.handleListVMs(storage: storage)
 90 |                 }),
 91 |             Route(
 92 |                 method: "GET", path: "/lume/vms/:name",
 93 |                 handler: { [weak self] request in
 94 |                     guard let self else { throw HTTPError.internalError }
 95 |                     let params = Route(
 96 |                         method: "GET", path: "/lume/vms/:name",
 97 |                         handler: { _ in
 98 |                             HTTPResponse(statusCode: .ok, body: "")
 99 |                         }
100 |                     ).extractParams(request)
101 |                     guard let name = params["name"] else {
102 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
103 |                     }
104 | 
105 |                     // Extract storage from query params if present
106 |                     let storage = self.extractQueryParam(request: request, name: "storage")
107 | 
108 |                     return try await self.handleGetVM(name: name, storage: storage)
109 |                 }),
110 |             Route(
111 |                 method: "DELETE", path: "/lume/vms/:name",
112 |                 handler: { [weak self] request in
113 |                     guard let self else { throw HTTPError.internalError }
114 |                     let params = Route(
115 |                         method: "DELETE", path: "/lume/vms/:name",
116 |                         handler: { _ in
117 |                             HTTPResponse(statusCode: .ok, body: "")
118 |                         }
119 |                     ).extractParams(request)
120 |                     guard let name = params["name"] else {
121 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
122 |                     }
123 | 
124 |                     // Extract storage from query params if present
125 |                     let storage = self.extractQueryParam(request: request, name: "storage")
126 | 
127 |                     return try await self.handleDeleteVM(name: name, storage: storage)
128 |                 }),
129 |             Route(
130 |                 method: "POST", path: "/lume/vms",
131 |                 handler: { [weak self] request in
132 |                     guard let self else { throw HTTPError.internalError }
133 |                     return try await self.handleCreateVM(request.body)
134 |                 }),
135 |             Route(
136 |                 method: "POST", path: "/lume/vms/clone",
137 |                 handler: { [weak self] request in
138 |                     guard let self else { throw HTTPError.internalError }
139 |                     return try await self.handleCloneVM(request.body)
140 |                 }),
141 |             Route(
142 |                 method: "PATCH", path: "/lume/vms/:name",
143 |                 handler: { [weak self] request in
144 |                     guard let self else { throw HTTPError.internalError }
145 |                     let params = Route(
146 |                         method: "PATCH", path: "/lume/vms/:name",
147 |                         handler: { _ in
148 |                             HTTPResponse(statusCode: .ok, body: "")
149 |                         }
150 |                     ).extractParams(request)
151 |                     guard let name = params["name"] else {
152 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
153 |                     }
154 |                     return try await self.handleSetVM(name: name, body: request.body)
155 |                 }),
156 |             Route(
157 |                 method: "POST", path: "/lume/vms/:name/run",
158 |                 handler: { [weak self] request in
159 |                     guard let self else { throw HTTPError.internalError }
160 |                     let params = Route(
161 |                         method: "POST", path: "/lume/vms/:name/run",
162 |                         handler: { _ in
163 |                             HTTPResponse(statusCode: .ok, body: "")
164 |                         }
165 |                     ).extractParams(request)
166 |                     guard let name = params["name"] else {
167 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
168 |                     }
169 |                     return try await self.handleRunVM(name: name, body: request.body)
170 |                 }),
171 |             Route(
172 |                 method: "POST", path: "/lume/vms/:name/stop",
173 |                 handler: { [weak self] request in
174 |                     guard let self else { throw HTTPError.internalError }
175 |                     let params = Route(
176 |                         method: "POST", path: "/lume/vms/:name/stop",
177 |                         handler: { _ in
178 |                             HTTPResponse(statusCode: .ok, body: "")
179 |                         }
180 |                     ).extractParams(request)
181 |                     guard let name = params["name"] else {
182 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing VM name")
183 |                     }
184 | 
185 |                     Logger.info("Processing stop VM request", metadata: ["method": request.method, "path": request.path])
186 | 
187 |                     // Extract storage from the request body
188 |                     var storage: String? = nil
189 |                     if let bodyData = request.body, !bodyData.isEmpty {
190 |                         do {
191 |                             if let json = try JSONSerialization.jsonObject(with: bodyData) as? [String: Any],
192 |                                let bodyStorage = json["storage"] as? String {
193 |                                 storage = bodyStorage
194 |                                 Logger.info("Extracted storage from request body", metadata: ["storage": bodyStorage])
195 |                             }
196 |                         } catch {
197 |                             Logger.error("Failed to parse request body JSON", metadata: ["error": error.localizedDescription])
198 |                         }
199 |                     }
200 | 
201 |                     return try await self.handleStopVM(name: name, storage: storage)
202 |                 }),
203 |             Route(
204 |                 method: "GET", path: "/lume/ipsw",
205 |                 handler: { [weak self] _ in
206 |                     guard let self else { throw HTTPError.internalError }
207 |                     return try await self.handleIPSW()
208 |                 }),
209 |             Route(
210 |                 method: "POST", path: "/lume/pull",
211 |                 handler: { [weak self] request in
212 |                     guard let self else { throw HTTPError.internalError }
213 |                     return try await self.handlePull(request.body)
214 |                 }),
215 |             Route(
216 |                 method: "POST", path: "/lume/prune",
217 |                 handler: { [weak self] _ in
218 |                     guard let self else { throw HTTPError.internalError }
219 |                     return try await self.handlePruneImages()
220 |                 }),
221 |             Route(
222 |                 method: "GET", path: "/lume/images",
223 |                 handler: { [weak self] request in
224 |                     guard let self else { throw HTTPError.internalError }
225 |                     return try await self.handleGetImages(request)
226 |                 }),
227 |             // New config endpoint
228 |             Route(
229 |                 method: "GET", path: "/lume/config",
230 |                 handler: { [weak self] _ in
231 |                     guard let self else { throw HTTPError.internalError }
232 |                     return try await self.handleGetConfig()
233 |                 }),
234 |             Route(
235 |                 method: "POST", path: "/lume/config",
236 |                 handler: { [weak self] request in
237 |                     guard let self else { throw HTTPError.internalError }
238 |                     return try await self.handleUpdateConfig(request.body)
239 |                 }),
240 |             Route(
241 |                 method: "GET", path: "/lume/config/locations",
242 |                 handler: { [weak self] _ in
243 |                     guard let self else { throw HTTPError.internalError }
244 |                     return try await self.handleGetLocations()
245 |                 }),
246 |             Route(
247 |                 method: "POST", path: "/lume/config/locations",
248 |                 handler: { [weak self] request in
249 |                     guard let self else { throw HTTPError.internalError }
250 |                     return try await self.handleAddLocation(request.body)
251 |                 }),
252 |             Route(
253 |                 method: "DELETE", path: "/lume/config/locations/:name",
254 |                 handler: { [weak self] request in
255 |                     guard let self else { throw HTTPError.internalError }
256 |                     let params = Route(
257 |                         method: "DELETE", path: "/lume/config/locations/:name",
258 |                         handler: { _ in
259 |                             HTTPResponse(statusCode: .ok, body: "")
260 |                         }
261 |                     ).extractParams(request)
262 |                     guard let name = params["name"] else {
263 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing location name")
264 |                     }
265 |                     return try await self.handleRemoveLocation(name)
266 |                 }),
267 |             
268 |             // Logs retrieval route
269 |             Route(
270 |                 method: "GET", path: "/lume/logs",
271 |                 handler: { [weak self] request in
272 |                     guard let self else { throw HTTPError.internalError }
273 |                     
274 |                     // Extract query parameters
275 |                     let type = self.extractQueryParam(request: request, name: "type") // "info", "error", or "all"
276 |                     let linesParam = self.extractQueryParam(request: request, name: "lines")
277 |                     let lines = linesParam.flatMap { Int($0) } // Convert to Int if present
278 |                     
279 |                     return try await self.handleGetLogs(type: type, lines: lines)
280 |                 }),
281 |             Route(
282 |                 method: "POST", path: "/lume/config/locations/default/:name",
283 |                 handler: { [weak self] request in
284 |                     guard let self else { throw HTTPError.internalError }
285 |                     let params = Route(
286 |                         method: "POST", path: "/lume/config/locations/default/:name",
287 |                         handler: { _ in
288 |                             HTTPResponse(statusCode: .ok, body: "")
289 |                         }
290 |                     ).extractParams(request)
291 |                     guard let name = params["name"] else {
292 |                         return HTTPResponse(statusCode: .badRequest, body: "Missing location name")
293 |                     }
294 |                     return try await self.handleSetDefaultLocation(name)
295 |                 }),
296 |             Route(
297 |                 method: "POST", path: "/lume/vms/push",
298 |                 handler: { [weak self] request in
299 |                     guard let self else { throw HTTPError.internalError }
300 |                     return try await self.handlePush(request.body)
301 |                 }),
302 |         ]
303 |     }
304 | 
305 |     // Helper to extract query parameters from the URL
306 |     private func extractQueryParam(request: HTTPRequest, name: String) -> String? {
307 |         // Extract only the query part by splitting on '?'
308 |         let parts = request.path.split(separator: "?", maxSplits: 1)
309 |         guard parts.count > 1 else { return nil } // No query parameters
310 |         
311 |         let queryString = String(parts[1])
312 |         // Create a placeholder URL with the query string
313 |         if let urlComponents = URLComponents(string: "http://placeholder.com?"+queryString),
314 |            let queryItems = urlComponents.queryItems
315 |         {
316 |             return queryItems.first(where: { $0.name == name })?.value?.removingPercentEncoding
317 |         }
318 |         return nil
319 |     }
320 | 
321 |     // MARK: - Port Utilities
322 |     private func isPortAvailable(port: Int) async -> Bool {
323 |         // Create a socket
324 |         let socketFD = socket(AF_INET, SOCK_STREAM, 0)
325 |         if socketFD == -1 {
326 |             return false
327 |         }
328 | 
329 |         // Set socket options to allow reuse
330 |         var value: Int32 = 1
331 |         if setsockopt(
332 |             socketFD, SOL_SOCKET, SO_REUSEADDR, &value, socklen_t(MemoryLayout<Int32>.size)) == -1
333 |         {
334 |             close(socketFD)
335 |             return false
336 |         }
337 | 
338 |         // Set up the address structure
339 |         var addr = sockaddr_in()
340 |         addr.sin_family = sa_family_t(AF_INET)
341 |         addr.sin_port = UInt16(port).bigEndian
342 |         addr.sin_addr.s_addr = INADDR_ANY.bigEndian
343 | 
344 |         // Bind to the port
345 |         let bindResult = withUnsafePointer(to: &addr) { addrPtr in
346 |             addrPtr.withMemoryRebound(to: sockaddr.self, capacity: 1) { addrPtr in
347 |                 Darwin.bind(socketFD, addrPtr, socklen_t(MemoryLayout<sockaddr_in>.size))
348 |             }
349 |         }
350 | 
351 |         // Clean up
352 |         close(socketFD)
353 | 
354 |         // If bind failed, the port is in use
355 |         return bindResult == 0
356 |     }
357 | 
358 |     // MARK: - Server Lifecycle
359 |     func start() async throws {
360 |         // First check if the port is already in use
361 |         if !(await isPortAvailable(port: Int(port.rawValue))) {
362 |             // Don't log anything here, just throw the error
363 |             throw PortError.alreadyInUse(port: port.rawValue)
364 |         }
365 | 
366 |         let parameters = NWParameters.tcp
367 |         listener = try NWListener(using: parameters, on: port)
368 | 
369 |         // Create an actor to safely manage state transitions
370 |         actor StartupState {
371 |             var error: Error?
372 |             var isComplete = false
373 | 
374 |             func setError(_ error: Error) {
375 |                 self.error = error
376 |                 self.isComplete = true
377 |             }
378 | 
379 |             func setComplete() {
380 |                 self.isComplete = true
381 |             }
382 | 
383 |             func checkStatus() -> (isComplete: Bool, error: Error?) {
384 |                 return (isComplete, error)
385 |             }
386 |         }
387 | 
388 |         let startupState = StartupState()
389 | 
390 |         // Set up a state update handler to detect port binding errors
391 |         listener?.stateUpdateHandler = { state in
392 |             Task {
393 |                 switch state {
394 |                 case .setup:
395 |                     // Initial state, no action needed
396 |                     Logger.info("Listener setup", metadata: ["port": "\(self.port.rawValue)"])
397 |                     break
398 |                 case .waiting(let error):
399 |                     // Log the full error details to see what we're getting
400 |                     Logger.error(
401 |                         "Listener waiting",
402 |                         metadata: [
403 |                             "error": error.localizedDescription,
404 |                             "debugDescription": error.debugDescription,
405 |                             "localizedDescription": error.localizedDescription,
406 |                             "port": "\(self.port.rawValue)",
407 |                         ])
408 | 
409 |                     // Check for different port in use error messages
410 |                     if error.debugDescription.contains("Address already in use")
411 |                         || error.localizedDescription.contains("in use")
412 |                         || error.localizedDescription.contains("address already in use")
413 |                     {
414 |                         Logger.error(
415 |                             "Port conflict detected", metadata: ["port": "\(self.port.rawValue)"])
416 |                         await startupState.setError(
417 |                             PortError.alreadyInUse(port: self.port.rawValue))
418 |                     } else {
419 |                         // Wait for a short period to see if the listener recovers
420 |                         // Some network errors are transient
421 |                         try? await Task.sleep(nanoseconds: 1_000_000_000)  // 1 second
422 | 
423 |                         // If we're still waiting after delay, consider it an error
424 |                         if case .waiting = await self.listener?.state {
425 |                             await startupState.setError(error)
426 |                         }
427 |                     }
428 |                 case .failed(let error):
429 |                     // Log the full error details
430 |                     Logger.error(
431 |                         "Listener failed",
432 |                         metadata: [
433 |                             "error": error.localizedDescription,
434 |                             "debugDescription": error.debugDescription,
435 |                             "port": "\(self.port.rawValue)",
436 |                         ])
437 |                     await startupState.setError(error)
438 |                 case .ready:
439 |                     // Listener successfully bound to port
440 |                     Logger.info("Listener ready", metadata: ["port": "\(self.port.rawValue)"])
441 |                     await startupState.setComplete()
442 |                 case .cancelled:
443 |                     // Listener was cancelled
444 |                     Logger.info("Listener cancelled", metadata: ["port": "\(self.port.rawValue)"])
445 |                     break
446 |                 @unknown default:
447 |                     Logger.info(
448 |                         "Unknown listener state",
449 |                         metadata: ["state": "\(state)", "port": "\(self.port.rawValue)"])
450 |                     break
451 |                 }
452 |             }
453 |         }
454 | 
455 |         listener?.newConnectionHandler = { [weak self] connection in
456 |             Task { @MainActor [weak self] in
457 |                 guard let self else { return }
458 |                 self.handleConnection(connection)
459 |             }
460 |         }
461 | 
462 |         listener?.start(queue: .main)
463 | 
464 |         // Wait for either successful startup or an error
465 |         var status: (isComplete: Bool, error: Error?) = (false, nil)
466 |         repeat {
467 |             try await Task.sleep(nanoseconds: 100_000_000)  // 100ms
468 |             status = await startupState.checkStatus()
469 |         } while !status.isComplete
470 | 
471 |         // If there was a startup error, throw it
472 |         if let error = status.error {
473 |             self.stop()
474 |             throw error
475 |         }
476 | 
477 |         isRunning = true
478 | 
479 |         Logger.info("Server started", metadata: ["port": "\(port.rawValue)"])
480 | 
481 |         // Keep the server running
482 |         while isRunning {
483 |             try await Task.sleep(nanoseconds: 1_000_000_000)
484 |         }
485 |     }
486 | 
487 |     func stop() {
488 |         isRunning = false
489 |         listener?.cancel()
490 |     }
491 | 
492 |     // MARK: - Connection Handling
493 |     private func handleConnection(_ connection: NWConnection) {
494 |         connection.stateUpdateHandler = { [weak self] state in
495 |             switch state {
496 |             case .ready:
497 |                 Task { @MainActor [weak self] in
498 |                     guard let self else { return }
499 |                     self.receiveData(connection)
500 |                 }
501 |             case .failed(let error):
502 |                 Logger.error("Connection failed", metadata: ["error": error.localizedDescription])
503 |                 connection.cancel()
504 |             case .cancelled:
505 |                 // Connection is already cancelled, no need to cancel again
506 |                 break
507 |             default:
508 |                 break
509 |             }
510 |         }
511 |         connection.start(queue: .main)
512 |     }
513 | 
514 |     private func receiveData(_ connection: NWConnection) {
515 |         connection.receive(minimumIncompleteLength: 1, maximumLength: 65536) {
516 |             [weak self] content, _, isComplete, error in
517 |             if let error = error {
518 |                 Logger.error("Receive error", metadata: ["error": error.localizedDescription])
519 |                 connection.cancel()
520 |                 return
521 |             }
522 | 
523 |             guard let data = content, !data.isEmpty else {
524 |                 if isComplete {
525 |                     connection.cancel()
526 |                 }
527 |                 return
528 |             }
529 | 
530 |             Task { @MainActor [weak self] in
531 |                 guard let self else { return }
532 |                 do {
533 |                     let response = try await self.handleRequest(data)
534 |                     self.send(response, on: connection)
535 |                 } catch {
536 |                     let errorResponse = self.errorResponse(error)
537 |                     self.send(errorResponse, on: connection)
538 |                 }
539 |             }
540 |         }
541 |     }
542 | 
543 |     private func send(_ response: HTTPResponse, on connection: NWConnection) {
544 |         let data = response.serialize()
545 |         Logger.info(
546 |             "Serialized response", metadata: ["data": String(data: data, encoding: .utf8) ?? ""])
547 |         connection.send(
548 |             content: data,
549 |             completion: .contentProcessed { [weak connection] error in
550 |                 if let error = error {
551 |                     Logger.error(
552 |                         "Failed to send response", metadata: ["error": error.localizedDescription])
553 |                 } else {
554 |                     Logger.info("Response sent successfully")
555 |                 }
556 |                 if connection?.state != .cancelled {
557 |                     connection?.cancel()
558 |                 }
559 |             })
560 |     }
561 | 
562 |     // MARK: - Request Handling
563 |     private func handleRequest(_ data: Data) async throws -> HTTPResponse {
564 |         Logger.info(
565 |             "Received request data", metadata: ["data": String(data: data, encoding: .utf8) ?? ""])
566 | 
567 |         guard let request = HTTPRequest(data: data) else {
568 |             Logger.error("Failed to parse request")
569 |             return HTTPResponse(statusCode: .badRequest, body: "Invalid request")
570 |         }
571 | 
572 |         Logger.info(
573 |             "Parsed request",
574 |             metadata: [
575 |                 "method": request.method,
576 |                 "path": request.path,
577 |                 "headers": "\(request.headers)",
578 |                 "body": String(data: request.body ?? Data(), encoding: .utf8) ?? "",
579 |             ])
580 | 
581 |         // Find matching route
582 |         guard let route = routes.first(where: { $0.matches(request) }) else {
583 |             return HTTPResponse(statusCode: .notFound, body: "Not found")
584 |         }
585 | 
586 |         // Handle the request
587 |         let response = try await route.handler(request)
588 | 
589 |         Logger.info(
590 |             "Sending response",
591 |             metadata: [
592 |                 "statusCode": "\(response.statusCode.rawValue)",
593 |                 "headers": "\(response.headers)",
594 |                 "body": String(data: response.body ?? Data(), encoding: .utf8) ?? "",
595 |             ])
596 | 
597 |         return response
598 |     }
599 | 
600 |     private func errorResponse(_ error: Error) -> HTTPResponse {
601 |         HTTPResponse(
602 |             statusCode: .internalServerError,
603 |             headers: ["Content-Type": "application/json"],
604 |             body: try! JSONEncoder().encode(APIError(message: error.localizedDescription))
605 |         )
606 |     }
607 | }
608 | 
```