#
tokens: 49912/50000 5/500 files (page 15/21)
lines: on (toggle) GitHub
raw markdown copy reset
This is page 15 of 21. Use http://codebase.md/trycua/cua?lines=true&page={x} to view the full context.

# Directory Structure

```
├── .all-contributorsrc
├── .cursorignore
├── .devcontainer
│   ├── devcontainer.json
│   ├── post-install.sh
│   └── README.md
├── .dockerignore
├── .gitattributes
├── .github
│   ├── FUNDING.yml
│   ├── scripts
│   │   ├── get_pyproject_version.py
│   │   └── tests
│   │       ├── __init__.py
│   │       ├── README.md
│   │       └── test_get_pyproject_version.py
│   └── workflows
│       ├── ci-lume.yml
│       ├── docker-publish-kasm.yml
│       ├── docker-publish-xfce.yml
│       ├── docker-reusable-publish.yml
│       ├── npm-publish-computer.yml
│       ├── npm-publish-core.yml
│       ├── publish-lume.yml
│       ├── pypi-publish-agent.yml
│       ├── pypi-publish-computer-server.yml
│       ├── pypi-publish-computer.yml
│       ├── pypi-publish-core.yml
│       ├── pypi-publish-mcp-server.yml
│       ├── pypi-publish-pylume.yml
│       ├── pypi-publish-som.yml
│       ├── pypi-reusable-publish.yml
│       └── test-validation-script.yml
├── .gitignore
├── .vscode
│   ├── docs.code-workspace
│   ├── launch.json
│   ├── libs-ts.code-workspace
│   ├── lume.code-workspace
│   ├── lumier.code-workspace
│   └── py.code-workspace
├── blog
│   ├── app-use.md
│   ├── assets
│   │   ├── composite-agents.png
│   │   ├── docker-ubuntu-support.png
│   │   ├── hack-booth.png
│   │   ├── hack-closing-ceremony.jpg
│   │   ├── hack-cua-ollama-hud.jpeg
│   │   ├── hack-leaderboard.png
│   │   ├── hack-the-north.png
│   │   ├── hack-winners.jpeg
│   │   ├── hack-workshop.jpeg
│   │   ├── hud-agent-evals.png
│   │   └── trajectory-viewer.jpeg
│   ├── bringing-computer-use-to-the-web.md
│   ├── build-your-own-operator-on-macos-1.md
│   ├── build-your-own-operator-on-macos-2.md
│   ├── composite-agents.md
│   ├── cua-hackathon.md
│   ├── hack-the-north.md
│   ├── hud-agent-evals.md
│   ├── human-in-the-loop.md
│   ├── introducing-cua-cloud-containers.md
│   ├── lume-to-containerization.md
│   ├── sandboxed-python-execution.md
│   ├── training-computer-use-models-trajectories-1.md
│   ├── trajectory-viewer.md
│   ├── ubuntu-docker-support.md
│   └── windows-sandbox.md
├── CONTRIBUTING.md
├── Development.md
├── Dockerfile
├── docs
│   ├── .gitignore
│   ├── .prettierrc
│   ├── content
│   │   └── docs
│   │       ├── agent-sdk
│   │       │   ├── agent-loops.mdx
│   │       │   ├── benchmarks
│   │       │   │   ├── index.mdx
│   │       │   │   ├── interactive.mdx
│   │       │   │   ├── introduction.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── osworld-verified.mdx
│   │       │   │   ├── screenspot-pro.mdx
│   │       │   │   └── screenspot-v2.mdx
│   │       │   ├── callbacks
│   │       │   │   ├── agent-lifecycle.mdx
│   │       │   │   ├── cost-saving.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── logging.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── pii-anonymization.mdx
│   │       │   │   └── trajectories.mdx
│   │       │   ├── chat-history.mdx
│   │       │   ├── custom-computer-handlers.mdx
│   │       │   ├── custom-tools.mdx
│   │       │   ├── customizing-computeragent.mdx
│   │       │   ├── integrations
│   │       │   │   ├── hud.mdx
│   │       │   │   └── meta.json
│   │       │   ├── message-format.mdx
│   │       │   ├── meta.json
│   │       │   ├── migration-guide.mdx
│   │       │   ├── prompt-caching.mdx
│   │       │   ├── supported-agents
│   │       │   │   ├── composed-agents.mdx
│   │       │   │   ├── computer-use-agents.mdx
│   │       │   │   ├── grounding-models.mdx
│   │       │   │   ├── human-in-the-loop.mdx
│   │       │   │   └── meta.json
│   │       │   ├── supported-model-providers
│   │       │   │   ├── index.mdx
│   │       │   │   └── local-models.mdx
│   │       │   └── usage-tracking.mdx
│   │       ├── computer-sdk
│   │       │   ├── cloud-vm-management.mdx
│   │       │   ├── commands.mdx
│   │       │   ├── computer-ui.mdx
│   │       │   ├── computers.mdx
│   │       │   ├── meta.json
│   │       │   └── sandboxed-python.mdx
│   │       ├── index.mdx
│   │       ├── libraries
│   │       │   ├── agent
│   │       │   │   └── index.mdx
│   │       │   ├── computer
│   │       │   │   └── index.mdx
│   │       │   ├── computer-server
│   │       │   │   ├── Commands.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── REST-API.mdx
│   │       │   │   └── WebSocket-API.mdx
│   │       │   ├── core
│   │       │   │   └── index.mdx
│   │       │   ├── lume
│   │       │   │   ├── cli-reference.mdx
│   │       │   │   ├── faq.md
│   │       │   │   ├── http-api.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── meta.json
│   │       │   │   └── prebuilt-images.mdx
│   │       │   ├── lumier
│   │       │   │   ├── building-lumier.mdx
│   │       │   │   ├── docker-compose.mdx
│   │       │   │   ├── docker.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   └── meta.json
│   │       │   ├── mcp-server
│   │       │   │   ├── client-integrations.mdx
│   │       │   │   ├── configuration.mdx
│   │       │   │   ├── index.mdx
│   │       │   │   ├── installation.mdx
│   │       │   │   ├── llm-integrations.mdx
│   │       │   │   ├── meta.json
│   │       │   │   ├── tools.mdx
│   │       │   │   └── usage.mdx
│   │       │   └── som
│   │       │       ├── configuration.mdx
│   │       │       └── index.mdx
│   │       ├── meta.json
│   │       ├── quickstart-cli.mdx
│   │       ├── quickstart-devs.mdx
│   │       └── telemetry.mdx
│   ├── next.config.mjs
│   ├── package-lock.json
│   ├── package.json
│   ├── pnpm-lock.yaml
│   ├── postcss.config.mjs
│   ├── public
│   │   └── img
│   │       ├── agent_gradio_ui.png
│   │       ├── agent.png
│   │       ├── cli.png
│   │       ├── computer.png
│   │       ├── som_box_threshold.png
│   │       └── som_iou_threshold.png
│   ├── README.md
│   ├── source.config.ts
│   ├── src
│   │   ├── app
│   │   │   ├── (home)
│   │   │   │   ├── [[...slug]]
│   │   │   │   │   └── page.tsx
│   │   │   │   └── layout.tsx
│   │   │   ├── api
│   │   │   │   └── search
│   │   │   │       └── route.ts
│   │   │   ├── favicon.ico
│   │   │   ├── global.css
│   │   │   ├── layout.config.tsx
│   │   │   ├── layout.tsx
│   │   │   ├── llms.mdx
│   │   │   │   └── [[...slug]]
│   │   │   │       └── route.ts
│   │   │   └── llms.txt
│   │   │       └── route.ts
│   │   ├── assets
│   │   │   ├── discord-black.svg
│   │   │   ├── discord-white.svg
│   │   │   ├── logo-black.svg
│   │   │   └── logo-white.svg
│   │   ├── components
│   │   │   ├── iou.tsx
│   │   │   └── mermaid.tsx
│   │   ├── lib
│   │   │   ├── llms.ts
│   │   │   └── source.ts
│   │   └── mdx-components.tsx
│   └── tsconfig.json
├── examples
│   ├── agent_examples.py
│   ├── agent_ui_examples.py
│   ├── cloud_api_examples.py
│   ├── computer_examples_windows.py
│   ├── computer_examples.py
│   ├── computer_ui_examples.py
│   ├── computer-example-ts
│   │   ├── .env.example
│   │   ├── .gitignore
│   │   ├── .prettierrc
│   │   ├── package-lock.json
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── README.md
│   │   ├── src
│   │   │   ├── helpers.ts
│   │   │   └── index.ts
│   │   └── tsconfig.json
│   ├── docker_examples.py
│   ├── evals
│   │   ├── hud_eval_examples.py
│   │   └── wikipedia_most_linked.txt
│   ├── pylume_examples.py
│   ├── sandboxed_functions_examples.py
│   ├── som_examples.py
│   ├── utils.py
│   └── winsandbox_example.py
├── img
│   ├── agent_gradio_ui.png
│   ├── agent.png
│   ├── cli.png
│   ├── computer.png
│   ├── logo_black.png
│   └── logo_white.png
├── libs
│   ├── kasm
│   │   ├── Dockerfile
│   │   ├── LICENSE
│   │   ├── README.md
│   │   └── src
│   │       └── ubuntu
│   │           └── install
│   │               └── firefox
│   │                   ├── custom_startup.sh
│   │                   ├── firefox.desktop
│   │                   └── install_firefox.sh
│   ├── lume
│   │   ├── .cursorignore
│   │   ├── CONTRIBUTING.md
│   │   ├── Development.md
│   │   ├── img
│   │   │   └── cli.png
│   │   ├── Package.resolved
│   │   ├── Package.swift
│   │   ├── README.md
│   │   ├── resources
│   │   │   └── lume.entitlements
│   │   ├── scripts
│   │   │   ├── build
│   │   │   │   ├── build-debug.sh
│   │   │   │   ├── build-release-notarized.sh
│   │   │   │   └── build-release.sh
│   │   │   └── install.sh
│   │   ├── src
│   │   │   ├── Commands
│   │   │   │   ├── Clone.swift
│   │   │   │   ├── Config.swift
│   │   │   │   ├── Create.swift
│   │   │   │   ├── Delete.swift
│   │   │   │   ├── Get.swift
│   │   │   │   ├── Images.swift
│   │   │   │   ├── IPSW.swift
│   │   │   │   ├── List.swift
│   │   │   │   ├── Logs.swift
│   │   │   │   ├── Options
│   │   │   │   │   └── FormatOption.swift
│   │   │   │   ├── Prune.swift
│   │   │   │   ├── Pull.swift
│   │   │   │   ├── Push.swift
│   │   │   │   ├── Run.swift
│   │   │   │   ├── Serve.swift
│   │   │   │   ├── Set.swift
│   │   │   │   └── Stop.swift
│   │   │   ├── ContainerRegistry
│   │   │   │   ├── ImageContainerRegistry.swift
│   │   │   │   ├── ImageList.swift
│   │   │   │   └── ImagesPrinter.swift
│   │   │   ├── Errors
│   │   │   │   └── Errors.swift
│   │   │   ├── FileSystem
│   │   │   │   ├── Home.swift
│   │   │   │   ├── Settings.swift
│   │   │   │   ├── VMConfig.swift
│   │   │   │   ├── VMDirectory.swift
│   │   │   │   └── VMLocation.swift
│   │   │   ├── LumeController.swift
│   │   │   ├── Main.swift
│   │   │   ├── Server
│   │   │   │   ├── Handlers.swift
│   │   │   │   ├── HTTP.swift
│   │   │   │   ├── Requests.swift
│   │   │   │   ├── Responses.swift
│   │   │   │   └── Server.swift
│   │   │   ├── Utils
│   │   │   │   ├── CommandRegistry.swift
│   │   │   │   ├── CommandUtils.swift
│   │   │   │   ├── Logger.swift
│   │   │   │   ├── NetworkUtils.swift
│   │   │   │   ├── Path.swift
│   │   │   │   ├── ProcessRunner.swift
│   │   │   │   ├── ProgressLogger.swift
│   │   │   │   ├── String.swift
│   │   │   │   └── Utils.swift
│   │   │   ├── Virtualization
│   │   │   │   ├── DarwinImageLoader.swift
│   │   │   │   ├── DHCPLeaseParser.swift
│   │   │   │   ├── ImageLoaderFactory.swift
│   │   │   │   └── VMVirtualizationService.swift
│   │   │   ├── VM
│   │   │   │   ├── DarwinVM.swift
│   │   │   │   ├── LinuxVM.swift
│   │   │   │   ├── VM.swift
│   │   │   │   ├── VMDetails.swift
│   │   │   │   ├── VMDetailsPrinter.swift
│   │   │   │   ├── VMDisplayResolution.swift
│   │   │   │   └── VMFactory.swift
│   │   │   └── VNC
│   │   │       ├── PassphraseGenerator.swift
│   │   │       └── VNCService.swift
│   │   └── tests
│   │       ├── Mocks
│   │       │   ├── MockVM.swift
│   │       │   ├── MockVMVirtualizationService.swift
│   │       │   └── MockVNCService.swift
│   │       ├── VM
│   │       │   └── VMDetailsPrinterTests.swift
│   │       ├── VMTests.swift
│   │       ├── VMVirtualizationServiceTests.swift
│   │       └── VNCServiceTests.swift
│   ├── lumier
│   │   ├── .dockerignore
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   └── src
│   │       ├── bin
│   │       │   └── entry.sh
│   │       ├── config
│   │       │   └── constants.sh
│   │       ├── hooks
│   │       │   └── on-logon.sh
│   │       └── lib
│   │           ├── utils.sh
│   │           └── vm.sh
│   ├── python
│   │   ├── agent
│   │   │   ├── agent
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── adapters
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── huggingfacelocal_adapter.py
│   │   │   │   │   ├── human_adapter.py
│   │   │   │   │   ├── mlxvlm_adapter.py
│   │   │   │   │   └── models
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── generic.py
│   │   │   │   │       ├── internvl.py
│   │   │   │   │       ├── opencua.py
│   │   │   │   │       └── qwen2_5_vl.py
│   │   │   │   ├── agent.py
│   │   │   │   ├── callbacks
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── budget_manager.py
│   │   │   │   │   ├── image_retention.py
│   │   │   │   │   ├── logging.py
│   │   │   │   │   ├── operator_validator.py
│   │   │   │   │   ├── pii_anonymization.py
│   │   │   │   │   ├── prompt_instructions.py
│   │   │   │   │   ├── telemetry.py
│   │   │   │   │   └── trajectory_saver.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── computers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cua.py
│   │   │   │   │   └── custom.py
│   │   │   │   ├── decorators.py
│   │   │   │   ├── human_tool
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   ├── server.py
│   │   │   │   │   └── ui.py
│   │   │   │   ├── integrations
│   │   │   │   │   └── hud
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── agent.py
│   │   │   │   │       └── proxy.py
│   │   │   │   ├── loops
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── anthropic.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── composed_grounded.py
│   │   │   │   │   ├── glm45v.py
│   │   │   │   │   ├── gta1.py
│   │   │   │   │   ├── holo.py
│   │   │   │   │   ├── internvl.py
│   │   │   │   │   ├── model_types.csv
│   │   │   │   │   ├── moondream3.py
│   │   │   │   │   ├── omniparser.py
│   │   │   │   │   ├── openai.py
│   │   │   │   │   ├── opencua.py
│   │   │   │   │   └── uitars.py
│   │   │   │   ├── proxy
│   │   │   │   │   ├── examples.py
│   │   │   │   │   └── handlers.py
│   │   │   │   ├── responses.py
│   │   │   │   ├── types.py
│   │   │   │   └── ui
│   │   │   │       ├── __init__.py
│   │   │   │       ├── __main__.py
│   │   │   │       └── gradio
│   │   │   │           ├── __init__.py
│   │   │   │           ├── app.py
│   │   │   │           └── ui_components.py
│   │   │   ├── benchmarks
│   │   │   │   ├── .gitignore
│   │   │   │   ├── contrib.md
│   │   │   │   ├── interactive.py
│   │   │   │   ├── models
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   └── gta1.py
│   │   │   │   ├── README.md
│   │   │   │   ├── ss-pro.py
│   │   │   │   ├── ss-v2.py
│   │   │   │   └── utils.py
│   │   │   ├── example.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer
│   │   │   ├── computer
│   │   │   │   ├── __init__.py
│   │   │   │   ├── computer.py
│   │   │   │   ├── diorama_computer.py
│   │   │   │   ├── helpers.py
│   │   │   │   ├── interface
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   ├── models.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── logger.py
│   │   │   │   ├── models.py
│   │   │   │   ├── providers
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── cloud
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── docker
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── lume
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── lume_api.py
│   │   │   │   │   ├── lumier
│   │   │   │   │   │   ├── __init__.py
│   │   │   │   │   │   └── provider.py
│   │   │   │   │   ├── types.py
│   │   │   │   │   └── winsandbox
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── provider.py
│   │   │   │   │       └── setup_script.ps1
│   │   │   │   ├── ui
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── __main__.py
│   │   │   │   │   └── gradio
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       └── app.py
│   │   │   │   └── utils.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── computer-server
│   │   │   ├── computer_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   ├── cli.py
│   │   │   │   ├── diorama
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── diorama_computer.py
│   │   │   │   │   ├── diorama.py
│   │   │   │   │   ├── draw.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── safezone.py
│   │   │   │   ├── handlers
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── factory.py
│   │   │   │   │   ├── generic.py
│   │   │   │   │   ├── linux.py
│   │   │   │   │   ├── macos.py
│   │   │   │   │   └── windows.py
│   │   │   │   ├── main.py
│   │   │   │   ├── server.py
│   │   │   │   └── watchdog.py
│   │   │   ├── examples
│   │   │   │   ├── __init__.py
│   │   │   │   └── usage_example.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   ├── run_server.py
│   │   │   └── test_connection.py
│   │   ├── core
│   │   │   ├── core
│   │   │   │   ├── __init__.py
│   │   │   │   └── telemetry
│   │   │   │       ├── __init__.py
│   │   │   │       └── posthog.py
│   │   │   ├── poetry.toml
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   ├── mcp-server
│   │   │   ├── mcp_server
│   │   │   │   ├── __init__.py
│   │   │   │   ├── __main__.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   ├── README.md
│   │   │   └── scripts
│   │   │       ├── install_mcp_server.sh
│   │   │       └── start_mcp_server.sh
│   │   ├── pylume
│   │   │   ├── __init__.py
│   │   │   ├── pylume
│   │   │   │   ├── __init__.py
│   │   │   │   ├── client.py
│   │   │   │   ├── exceptions.py
│   │   │   │   ├── lume
│   │   │   │   ├── models.py
│   │   │   │   ├── pylume.py
│   │   │   │   └── server.py
│   │   │   ├── pyproject.toml
│   │   │   └── README.md
│   │   └── som
│   │       ├── LICENSE
│   │       ├── poetry.toml
│   │       ├── pyproject.toml
│   │       ├── README.md
│   │       ├── som
│   │       │   ├── __init__.py
│   │       │   ├── detect.py
│   │       │   ├── detection.py
│   │       │   ├── models.py
│   │       │   ├── ocr.py
│   │       │   ├── util
│   │       │   │   └── utils.py
│   │       │   └── visualization.py
│   │       └── tests
│   │           └── test_omniparser.py
│   ├── typescript
│   │   ├── .gitignore
│   │   ├── .nvmrc
│   │   ├── agent
│   │   │   ├── examples
│   │   │   │   ├── playground-example.html
│   │   │   │   └── README.md
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── client.ts
│   │   │   │   ├── index.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   └── client.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── biome.json
│   │   ├── computer
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── computer
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── providers
│   │   │   │   │   │   ├── base.ts
│   │   │   │   │   │   ├── cloud.ts
│   │   │   │   │   │   └── index.ts
│   │   │   │   │   └── types.ts
│   │   │   │   ├── index.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── base.ts
│   │   │   │   │   ├── factory.ts
│   │   │   │   │   ├── index.ts
│   │   │   │   │   ├── linux.ts
│   │   │   │   │   ├── macos.ts
│   │   │   │   │   └── windows.ts
│   │   │   │   └── types.ts
│   │   │   ├── tests
│   │   │   │   ├── computer
│   │   │   │   │   └── cloud.test.ts
│   │   │   │   ├── interface
│   │   │   │   │   ├── factory.test.ts
│   │   │   │   │   ├── index.test.ts
│   │   │   │   │   ├── linux.test.ts
│   │   │   │   │   ├── macos.test.ts
│   │   │   │   │   └── windows.test.ts
│   │   │   │   └── setup.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── core
│   │   │   ├── .editorconfig
│   │   │   ├── .gitattributes
│   │   │   ├── .gitignore
│   │   │   ├── LICENSE
│   │   │   ├── package.json
│   │   │   ├── README.md
│   │   │   ├── src
│   │   │   │   ├── index.ts
│   │   │   │   └── telemetry
│   │   │   │       ├── clients
│   │   │   │       │   ├── index.ts
│   │   │   │       │   └── posthog.ts
│   │   │   │       └── index.ts
│   │   │   ├── tests
│   │   │   │   └── telemetry.test.ts
│   │   │   ├── tsconfig.json
│   │   │   ├── tsdown.config.ts
│   │   │   └── vitest.config.ts
│   │   ├── package.json
│   │   ├── pnpm-lock.yaml
│   │   ├── pnpm-workspace.yaml
│   │   └── README.md
│   └── xfce
│       ├── .dockerignore
│       ├── .gitignore
│       ├── Dockerfile
│       ├── README.md
│       └── src
│           ├── scripts
│           │   ├── resize-display.sh
│           │   ├── start-computer-server.sh
│           │   ├── start-novnc.sh
│           │   ├── start-vnc.sh
│           │   └── xstartup.sh
│           ├── supervisor
│           │   └── supervisord.conf
│           └── xfce-config
│               ├── helpers.rc
│               ├── xfce4-power-manager.xml
│               └── xfce4-session.xml
├── LICENSE.md
├── notebooks
│   ├── agent_nb.ipynb
│   ├── blog
│   │   ├── build-your-own-operator-on-macos-1.ipynb
│   │   └── build-your-own-operator-on-macos-2.ipynb
│   ├── composite_agents_docker_nb.ipynb
│   ├── computer_nb.ipynb
│   ├── computer_server_nb.ipynb
│   ├── customizing_computeragent.ipynb
│   ├── eval_osworld.ipynb
│   ├── ollama_nb.ipynb
│   ├── pylume_nb.ipynb
│   ├── README.md
│   ├── sota_hackathon_cloud.ipynb
│   └── sota_hackathon.ipynb
├── pdm.lock
├── pyproject.toml
├── pyrightconfig.json
├── README.md
├── samples
│   └── community
│       ├── global-online
│       │   └── README.md
│       └── hack-the-north
│           └── README.md
├── scripts
│   ├── build-uv.sh
│   ├── build.ps1
│   ├── build.sh
│   ├── cleanup.sh
│   ├── playground-docker.sh
│   ├── playground.sh
│   └── run-docker-dev.sh
└── tests
    ├── pytest.ini
    ├── shell_cmd.py
    ├── test_files.py
    ├── test_shell_bash.py
    ├── test_telemetry.py
    ├── test_venv.py
    └── test_watchdog.py
```

# Files

--------------------------------------------------------------------------------
/docs/content/docs/libraries/lume/http-api.mdx:
--------------------------------------------------------------------------------

```markdown
   1 | ---
   2 | title: HTTP Server API
   3 | description: Lume exposes a local HTTP API server that listens at localhost for programmatic management of VMs.
   4 | ---
   5 | 
   6 | import { Tabs, Tab } from 'fumadocs-ui/components/tabs';
   7 | import { Callout } from 'fumadocs-ui/components/callout';
   8 | 
   9 | ## Default URL
  10 | 
  11 | ```
  12 | http://localhost:7777
  13 | ```
  14 | 
  15 | <Callout type="info">
  16 |   The HTTP API service runs on port `7777` by default. If you'd like to use a
  17 |   different port, pass the `--port` option during installation or when running
  18 |   `lume serve`.
  19 | </Callout>
  20 | 
  21 | ## Endpoints
  22 | 
  23 | ---
  24 | 
  25 | ### Create VM
  26 | 
  27 | Create a new virtual machine.
  28 | 
  29 | `POST: /lume/vms`
  30 | 
  31 | #### Parameters
  32 | 
  33 | | Name     | Type    | Required | Description                          |
  34 | | -------- | ------- | -------- | ------------------------------------ |
  35 | | name     | string  | Yes      | Name of the VM                       |
  36 | | os       | string  | Yes      | Guest OS (`macOS`, `linux`, etc.)    |
  37 | | cpu      | integer | Yes      | Number of CPU cores                  |
  38 | | memory   | string  | Yes      | Memory size (e.g. `4GB`)             |
  39 | | diskSize | string  | Yes      | Disk size (e.g. `64GB`)              |
  40 | | display  | string  | No       | Display resolution (e.g. `1024x768`) |
  41 | | ipsw     | string  | No       | IPSW version (e.g. `latest`)         |
  42 | | storage  | string  | No       | Storage type (`ssd`, etc.)           |
  43 | 
  44 | #### Example Request
  45 | 
  46 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
  47 |   <Tab value="Curl">
  48 | 
  49 | ```bash
  50 | curl --connect-timeout 6000 \
  51 |   --max-time 5000 \
  52 |   -X POST \
  53 |   -H "Content-Type: application/json" \
  54 |   -d '{
  55 |     "name": "lume_vm",
  56 |     "os": "macOS",
  57 |     "cpu": 2,
  58 |     "memory": "4GB",
  59 |     "diskSize": "64GB",
  60 |     "display": "1024x768",
  61 |     "ipsw": "latest",
  62 |     "storage": "ssd"
  63 |   }' \
  64 |   http://localhost:7777/lume/vms
  65 | ```
  66 | 
  67 |   </Tab>
  68 |   <Tab value="Python">
  69 | 
  70 | ```python
  71 | import requests
  72 | 
  73 | payload = {
  74 |     "name": "lume_vm",
  75 |     "os": "macOS",
  76 |     "cpu": 2,
  77 |     "memory": "4GB",
  78 |     "diskSize": "64GB",
  79 |     "display": "1024x768",
  80 |     "ipsw": "latest",
  81 |     "storage": "ssd"
  82 | }
  83 | r = requests.post("http://localhost:7777/lume/vms", json=payload, timeout=50)
  84 | print(r.json())
  85 | ```
  86 | 
  87 |   </Tab>
  88 |   <Tab value="TypeScript">
  89 | 
  90 | ```typescript
  91 | const payload = {
  92 |   name: 'lume_vm',
  93 |   os: 'macOS',
  94 |   cpu: 2,
  95 |   memory: '4GB',
  96 |   diskSize: '64GB',
  97 |   display: '1024x768',
  98 |   ipsw: 'latest',
  99 |   storage: 'ssd',
 100 | };
 101 | 
 102 | const res = await fetch('http://localhost:7777/lume/vms', {
 103 |   method: 'POST',
 104 |   headers: { 'Content-Type': 'application/json' },
 105 |   body: JSON.stringify(payload),
 106 | });
 107 | console.log(await res.json());
 108 | ```
 109 | 
 110 |   </Tab>
 111 | </Tabs>
 112 | 
 113 | ---
 114 | 
 115 | ### Run VM
 116 | 
 117 | Run a virtual machine instance.
 118 | 
 119 | `POST: /lume/vms/:name/run`
 120 | 
 121 | #### Parameters
 122 | 
 123 | | Name              | Type            | Required | Description                                         |
 124 | | ----------------- | --------------- | -------- | --------------------------------------------------- |
 125 | | noDisplay         | boolean         | No       | If true, do not start VNC client                    |
 126 | | sharedDirectories | array of object | No       | List of shared directories (`hostPath`, `readOnly`) |
 127 | | recoveryMode      | boolean         | No       | Start in recovery mode                              |
 128 | | storage           | string          | No       | Storage type (`ssd`, etc.)                          |
 129 | 
 130 | #### Example Request
 131 | 
 132 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 133 |   <Tab value="Curl">
 134 | 
 135 | ```bash
 136 | # Basic run
 137 | curl --connect-timeout 6000 \
 138 |   --max-time 5000 \
 139 |   -X POST \
 140 |   http://localhost:7777/lume/vms/my-vm-name/run
 141 | 
 142 | # Run with VNC client started and shared directory
 143 | curl --connect-timeout 6000 \
 144 |   --max-time 5000 \
 145 |   -X POST \
 146 |   -H "Content-Type: application/json" \
 147 |   -d '{
 148 |     "noDisplay": false,
 149 |     "sharedDirectories": [
 150 |       {
 151 |         "hostPath": "~/Projects",
 152 |         "readOnly": false
 153 |       }
 154 |     ],
 155 |     "recoveryMode": false,
 156 |     "storage": "ssd"
 157 |   }' \
 158 |   http://localhost:7777/lume/vms/lume_vm/run
 159 | ```
 160 | 
 161 |   </Tab>
 162 |   <Tab value="Python">
 163 | 
 164 | ```python
 165 | import requests
 166 | 
 167 | # Basic run
 168 | r = requests.post("http://localhost:7777/lume/vms/my-vm-name/run", timeout=50)
 169 | print(r.json())
 170 | 
 171 | # With VNC and shared directory
 172 | payload = {
 173 |     "noDisplay": False,
 174 |     "sharedDirectories": [
 175 |         {"hostPath": "~/Projects", "readOnly": False}
 176 |     ],
 177 |     "recoveryMode": False,
 178 |     "storage": "ssd"
 179 | }
 180 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/run", json=payload, timeout=50)
 181 | print(r.json())
 182 | ```
 183 | 
 184 |   </Tab>
 185 |   <Tab value="TypeScript">
 186 | 
 187 | ```typescript
 188 | // Basic run
 189 | let res = await fetch('http://localhost:7777/lume/vms/my-vm-name/run', {
 190 |   method: 'POST',
 191 | });
 192 | console.log(await res.json());
 193 | 
 194 | // With VNC and shared directory
 195 | const payload = {
 196 |   noDisplay: false,
 197 |   sharedDirectories: [{ hostPath: '~/Projects', readOnly: false }],
 198 |   recoveryMode: false,
 199 |   storage: 'ssd',
 200 | };
 201 | res = await fetch('http://localhost:7777/lume/vms/lume_vm/run', {
 202 |   method: 'POST',
 203 |   headers: { 'Content-Type': 'application/json' },
 204 |   body: JSON.stringify(payload),
 205 | });
 206 | console.log(await res.json());
 207 | ```
 208 | 
 209 |   </Tab>
 210 | </Tabs>
 211 | 
 212 | ---
 213 | 
 214 | ### List VMs
 215 | 
 216 | List all virtual machines.
 217 | 
 218 | `GET: /lume/vms`
 219 | 
 220 | #### Example Request
 221 | 
 222 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 223 |   <Tab value="Curl">
 224 | 
 225 | ```bash
 226 | curl --connect-timeout 6000 \
 227 |   --max-time 5000 \
 228 |   http://localhost:7777/lume/vms
 229 | ```
 230 | 
 231 |   </Tab>
 232 |   <Tab value="Python">
 233 | 
 234 | ```python
 235 | import requests
 236 | 
 237 | r = requests.get("http://localhost:7777/lume/vms", timeout=50)
 238 | print(r.json())
 239 | ```
 240 | 
 241 |   </Tab>
 242 |   <Tab value="TypeScript">
 243 | 
 244 | ```typescript
 245 | const res = await fetch('http://localhost:7777/lume/vms');
 246 | console.log(await res.json());
 247 | ```
 248 | 
 249 |   </Tab>
 250 | </Tabs>
 251 | 
 252 | ```json
 253 | [
 254 |   {
 255 |     "name": "my-vm",
 256 |     "state": "stopped",
 257 |     "os": "macOS",
 258 |     "cpu": 2,
 259 |     "memory": "4GB",
 260 |     "diskSize": "64GB"
 261 |   },
 262 |   {
 263 |     "name": "my-vm-2",
 264 |     "state": "stopped",
 265 |     "os": "linux",
 266 |     "cpu": 2,
 267 |     "memory": "4GB",
 268 |     "diskSize": "64GB"
 269 |   }
 270 | ]
 271 | ```
 272 | 
 273 | ---
 274 | 
 275 | ### Get VM Details
 276 | 
 277 | Get details for a specific virtual machine.
 278 | 
 279 | `GET: /lume/vms/:name`
 280 | 
 281 | #### Parameters
 282 | 
 283 | | Name    | Type   | Required | Description                |
 284 | | ------- | ------ | -------- | -------------------------- |
 285 | | storage | string | No       | Storage type (`ssd`, etc.) |
 286 | 
 287 | #### Example Request
 288 | 
 289 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 290 |   <Tab value="Curl">
 291 | 
 292 | ```bash
 293 | # Basic get
 294 | curl --connect-timeout 6000 \
 295 |   --max-time 5000 \
 296 |   http://localhost:7777/lume/vms/lume_vm
 297 | 
 298 | # Get with specific storage
 299 | curl --connect-timeout 6000 \
 300 |   --max-time 5000 \
 301 |   http://localhost:7777/lume/vms/lume_vm?storage=ssd
 302 | ```
 303 | 
 304 |   </Tab>
 305 |   <Tab value="Python">
 306 | 
 307 | ```python
 308 | import requests
 309 | 
 310 | # Basic get
 311 | details = requests.get("http://localhost:7777/lume/vms/lume_vm", timeout=50)
 312 | print(details.json())
 313 | 
 314 | # Get with specific storage
 315 | details = requests.get("http://localhost:7777/lume/vms/lume_vm", params={"storage": "ssd"}, timeout=50)
 316 | print(details.json())
 317 | ```
 318 | 
 319 |   </Tab>
 320 |   <Tab value="TypeScript">
 321 | 
 322 | ```typescript
 323 | // Basic get
 324 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm');
 325 | console.log(await res.json());
 326 | 
 327 | // Get with specific storage
 328 | res = await fetch('http://localhost:7777/lume/vms/lume_vm?storage=ssd');
 329 | console.log(await res.json());
 330 | ```
 331 | 
 332 |   </Tab>
 333 | </Tabs>
 334 | 
 335 | ```json
 336 | {
 337 |   "name": "lume_vm",
 338 |   "state": "stopped",
 339 |   "os": "macOS",
 340 |   "cpu": 2,
 341 |   "memory": "4GB",
 342 |   "diskSize": "64GB",
 343 |   "display": "1024x768",
 344 |   "ipAddress": "192.168.65.2",
 345 |   "vncPort": 5900,
 346 |   "sharedDirectories": [
 347 |     {
 348 |       "hostPath": "~/Projects",
 349 |       "readOnly": false,
 350 |       "tag": "com.apple.virtio-fs.automount"
 351 |     }
 352 |   ]
 353 | }
 354 | ```
 355 | 
 356 | ---
 357 | 
 358 | ### Update VM Configuration
 359 | 
 360 | Update the configuration of a virtual machine.
 361 | 
 362 | `PATCH: /lume/vms/:name`
 363 | 
 364 | #### Parameters
 365 | 
 366 | | Name     | Type    | Required | Description                           |
 367 | | -------- | ------- | -------- | ------------------------------------- |
 368 | | cpu      | integer | No       | Number of CPU cores                   |
 369 | | memory   | string  | No       | Memory size (e.g. `8GB`)              |
 370 | | diskSize | string  | No       | Disk size (e.g. `100GB`)              |
 371 | | display  | string  | No       | Display resolution (e.g. `1920x1080`) |
 372 | | storage  | string  | No       | Storage type (`ssd`, etc.)            |
 373 | 
 374 | #### Example Request
 375 | 
 376 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 377 |   <Tab value="Curl">
 378 | 
 379 | ```bash
 380 | curl --connect-timeout 6000 \
 381 |   --max-time 5000 \
 382 |   -X PATCH \
 383 |   -H "Content-Type: application/json" \
 384 |   -d '{
 385 |     "cpu": 4,
 386 |     "memory": "8GB",
 387 |     "diskSize": "100GB",
 388 |     "display": "1920x1080",
 389 |     "storage": "ssd"
 390 |   }' \
 391 |   http://localhost:7777/lume/vms/lume_vm
 392 | ```
 393 | 
 394 |   </Tab>
 395 |   <Tab value="Python">
 396 | 
 397 | ```python
 398 | import requests
 399 | 
 400 | payload = {
 401 |     "cpu": 4,
 402 |     "memory": "8GB",
 403 |     "diskSize": "100GB",
 404 |     "display": "1920x1080",
 405 |     "storage": "ssd"
 406 | }
 407 | r = requests.patch("http://localhost:7777/lume/vms/lume_vm", json=payload, timeout=50)
 408 | print(r.json())
 409 | ```
 410 | 
 411 |   </Tab>
 412 |   <Tab value="TypeScript">
 413 | 
 414 | ```typescript
 415 | const payload = {
 416 |   cpu: 4,
 417 |   memory: '8GB',
 418 |   diskSize: '100GB',
 419 |   display: '1920x1080',
 420 |   storage: 'ssd',
 421 | };
 422 | const res = await fetch('http://localhost:7777/lume/vms/lume_vm', {
 423 |   method: 'PATCH',
 424 |   headers: { 'Content-Type': 'application/json' },
 425 |   body: JSON.stringify(payload),
 426 | });
 427 | console.log(await res.json());
 428 | ```
 429 | 
 430 |   </Tab>
 431 | </Tabs>
 432 | 
 433 | ---
 434 | 
 435 | ### Stop VM
 436 | 
 437 | Stop a running virtual machine.
 438 | 
 439 | `POST: /lume/vms/:name/stop`
 440 | 
 441 | #### Parameters
 442 | 
 443 | | Name    | Type   | Required | Description                |
 444 | | ------- | ------ | -------- | -------------------------- |
 445 | | storage | string | No       | Storage type (`ssd`, etc.) |
 446 | 
 447 | #### Example Request
 448 | 
 449 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 450 |   <Tab value="Curl">
 451 | 
 452 | ```bash
 453 | # Basic stop
 454 | curl --connect-timeout 6000 \
 455 |   --max-time 5000 \
 456 |   -X POST \
 457 |   http://localhost:7777/lume/vms/lume_vm/stop
 458 | 
 459 | # Stop with storage location specified
 460 | curl --connect-timeout 6000 \
 461 |   --max-time 5000 \
 462 |   -X POST \
 463 |   http://localhost:7777/lume/vms/lume_vm/stop?storage=ssd
 464 | ```
 465 | 
 466 |   </Tab>
 467 |   <Tab value="Python">
 468 | 
 469 | ```python
 470 | import requests
 471 | 
 472 | # Basic stop
 473 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/stop", timeout=50)
 474 | print(r.json())
 475 | 
 476 | # Stop with storage location specified
 477 | r = requests.post("http://localhost:7777/lume/vms/lume_vm/stop", params={"storage": "ssd"}, timeout=50)
 478 | print(r.json())
 479 | ```
 480 | 
 481 |   </Tab>
 482 |   <Tab value="TypeScript">
 483 | 
 484 | ```typescript
 485 | // Basic stop
 486 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm/stop', {
 487 |   method: 'POST',
 488 | });
 489 | console.log(await res.json());
 490 | 
 491 | // Stop with storage location specified
 492 | res = await fetch('http://localhost:7777/lume/vms/lume_vm/stop?storage=ssd', {
 493 |   method: 'POST',
 494 | });
 495 | console.log(await res.json());
 496 | ```
 497 | 
 498 |   </Tab>
 499 | </Tabs>
 500 | 
 501 | ---
 502 | 
 503 | ### Delete VM
 504 | 
 505 | Delete a virtual machine instance.
 506 | 
 507 | `DELETE: /lume/vms/:name`
 508 | 
 509 | #### Parameters
 510 | 
 511 | | Name    | Type   | Required | Description                |
 512 | | ------- | ------ | -------- | -------------------------- |
 513 | | storage | string | No       | Storage type (`ssd`, etc.) |
 514 | 
 515 | #### Example Request
 516 | 
 517 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 518 |   <Tab value="Curl">
 519 | 
 520 | ```bash
 521 | # Basic delete
 522 | curl --connect-timeout 6000 \
 523 |   --max-time 5000 \
 524 |   -X DELETE \
 525 |   http://localhost:7777/lume/vms/lume_vm
 526 | 
 527 | # Delete with specific storage
 528 | curl --connect-timeout 6000 \
 529 |   --max-time 5000 \
 530 |   -X DELETE \
 531 |   http://localhost:7777/lume/vms/lume_vm?storage=ssd
 532 | ```
 533 | 
 534 |   </Tab>
 535 |   <Tab value="Python">
 536 | 
 537 | ```python
 538 | import requests
 539 | 
 540 | # Basic delete
 541 | r = requests.delete("http://localhost:7777/lume/vms/lume_vm", timeout=50)
 542 | print(r.status_code)
 543 | 
 544 | # Delete with specific storage
 545 | r = requests.delete("http://localhost:7777/lume/vms/lume_vm", params={"storage": "ssd"}, timeout=50)
 546 | print(r.status_code)
 547 | ```
 548 | 
 549 |   </Tab>
 550 |   <Tab value="TypeScript">
 551 | 
 552 | ```typescript
 553 | // Basic delete
 554 | let res = await fetch('http://localhost:7777/lume/vms/lume_vm', {
 555 |   method: 'DELETE',
 556 | });
 557 | console.log(res.status);
 558 | 
 559 | // Delete with specific storage
 560 | res = await fetch('http://localhost:7777/lume/vms/lume_vm?storage=ssd', {
 561 |   method: 'DELETE',
 562 | });
 563 | console.log(res.status);
 564 | ```
 565 | 
 566 |   </Tab>
 567 | </Tabs>
 568 | 
 569 | ---
 570 | 
 571 | ### Clone VM
 572 | 
 573 | Clone an existing virtual machine.
 574 | 
 575 | `POST: /lume/vms/clone`
 576 | 
 577 | #### Parameters
 578 | 
 579 | | Name           | Type   | Required | Description                         |
 580 | | -------------- | ------ | -------- | ----------------------------------- |
 581 | | name           | string | Yes      | Source VM name                      |
 582 | | newName        | string | Yes      | New VM name                         |
 583 | | sourceLocation | string | No       | Source storage location (`default`) |
 584 | | destLocation   | string | No       | Destination storage location        |
 585 | 
 586 | #### Example Request
 587 | 
 588 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 589 |   <Tab value="Curl">
 590 | 
 591 | ```bash
 592 | curl --connect-timeout 6000 \
 593 |   --max-time 5000 \
 594 |   -X POST \
 595 |   -H "Content-Type: application/json" \
 596 |   -d '{
 597 |     "name": "source-vm",
 598 |     "newName": "cloned-vm",
 599 |     "sourceLocation": "default",
 600 |     "destLocation": "ssd"
 601 |   }' \
 602 |   http://localhost:7777/lume/vms/clone
 603 | ```
 604 | 
 605 |   </Tab>
 606 |   <Tab value="Python">
 607 | 
 608 | ```python
 609 | import requests
 610 | 
 611 | payload = {
 612 |     "name": "source-vm",
 613 |     "newName": "cloned-vm",
 614 |     "sourceLocation": "default",
 615 |     "destLocation": "ssd"
 616 | }
 617 | r = requests.post("http://localhost:7777/lume/vms/clone", json=payload, timeout=50)
 618 | print(r.json())
 619 | ```
 620 | 
 621 |   </Tab>
 622 |   <Tab value="TypeScript">
 623 | 
 624 | ```typescript
 625 | const payload = {
 626 |   name: 'source-vm',
 627 |   newName: 'cloned-vm',
 628 |   sourceLocation: 'default',
 629 |   destLocation: 'ssd',
 630 | };
 631 | const res = await fetch('http://localhost:7777/lume/vms/clone', {
 632 |   method: 'POST',
 633 |   headers: { 'Content-Type': 'application/json' },
 634 |   body: JSON.stringify(payload),
 635 | });
 636 | console.log(await res.json());
 637 | ```
 638 | 
 639 |   </Tab>
 640 | </Tabs>
 641 | 
 642 | ---
 643 | 
 644 | ### Pull VM Image
 645 | 
 646 | Pull a VM image from a registry.
 647 | 
 648 | `POST: /lume/pull`
 649 | 
 650 | #### Parameters
 651 | 
 652 | | Name         | Type   | Required | Description                           |
 653 | | ------------ | ------ | -------- | ------------------------------------- |
 654 | | image        | string | Yes      | Image name (e.g. `macos-sequoia-...`) |
 655 | | name         | string | No       | VM name for the pulled image          |
 656 | | registry     | string | No       | Registry host (e.g. `ghcr.io`)        |
 657 | | organization | string | No       | Organization name                     |
 658 | | storage      | string | No       | Storage type (`ssd`, etc.)            |
 659 | 
 660 | #### Example Request
 661 | 
 662 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 663 |   <Tab value="Curl">
 664 | 
 665 | ```bash
 666 | curl --connect-timeout 6000 \
 667 |   --max-time 5000 \
 668 |   -X POST \
 669 |   -H "Content-Type: application/json" \
 670 |   -d '{
 671 |     "image": "macos-sequoia-vanilla:latest",
 672 |     "name": "my-vm-name",
 673 |     "registry": "ghcr.io",
 674 |     "organization": "trycua",
 675 |     "storage": "ssd"
 676 |   }' \
 677 |   http://localhost:7777/lume/pull
 678 | ```
 679 | 
 680 |   </Tab>
 681 |   <Tab value="Python">
 682 | 
 683 | ```python
 684 | import requests
 685 | 
 686 | payload = {
 687 |     "image": "macos-sequoia-vanilla:latest",
 688 |     "name": "my-vm-name",
 689 |     "registry": "ghcr.io",
 690 |     "organization": "trycua",
 691 |     "storage": "ssd"
 692 | }
 693 | r = requests.post("http://localhost:7777/lume/pull", json=payload, timeout=50)
 694 | print(r.json())
 695 | ```
 696 | 
 697 |   </Tab>
 698 |   <Tab value="TypeScript">
 699 | 
 700 | ```typescript
 701 | const payload = {
 702 |   image: 'macos-sequoia-vanilla:latest',
 703 |   name: 'my-vm-name',
 704 |   registry: 'ghcr.io',
 705 |   organization: 'trycua',
 706 |   storage: 'ssd',
 707 | };
 708 | const res = await fetch('http://localhost:7777/lume/pull', {
 709 |   method: 'POST',
 710 |   headers: { 'Content-Type': 'application/json' },
 711 |   body: JSON.stringify(payload),
 712 | });
 713 | console.log(await res.json());
 714 | ```
 715 | 
 716 |   </Tab>
 717 | </Tabs>
 718 | 
 719 | ---
 720 | 
 721 | ### Push VM Image
 722 | 
 723 | Push a VM to a registry as an image (asynchronous operation).
 724 | 
 725 | `POST: /lume/vms/push`
 726 | 
 727 | #### Parameters
 728 | 
 729 | | Name         | Type         | Required | Description                                     |
 730 | | ------------ | ------------ | -------- | ----------------------------------------------- |
 731 | | name         | string       | Yes      | Local VM name to push                           |
 732 | | imageName    | string       | Yes      | Image name in registry                          |
 733 | | tags         | array        | Yes      | Image tags (e.g. `["latest", "v1"]`)           |
 734 | | organization | string       | Yes      | Organization name                               |
 735 | | registry     | string       | No       | Registry host (e.g. `ghcr.io`)                  |
 736 | | chunkSizeMb  | integer      | No       | Chunk size in MB for upload                     |
 737 | | storage      | string/null  | No       | Storage type (`ssd`, etc.)                      |
 738 | 
 739 | #### Example Request
 740 | 
 741 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 742 |   <Tab value="Curl">
 743 | 
 744 | ```bash
 745 | curl --connect-timeout 6000 \
 746 |   --max-time 5000 \
 747 |   -X POST \
 748 |   -H "Content-Type: application/json" \
 749 |   -d '{
 750 |     "name": "my-local-vm", 
 751 |     "imageName": "my-image",
 752 |     "tags": ["latest", "v1"],
 753 |     "organization": "my-org", 
 754 |     "registry": "ghcr.io",
 755 |     "chunkSizeMb": 512,
 756 |     "storage": null 
 757 |   }' \
 758 |   http://localhost:7777/lume/vms/push
 759 | ```
 760 | 
 761 |   </Tab>
 762 |   <Tab value="Python">
 763 | 
 764 | ```python
 765 | import requests
 766 | 
 767 | payload = {
 768 |     "name": "my-local-vm",
 769 |     "imageName": "my-image",
 770 |     "tags": ["latest", "v1"],
 771 |     "organization": "my-org",
 772 |     "registry": "ghcr.io",
 773 |     "chunkSizeMb": 512,
 774 |     "storage": None
 775 | }
 776 | r = requests.post("http://localhost:7777/lume/vms/push", json=payload, timeout=50)
 777 | print(r.json())
 778 | ```
 779 | 
 780 |   </Tab>
 781 |   <Tab value="TypeScript">
 782 | 
 783 | ```typescript
 784 | const payload = {
 785 |   name: 'my-local-vm',
 786 |   imageName: 'my-image',
 787 |   tags: ['latest', 'v1'],
 788 |   organization: 'my-org',
 789 |   registry: 'ghcr.io',
 790 |   chunkSizeMb: 512,
 791 |   storage: null,
 792 | };
 793 | const res = await fetch('http://localhost:7777/lume/vms/push', {
 794 |   method: 'POST',
 795 |   headers: { 'Content-Type': 'application/json' },
 796 |   body: JSON.stringify(payload),
 797 | });
 798 | console.log(await res.json());
 799 | ```
 800 | 
 801 |   </Tab>
 802 | </Tabs>
 803 | 
 804 | **Response (202 Accepted):**
 805 | 
 806 | ```json
 807 | {
 808 |   "message": "Push initiated in background",
 809 |   "name": "my-local-vm",
 810 |   "imageName": "my-image",
 811 |   "tags": [
 812 |     "latest",
 813 |     "v1"
 814 |   ]
 815 | }
 816 | ```
 817 | 
 818 | ---
 819 | 
 820 | ### List Images
 821 | 
 822 | List available VM images.
 823 | 
 824 | `GET: /lume/images`
 825 | 
 826 | #### Example Request
 827 | 
 828 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 829 |   <Tab value="Curl">
 830 | 
 831 | ```bash
 832 | curl --connect-timeout 6000 \
 833 |   --max-time 5000 \
 834 |   http://localhost:7777/lume/images
 835 | ```
 836 | 
 837 |   </Tab>
 838 |   <Tab value="Python">
 839 | 
 840 | ```python
 841 | import requests
 842 | 
 843 | r = requests.get("http://localhost:7777/lume/images", timeout=50)
 844 | print(r.json())
 845 | ```
 846 | 
 847 |   </Tab>
 848 |   <Tab value="TypeScript">
 849 | 
 850 | ```typescript
 851 | const res = await fetch('http://localhost:7777/lume/images');
 852 | console.log(await res.json());
 853 | ```
 854 | 
 855 |   </Tab>
 856 | </Tabs>
 857 | 
 858 | ```json
 859 | {
 860 |   "local": [
 861 |     "macos-sequoia-xcode:latest",
 862 |     "macos-sequoia-vanilla:latest"
 863 |   ]
 864 | }
 865 | ```
 866 | 
 867 | ---
 868 | 
 869 | ### Prune Images
 870 | 
 871 | Remove unused VM images to free up disk space.
 872 | 
 873 | `POST: /lume/prune`
 874 | 
 875 | #### Example Request
 876 | 
 877 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 878 |   <Tab value="Curl">
 879 | 
 880 | ```bash
 881 | curl --connect-timeout 6000 \
 882 |   --max-time 5000 \
 883 |   -X POST \
 884 |   http://localhost:7777/lume/prune
 885 | ```
 886 | 
 887 |   </Tab>
 888 |   <Tab value="Python">
 889 | 
 890 | ```python
 891 | import requests
 892 | 
 893 | r = requests.post("http://localhost:7777/lume/prune", timeout=50)
 894 | print(r.json())
 895 | ```
 896 | 
 897 |   </Tab>
 898 |   <Tab value="TypeScript">
 899 | 
 900 | ```typescript
 901 | const res = await fetch('http://localhost:7777/lume/prune', {
 902 |   method: 'POST',
 903 | });
 904 | console.log(await res.json());
 905 | ```
 906 | 
 907 |   </Tab>
 908 | </Tabs>
 909 | 
 910 | ---
 911 | 
 912 | ### Get Latest IPSW URL
 913 | 
 914 | Get the URL for the latest macOS IPSW file.
 915 | 
 916 | `GET: /lume/ipsw`
 917 | 
 918 | #### Example Request
 919 | 
 920 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 921 |   <Tab value="Curl">
 922 | 
 923 | ```bash
 924 | curl --connect-timeout 6000 \
 925 |   --max-time 5000 \
 926 |   http://localhost:7777/lume/ipsw
 927 | ```
 928 | 
 929 |   </Tab>
 930 |   <Tab value="Python">
 931 | 
 932 | ```python
 933 | import requests
 934 | 
 935 | r = requests.get("http://localhost:7777/lume/ipsw", timeout=50)
 936 | print(r.json())
 937 | ```
 938 | 
 939 |   </Tab>
 940 |   <Tab value="TypeScript">
 941 | 
 942 | ```typescript
 943 | const res = await fetch('http://localhost:7777/lume/ipsw');
 944 | console.log(await res.json());
 945 | ```
 946 | 
 947 |   </Tab>
 948 | </Tabs>
 949 | 
 950 | ---
 951 | 
 952 | ## Configuration Management
 953 | 
 954 | ### Get Configuration
 955 | 
 956 | Get current Lume configuration settings.
 957 | 
 958 | `GET: /lume/config`
 959 | 
 960 | #### Example Request
 961 | 
 962 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
 963 |   <Tab value="Curl">
 964 | 
 965 | ```bash
 966 | curl --connect-timeout 6000 \
 967 |   --max-time 5000 \
 968 |   http://localhost:7777/lume/config
 969 | ```
 970 | 
 971 |   </Tab>
 972 |   <Tab value="Python">
 973 | 
 974 | ```python
 975 | import requests
 976 | 
 977 | r = requests.get("http://localhost:7777/lume/config", timeout=50)
 978 | print(r.json())
 979 | ```
 980 | 
 981 |   </Tab>
 982 |   <Tab value="TypeScript">
 983 | 
 984 | ```typescript
 985 | const res = await fetch('http://localhost:7777/lume/config');
 986 | console.log(await res.json());
 987 | ```
 988 | 
 989 |   </Tab>
 990 | </Tabs>
 991 | 
 992 | ```json
 993 | {
 994 |   "homeDirectory": "~/.lume",
 995 |   "cacheDirectory": "~/.lume/cache",
 996 |   "cachingEnabled": true
 997 | }
 998 | ```
 999 | 
1000 | ### Update Configuration
1001 | 
1002 | Update Lume configuration settings.
1003 | 
1004 | `POST: /lume/config`
1005 | 
1006 | #### Parameters
1007 | 
1008 | | Name            | Type    | Required | Description                      |
1009 | | --------------- | ------- | -------- | -------------------------------- |
1010 | | homeDirectory   | string  | No       | Lume home directory path         |
1011 | | cacheDirectory  | string  | No       | Cache directory path             |
1012 | | cachingEnabled  | boolean | No       | Enable or disable caching        |
1013 | 
1014 | #### Example Request
1015 | 
1016 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
1017 |   <Tab value="Curl">
1018 | 
1019 | ```bash
1020 | curl --connect-timeout 6000 \
1021 |   --max-time 5000 \
1022 |   -X POST \
1023 |   -H "Content-Type: application/json" \
1024 |   -d '{
1025 |     "homeDirectory": "~/custom/lume",
1026 |     "cacheDirectory": "~/custom/lume/cache",
1027 |     "cachingEnabled": true
1028 |   }' \
1029 |   http://localhost:7777/lume/config
1030 | ```
1031 | 
1032 |   </Tab>
1033 |   <Tab value="Python">
1034 | 
1035 | ```python
1036 | import requests
1037 | 
1038 | payload = {
1039 |     "homeDirectory": "~/custom/lume",
1040 |     "cacheDirectory": "~/custom/lume/cache",
1041 |     "cachingEnabled": True
1042 | }
1043 | r = requests.post("http://localhost:7777/lume/config", json=payload, timeout=50)
1044 | print(r.json())
1045 | ```
1046 | 
1047 |   </Tab>
1048 |   <Tab value="TypeScript">
1049 | 
1050 | ```typescript
1051 | const payload = {
1052 |   homeDirectory: '~/custom/lume',
1053 |   cacheDirectory: '~/custom/lume/cache',
1054 |   cachingEnabled: true,
1055 | };
1056 | const res = await fetch('http://localhost:7777/lume/config', {
1057 |   method: 'POST',
1058 |   headers: { 'Content-Type': 'application/json' },
1059 |   body: JSON.stringify(payload),
1060 | });
1061 | console.log(await res.json());
1062 | ```
1063 | 
1064 |   </Tab>
1065 | </Tabs>
1066 | 
1067 | ---
1068 | 
1069 | ## Storage Location Management
1070 | 
1071 | ### Get VM Storage Locations
1072 | 
1073 | List all configured VM storage locations.
1074 | 
1075 | `GET: /lume/config/locations`
1076 | 
1077 | #### Example Request
1078 | 
1079 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
1080 |   <Tab value="Curl">
1081 | 
1082 | ```bash
1083 | curl --connect-timeout 6000 \
1084 |   --max-time 5000 \
1085 |   http://localhost:7777/lume/config/locations
1086 | ```
1087 | 
1088 |   </Tab>
1089 |   <Tab value="Python">
1090 | 
1091 | ```python
1092 | import requests
1093 | 
1094 | r = requests.get("http://localhost:7777/lume/config/locations", timeout=50)
1095 | print(r.json())
1096 | ```
1097 | 
1098 |   </Tab>
1099 |   <Tab value="TypeScript">
1100 | 
1101 | ```typescript
1102 | const res = await fetch('http://localhost:7777/lume/config/locations');
1103 | console.log(await res.json());
1104 | ```
1105 | 
1106 |   </Tab>
1107 | </Tabs>
1108 | 
1109 | ```json
1110 | [
1111 |   {
1112 |     "name": "default",
1113 |     "path": "~/.lume/vms",
1114 |     "isDefault": true
1115 |   },
1116 |   {
1117 |     "name": "ssd",
1118 |     "path": "/Volumes/SSD/lume/vms",
1119 |     "isDefault": false
1120 |   }
1121 | ]
1122 | ```
1123 | 
1124 | ### Add VM Storage Location
1125 | 
1126 | Add a new VM storage location.
1127 | 
1128 | `POST: /lume/config/locations`
1129 | 
1130 | #### Parameters
1131 | 
1132 | | Name | Type   | Required | Description                  |
1133 | | ---- | ------ | -------- | ---------------------------- |
1134 | | name | string | Yes      | Storage location name        |
1135 | | path | string | Yes      | File system path for storage |
1136 | 
1137 | #### Example Request
1138 | 
1139 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
1140 |   <Tab value="Curl">
1141 | 
1142 | ```bash
1143 | curl --connect-timeout 6000 \
1144 |   --max-time 5000 \
1145 |   -X POST \
1146 |   -H "Content-Type: application/json" \
1147 |   -d '{
1148 |     "name": "ssd",
1149 |     "path": "/Volumes/SSD/lume/vms"
1150 |   }' \
1151 |   http://localhost:7777/lume/config/locations
1152 | ```
1153 | 
1154 |   </Tab>
1155 |   <Tab value="Python">
1156 | 
1157 | ```python
1158 | import requests
1159 | 
1160 | payload = {
1161 |     "name": "ssd",
1162 |     "path": "/Volumes/SSD/lume/vms"
1163 | }
1164 | r = requests.post("http://localhost:7777/lume/config/locations", json=payload, timeout=50)
1165 | print(r.json())
1166 | ```
1167 | 
1168 |   </Tab>
1169 |   <Tab value="TypeScript">
1170 | 
1171 | ```typescript
1172 | const payload = {
1173 |   name: 'ssd',
1174 |   path: '/Volumes/SSD/lume/vms',
1175 | };
1176 | const res = await fetch('http://localhost:7777/lume/config/locations', {
1177 |   method: 'POST',
1178 |   headers: { 'Content-Type': 'application/json' },
1179 |   body: JSON.stringify(payload),
1180 | });
1181 | console.log(await res.json());
1182 | ```
1183 | 
1184 |   </Tab>
1185 | </Tabs>
1186 | 
1187 | ### Remove VM Storage Location
1188 | 
1189 | Remove a VM storage location.
1190 | 
1191 | `DELETE: /lume/config/locations/:name`
1192 | 
1193 | #### Example Request
1194 | 
1195 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
1196 |   <Tab value="Curl">
1197 | 
1198 | ```bash
1199 | curl --connect-timeout 6000 \
1200 |   --max-time 5000 \
1201 |   -X DELETE \
1202 |   http://localhost:7777/lume/config/locations/ssd
1203 | ```
1204 | 
1205 |   </Tab>
1206 |   <Tab value="Python">
1207 | 
1208 | ```python
1209 | import requests
1210 | 
1211 | r = requests.delete("http://localhost:7777/lume/config/locations/ssd", timeout=50)
1212 | print(r.status_code)
1213 | ```
1214 | 
1215 |   </Tab>
1216 |   <Tab value="TypeScript">
1217 | 
1218 | ```typescript
1219 | const res = await fetch('http://localhost:7777/lume/config/locations/ssd', {
1220 |   method: 'DELETE',
1221 | });
1222 | console.log(res.status);
1223 | ```
1224 | 
1225 |   </Tab>
1226 | </Tabs>
1227 | 
1228 | ### Set Default VM Storage Location
1229 | 
1230 | Set a storage location as the default.
1231 | 
1232 | `POST: /lume/config/locations/default/:name`
1233 | 
1234 | #### Example Request
1235 | 
1236 | <Tabs groupId="language" persist items={['Curl', 'Python', 'TypeScript']}>
1237 |   <Tab value="Curl">
1238 | 
1239 | ```bash
1240 | curl --connect-timeout 6000 \
1241 |   --max-time 5000 \
1242 |   -X POST \
1243 |   http://localhost:7777/lume/config/locations/default/ssd
1244 | ```
1245 | 
1246 |   </Tab>
1247 |   <Tab value="Python">
1248 | 
1249 | ```python
1250 | import requests
1251 | 
1252 | r = requests.post("http://localhost:7777/lume/config/locations/default/ssd", timeout=50)
1253 | print(r.json())
1254 | ```
1255 | 
1256 |   </Tab>
1257 |   <Tab value="TypeScript">
1258 | 
1259 | ```typescript
1260 | const res = await fetch('http://localhost:7777/lume/config/locations/default/ssd', {
1261 |   method: 'POST',
1262 | });
1263 | console.log(await res.json());
1264 | ```
1265 | 
1266 |   </Tab>
1267 | </Tabs>
1268 | 
```

--------------------------------------------------------------------------------
/libs/python/computer-server/computer_server/main.py:
--------------------------------------------------------------------------------

```python
  1 | from fastapi import FastAPI, WebSocket, WebSocketDisconnect, Request, HTTPException, Header
  2 | from fastapi.responses import StreamingResponse, JSONResponse
  3 | from typing import List, Dict, Any, Optional, Union, Literal, cast
  4 | import uvicorn
  5 | import logging
  6 | import asyncio
  7 | import json
  8 | import traceback
  9 | import inspect
 10 | from contextlib import redirect_stdout, redirect_stderr
 11 | from io import StringIO
 12 | from .handlers.factory import HandlerFactory
 13 | import os
 14 | import aiohttp
 15 | import hashlib
 16 | import time
 17 | import platform
 18 | from fastapi.middleware.cors import CORSMiddleware
 19 | 
 20 | # Authentication session TTL (in seconds). Override via env var CUA_AUTH_TTL_SECONDS. Default: 60s
 21 | AUTH_SESSION_TTL_SECONDS: int = int(os.environ.get("CUA_AUTH_TTL_SECONDS", "60"))
 22 | 
 23 | try:
 24 |     from agent import ComputerAgent
 25 |     HAS_AGENT = True
 26 | except ImportError:
 27 |     HAS_AGENT = False
 28 | 
 29 | # Set up logging with more detail
 30 | logger = logging.getLogger(__name__)
 31 | logger.setLevel(logging.INFO)
 32 | 
 33 | # Configure WebSocket with larger message size
 34 | WEBSOCKET_MAX_SIZE = 1024 * 1024 * 10  # 10MB limit
 35 | 
 36 | # Configure application with WebSocket settings
 37 | app = FastAPI(
 38 |     title="Computer API",
 39 |     description="API for the Computer project",
 40 |     version="0.1.0",
 41 |     websocket_max_size=WEBSOCKET_MAX_SIZE,
 42 | )
 43 | 
 44 | # CORS configuration
 45 | origins = ["*"]
 46 | app.add_middleware(
 47 |     CORSMiddleware,
 48 |     allow_origins=origins,
 49 |     allow_credentials=True,
 50 |     allow_methods=["*"],
 51 |     allow_headers=["*"],
 52 | )
 53 | 
 54 | protocol_version = 1
 55 | try:
 56 |     from importlib.metadata import version
 57 |     package_version = version("cua-computer-server")
 58 | except Exception:
 59 |     # Fallback for cases where package is not installed or importlib.metadata is not available
 60 |     try:
 61 |         import pkg_resources
 62 |         package_version = pkg_resources.get_distribution("cua-computer-server").version
 63 |     except Exception:
 64 |         package_version = "unknown"
 65 | 
 66 | accessibility_handler, automation_handler, diorama_handler, file_handler = HandlerFactory.create_handlers()
 67 | handlers = {
 68 |     "version": lambda: {"protocol": protocol_version, "package": package_version},
 69 |     # App-Use commands
 70 |     "diorama_cmd": diorama_handler.diorama_cmd,
 71 |     # Accessibility commands
 72 |     "get_accessibility_tree": accessibility_handler.get_accessibility_tree,
 73 |     "find_element": accessibility_handler.find_element,
 74 |     # Shell commands
 75 |     "run_command": automation_handler.run_command,
 76 |     # File system commands
 77 |     "file_exists": file_handler.file_exists,
 78 |     "directory_exists": file_handler.directory_exists,
 79 |     "list_dir": file_handler.list_dir,
 80 |     "read_text": file_handler.read_text,
 81 |     "write_text": file_handler.write_text,
 82 |     "read_bytes": file_handler.read_bytes,
 83 |     "write_bytes": file_handler.write_bytes,
 84 |     "get_file_size": file_handler.get_file_size,
 85 |     "delete_file": file_handler.delete_file,
 86 |     "create_dir": file_handler.create_dir,
 87 |     "delete_dir": file_handler.delete_dir,
 88 |     # Mouse commands
 89 |     "mouse_down": automation_handler.mouse_down,
 90 |     "mouse_up": automation_handler.mouse_up,
 91 |     "left_click": automation_handler.left_click,
 92 |     "right_click": automation_handler.right_click,
 93 |     "double_click": automation_handler.double_click,
 94 |     "move_cursor": automation_handler.move_cursor,
 95 |     "drag_to": automation_handler.drag_to,
 96 |     "drag": automation_handler.drag,
 97 |     # Keyboard commands
 98 |     "key_down": automation_handler.key_down,
 99 |     "key_up": automation_handler.key_up,
100 |     "type_text": automation_handler.type_text,
101 |     "press_key": automation_handler.press_key,
102 |     "hotkey": automation_handler.hotkey,
103 |     # Scrolling actions
104 |     "scroll": automation_handler.scroll,
105 |     "scroll_down": automation_handler.scroll_down,
106 |     "scroll_up": automation_handler.scroll_up,
107 |     # Screen actions
108 |     "screenshot": automation_handler.screenshot,
109 |     "get_cursor_position": automation_handler.get_cursor_position,
110 |     "get_screen_size": automation_handler.get_screen_size,
111 |     # Clipboard actions
112 |     "copy_to_clipboard": automation_handler.copy_to_clipboard,
113 |     "set_clipboard": automation_handler.set_clipboard,
114 | }
115 | 
116 | 
117 | class AuthenticationManager:
118 |     def __init__(self):
119 |         self.sessions: Dict[str, Dict[str, Any]] = {}
120 |         self.container_name = os.environ.get("CONTAINER_NAME")
121 |     
122 |     def _hash_credentials(self, container_name: str, api_key: str) -> str:
123 |         """Create a hash of container name and API key for session identification"""
124 |         combined = f"{container_name}:{api_key}"
125 |         return hashlib.sha256(combined.encode()).hexdigest()
126 |     
127 |     def _is_session_valid(self, session_data: Dict[str, Any]) -> bool:
128 |         """Check if a session is still valid based on expiration time"""
129 |         if not session_data.get('valid', False):
130 |             return False
131 |         
132 |         expires_at = session_data.get('expires_at', 0)
133 |         return time.time() < expires_at
134 |     
135 |     async def auth(self, container_name: str, api_key: str) -> bool:
136 |         """Authenticate container name and API key, using cached sessions when possible"""
137 |         # If no CONTAINER_NAME is set, always allow access (local development)
138 |         if not self.container_name:
139 |             logger.info("No CONTAINER_NAME set in environment. Allowing access (local development mode)")
140 |             return True
141 |         
142 |         # Layer 1: VM Identity Verification
143 |         if container_name != self.container_name:
144 |             logger.warning(f"VM name mismatch. Expected: {self.container_name}, Got: {container_name}")
145 |             return False
146 |         
147 |         # Create hash for session lookup
148 |         session_hash = self._hash_credentials(container_name, api_key)
149 |         
150 |         # Check if we have a valid cached session
151 |         if session_hash in self.sessions:
152 |             session_data = self.sessions[session_hash]
153 |             if self._is_session_valid(session_data):
154 |                 logger.info(f"Using cached authentication for container: {container_name}")
155 |                 return session_data['valid']
156 |             else:
157 |                 # Remove expired session
158 |                 del self.sessions[session_hash]
159 |         
160 |         # No valid cached session, authenticate with API
161 |         logger.info(f"Authenticating with TryCUA API for container: {container_name}")
162 |         
163 |         try:
164 |             async with aiohttp.ClientSession() as session:
165 |                 headers = {
166 |                     "Authorization": f"Bearer {api_key}"
167 |                 }
168 |                 
169 |                 async with session.get(
170 |                     f"https://www.trycua.com/api/vm/auth?container_name={container_name}",
171 |                     headers=headers,
172 |                 ) as resp:
173 |                     is_valid = resp.status == 200 and bool((await resp.text()).strip())
174 |                     
175 |                     # Cache the result with configurable expiration
176 |                     self.sessions[session_hash] = {
177 |                         'valid': is_valid,
178 |                         'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS
179 |                     }
180 |                     
181 |                     if is_valid:
182 |                         logger.info(f"Authentication successful for container: {container_name}")
183 |                     else:
184 |                         logger.warning(f"Authentication failed for container: {container_name}. Status: {resp.status}")
185 |                     
186 |                     return is_valid
187 |         
188 |         except aiohttp.ClientError as e:
189 |             logger.error(f"Failed to validate API key with TryCUA API: {str(e)}")
190 |             # Cache failed result to avoid repeated requests
191 |             self.sessions[session_hash] = {
192 |                 'valid': False,
193 |                 'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS
194 |             }
195 |             return False
196 |         except Exception as e:
197 |             logger.error(f"Unexpected error during authentication: {str(e)}")
198 |             # Cache failed result to avoid repeated requests
199 |             self.sessions[session_hash] = {
200 |                 'valid': False,
201 |                 'expires_at': time.time() + AUTH_SESSION_TTL_SECONDS
202 |             }
203 |             return False
204 | 
205 | 
206 | class ConnectionManager:
207 |     def __init__(self):
208 |         self.active_connections: List[WebSocket] = []
209 | 
210 |     async def connect(self, websocket: WebSocket):
211 |         await websocket.accept()
212 |         self.active_connections.append(websocket)
213 | 
214 |     def disconnect(self, websocket: WebSocket):
215 |         self.active_connections.remove(websocket)
216 | 
217 | 
218 | manager = ConnectionManager()
219 | auth_manager = AuthenticationManager()
220 | 
221 | @app.get("/status")
222 | async def status():
223 |     sys = platform.system().lower()
224 |     # get os type
225 |     if "darwin" in sys or sys == "macos" or sys == "mac":
226 |         os_type = "macos"
227 |     elif "windows" in sys:
228 |         os_type = "windows"
229 |     else:
230 |         os_type = "linux"
231 |     # get computer-server features
232 |     features = []
233 |     if HAS_AGENT:
234 |         features.append("agent")
235 |     return {"status": "ok", "os_type": os_type, "features": features}
236 | 
237 | @app.websocket("/ws", name="websocket_endpoint")
238 | async def websocket_endpoint(websocket: WebSocket):
239 |     global handlers
240 | 
241 |     # WebSocket message size is configured at the app or endpoint level, not on the instance
242 |     await manager.connect(websocket)
243 |     
244 |     # Check if CONTAINER_NAME is set (indicating cloud provider)
245 |     server_container_name = os.environ.get("CONTAINER_NAME")
246 |     
247 |     # If cloud provider, perform authentication handshake
248 |     if server_container_name:
249 |         try:
250 |             logger.info(f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Waiting for authentication...")
251 |             
252 |             # Wait for authentication message
253 |             auth_data = await websocket.receive_json()
254 |             
255 |             # Validate auth message format
256 |             if auth_data.get("command") != "authenticate":
257 |                 await websocket.send_json({
258 |                     "success": False,
259 |                     "error": "First message must be authentication"
260 |                 })
261 |                 await websocket.close()
262 |                 manager.disconnect(websocket)
263 |                 return
264 |             
265 |             # Extract credentials
266 |             client_api_key = auth_data.get("params", {}).get("api_key")
267 |             client_container_name = auth_data.get("params", {}).get("container_name")
268 |             
269 |             # Validate credentials using AuthenticationManager
270 |             if not client_api_key:
271 |                 await websocket.send_json({
272 |                     "success": False,
273 |                     "error": "API key required"
274 |                 })
275 |                 await websocket.close()
276 |                 manager.disconnect(websocket)
277 |                 return
278 |             
279 |             if not client_container_name:
280 |                 await websocket.send_json({
281 |                     "success": False,
282 |                     "error": "Container name required"
283 |                 })
284 |                 await websocket.close()
285 |                 manager.disconnect(websocket)
286 |                 return
287 |             
288 |             # Use AuthenticationManager for validation
289 |             is_authenticated = await auth_manager.auth(client_container_name, client_api_key)
290 |             if not is_authenticated:
291 |                 await websocket.send_json({
292 |                     "success": False,
293 |                     "error": "Authentication failed"
294 |                 })
295 |                 await websocket.close()
296 |                 manager.disconnect(websocket)
297 |                 return
298 |             
299 |             logger.info(f"Authentication successful for VM: {client_container_name}")
300 |             await websocket.send_json({
301 |                 "success": True,
302 |                 "message": "Authentication successful"
303 |             })
304 |         
305 |         except Exception as e:
306 |             logger.error(f"Error during authentication handshake: {str(e)}")
307 |             await websocket.send_json({
308 |                 "success": False,
309 |                 "error": "Authentication failed"
310 |             })
311 |             await websocket.close()
312 |             manager.disconnect(websocket)
313 |             return
314 | 
315 |     try:
316 |         while True:
317 |             try:
318 |                 data = await websocket.receive_json()
319 |                 command = data.get("command")
320 |                 params = data.get("params", {})
321 | 
322 |                 if command not in handlers:
323 |                     await websocket.send_json(
324 |                         {"success": False, "error": f"Unknown command: {command}"}
325 |                     )
326 |                     continue
327 | 
328 |                 try:
329 |                     # Filter params to only include those accepted by the handler function
330 |                     handler_func = handlers[command]
331 |                     sig = inspect.signature(handler_func)
332 |                     filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
333 |                     
334 |                     # Handle both sync and async functions
335 |                     if asyncio.iscoroutinefunction(handler_func):
336 |                         result = await handler_func(**filtered_params)
337 |                     else:
338 |                         # Run sync functions in thread pool to avoid blocking event loop
339 |                         result = await asyncio.to_thread(handler_func, **filtered_params)
340 |                     await websocket.send_json({"success": True, **result})
341 |                 except Exception as cmd_error:
342 |                     logger.error(f"Error executing command {command}: {str(cmd_error)}")
343 |                     logger.error(traceback.format_exc())
344 |                     await websocket.send_json({"success": False, "error": str(cmd_error)})
345 | 
346 |             except WebSocketDisconnect:
347 |                 raise
348 |             except json.JSONDecodeError as json_err:
349 |                 logger.error(f"JSON decode error: {str(json_err)}")
350 |                 await websocket.send_json(
351 |                     {"success": False, "error": f"Invalid JSON: {str(json_err)}"}
352 |                 )
353 |             except Exception as loop_error:
354 |                 logger.error(f"Error in message loop: {str(loop_error)}")
355 |                 logger.error(traceback.format_exc())
356 |                 await websocket.send_json({"success": False, "error": str(loop_error)})
357 | 
358 |     except WebSocketDisconnect:
359 |         logger.info("Client disconnected")
360 |         manager.disconnect(websocket)
361 |     except Exception as e:
362 |         logger.error(f"Fatal error in websocket connection: {str(e)}")
363 |         logger.error(traceback.format_exc())
364 |         try:
365 |             await websocket.close()
366 |         except:
367 |             pass
368 |         manager.disconnect(websocket)
369 | 
370 | @app.post("/cmd")
371 | async def cmd_endpoint(
372 |     request: Request,
373 |     container_name: Optional[str] = Header(None, alias="X-Container-Name"),
374 |     api_key: Optional[str] = Header(None, alias="X-API-Key")
375 | ):
376 |     """
377 |     Backup endpoint for when WebSocket connections fail.
378 |     Accepts commands via HTTP POST with streaming response.
379 |     
380 |     Headers:
381 |     - X-Container-Name: Container name for cloud authentication
382 |     - X-API-Key: API key for cloud authentication
383 |     
384 |     Body:
385 |     {
386 |         "command": "command_name",
387 |         "params": {...}
388 |     }
389 |     """
390 |     global handlers
391 |     
392 |     # Parse request body
393 |     try:
394 |         body = await request.json()
395 |         command = body.get("command")
396 |         params = body.get("params", {})
397 |     except Exception as e:
398 |         raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
399 |     
400 |     if not command:
401 |         raise HTTPException(status_code=400, detail="Command is required")
402 |     
403 |     # Check if CONTAINER_NAME is set (indicating cloud provider)
404 |     server_container_name = os.environ.get("CONTAINER_NAME")
405 |     
406 |     # If cloud provider, perform authentication
407 |     if server_container_name:
408 |         logger.info(f"Cloud provider detected. CONTAINER_NAME: {server_container_name}. Performing authentication...")
409 |         
410 |         # Validate required headers
411 |         if not container_name:
412 |             raise HTTPException(status_code=401, detail="Container name required")
413 |         
414 |         if not api_key:
415 |             raise HTTPException(status_code=401, detail="API key required")
416 |         
417 |         # Validate with AuthenticationManager
418 |         is_authenticated = await auth_manager.auth(container_name, api_key)
419 |         if not is_authenticated:
420 |             raise HTTPException(status_code=401, detail="Authentication failed")
421 |     
422 |     if command not in handlers:
423 |         raise HTTPException(status_code=400, detail=f"Unknown command: {command}")
424 |     
425 |     async def generate_response():
426 |         """Generate streaming response for the command execution"""
427 |         try:
428 |             # Filter params to only include those accepted by the handler function
429 |             handler_func = handlers[command]
430 |             sig = inspect.signature(handler_func)
431 |             filtered_params = {k: v for k, v in params.items() if k in sig.parameters}
432 |             
433 |             # Handle both sync and async functions
434 |             if asyncio.iscoroutinefunction(handler_func):
435 |                 result = await handler_func(**filtered_params)
436 |             else:
437 |                 # Run sync functions in thread pool to avoid blocking event loop
438 |                 result = await asyncio.to_thread(handler_func, **filtered_params)
439 |             
440 |             # Stream the successful result
441 |             response_data = {"success": True, **result}
442 |             yield f"data: {json.dumps(response_data)}\n\n"
443 |             
444 |         except Exception as cmd_error:
445 |             logger.error(f"Error executing command {command}: {str(cmd_error)}")
446 |             logger.error(traceback.format_exc())
447 |             
448 |             # Stream the error result
449 |             error_data = {"success": False, "error": str(cmd_error)}
450 |             yield f"data: {json.dumps(error_data)}\n\n"
451 |     
452 |     return StreamingResponse(
453 |         generate_response(),
454 |         media_type="text/plain",
455 |         headers={
456 |             "Cache-Control": "no-cache",
457 |             "Connection": "keep-alive",
458 |         }
459 |     )
460 | 
461 | @app.post("/responses")
462 | async def agent_response_endpoint(
463 |     request: Request,
464 |     api_key: Optional[str] = Header(None, alias="X-API-Key"),
465 | ):
466 |     """
467 |     Minimal proxy to run ComputerAgent for up to 2 turns.
468 | 
469 |     Security:
470 |     - If CONTAINER_NAME is set on the server, require X-API-Key
471 |       and validate using AuthenticationManager unless CUA_ENABLE_PUBLIC_PROXY is true.
472 | 
473 |     Body JSON:
474 |     {
475 |       "model": "...",                 # required
476 |       "input": "... or messages[]",   # required
477 |       "agent_kwargs": { ... },         # optional, passed directly to ComputerAgent
478 |       "env": { ... }                   # optional env overrides for agent
479 |     }
480 |     """
481 |     if not HAS_AGENT:
482 |         raise HTTPException(status_code=501, detail="ComputerAgent not available")
483 |     
484 |     # Authenticate via AuthenticationManager if running in cloud (CONTAINER_NAME set)
485 |     container_name = os.environ.get("CONTAINER_NAME")
486 |     if container_name:
487 |         is_public = os.environ.get("CUA_ENABLE_PUBLIC_PROXY", "").lower().strip() in ["1", "true", "yes", "y", "on"]
488 |         if not is_public:
489 |             if not api_key:
490 |                 raise HTTPException(status_code=401, detail="Missing AGENT PROXY auth headers")
491 |             ok = await auth_manager.auth(container_name, api_key)
492 |             if not ok:
493 |                 raise HTTPException(status_code=401, detail="Unauthorized")
494 | 
495 |     # Parse request body
496 |     try:
497 |         body = await request.json()
498 |     except Exception as e:
499 |         raise HTTPException(status_code=400, detail=f"Invalid JSON body: {str(e)}")
500 | 
501 |     model = body.get("model")
502 |     input_data = body.get("input")
503 |     if not model or input_data is None:
504 |         raise HTTPException(status_code=400, detail="'model' and 'input' are required")
505 | 
506 |     agent_kwargs: Dict[str, Any] = body.get("agent_kwargs") or {}
507 |     env_overrides: Dict[str, str] = body.get("env") or {}
508 | 
509 |     # Simple env override context
510 |     class _EnvOverride:
511 |         def __init__(self, overrides: Dict[str, str]):
512 |             self.overrides = overrides
513 |             self._original: Dict[str, Optional[str]] = {}
514 |         def __enter__(self):
515 |             for k, v in (self.overrides or {}).items():
516 |                 self._original[k] = os.environ.get(k)
517 |                 os.environ[k] = str(v)
518 |         def __exit__(self, exc_type, exc, tb):
519 |             for k, old in self._original.items():
520 |                 if old is None:
521 |                     os.environ.pop(k, None)
522 |                 else:
523 |                     os.environ[k] = old
524 | 
525 |     # Convert input to messages
526 |     def _to_messages(data: Union[str, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
527 |         if isinstance(data, str):
528 |             return [{"role": "user", "content": data}]
529 |         if isinstance(data, list):
530 |             return data
531 | 
532 |     messages = _to_messages(input_data)
533 | 
534 |     # Define a direct computer tool that implements the AsyncComputerHandler protocol
535 |     # and delegates to our existing automation/file/accessibility handlers.
536 |     from agent.computers import AsyncComputerHandler  # runtime-checkable Protocol
537 | 
538 |     class DirectComputer(AsyncComputerHandler):
539 |         def __init__(self):
540 |             # use module-scope handler singletons created by HandlerFactory
541 |             self._auto = automation_handler
542 |             self._file = file_handler
543 |             self._access = accessibility_handler
544 | 
545 |         async def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]:
546 |             sys = platform.system().lower()
547 |             if "darwin" in sys or sys in ("macos", "mac"):
548 |                 return "mac"
549 |             if "windows" in sys:
550 |                 return "windows"
551 |             return "linux"
552 | 
553 |         async def get_dimensions(self) -> tuple[int, int]:
554 |             size = await self._auto.get_screen_size()
555 |             return size["width"], size["height"]
556 | 
557 |         async def screenshot(self) -> str:
558 |             img_b64 = await self._auto.screenshot()
559 |             return img_b64["image_data"]
560 | 
561 |         async def click(self, x: int, y: int, button: str = "left") -> None:
562 |             if button == "left":
563 |                 await self._auto.left_click(x, y)
564 |             elif button == "right":
565 |                 await self._auto.right_click(x, y)
566 |             else:
567 |                 await self._auto.left_click(x, y)
568 | 
569 |         async def double_click(self, x: int, y: int) -> None:
570 |             await self._auto.double_click(x, y)
571 | 
572 |         async def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None:
573 |             await self._auto.move_cursor(x, y)
574 |             await self._auto.scroll(scroll_x, scroll_y)
575 | 
576 |         async def type(self, text: str) -> None:
577 |             await self._auto.type_text(text)
578 | 
579 |         async def wait(self, ms: int = 1000) -> None:
580 |             await asyncio.sleep(ms / 1000.0)
581 | 
582 |         async def move(self, x: int, y: int) -> None:
583 |             await self._auto.move_cursor(x, y)
584 | 
585 |         async def keypress(self, keys: Union[List[str], str]) -> None:
586 |             if isinstance(keys, str):
587 |                 parts = keys.replace("-", "+").split("+") if len(keys) > 1 else [keys]
588 |             else:
589 |                 parts = keys
590 |             if len(parts) == 1:
591 |                 await self._auto.press_key(parts[0])
592 |             else:
593 |                 await self._auto.hotkey(parts)
594 | 
595 |         async def drag(self, path: List[Dict[str, int]]) -> None:
596 |             if not path:
597 |                 return
598 |             start = path[0]
599 |             await self._auto.mouse_down(start["x"], start["y"])
600 |             for pt in path[1:]:
601 |                 await self._auto.move_cursor(pt["x"], pt["y"]) 
602 |             end = path[-1]
603 |             await self._auto.mouse_up(end["x"], end["y"]) 
604 | 
605 |         async def get_current_url(self) -> str:
606 |             # Not available in this server context
607 |             return ""
608 | 
609 |         async def left_mouse_down(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
610 |             await self._auto.mouse_down(x, y, button="left")
611 | 
612 |         async def left_mouse_up(self, x: Optional[int] = None, y: Optional[int] = None) -> None:
613 |             await self._auto.mouse_up(x, y, button="left")
614 | 
615 |     # # Inline image URLs to base64
616 |     # import base64, mimetypes, requests
617 |     # # Use a browser-like User-Agent to avoid 403s from some CDNs (e.g., Wikimedia)
618 |     # HEADERS = {
619 |     #     "User-Agent": (
620 |     #         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
621 |     #         "AppleWebKit/537.36 (KHTML, like Gecko) "
622 |     #         "Chrome/124.0.0.0 Safari/537.36"
623 |     #     )
624 |     # }
625 |     # def _to_data_url(content_bytes: bytes, url: str, resp: requests.Response) -> str:
626 |     #     ctype = resp.headers.get("Content-Type") or mimetypes.guess_type(url)[0] or "application/octet-stream"
627 |     #     b64 = base64.b64encode(content_bytes).decode("utf-8")
628 |     #     return f"data:{ctype};base64,{b64}"
629 |     # def inline_image_urls(messages):
630 |     #     # messages: List[{"role": "...","content":[...]}]
631 |     #     out = []
632 |     #     for m in messages:
633 |     #         if not isinstance(m.get("content"), list):
634 |     #             out.append(m)
635 |     #             continue
636 |     #         new_content = []
637 |     #         for part in (m.get("content") or []):
638 |     #             if part.get("type") == "input_image" and (url := part.get("image_url")):
639 |     #                 resp = requests.get(url, headers=HEADERS, timeout=30)
640 |     #                 resp.raise_for_status()
641 |     #                 new_content.append({
642 |     #                     "type": "input_image",
643 |     #                     "image_url": _to_data_url(resp.content, url, resp)
644 |     #                 })
645 |     #             else:
646 |     #                 new_content.append(part)
647 |     #         out.append({**m, "content": new_content})
648 |     #     return out
649 |     # messages = inline_image_urls(messages)
650 | 
651 |     error = None
652 | 
653 |     with _EnvOverride(env_overrides):
654 |         # Prepare tools: if caller did not pass tools, inject our DirectComputer
655 |         tools = agent_kwargs.get("tools")
656 |         if not tools:
657 |             tools = [DirectComputer()]
658 |             agent_kwargs = {**agent_kwargs, "tools": tools}
659 |         # Instantiate agent with our tools
660 |         agent = ComputerAgent(model=model, **agent_kwargs)  # type: ignore[arg-type]
661 | 
662 |         total_output: List[Any] = []
663 |         total_usage: Dict[str, Any] = {}
664 | 
665 |         pending_computer_call_ids = set()
666 |         try:
667 |             async for result in agent.run(messages):
668 |                 total_output += result["output"]
669 |                 # Try to collect usage if present
670 |                 if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict):
671 |                     # Merge usage counters
672 |                     for k, v in result["usage"].items():
673 |                         if isinstance(v, (int, float)):
674 |                             total_usage[k] = total_usage.get(k, 0) + v
675 |                         else:
676 |                             total_usage[k] = v
677 |                 for msg in result.get("output", []):
678 |                     if msg.get("type") == "computer_call":
679 |                         pending_computer_call_ids.add(msg["call_id"])
680 |                     elif msg.get("type") == "computer_call_output":
681 |                         pending_computer_call_ids.discard(msg["call_id"])
682 |                 # exit if no pending computer calls
683 |                 if not pending_computer_call_ids:
684 |                     break
685 |         except Exception as e:
686 |             logger.error(f"Error running agent: {str(e)}")
687 |             logger.error(traceback.format_exc())
688 |             error = str(e)
689 |     
690 |     # Build response payload
691 |     payload = {
692 |         "model": model,
693 |         "error": error,
694 |         "output": total_output,
695 |         "usage": total_usage,
696 |         "status": "completed" if not error else "failed"
697 |     }
698 | 
699 |     # CORS: allow any origin
700 |     headers = {
701 |         "Cache-Control": "no-cache",
702 |         "Connection": "keep-alive",
703 |     }
704 | 
705 |     return JSONResponse(content=payload, headers=headers)
706 | 
707 | 
708 | if __name__ == "__main__":
709 |     uvicorn.run(app, host="0.0.0.0", port=8000)
710 | 
```

--------------------------------------------------------------------------------
/libs/lume/src/Server/Handlers.swift:
--------------------------------------------------------------------------------

```swift
  1 | import ArgumentParser
  2 | import Foundation
  3 | import Virtualization
  4 | 
  5 | @MainActor
  6 | extension Server {
  7 |     // MARK: - VM Management Handlers
  8 | 
  9 |     func handleListVMs(storage: String? = nil) async throws -> HTTPResponse {
 10 |         do {
 11 |             let vmController = LumeController()
 12 |             let vms = try vmController.list(storage: storage)
 13 |             return try .json(vms)
 14 |         } catch {
 15 |             print(
 16 |                 "ERROR: Failed to list VMs: \(error.localizedDescription), storage=\(String(describing: storage))"
 17 |             )
 18 |             return .badRequest(message: error.localizedDescription)
 19 |         }
 20 |     }
 21 | 
 22 |     func handleGetVM(name: String, storage: String? = nil) async throws -> HTTPResponse {
 23 |         print("Getting VM details: name=\(name), storage=\(String(describing: storage))")
 24 | 
 25 |         do {
 26 |             let vmController = LumeController()
 27 |             print("Created VM controller, attempting to get VM")
 28 |             let vm = try vmController.get(name: name, storage: storage)
 29 |             print("Successfully retrieved VM")
 30 | 
 31 |             // Check for nil values that might cause crashes
 32 |             if vm.vmDirContext.config.macAddress == nil {
 33 |                 print("ERROR: VM has nil macAddress")
 34 |                 return .badRequest(message: "VM configuration is invalid (nil macAddress)")
 35 |             }
 36 |             print("MacAddress check passed")
 37 | 
 38 |             // Log that we're about to access details
 39 |             print("Preparing VM details response")
 40 | 
 41 |             // Print the full details object for debugging
 42 |             let details = vm.details
 43 |             print("VM DETAILS: \(details)")
 44 |             print("  name: \(details.name)")
 45 |             print("  os: \(details.os)")
 46 |             print("  cpuCount: \(details.cpuCount)")
 47 |             print("  memorySize: \(details.memorySize)")
 48 |             print("  diskSize: \(details.diskSize)")
 49 |             print("  display: \(details.display)")
 50 |             print("  status: \(details.status)")
 51 |             print("  vncUrl: \(String(describing: details.vncUrl))")
 52 |             print("  ipAddress: \(String(describing: details.ipAddress))")
 53 |             print("  locationName: \(details.locationName)")
 54 | 
 55 |             // Serialize the VM details
 56 |             print("About to serialize VM details")
 57 |             let response = try HTTPResponse.json(vm.details)
 58 |             print("Successfully serialized VM details")
 59 |             return response
 60 | 
 61 |         } catch {
 62 |             // This will catch errors from both vmController.get and the json serialization
 63 |             print("ERROR: Failed to get VM details: \(error.localizedDescription)")
 64 |             return .badRequest(message: error.localizedDescription)
 65 |         }
 66 |     }
 67 | 
 68 |     func handleCreateVM(_ body: Data?) async throws -> HTTPResponse {
 69 |         guard let body = body,
 70 |             let request = try? JSONDecoder().decode(CreateVMRequest.self, from: body)
 71 |         else {
 72 |             return HTTPResponse(
 73 |                 statusCode: .badRequest,
 74 |                 headers: ["Content-Type": "application/json"],
 75 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
 76 |             )
 77 |         }
 78 | 
 79 |         do {
 80 |             let sizes = try request.parse()
 81 |             let vmController = LumeController()
 82 |             try await vmController.create(
 83 |                 name: request.name,
 84 |                 os: request.os,
 85 |                 diskSize: sizes.diskSize,
 86 |                 cpuCount: request.cpu,
 87 |                 memorySize: sizes.memory,
 88 |                 display: request.display,
 89 |                 ipsw: request.ipsw,
 90 |                 storage: request.storage
 91 |             )
 92 | 
 93 |             return HTTPResponse(
 94 |                 statusCode: .ok,
 95 |                 headers: ["Content-Type": "application/json"],
 96 |                 body: try JSONEncoder().encode([
 97 |                     "message": "VM created successfully", "name": request.name,
 98 |                 ])
 99 |             )
100 |         } catch {
101 |             return HTTPResponse(
102 |                 statusCode: .badRequest,
103 |                 headers: ["Content-Type": "application/json"],
104 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
105 |             )
106 |         }
107 |     }
108 | 
109 |     func handleDeleteVM(name: String, storage: String? = nil) async throws -> HTTPResponse {
110 |         do {
111 |             let vmController = LumeController()
112 |             try await vmController.delete(name: name, storage: storage)
113 |             return HTTPResponse(
114 |                 statusCode: .ok, headers: ["Content-Type": "application/json"], body: Data())
115 |         } catch {
116 |             return HTTPResponse(
117 |                 statusCode: .badRequest, headers: ["Content-Type": "application/json"],
118 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription)))
119 |         }
120 |     }
121 | 
122 |     func handleCloneVM(_ body: Data?) async throws -> HTTPResponse {
123 |         guard let body = body,
124 |             let request = try? JSONDecoder().decode(CloneRequest.self, from: body)
125 |         else {
126 |             return HTTPResponse(
127 |                 statusCode: .badRequest,
128 |                 headers: ["Content-Type": "application/json"],
129 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
130 |             )
131 |         }
132 | 
133 |         do {
134 |             let vmController = LumeController()
135 |             try vmController.clone(
136 |                 name: request.name,
137 |                 newName: request.newName,
138 |                 sourceLocation: request.sourceLocation,
139 |                 destLocation: request.destLocation
140 |             )
141 | 
142 |             return HTTPResponse(
143 |                 statusCode: .ok,
144 |                 headers: ["Content-Type": "application/json"],
145 |                 body: try JSONEncoder().encode([
146 |                     "message": "VM cloned successfully",
147 |                     "source": request.name,
148 |                     "destination": request.newName,
149 |                 ])
150 |             )
151 |         } catch {
152 |             return HTTPResponse(
153 |                 statusCode: .badRequest,
154 |                 headers: ["Content-Type": "application/json"],
155 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
156 |             )
157 |         }
158 |     }
159 | 
160 |     // MARK: - VM Operation Handlers
161 | 
162 |     func handleSetVM(name: String, body: Data?) async throws -> HTTPResponse {
163 |         guard let body = body,
164 |             let request = try? JSONDecoder().decode(SetVMRequest.self, from: body)
165 |         else {
166 |             return HTTPResponse(
167 |                 statusCode: .badRequest,
168 |                 headers: ["Content-Type": "application/json"],
169 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
170 |             )
171 |         }
172 | 
173 |         do {
174 |             let vmController = LumeController()
175 |             let sizes = try request.parse()
176 |             try vmController.updateSettings(
177 |                 name: name,
178 |                 cpu: request.cpu,
179 |                 memory: sizes.memory,
180 |                 diskSize: sizes.diskSize,
181 |                 display: sizes.display?.string,
182 |                 storage: request.storage
183 |             )
184 | 
185 |             return HTTPResponse(
186 |                 statusCode: .ok,
187 |                 headers: ["Content-Type": "application/json"],
188 |                 body: try JSONEncoder().encode(["message": "VM settings updated successfully"])
189 |             )
190 |         } catch {
191 |             return HTTPResponse(
192 |                 statusCode: .badRequest,
193 |                 headers: ["Content-Type": "application/json"],
194 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
195 |             )
196 |         }
197 |     }
198 | 
199 |     func handleStopVM(name: String, storage: String? = nil) async throws -> HTTPResponse {
200 |         Logger.info(
201 |             "Stopping VM", metadata: ["name": name, "storage": String(describing: storage)])
202 | 
203 |         do {
204 |             Logger.info("Creating VM controller", metadata: ["name": name])
205 |             let vmController = LumeController()
206 | 
207 |             Logger.info("Calling stopVM on controller", metadata: ["name": name])
208 |             try await vmController.stopVM(name: name, storage: storage)
209 | 
210 |             Logger.info(
211 |                 "VM stopped, waiting 5 seconds for locks to clear", metadata: ["name": name])
212 | 
213 |             // Add a delay to ensure locks are fully released before returning
214 |             for i in 1...5 {
215 |                 try? await Task.sleep(nanoseconds: 1_000_000_000)
216 |                 Logger.info("Lock clearing delay", metadata: ["name": name, "seconds": "\(i)/5"])
217 |             }
218 | 
219 |             // Verify the VM is really in a stopped state
220 |             Logger.info("Verifying VM is stopped", metadata: ["name": name])
221 |             let vm = try? vmController.get(name: name, storage: storage)
222 |             if let vm = vm, vm.details.status == "running" {
223 |                 Logger.info(
224 |                     "VM still reports as running despite stop operation",
225 |                     metadata: ["name": name, "severity": "warning"])
226 |             } else {
227 |                 Logger.info(
228 |                     "Verification complete: VM is in stopped state", metadata: ["name": name])
229 |             }
230 | 
231 |             Logger.info("Returning successful response", metadata: ["name": name])
232 |             return HTTPResponse(
233 |                 statusCode: .ok,
234 |                 headers: ["Content-Type": "application/json"],
235 |                 body: try JSONEncoder().encode(["message": "VM stopped successfully"])
236 |             )
237 |         } catch {
238 |             Logger.error(
239 |                 "Failed to stop VM",
240 |                 metadata: [
241 |                     "name": name,
242 |                     "error": error.localizedDescription,
243 |                     "storage": String(describing: storage),
244 |                 ])
245 |             return HTTPResponse(
246 |                 statusCode: .badRequest,
247 |                 headers: ["Content-Type": "application/json"],
248 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
249 |             )
250 |         }
251 |     }
252 | 
253 |     func handleRunVM(name: String, body: Data?) async throws -> HTTPResponse {
254 |         Logger.info("Running VM", metadata: ["name": name])
255 | 
256 |         // Log the raw body data if available
257 |         if let body = body, let bodyString = String(data: body, encoding: .utf8) {
258 |             Logger.info("Run VM raw request body", metadata: ["name": name, "body": bodyString])
259 |         } else {
260 |             Logger.info("No request body or could not decode as string", metadata: ["name": name])
261 |         }
262 | 
263 |         do {
264 |             Logger.info("Creating VM controller and parsing request", metadata: ["name": name])
265 |             let request =
266 |                 body.flatMap { try? JSONDecoder().decode(RunVMRequest.self, from: $0) }
267 |                 ?? RunVMRequest(
268 |                     noDisplay: nil, sharedDirectories: nil, recoveryMode: nil, storage: nil)
269 | 
270 |             Logger.info(
271 |                 "Parsed request",
272 |                 metadata: [
273 |                     "name": name,
274 |                     "noDisplay": String(describing: request.noDisplay),
275 |                     "sharedDirectories": "\(request.sharedDirectories?.count ?? 0)",
276 |                     "storage": String(describing: request.storage),
277 |                 ])
278 | 
279 |             Logger.info("Parsing shared directories", metadata: ["name": name])
280 |             let dirs = try request.parse()
281 |             Logger.info(
282 |                 "Successfully parsed shared directories",
283 |                 metadata: ["name": name, "count": "\(dirs.count)"])
284 | 
285 |             // Start VM in background
286 |             Logger.info("Starting VM in background", metadata: ["name": name])
287 |             startVM(
288 |                 name: name,
289 |                 noDisplay: request.noDisplay ?? false,
290 |                 sharedDirectories: dirs,
291 |                 recoveryMode: request.recoveryMode ?? false,
292 |                 storage: request.storage
293 |             )
294 |             Logger.info("VM start initiated in background", metadata: ["name": name])
295 | 
296 |             // Return response immediately
297 |             return HTTPResponse(
298 |                 statusCode: .accepted,
299 |                 headers: ["Content-Type": "application/json"],
300 |                 body: try JSONEncoder().encode([
301 |                     "message": "VM start initiated",
302 |                     "name": name,
303 |                     "status": "pending",
304 |                 ])
305 |             )
306 |         } catch {
307 |             Logger.error(
308 |                 "Failed to run VM",
309 |                 metadata: [
310 |                     "name": name,
311 |                     "error": error.localizedDescription,
312 |                 ])
313 |             return HTTPResponse(
314 |                 statusCode: .badRequest,
315 |                 headers: ["Content-Type": "application/json"],
316 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
317 |             )
318 |         }
319 |     }
320 | 
321 |     // MARK: - Image Management Handlers
322 | 
323 |     func handleIPSW() async throws -> HTTPResponse {
324 |         do {
325 |             let vmController = LumeController()
326 |             let url = try await vmController.getLatestIPSWURL()
327 |             return HTTPResponse(
328 |                 statusCode: .ok,
329 |                 headers: ["Content-Type": "application/json"],
330 |                 body: try JSONEncoder().encode(["url": url.absoluteString])
331 |             )
332 |         } catch {
333 |             return HTTPResponse(
334 |                 statusCode: .badRequest,
335 |                 headers: ["Content-Type": "application/json"],
336 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
337 |             )
338 |         }
339 |     }
340 | 
341 |     func handlePull(_ body: Data?) async throws -> HTTPResponse {
342 |         guard let body = body,
343 |             let request = try? JSONDecoder().decode(PullRequest.self, from: body)
344 |         else {
345 |             return HTTPResponse(
346 |                 statusCode: .badRequest,
347 |                 headers: ["Content-Type": "application/json"],
348 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
349 |             )
350 |         }
351 | 
352 |         do {
353 |             let vmController = LumeController()
354 |             try await vmController.pullImage(
355 |                 image: request.image,
356 |                 name: request.name,
357 |                 registry: request.registry,
358 |                 organization: request.organization,
359 |                 storage: request.storage
360 |             )
361 | 
362 |             return HTTPResponse(
363 |                 statusCode: .ok,
364 |                 headers: ["Content-Type": "application/json"],
365 |                 body: try JSONEncoder().encode([
366 |                     "message": "Image pulled successfully",
367 |                     "image": request.image,
368 |                     "name": request.name ?? "default",
369 |                 ])
370 |             )
371 |         } catch {
372 |             return HTTPResponse(
373 |                 statusCode: .badRequest,
374 |                 headers: ["Content-Type": "application/json"],
375 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
376 |             )
377 |         }
378 |     }
379 | 
380 |     func handlePruneImages() async throws -> HTTPResponse {
381 |         do {
382 |             let vmController = LumeController()
383 |             try await vmController.pruneImages()
384 |             return HTTPResponse(
385 |                 statusCode: .ok,
386 |                 headers: ["Content-Type": "application/json"],
387 |                 body: try JSONEncoder().encode(["message": "Successfully removed cached images"])
388 |             )
389 |         } catch {
390 |             return HTTPResponse(
391 |                 statusCode: .badRequest,
392 |                 headers: ["Content-Type": "application/json"],
393 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
394 |             )
395 |         }
396 |     }
397 | 
398 |     func handlePush(_ body: Data?) async throws -> HTTPResponse {
399 |         guard let body = body,
400 |             let request = try? JSONDecoder().decode(PushRequest.self, from: body)
401 |         else {
402 |             return HTTPResponse(
403 |                 statusCode: .badRequest,
404 |                 headers: ["Content-Type": "application/json"],
405 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
406 |             )
407 |         }
408 | 
409 |         // Trigger push asynchronously, return Accepted immediately
410 |         Task.detached { @MainActor @Sendable in
411 |             do {
412 |                 let vmController = LumeController()
413 |                 try await vmController.pushImage(
414 |                     name: request.name,
415 |                     imageName: request.imageName,
416 |                     tags: request.tags,
417 |                     registry: request.registry,
418 |                     organization: request.organization,
419 |                     storage: request.storage,
420 |                     chunkSizeMb: request.chunkSizeMb,
421 |                     verbose: false,  // Verbose typically handled by server logs
422 |                     dryRun: false,  // Default API behavior is likely non-dry-run
423 |                     reassemble: false  // Default API behavior is likely non-reassemble
424 |                 )
425 |                 print(
426 |                     "Background push completed successfully for image: \(request.imageName):\(request.tags.joined(separator: ","))"
427 |                 )
428 |             } catch {
429 |                 print(
430 |                     "Background push failed for image: \(request.imageName):\(request.tags.joined(separator: ",")) - Error: \(error.localizedDescription)"
431 |                 )
432 |             }
433 |         }
434 | 
435 |         return HTTPResponse(
436 |             statusCode: .accepted,
437 |             headers: ["Content-Type": "application/json"],
438 |             body: try JSONEncoder().encode([
439 |                 "message": AnyEncodable("Push initiated in background"),
440 |                 "name": AnyEncodable(request.name),
441 |                 "imageName": AnyEncodable(request.imageName),
442 |                 "tags": AnyEncodable(request.tags),
443 |             ])
444 |         )
445 |     }
446 | 
447 |     func handleGetImages(_ request: HTTPRequest) async throws -> HTTPResponse {
448 |         let pathAndQuery = request.path.split(separator: "?", maxSplits: 1)
449 |         let queryParams =
450 |             pathAndQuery.count > 1
451 |             ? pathAndQuery[1]
452 |                 .split(separator: "&")
453 |                 .reduce(into: [String: String]()) { dict, param in
454 |                     let parts = param.split(separator: "=", maxSplits: 1)
455 |                     if parts.count == 2 {
456 |                         dict[String(parts[0])] = String(parts[1])
457 |                     }
458 |                 } : [:]
459 | 
460 |         let organization = queryParams["organization"] ?? "trycua"
461 | 
462 |         do {
463 |             let vmController = LumeController()
464 |             let imageList = try await vmController.getImages(organization: organization)
465 | 
466 |             // Create a response format that matches the CLI output
467 |             let response = imageList.local.map {
468 |                 [
469 |                     "repository": $0.repository,
470 |                     "imageId": $0.imageId,
471 |                 ]
472 |             }
473 | 
474 |             return HTTPResponse(
475 |                 statusCode: .ok,
476 |                 headers: ["Content-Type": "application/json"],
477 |                 body: try JSONEncoder().encode(response)
478 |             )
479 |         } catch {
480 |             return HTTPResponse(
481 |                 statusCode: .badRequest,
482 |                 headers: ["Content-Type": "application/json"],
483 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
484 |             )
485 |         }
486 |     }
487 | 
488 |     // MARK: - Config Management Handlers
489 | 
490 |     func handleGetConfig() async throws -> HTTPResponse {
491 |         do {
492 |             let vmController = LumeController()
493 |             let settings = vmController.getSettings()
494 |             return try .json(settings)
495 |         } catch {
496 |             return .badRequest(message: error.localizedDescription)
497 |         }
498 |     }
499 | 
500 |     struct ConfigRequest: Codable {
501 |         let homeDirectory: String?
502 |         let cacheDirectory: String?
503 |         let cachingEnabled: Bool?
504 |     }
505 | 
506 |     func handleUpdateConfig(_ body: Data?) async throws -> HTTPResponse {
507 |         guard let body = body,
508 |             let request = try? JSONDecoder().decode(ConfigRequest.self, from: body)
509 |         else {
510 |             return HTTPResponse(
511 |                 statusCode: .badRequest,
512 |                 headers: ["Content-Type": "application/json"],
513 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
514 |             )
515 |         }
516 | 
517 |         do {
518 |             let vmController = LumeController()
519 | 
520 |             if let homeDir = request.homeDirectory {
521 |                 try vmController.setHomeDirectory(homeDir)
522 |             }
523 | 
524 |             if let cacheDir = request.cacheDirectory {
525 |                 try vmController.setCacheDirectory(path: cacheDir)
526 |             }
527 | 
528 |             if let cachingEnabled = request.cachingEnabled {
529 |                 try vmController.setCachingEnabled(cachingEnabled)
530 |             }
531 | 
532 |             return HTTPResponse(
533 |                 statusCode: .ok,
534 |                 headers: ["Content-Type": "application/json"],
535 |                 body: try JSONEncoder().encode(["message": "Configuration updated successfully"])
536 |             )
537 |         } catch {
538 |             return HTTPResponse(
539 |                 statusCode: .badRequest,
540 |                 headers: ["Content-Type": "application/json"],
541 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
542 |             )
543 |         }
544 |     }
545 | 
546 |     func handleGetLocations() async throws -> HTTPResponse {
547 |         do {
548 |             let vmController = LumeController()
549 |             let locations = vmController.getLocations()
550 |             return try .json(locations)
551 |         } catch {
552 |             return .badRequest(message: error.localizedDescription)
553 |         }
554 |     }
555 | 
556 |     struct LocationRequest: Codable {
557 |         let name: String
558 |         let path: String
559 |     }
560 | 
561 |     func handleAddLocation(_ body: Data?) async throws -> HTTPResponse {
562 |         guard let body = body,
563 |             let request = try? JSONDecoder().decode(LocationRequest.self, from: body)
564 |         else {
565 |             return HTTPResponse(
566 |                 statusCode: .badRequest,
567 |                 headers: ["Content-Type": "application/json"],
568 |                 body: try JSONEncoder().encode(APIError(message: "Invalid request body"))
569 |             )
570 |         }
571 | 
572 |         do {
573 |             let vmController = LumeController()
574 |             try vmController.addLocation(name: request.name, path: request.path)
575 | 
576 |             return HTTPResponse(
577 |                 statusCode: .ok,
578 |                 headers: ["Content-Type": "application/json"],
579 |                 body: try JSONEncoder().encode([
580 |                     "message": "Location added successfully",
581 |                     "name": request.name,
582 |                     "path": request.path,
583 |                 ])
584 |             )
585 |         } catch {
586 |             return HTTPResponse(
587 |                 statusCode: .badRequest,
588 |                 headers: ["Content-Type": "application/json"],
589 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
590 |             )
591 |         }
592 |     }
593 | 
594 |     func handleRemoveLocation(_ name: String) async throws -> HTTPResponse {
595 |         do {
596 |             let vmController = LumeController()
597 |             try vmController.removeLocation(name: name)
598 |             return HTTPResponse(
599 |                 statusCode: .ok,
600 |                 headers: ["Content-Type": "application/json"],
601 |                 body: try JSONEncoder().encode(["message": "Location removed successfully"])
602 |             )
603 |         } catch {
604 |             return HTTPResponse(
605 |                 statusCode: .badRequest,
606 |                 headers: ["Content-Type": "application/json"],
607 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
608 |             )
609 |         }
610 |     }
611 | 
612 |     func handleSetDefaultLocation(_ name: String) async throws -> HTTPResponse {
613 |         do {
614 |             let vmController = LumeController()
615 |             try vmController.setDefaultLocation(name: name)
616 |             return HTTPResponse(
617 |                 statusCode: .ok,
618 |                 headers: ["Content-Type": "application/json"],
619 |                 body: try JSONEncoder().encode(["message": "Default location set successfully"])
620 |             )
621 |         } catch {
622 |             return HTTPResponse(
623 |                 statusCode: .badRequest,
624 |                 headers: ["Content-Type": "application/json"],
625 |                 body: try JSONEncoder().encode(APIError(message: error.localizedDescription))
626 |             )
627 |         }
628 |     }
629 | 
630 |     // MARK: - Log Handlers
631 | 
632 |     func handleGetLogs(type: String?, lines: Int?) async throws -> HTTPResponse {
633 |         do {
634 |             let logType = type?.lowercased() ?? "all"
635 |             let infoPath = "/tmp/lume_daemon.log"
636 |             let errorPath = "/tmp/lume_daemon.error.log"
637 | 
638 |             let fileManager = FileManager.default
639 |             var response: [String: String] = [:]
640 | 
641 |             // Function to read log files
642 |             func readLogFile(path: String) -> String? {
643 |                 guard fileManager.fileExists(atPath: path) else {
644 |                     return nil
645 |                 }
646 | 
647 |                 do {
648 |                     let content = try String(contentsOfFile: path, encoding: .utf8)
649 | 
650 |                     // If lines parameter is provided, return only the specified number of lines from the end
651 |                     if let lineCount = lines {
652 |                         let allLines = content.components(separatedBy: .newlines)
653 |                         let startIndex = max(0, allLines.count - lineCount)
654 |                         let lastLines = Array(allLines[startIndex...])
655 |                         return lastLines.joined(separator: "\n")
656 |                     }
657 | 
658 |                     return content
659 |                 } catch {
660 |                     return "Error reading log file: \(error.localizedDescription)"
661 |                 }
662 |             }
663 | 
664 |             // Get logs based on requested type
665 |             if logType == "info" || logType == "all" {
666 |                 response["info"] = readLogFile(path: infoPath) ?? "Info log file not found"
667 |             }
668 | 
669 |             if logType == "error" || logType == "all" {
670 |                 response["error"] = readLogFile(path: errorPath) ?? "Error log file not found"
671 |             }
672 | 
673 |             return try .json(response)
674 |         } catch {
675 |             return .badRequest(message: error.localizedDescription)
676 |         }
677 |     }
678 | 
679 |     // MARK: - Private Helper Methods
680 | 
681 |     nonisolated private func startVM(
682 |         name: String,
683 |         noDisplay: Bool,
684 |         sharedDirectories: [SharedDirectory] = [],
685 |         recoveryMode: Bool = false,
686 |         storage: String? = nil
687 |     ) {
688 |         Logger.info(
689 |             "Starting VM in detached task",
690 |             metadata: [
691 |                 "name": name,
692 |                 "noDisplay": "\(noDisplay)",
693 |                 "recoveryMode": "\(recoveryMode)",
694 |                 "storage": String(describing: storage),
695 |             ])
696 | 
697 |         Task.detached { @MainActor @Sendable in
698 |             Logger.info("Background task started for VM", metadata: ["name": name])
699 |             do {
700 |                 Logger.info("Creating VM controller in background task", metadata: ["name": name])
701 |                 let vmController = LumeController()
702 | 
703 |                 Logger.info(
704 |                     "Calling runVM on controller",
705 |                     metadata: [
706 |                         "name": name,
707 |                         "noDisplay": "\(noDisplay)",
708 |                     ])
709 |                 try await vmController.runVM(
710 |                     name: name,
711 |                     noDisplay: noDisplay,
712 |                     sharedDirectories: sharedDirectories,
713 |                     recoveryMode: recoveryMode,
714 |                     storage: storage
715 |                 )
716 |                 Logger.info("VM started successfully in background task", metadata: ["name": name])
717 |             } catch {
718 |                 Logger.error(
719 |                     "Failed to start VM in background task",
720 |                     metadata: [
721 |                         "name": name,
722 |                         "error": error.localizedDescription,
723 |                     ])
724 |             }
725 |         }
726 |         Logger.info("Background task dispatched for VM", metadata: ["name": name])
727 |     }
728 | }
729 | 
```

--------------------------------------------------------------------------------
/blog/build-your-own-operator-on-macos-2.md:
--------------------------------------------------------------------------------

```markdown
  1 | # Build Your Own Operator on macOS - Part 2
  2 | 
  3 | *Published on April 27, 2025 by Francesco Bonacci*
  4 | 
  5 | In our [previous post](build-your-own-operator-on-macos-1.md), we built a basic Computer-Use Operator from scratch using OpenAI's `computer-use-preview` model and our [cua-computer](https://pypi.org/project/cua-computer) package. While educational, implementing the control loop manually can be tedious and error-prone.
  6 | 
  7 | In this follow-up, we'll explore our [cua-agent](https://pypi.org/project/cua-agent) framework - a high-level abstraction that handles all the complexity of VM interaction, screenshot processing, model communication, and action execution automatically.
  8 | 
  9 | <div align="center">
 10 |   <video src="https://github.com/user-attachments/assets/0be7e3e3-eead-4646-a4a3-5bb392501ee7" width="600" controls></video>
 11 | </div>
 12 | 
 13 | ## What You'll Learn
 14 | 
 15 | By the end of this tutorial, you'll be able to:
 16 | - Set up the `cua-agent` framework with various agent loop types and model providers
 17 | - Understand the different agent loop types and their capabilities
 18 | - Work with local models for cost-effective workflows
 19 | - Use a simple UI for your operator
 20 | 
 21 | **Prerequisites:**
 22 | - Completed setup from Part 1 ([lume CLI installed](https://github.com/trycua/cua?tab=readme-ov-file#option-2-full-computer-use-agent-capabilities), macOS CUA image already pulled)
 23 | - Python 3.10+. We recommend using Conda (or Anaconda) to create an ad hoc Python environment.
 24 | - API keys for OpenAI and/or Anthropic (optional for local models)
 25 | 
 26 | **Estimated Time:** 30-45 minutes
 27 | 
 28 | ## Introduction to cua-agent
 29 | 
 30 | The `cua-agent` framework is designed to simplify building Computer-Use Agents. It abstracts away the complex interaction loop we built manually in Part 1, letting you focus on defining tasks rather than implementing the machinery. Among other features, it includes:
 31 | 
 32 | - **Multiple Provider Support**: Works with OpenAI, Anthropic, UI-Tars, local models (via Ollama), or any OpenAI-compatible model (e.g. LM Studio, vLLM, LocalAI, OpenRouter, Groq, etc.)
 33 | - **Flexible Loop Types**: Different implementations optimized for various models (e.g. OpenAI vs. Anthropic)
 34 | - **Structured Responses**: Clean, consistent output following the OpenAI Agent SDK specification we touched on in Part 1
 35 | - **Local Model Support**: Run cost-effectively with locally hosted models (Ollama, LM Studio, vLLM, LocalAI, etc.)
 36 | - **Gradio UI**: Optional visual interface for interacting with your agent
 37 | 
 38 | ## Installation
 39 | 
 40 | Let's start by installing the `cua-agent` package. You can install it with all features or selectively install only what you need.
 41 | 
 42 | From your python 3.10+ environment, run:
 43 | 
 44 | ```bash
 45 | # For all features
 46 | pip install "cua-agent[all]"
 47 | 
 48 | # Or selectively install only what you need
 49 | pip install "cua-agent[openai]"    # OpenAI support
 50 | pip install "cua-agent[anthropic]"  # Anthropic support
 51 | pip install "cua-agent[uitars]"    # UI-Tars support
 52 | pip install "cua-agent[omni]"       # OmniParser + VLMs support
 53 | pip install "cua-agent[ui]"         # Gradio UI
 54 | ```
 55 | 
 56 | ## Setting Up Your Environment
 57 | 
 58 | Before running any code examples, let's set up a proper environment:
 59 | 
 60 | 1. **Create a new directory** for your project:
 61 |    ```bash
 62 |    mkdir cua-agent-tutorial
 63 |    cd cua-agent-tutorial
 64 |    ```
 65 | 
 66 | 2. **Set up a Python environment** using one of these methods:
 67 | 
 68 |    **Option A: Using conda command line**
 69 |    ```bash
 70 |    # Using conda
 71 |    conda create -n cua-agent python=3.10
 72 |    conda activate cua-agent
 73 |    ```
 74 |    
 75 |    **Option B: Using Anaconda Navigator UI**
 76 |    - Open Anaconda Navigator
 77 |    - Click on "Environments" in the left sidebar
 78 |    - Click the "Create" button at the bottom
 79 |    - Name your environment "cua-agent"
 80 |    - Select Python 3.10
 81 |    - Click "Create"
 82 |    - Once created, select the environment and click "Open Terminal" to activate it
 83 |    
 84 |    **Option C: Using venv**
 85 |    ```bash
 86 |    python -m venv cua-env
 87 |    source cua-env/bin/activate  # On macOS/Linux
 88 |    ```
 89 | 
 90 | 3. **Install the cua-agent package**:
 91 |    ```bash
 92 |    pip install "cua-agent[all]"
 93 |    ```
 94 | 
 95 | 4. **Set up your API keys as environment variables**:
 96 |    ```bash
 97 |    # For OpenAI models
 98 |    export OPENAI_API_KEY=your_openai_key_here
 99 |    
100 |    # For Anthropic models (if needed)
101 |    export ANTHROPIC_API_KEY=your_anthropic_key_here
102 |    ```
103 | 
104 | 5. **Create a Python file or notebook**:
105 |    
106 |    **Option A: Create a Python script**
107 |    ```bash
108 |    # For a Python script
109 |    touch cua_agent_example.py
110 |    ```
111 |    
112 |    **Option B: Use VS Code notebooks**
113 |    - Open VS Code
114 |    - Install the Python extension if you haven't already
115 |    - Create a new file with a `.ipynb` extension (e.g., `cua_agent_tutorial.ipynb`)
116 |    - Select your Python environment when prompted
117 |    - You can now create and run code cells in the notebook interface
118 | 
119 | Now you're ready to run the code examples!
120 | 
121 | ## Understanding Agent Loops
122 | 
123 | If you recall from Part 1, we had to implement a custom interaction loop to interact with the compute-use-preview model. 
124 | 
125 | In the `cua-agent` framework, an **Agent Loop** is the core abstraction that implements the continuous interaction cycle between an AI model and the computer environment. It manages the flow of:
126 | 1. Capturing screenshots of the computer's state
127 | 2. Processing these screenshots (with or without UI element detection)
128 | 3. Sending this visual context to an AI model along with the task instructions
129 | 4. Receiving the model's decisions on what actions to take
130 | 5. Safely executing these actions in the environment
131 | 6. Repeating this cycle until the task is complete
132 | 
133 | The loop handles all the complex error handling, retries, context management, and model-specific interaction patterns so you don't have to implement them yourself.
134 | 
135 | While the core concept remains the same across all agent loops, different AI models require specialized handling for optimal performance. To address this, the framework provides 4 different agent loop implementations, each designed for different computer-use modalities.
136 | | Agent Loop | Supported Models | Description | Set-Of-Marks |
137 | |:-----------|:-----------------|:------------|:-------------|
138 | | `AgentLoop.OPENAI` | • `computer_use_preview` | Use OpenAI Operator CUA Preview model | Not Required |
139 | | `AgentLoop.ANTHROPIC` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219` | Use Anthropic Computer-Use Beta Tools | Not Required |
140 | | `AgentLoop.UITARS` | • `ByteDance-Seed/UI-TARS-1.5-7B` | Uses ByteDance's UI-TARS 1.5 model | Not Required |
141 | | `AgentLoop.OMNI` | • `claude-3-5-sonnet-20240620`<br>• `claude-3-7-sonnet-20250219`<br>• `gpt-4.5-preview`<br>• `gpt-4o`<br>• `gpt-4`<br>• `phi4`<br>• `phi4-mini`<br>• `gemma3`<br>• `...`<br>• `Any Ollama or OpenAI-compatible model` | Use OmniParser for element pixel-detection (SoM) and any VLMs for UI Grounding and Reasoning | OmniParser |
142 | 
143 | Each loop handles the same basic pattern we implemented manually in Part 1:
144 | 1. Take a screenshot of the VM
145 | 2. Send the screenshot and task to the AI model
146 | 3. Receive an action to perform
147 | 4. Execute the action
148 | 5. Repeat until the task is complete
149 | 
150 | ### Why Different Agent Loops?
151 | 
152 | The `cua-agent` framework provides multiple agent loop implementations to abstract away the complexity of interacting with different CUA models. Each provider has unique API structures, response formats, conventions and capabilities that require specialized handling:
153 | 
154 | - **OpenAI Loop**: Uses the Responses API with a specific `computer_call_output` format for sending screenshots after actions. Requires handling safety checks and maintains a chain of requests using `previous_response_id`.
155 | 
156 | - **Anthropic Loop**: Implements a [multi-agent loop pattern](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop) with a sophisticated message handling system, supporting various API providers (Anthropic, Bedrock, Vertex) with token management and prompt caching capabilities.
157 | 
158 | - **UI-TARS Loop**: Requires custom message formatting and specialized parsing to extract actions from text responses using a "box token" system for UI element identification.
159 | 
160 | - **OMNI Loop**: Uses [Microsoft's OmniParser](https://github.com/microsoft/OmniParser) to create a [Set-of-Marks (SoM)](https://arxiv.org/abs/2310.11441) representation of the UI, enabling any vision-language model to interact with interfaces without specialized UI training.
161 | 
162 | - **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities.
163 | 
164 | These abstractions allow you to easily switch between providers without changing your application code. All loop implementations are available in the [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent/agent/providers).
165 | 
166 | Choosing the right agent loop depends not only on your API access and technical requirements but also on the specific tasks you need to accomplish. To make an informed decision, it's helpful to understand how these underlying models perform across different computing environments – from desktop operating systems to web browsers and mobile interfaces.
167 | 
168 | ## Computer-Use Model Capabilities
169 | 
170 | The performance of different Computer-Use models varies significantly across tasks. These benchmark evaluations measure an agent's ability to follow instructions and complete real-world tasks in different computing environments.
171 | 
172 | | Benchmark type | Benchmark                                                                                                                                       | UI-TARS-1.5 | OpenAI CUA | Claude 3.7 | Previous SOTA       | Human       |
173 | |----------------|--------------------------------------------------------------------------------------------------------------------------------------------------|-------------|-------------|-------------|----------------------|-------------|
174 | | **Computer Use** | [OSworld](https://arxiv.org/abs/2404.07972) (100 steps)                                                                                        | **42.5**     | 36.4        | 28          | 38.1 (200 step)      | 72.4        |
175 | |                | [Windows Agent Arena](https://arxiv.org/abs/2409.08264) (50 steps)                                                                              | **42.1**     | -           | -           | 29.8                 | -           |
176 | | **Browser Use**  | [WebVoyager](https://arxiv.org/abs/2401.13919)                                                                                                 | 84.8         | **87**      | 84.1        | 87                   | -           |
177 | |                | [Online-Mind2web](https://arxiv.org/abs/2504.01382)                                                                                              | **75.8**     | 71          | 62.9        | 71                   | -           |
178 | | **Phone Use**    | [Android World](https://arxiv.org/abs/2405.14573)                                                                                              | **64.2**     | -           | -           | 59.5                 | -           |
179 | 
180 | ### When to Use Each Loop
181 | 
182 | - **AgentLoop.OPENAI**: Choose when you have OpenAI Tier 3 access and need the most capable computer-use agent for web-based tasks. Uses the same [OpenAI Computer-Use Loop](https://platform.openai.com/docs/guides/tools-computer-use) as Part 1, delivering strong performance on browser-based benchmarks.
183 | 
184 | - **AgentLoop.ANTHROPIC**: Ideal for users with Anthropic API access who need strong reasoning capabilities with computer-use abilities. Works with `claude-3-5-sonnet-20240620` and `claude-3-7-sonnet-20250219` models following [Anthropic's Computer-Use tools](https://docs.anthropic.com/en/docs/agents-and-tools/computer-use#understanding-the-multi-agent-loop).
185 | 
186 | - **AgentLoop.UITARS**: Best for scenarios requiring more powerful OS/desktop, and latency-sensitive automation, as UI-TARS-1.5 leads in OS capabilities benchmarks. Requires running the model locally or accessing it through compatible endpoints (e.g. on Hugging Face).
187 | 
188 | - **AgentLoop.OMNI**: The most flexible option that works with virtually any vision-language model including local and open-source ones. Perfect for cost-effective development or when you need to use models without native computer-use capabilities.
189 | 
190 | Now that we understand the capabilities and strengths of different models, let's see how easy it is to implement a Computer-Use Agent using the `cua-agent` framework. Let's look at the implementation details.
191 | 
192 | ## Creating Your First Computer-Use Agent
193 | 
194 | With the `cua-agent` framework, creating a Computer-Use Agent becomes remarkably straightforward. The framework handles all the complexities of model interaction, screenshot processing, and action execution behind the scenes. Let's look at a simple example of how to build your first agent:
195 | 
196 | **How to run this example:**
197 | 
198 | 1. Create a new file named `simple_task.py` in your text editor or IDE (like VS Code, PyCharm, or Cursor)
199 | 2. Copy and paste the following code:
200 | 
201 | ```python
202 | import asyncio
203 | from computer import Computer
204 | from agent import ComputerAgent
205 | 
206 | async def run_simple_task():
207 |     async with Computer() as macos_computer:
208 |         # Create agent with OpenAI loop
209 |         agent = ComputerAgent(
210 |             model="openai/computer-use-preview",
211 |             tools=[macos_computer]
212 |         )
213 |         
214 |         # Define a simple task
215 |         task = "Open Safari and search for 'Python tutorials'"
216 |         
217 |         # Run the task and process responses
218 |         async for result in agent.run(task):
219 |             print(f"Action: {result.get('text')}")
220 | 
221 | # Run the example
222 | if __name__ == "__main__":
223 |     asyncio.run(run_simple_task())
224 | ```
225 | 
226 | 3. Save the file
227 | 4. Open a terminal, navigate to your project directory, and run:
228 |    ```bash
229 |    python simple_task.py
230 |    ```
231 | 
232 | 5. The code will initialize the macOS virtual machine, create an agent, and execute the task of opening Safari and searching for Python tutorials.
233 | 
234 | You can also run this in a VS Code notebook:
235 | 1. Create a new notebook in VS Code (.ipynb file)
236 | 2. Copy the code into a cell (without the `if __name__ == "__main__":` part)
237 | 3. Run the cell to execute the code
238 | 
239 | You can find the full code in our [notebook](https://github.com/trycua/cua/blob/main/notebooks/blog/build-your-own-operator-on-macos-2.ipynb).
240 | 
241 | Compare this to the manual implementation from Part 1 - we've reduced dozens of lines of code to just a few. The cua-agent framework handles all the complex logic internally, letting you focus on the overarching agentic system.
242 | 
243 | ## Working with Multiple Tasks
244 | 
245 | Another advantage of the cua-agent framework is easily chaining multiple tasks. Instead of managing complex state between tasks, you can simply provide a sequence of instructions to be executed in order:
246 | 
247 | **How to run this example:**
248 | 
249 | 1. Create a new file named `multi_task.py` with the following code:
250 | 
251 | ```python
252 | import asyncio
253 | from computer import Computer
254 | from agent import ComputerAgent
255 | 
256 | async def run_multi_task_workflow():
257 |     async with Computer() as macos_computer:
258 |         agent = ComputerAgent(
259 |             model="anthropic/claude-3-5-sonnet-20241022",
260 |             tools=[macos_computer]
261 |         )
262 |         
263 |         tasks = [
264 |             "Open Safari and go to github.com",
265 |             "Search for 'trycua/cua'",
266 |             "Open the repository page",
267 |             "Click on the 'Issues' tab",
268 |             "Read the first open issue"
269 |         ]
270 |         
271 |         for i, task in enumerate(tasks):
272 |             print(f"\nTask {i+1}/{len(tasks)}: {task}")
273 |             async for result in agent.run(task):
274 |                 # Print just the action description for brevity
275 |                 if result.get("text"):
276 |                     print(f"  → {result.get('text')}")
277 |             print(f"✅ Task {i+1} completed")
278 | 
279 | if __name__ == "__main__":
280 |     asyncio.run(run_multi_task_workflow())
281 | ```
282 | 
283 | 2. Save the file
284 | 3. Make sure you have set your Anthropic API key:
285 |    ```bash
286 |    export ANTHROPIC_API_KEY=your_anthropic_key_here
287 |    ```
288 | 4. Run the script:
289 |    ```bash
290 |    python multi_task.py
291 |    ```
292 | 
293 | This pattern is particularly useful for creating workflows that navigate through multiple steps of an application or process. The agent maintains visual context between tasks, making it more likely to successfully complete complex sequences of actions.
294 | 
295 | ## Understanding the Response Format
296 | 
297 | Each action taken by the agent returns a structured response following the OpenAI Agent SDK specification. This standardized format makes it easy to extract detailed information about what the agent is doing and why:
298 | 
299 | ```python
300 | async for result in agent.run(task):
301 |     # Basic information
302 |     print(f"Response ID: {result.get('id')}")
303 |     print(f"Response Text: {result.get('text')}")
304 |     
305 |     # Detailed token usage statistics
306 |     usage = result.get('usage')
307 |     if usage:
308 |         print(f"Input Tokens: {usage.get('input_tokens')}")
309 |         print(f"Output Tokens: {usage.get('output_tokens')}")
310 |     
311 |     # Reasoning and actions
312 |     for output in result.get('output', []):
313 |         if output.get('type') == 'reasoning':
314 |             print(f"Reasoning: {output.get('summary', [{}])[0].get('text')}")
315 |         elif output.get('type') == 'computer_call':
316 |             action = output.get('action', {})
317 |             print(f"Action: {action.get('type')} at ({action.get('x')}, {action.get('y')})")
318 | ```
319 | 
320 | This structured format allows you to:
321 | - Log detailed information about agent actions
322 | - Provide real-time feedback to users
323 | - Track token usage for cost monitoring
324 | - Access the reasoning behind decisions for debugging or user explanation
325 | 
326 | ## Using Local Models with OMNI
327 | 
328 | One of the most powerful features of the framework is the ability to use local models via the OMNI loop. This approach dramatically reduces costs while maintaining acceptable reliability for many agentic workflows:
329 | 
330 | **How to run this example:**
331 | 
332 | 1. First, you'll need to install Ollama for running local models:
333 |    - Visit [ollama.com](https://ollama.com) and download the installer for your OS
334 |    - Follow the installation instructions
335 |    - Pull the Gemma 3 model:
336 |      ```bash
337 |      ollama pull gemma3:4b-it-q4_K_M
338 |      ```
339 | 
340 | 2. Create a file named `local_model.py` with this code:
341 | 
342 | ```python
343 | import asyncio
344 | from computer import Computer
345 | from agent import ComputerAgent
346 | 
347 | async def run_with_local_model():
348 |     async with Computer() as macos_computer:
349 |         agent = ComputerAgent(
350 |             model="omniparser+ollama_chat/gemma3",
351 |             tools=[macos_computer]
352 |         )
353 |         
354 |         task = "Open the Calculator app and perform a simple calculation"
355 |         
356 |         async for result in agent.run(task):
357 |             print(f"Action: {result.get('text')}")
358 | 
359 | if __name__ == "__main__":
360 |     asyncio.run(run_with_local_model())
361 | ```
362 | 
363 | 3. Run the script:
364 |    ```bash
365 |    python local_model.py
366 |    ```
367 | 
368 | You can also use other local model servers with the OAICOMPAT provider, which enables compatibility with any API endpoint following the OpenAI API structure:
369 | 
370 | ```python
371 | agent = ComputerAgent(
372 |     model=LLM(
373 |         provider=LLMProvider.OAICOMPAT,
374 |         name="gemma-3-12b-it",
375 |         provider_base_url="http://localhost:1234/v1"  # LM Studio endpoint
376 |     ),
377 |     tools=[macos_computer]
378 | )
379 | ```
380 | 
381 | Common local endpoints include:
382 | - LM Studio: `http://localhost:1234/v1`
383 | - vLLM: `http://localhost:8000/v1`
384 | - LocalAI: `http://localhost:8080/v1`
385 | - Ollama with OpenAI compat: `http://localhost:11434/v1`
386 | 
387 | This approach is perfect for:
388 | - Development and testing without incurring API costs
389 | - Offline or air-gapped environments where API access isn't possible
390 | - Privacy-sensitive applications where data can't leave your network
391 | - Experimenting with different models to find the best fit for your use case
392 | 
393 | ## Deploying and Using UI-TARS
394 | 
395 | UI-TARS is ByteDance's Computer-Use model designed for navigating OS-level interfaces. It shows excellent performance on desktop OS tasks. To use UI-TARS, you'll first need to deploy the model.
396 | 
397 | ### Deployment Options
398 | 
399 | 1. **Local Deployment**: Follow the [UI-TARS deployment guide](https://github.com/bytedance/UI-TARS/blob/main/README_deploy.md) to run the model locally.
400 | 
401 | 2. **Hugging Face Endpoint**: Deploy UI-TARS on Hugging Face Inference Endpoints, which will give you a URL like:
402 |    `https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1`
403 | 
404 | 3. **Using with cua-agent**: Once deployed, you can use UI-TARS with the cua-agent framework:
405 | 
406 | ```python
407 | agent = ComputerAgent(
408 |     model=LLM(
409 |         provider=LLMProvider.OAICOMPAT, 
410 |         name="tgi", 
411 |         provider_base_url="https://**************.us-east-1.aws.endpoints.huggingface.cloud/v1"
412 |     ),
413 |     tools=[macos_computer]
414 | )
415 | ```
416 | 
417 | UI-TARS is particularly useful for desktop automation tasks, as it shows the highest performance on OS-level benchmarks like OSworld and Windows Agent Arena.
418 | 
419 | ## Understanding Agent Responses in Detail
420 | 
421 | The `run()` method of your agent yields structured responses that follow the OpenAI Agent SDK specification. This provides a rich set of information beyond just the basic action text:
422 | 
423 | ```python
424 | async for result in agent.run(task):
425 |     # Basic ID and text
426 |     print("Response ID:", result.get("id"))
427 |     print("Response Text:", result.get("text"))
428 | 
429 |     # Token usage statistics
430 |     usage = result.get("usage")
431 |     if usage:
432 |         print("\nUsage Details:")
433 |         print(f"  Input Tokens: {usage.get('input_tokens')}")
434 |         if "input_tokens_details" in usage:
435 |             print(f"  Input Tokens Details: {usage.get('input_tokens_details')}")
436 |         print(f"  Output Tokens: {usage.get('output_tokens')}")
437 |         if "output_tokens_details" in usage:
438 |             print(f"  Output Tokens Details: {usage.get('output_tokens_details')}")
439 |         print(f"  Total Tokens: {usage.get('total_tokens')}")
440 | 
441 |     # Detailed reasoning and actions
442 |     outputs = result.get("output", [])
443 |     for output in outputs:
444 |         output_type = output.get("type")
445 |         if output_type == "reasoning":
446 |             print("\nReasoning:")
447 |             for summary in output.get("summary", []):
448 |                 print(f"  {summary.get('text')}")
449 |         elif output_type == "computer_call":
450 |             action = output.get("action", {})
451 |             print("\nComputer Action:")
452 |             print(f"  Type: {action.get('type')}")
453 |             print(f"  Position: ({action.get('x')}, {action.get('y')})")
454 |             if action.get("text"):
455 |                 print(f"  Text: {action.get('text')}")
456 | ```
457 | 
458 | This detailed information is invaluable for debugging, logging, and understanding the agent's decision-making process in an agentic system. More details can be found in the [OpenAI Agent SDK Specification](https://platform.openai.com/docs/guides/responses-vs-chat-completions).
459 | 
460 | ## Building a Gradio UI
461 | 
462 | For a visual interface to your agent, the package also includes a Gradio UI:
463 | 
464 | **How to run the Gradio UI:**
465 | 
466 | 1. Create a file named `launch_ui.py` with the following code:
467 | 
468 | ```python
469 | from agent.ui.gradio.app import create_gradio_ui
470 | 
471 | # Create and launch the UI
472 | if __name__ == "__main__":
473 |     app = create_gradio_ui()
474 |     app.launch(share=False)  # Set share=False for local access only
475 | ```
476 | 
477 | 2. Install the UI dependencies if you haven't already:
478 |    ```bash
479 |    pip install "cua-agent[ui]"
480 |    ```
481 | 
482 | 3. Run the script:
483 |    ```bash
484 |    python launch_ui.py
485 |    ```
486 | 
487 | 4. Open your browser to the displayed URL (usually http://127.0.0.1:7860)
488 | 
489 | **Creating a Shareable Link (Optional):**
490 | 
491 | You can also create a temporary public URL to access your Gradio UI from anywhere:
492 | 
493 | ```python
494 | # In launch_ui.py
495 | if __name__ == "__main__":
496 |     app = create_gradio_ui()
497 |     app.launch(share=True)  # Creates a public link
498 | ```
499 | 
500 | When you run this, Gradio will display both a local URL and a public URL like:
501 | ```
502 | Running on local URL:  http://127.0.0.1:7860
503 | Running on public URL: https://abcd1234.gradio.live
504 | ```
505 | 
506 | **Security Note:** Be cautious when sharing your Gradio UI publicly:
507 | - The public URL gives anyone with the link full access to your agent
508 | - Consider using basic authentication for additional protection:
509 |   ```python
510 |   app.launch(share=True, auth=("username", "password"))
511 |   ```
512 | - Only use this feature for personal or team use, not for production environments
513 | - The temporary link expires when you stop the Gradio application
514 | 
515 | This provides:
516 | - Model provider selection
517 | - Agent loop selection
518 | - Task input field
519 | - Real-time display of VM screenshots
520 | - Action history
521 | 
522 | ### Setting API Keys for the UI
523 | 
524 | To use the UI with different providers, set your API keys as environment variables:
525 | 
526 | ```bash
527 | # For OpenAI models
528 | export OPENAI_API_KEY=your_openai_key_here
529 | 
530 | # For Anthropic models
531 | export ANTHROPIC_API_KEY=your_anthropic_key_here
532 | 
533 | # Launch with both keys set
534 | OPENAI_API_KEY=your_key ANTHROPIC_API_KEY=your_key python launch_ui.py
535 | ```
536 | 
537 | ### UI Settings Persistence
538 | 
539 | The Gradio UI automatically saves your configuration to maintain your preferences between sessions:
540 | 
541 | - Settings like Agent Loop, Model Choice, Custom Base URL, and configuration options are saved to `.gradio_settings.json` in the project's root directory
542 | - These settings are loaded automatically when you restart the UI
543 | - API keys entered in the custom provider field are **not** saved for security reasons
544 | - It's recommended to add `.gradio_settings.json` to your `.gitignore` file
545 | 
546 | ## Advanced Example: GitHub Repository Workflow
547 | 
548 | Let's look at a more complex example that automates a GitHub workflow:
549 | 
550 | **How to run this advanced example:**
551 | 
552 | 1. Create a file named `github_workflow.py` with the following code:
553 | 
554 | ```python
555 | import asyncio
556 | import logging
557 | from computer import Computer
558 | from agent import ComputerAgent
559 | 
560 | async def github_workflow():
561 |     async with Computer(verbosity=logging.INFO) as macos_computer:
562 |         agent = ComputerAgent(
563 |             model="openai/computer-use-preview",
564 |             save_trajectory=True,  # Save screenshots for debugging
565 |             only_n_most_recent_images=3,  # Only keep last 3 images in context
566 |             verbosity=logging.INFO,
567 |             tools=[macos_computer]
568 |         )
569 |         
570 |         tasks = [
571 |             "Look for a repository named trycua/cua on GitHub.",
572 |             "Check the open issues, open the most recent one and read it.",
573 |             "Clone the repository in users/lume/projects if it doesn't exist yet.",
574 |             "Open the repository with Cursor (on the dock, black background and white cube icon).",
575 |             "From Cursor, open Composer if not already open.",
576 |             "Focus on the Composer text area, then write and submit a task to help resolve the GitHub issue.",
577 |         ]
578 |         
579 |         for i, task in enumerate(tasks):
580 |             print(f"\nExecuting task {i+1}/{len(tasks)}: {task}")
581 |             async for result in agent.run(task):
582 |                 print(f"Action: {result.get('text')}")
583 |             print(f"✅ Task {i+1}/{len(tasks)} completed")
584 | 
585 | if __name__ == "__main__":
586 |     asyncio.run(github_workflow())
587 | ```
588 | 
589 | 2. Make sure your OpenAI API key is set:
590 |    ```bash
591 |    export OPENAI_API_KEY=your_openai_key_here
592 |    ```
593 | 
594 | 3. Run the script:
595 |    ```bash
596 |    python github_workflow.py
597 |    ```
598 | 
599 | 4. Watch as the agent completes the entire workflow:
600 |    - The agent will navigate to GitHub
601 |    - Find and investigate issues in the repository
602 |    - Clone the repository to the local machine
603 |    - Open it in Cursor
604 |    - Use Cursor's AI features to work on a solution
605 | 
606 | This example:
607 | 1. Searches GitHub for a repository
608 | 2. Reads an issue
609 | 3. Clones the repository
610 | 4. Opens it in an IDE
611 | 5. Uses AI to write a solution
612 | 
613 | ## Comparing Implementation Approaches
614 | 
615 | Let's compare our manual implementation from Part 1 with the framework approach:
616 | 
617 | ### Manual Implementation (Part 1)
618 | - Required writing custom code for the interaction loop
619 | - Needed explicit handling of different action types
620 | - Required direct management of the OpenAI API calls
621 | - Around 50-100 lines of code for basic functionality
622 | - Limited to OpenAI's computer-use model
623 | 
624 | ### Framework Implementation (Part 2)
625 | - Abstracts the interaction loop
626 | - Handles all action types automatically
627 | - Manages API calls internally
628 | - Only 10-15 lines of code for the same functionality
629 | - Works with multiple model providers
630 | - Includes UI capabilities
631 | 
632 | ## Conclusion
633 | 
634 | The `cua-agent` framework transforms what was a complex implementation task into a simple, high-level interface for building Computer-Use Agents. By abstracting away the technical details, it lets you focus on defining the tasks rather than the machinery.
635 | 
636 | ### When to Use Each Approach
637 | - **Manual Implementation (Part 1)**: When you need complete control over the interaction loop or are implementing a custom solution
638 | - **Framework (Part 2)**: For most applications where you want to quickly build and deploy Computer-Use Agents
639 | 
640 | ### Next Steps
641 | With the basics covered, you might want to explore:
642 | - Customizing the agent's behavior with additional parameters
643 | - Building more complex workflows spanning multiple applications
644 | - Integrating your agent into other applications
645 | - Contributing to the open-source project on GitHub
646 | 
647 | ### Resources
648 | - [cua-agent GitHub repository](https://github.com/trycua/cua/tree/main/libs/agent)
649 | - [Agent Notebook Examples](https://github.com/trycua/cua/blob/main/notebooks/agent_nb.ipynb)
650 | - [OpenAI Agent SDK Specification](https://platform.openai.com/docs/api-reference/responses)
651 | - [Anthropic API Documentation](https://docs.anthropic.com/en/api/getting-started)
652 | - [UI-TARS GitHub](https://github.com/ByteDance/UI-TARS)
653 | - [OmniParser GitHub](https://github.com/microsoft/OmniParser)
654 | 
```

--------------------------------------------------------------------------------
/libs/python/agent/agent/agent.py:
--------------------------------------------------------------------------------

```python
  1 | """
  2 | ComputerAgent - Main agent class that selects and runs agent loops
  3 | """
  4 | 
  5 | import asyncio
  6 | from pathlib import Path
  7 | from typing import Dict, List, Any, Optional, AsyncGenerator, Union, cast, Callable, Set, Tuple
  8 | 
  9 | from litellm.responses.utils import Usage
 10 | 
 11 | from .types import (
 12 |     Messages,
 13 |     AgentCapability,
 14 |     ToolError,
 15 |     IllegalArgumentError
 16 | )
 17 | from .responses import make_tool_error_item, replace_failed_computer_calls_with_function_calls
 18 | from .decorators import find_agent_config
 19 | import json
 20 | import litellm
 21 | import litellm.utils
 22 | import inspect
 23 | from .adapters import (
 24 |     HuggingFaceLocalAdapter,
 25 |     HumanAdapter,
 26 |     MLXVLMAdapter,
 27 | )
 28 | from .callbacks import (
 29 |     ImageRetentionCallback, 
 30 |     LoggingCallback, 
 31 |     TrajectorySaverCallback, 
 32 |     BudgetManagerCallback,
 33 |     TelemetryCallback,
 34 |     OperatorNormalizerCallback,
 35 |     PromptInstructionsCallback,
 36 | )
 37 | from .computers import (
 38 |     AsyncComputerHandler,
 39 |     is_agent_computer,
 40 |     make_computer_handler
 41 | )
 42 | 
 43 | def assert_callable_with(f, *args, **kwargs):
 44 |    """Check if function can be called with given arguments."""
 45 |    try:
 46 |        inspect.signature(f).bind(*args, **kwargs)
 47 |        return True
 48 |    except TypeError as e:
 49 |        sig = inspect.signature(f)
 50 |        raise IllegalArgumentError(f"Expected {sig}, got args={args} kwargs={kwargs}") from e
 51 | 
 52 | def get_json(obj: Any, max_depth: int = 10) -> Any:
 53 |     def custom_serializer(o: Any, depth: int = 0, seen: Optional[Set[int]] = None) -> Any:
 54 |         if seen is None:
 55 |             seen = set()
 56 |         
 57 |         # Use model_dump() if available
 58 |         if hasattr(o, 'model_dump'):
 59 |             return o.model_dump()
 60 |         
 61 |         # Check depth limit
 62 |         if depth > max_depth:
 63 |             return f"<max_depth_exceeded:{max_depth}>"
 64 |         
 65 |         # Check for circular references using object id
 66 |         obj_id = id(o)
 67 |         if obj_id in seen:
 68 |             return f"<circular_reference:{type(o).__name__}>"
 69 |         
 70 |         # Handle Computer objects
 71 |         if hasattr(o, '__class__') and 'computer' in getattr(o, '__class__').__name__.lower():
 72 |             return f"<computer:{o.__class__.__name__}>"
 73 | 
 74 |         # Handle objects with __dict__
 75 |         if hasattr(o, '__dict__'):
 76 |             seen.add(obj_id)
 77 |             try:
 78 |                 result = {}
 79 |                 for k, v in o.__dict__.items():
 80 |                     if v is not None:
 81 |                         # Recursively serialize with updated depth and seen set
 82 |                         serialized_value = custom_serializer(v, depth + 1, seen.copy())
 83 |                         result[k] = serialized_value
 84 |                 return result
 85 |             finally:
 86 |                 seen.discard(obj_id)
 87 |         
 88 |         # Handle common types that might contain nested objects
 89 |         elif isinstance(o, dict):
 90 |             seen.add(obj_id)
 91 |             try:
 92 |                 return {
 93 |                     k: custom_serializer(v, depth + 1, seen.copy())
 94 |                     for k, v in o.items()
 95 |                     if v is not None
 96 |                 }
 97 |             finally:
 98 |                 seen.discard(obj_id)
 99 |         
100 |         elif isinstance(o, (list, tuple, set)):
101 |             seen.add(obj_id)
102 |             try:
103 |                 return [
104 |                     custom_serializer(item, depth + 1, seen.copy())
105 |                     for item in o
106 |                     if item is not None
107 |                 ]
108 |             finally:
109 |                 seen.discard(obj_id)
110 |         
111 |         # For basic types that json.dumps can handle
112 |         elif isinstance(o, (str, int, float, bool)) or o is None:
113 |             return o
114 |         
115 |         # Fallback to string representation
116 |         else:
117 |             return str(o)
118 |     
119 |     def remove_nones(obj: Any) -> Any:
120 |         if isinstance(obj, dict):
121 |             return {k: remove_nones(v) for k, v in obj.items() if v is not None}
122 |         elif isinstance(obj, list):
123 |             return [remove_nones(item) for item in obj if item is not None]
124 |         return obj
125 |     
126 |     # Serialize with circular reference and depth protection
127 |     serialized = custom_serializer(obj)
128 |     
129 |     # Convert to JSON string and back to ensure JSON compatibility
130 |     json_str = json.dumps(serialized)
131 |     parsed = json.loads(json_str)
132 |     
133 |     # Final cleanup of any remaining None values
134 |     return remove_nones(parsed)
135 | 
136 | def sanitize_message(msg: Any) -> Any:
137 |     """Return a copy of the message with image_url omitted for computer_call_output messages."""
138 |     if msg.get("type") == "computer_call_output":
139 |         output = msg.get("output", {})
140 |         if isinstance(output, dict):
141 |             sanitized = msg.copy()
142 |             sanitized["output"] = {**output, "image_url": "[omitted]"}
143 |             return sanitized
144 |     return msg
145 | 
146 | def get_output_call_ids(messages: List[Dict[str, Any]]) -> List[str]:
147 |     call_ids = []
148 |     for message in messages:
149 |         if message.get("type") == "computer_call_output" or message.get("type") == "function_call_output":
150 |             call_ids.append(message.get("call_id"))
151 |     return call_ids
152 | 
153 | class ComputerAgent:
154 |     """
155 |     Main agent class that automatically selects the appropriate agent loop
156 |     based on the model and executes tool calls.
157 |     """
158 |     
159 |     def __init__(
160 |         self,
161 |         model: str,
162 |         tools: Optional[List[Any]] = None,
163 |         custom_loop: Optional[Callable] = None,
164 |         only_n_most_recent_images: Optional[int] = None,
165 |         callbacks: Optional[List[Any]] = None,
166 |         instructions: Optional[str] = None,
167 |         verbosity: Optional[int] = None,
168 |         trajectory_dir: Optional[str | Path | dict] = None,
169 |         max_retries: Optional[int] = 3,
170 |         screenshot_delay: Optional[float | int] = 0.5,
171 |         use_prompt_caching: Optional[bool] = False,
172 |         max_trajectory_budget: Optional[float | dict] = None,
173 |         telemetry_enabled: Optional[bool] = True,
174 |         trust_remote_code: Optional[bool] = False,
175 |         **kwargs
176 |     ):
177 |         """
178 |         Initialize ComputerAgent.
179 |         
180 |         Args:
181 |             model: Model name (e.g., "claude-3-5-sonnet-20241022", "computer-use-preview", "omni+vertex_ai/gemini-pro")
182 |             tools: List of tools (computer objects, decorated functions, etc.)
183 |             custom_loop: Custom agent loop function to use instead of auto-selection
184 |             only_n_most_recent_images: If set, only keep the N most recent images in message history. Adds ImageRetentionCallback automatically.
185 |             callbacks: List of AsyncCallbackHandler instances for preprocessing/postprocessing
186 |             instructions: Optional system instructions to be passed to the model
187 |             verbosity: Logging level (logging.DEBUG, logging.INFO, etc.). If set, adds LoggingCallback automatically
188 |             trajectory_dir: If set, saves trajectory data (screenshots, responses) to this directory. Adds TrajectorySaverCallback automatically.
189 |             max_retries: Maximum number of retries for failed API calls
190 |             screenshot_delay: Delay before screenshots in seconds
191 |             use_prompt_caching: If set, use prompt caching to avoid reprocessing the same prompt. Intended for use with anthropic providers.
192 |             max_trajectory_budget: If set, adds BudgetManagerCallback to track usage costs and stop when budget is exceeded
193 |             telemetry_enabled: If set, adds TelemetryCallback to track anonymized usage data. Enabled by default.
194 |             trust_remote_code: If set, trust remote code when loading local models. Disabled by default.
195 |             **kwargs: Additional arguments passed to the agent loop
196 |         """        
197 |         # If the loop is "human/human", we need to prefix a grounding model fallback
198 |         if model in ["human/human", "human"]:
199 |             model = "openai/computer-use-preview+human/human"
200 |         
201 |         self.model = model
202 |         self.tools = tools or []
203 |         self.custom_loop = custom_loop
204 |         self.only_n_most_recent_images = only_n_most_recent_images
205 |         self.callbacks = callbacks or []
206 |         self.instructions = instructions
207 |         self.verbosity = verbosity
208 |         self.trajectory_dir = trajectory_dir
209 |         self.max_retries = max_retries
210 |         self.screenshot_delay = screenshot_delay
211 |         self.use_prompt_caching = use_prompt_caching
212 |         self.telemetry_enabled = telemetry_enabled
213 |         self.kwargs = kwargs
214 |         self.trust_remote_code = trust_remote_code
215 | 
216 |         # == Add built-in callbacks ==
217 | 
218 |         # Prepend operator normalizer callback
219 |         self.callbacks.insert(0, OperatorNormalizerCallback())
220 | 
221 |         # Add prompt instructions callback if provided
222 |         if self.instructions:
223 |             self.callbacks.append(PromptInstructionsCallback(self.instructions))
224 | 
225 |         # Add telemetry callback if telemetry_enabled is set
226 |         if self.telemetry_enabled:
227 |             if isinstance(self.telemetry_enabled, bool):
228 |                 self.callbacks.append(TelemetryCallback(self))
229 |             else:
230 |                 self.callbacks.append(TelemetryCallback(self, **self.telemetry_enabled))
231 | 
232 |         # Add logging callback if verbosity is set
233 |         if self.verbosity is not None:
234 |             self.callbacks.append(LoggingCallback(level=self.verbosity))
235 | 
236 |         # Add image retention callback if only_n_most_recent_images is set
237 |         if self.only_n_most_recent_images:
238 |             self.callbacks.append(ImageRetentionCallback(self.only_n_most_recent_images))
239 |         
240 |         # Add trajectory saver callback if trajectory_dir is set
241 |         if self.trajectory_dir:
242 |             if isinstance(self.trajectory_dir, dict):
243 |                 self.callbacks.append(TrajectorySaverCallback(**self.trajectory_dir))
244 |             elif isinstance(self.trajectory_dir, (str, Path)):
245 |                 self.callbacks.append(TrajectorySaverCallback(str(self.trajectory_dir)))
246 |         
247 |         # Add budget manager if max_trajectory_budget is set
248 |         if max_trajectory_budget:
249 |             if isinstance(max_trajectory_budget, dict):
250 |                 self.callbacks.append(BudgetManagerCallback(**max_trajectory_budget))
251 |             else:
252 |                 self.callbacks.append(BudgetManagerCallback(max_trajectory_budget))
253 |         
254 |         # == Enable local model providers w/ LiteLLM ==
255 | 
256 |         # Register local model providers
257 |         hf_adapter = HuggingFaceLocalAdapter(
258 |             device="auto",
259 |             trust_remote_code=self.trust_remote_code or False
260 |         )
261 |         human_adapter = HumanAdapter()
262 |         mlx_adapter = MLXVLMAdapter()
263 |         litellm.custom_provider_map = [
264 |             {"provider": "huggingface-local", "custom_handler": hf_adapter},
265 |             {"provider": "human", "custom_handler": human_adapter},
266 |             {"provider": "mlx", "custom_handler": mlx_adapter}
267 |         ]
268 |         litellm.suppress_debug_info = True
269 | 
270 |         # == Initialize computer agent ==
271 | 
272 |         # Find the appropriate agent loop
273 |         if custom_loop:
274 |             self.agent_loop = custom_loop
275 |             self.agent_config_info = None
276 |         else:
277 |             config_info = find_agent_config(model)
278 |             if not config_info:
279 |                 raise ValueError(f"No agent config found for model: {model}")
280 |             # Instantiate the agent config class
281 |             self.agent_loop = config_info.agent_class()
282 |             self.agent_config_info = config_info
283 |         
284 |         self.tool_schemas = []
285 |         self.computer_handler = None
286 |         
287 |     async def _initialize_computers(self):
288 |         """Initialize computer objects"""
289 |         if not self.tool_schemas:
290 |             # Process tools and create tool schemas
291 |             self.tool_schemas = self._process_tools()
292 |             
293 |             # Find computer tool and create interface adapter
294 |             computer_handler = None
295 |             for schema in self.tool_schemas:
296 |                 if schema["type"] == "computer":
297 |                     computer_handler = await make_computer_handler(schema["computer"])
298 |                     break
299 |             self.computer_handler = computer_handler
300 |     
301 |     def _process_input(self, input: Messages) -> List[Dict[str, Any]]:
302 |         """Process input messages and create schemas for the agent loop"""
303 |         if isinstance(input, str):
304 |             return [{"role": "user", "content": input}]
305 |         return [get_json(msg) for msg in input]
306 | 
307 |     def _process_tools(self) -> List[Dict[str, Any]]:
308 |         """Process tools and create schemas for the agent loop"""
309 |         schemas = []
310 |         
311 |         for tool in self.tools:
312 |             # Check if it's a computer object (has interface attribute)
313 |             if is_agent_computer(tool):
314 |                 # This is a computer tool - will be handled by agent loop
315 |                 schemas.append({
316 |                     "type": "computer",
317 |                     "computer": tool
318 |                 })
319 |             elif callable(tool):
320 |                 # Use litellm.utils.function_to_dict to extract schema from docstring
321 |                 try:
322 |                     function_schema = litellm.utils.function_to_dict(tool)
323 |                     schemas.append({
324 |                         "type": "function",
325 |                         "function": function_schema
326 |                     })
327 |                 except Exception as e:
328 |                     print(f"Warning: Could not process tool {tool}: {e}")
329 |             else:
330 |                 print(f"Warning: Unknown tool type: {tool}")
331 |         
332 |         return schemas
333 |     
334 |     def _get_tool(self, name: str) -> Optional[Callable]:
335 |         """Get a tool by name"""
336 |         for tool in self.tools:
337 |             if hasattr(tool, '__name__') and tool.__name__ == name:
338 |                 return tool
339 |             elif hasattr(tool, 'func') and tool.func.__name__ == name:
340 |                 return tool
341 |         return None
342 |     
343 |     # ============================================================================
344 |     # AGENT RUN LOOP LIFECYCLE HOOKS
345 |     # ============================================================================
346 |     
347 |     async def _on_run_start(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]]) -> None:
348 |         """Initialize run tracking by calling callbacks."""
349 |         for callback in self.callbacks:
350 |             if hasattr(callback, 'on_run_start'):
351 |                 await callback.on_run_start(kwargs, old_items)
352 |     
353 |     async def _on_run_end(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> None:
354 |         """Finalize run tracking by calling callbacks."""
355 |         for callback in self.callbacks:
356 |             if hasattr(callback, 'on_run_end'):
357 |                 await callback.on_run_end(kwargs, old_items, new_items)
358 |     
359 |     async def _on_run_continue(self, kwargs: Dict[str, Any], old_items: List[Dict[str, Any]], new_items: List[Dict[str, Any]]) -> bool:
360 |         """Check if run should continue by calling callbacks."""
361 |         for callback in self.callbacks:
362 |             if hasattr(callback, 'on_run_continue'):
363 |                 should_continue = await callback.on_run_continue(kwargs, old_items, new_items)
364 |                 if not should_continue:
365 |                     return False
366 |         return True
367 |     
368 |     async def _on_llm_start(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
369 |         """Prepare messages for the LLM call by applying callbacks."""
370 |         result = messages
371 |         for callback in self.callbacks:
372 |             if hasattr(callback, 'on_llm_start'):
373 |                 result = await callback.on_llm_start(result)
374 |         return result
375 | 
376 |     async def _on_llm_end(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
377 |         """Postprocess messages after the LLM call by applying callbacks."""
378 |         result = messages
379 |         for callback in self.callbacks:
380 |             if hasattr(callback, 'on_llm_end'):
381 |                 result = await callback.on_llm_end(result)
382 |         return result
383 | 
384 |     async def _on_responses(self, kwargs: Dict[str, Any], responses: Dict[str, Any]) -> None:
385 |         """Called when responses are received."""
386 |         for callback in self.callbacks:
387 |             if hasattr(callback, 'on_responses'):
388 |                 await callback.on_responses(get_json(kwargs), get_json(responses))
389 |     
390 |     async def _on_computer_call_start(self, item: Dict[str, Any]) -> None:
391 |         """Called when a computer call is about to start."""
392 |         for callback in self.callbacks:
393 |             if hasattr(callback, 'on_computer_call_start'):
394 |                 await callback.on_computer_call_start(get_json(item))
395 |     
396 |     async def _on_computer_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
397 |         """Called when a computer call has completed."""
398 |         for callback in self.callbacks:
399 |             if hasattr(callback, 'on_computer_call_end'):
400 |                 await callback.on_computer_call_end(get_json(item), get_json(result))
401 |     
402 |     async def _on_function_call_start(self, item: Dict[str, Any]) -> None:
403 |         """Called when a function call is about to start."""
404 |         for callback in self.callbacks:
405 |             if hasattr(callback, 'on_function_call_start'):
406 |                 await callback.on_function_call_start(get_json(item))
407 |     
408 |     async def _on_function_call_end(self, item: Dict[str, Any], result: List[Dict[str, Any]]) -> None:
409 |         """Called when a function call has completed."""
410 |         for callback in self.callbacks:
411 |             if hasattr(callback, 'on_function_call_end'):
412 |                 await callback.on_function_call_end(get_json(item), get_json(result))
413 |     
414 |     async def _on_text(self, item: Dict[str, Any]) -> None:
415 |         """Called when a text message is encountered."""
416 |         for callback in self.callbacks:
417 |             if hasattr(callback, 'on_text'):
418 |                 await callback.on_text(get_json(item))
419 |     
420 |     async def _on_api_start(self, kwargs: Dict[str, Any]) -> None:
421 |         """Called when an LLM API call is about to start."""
422 |         for callback in self.callbacks:
423 |             if hasattr(callback, 'on_api_start'):
424 |                 await callback.on_api_start(get_json(kwargs))
425 |     
426 |     async def _on_api_end(self, kwargs: Dict[str, Any], result: Any) -> None:
427 |         """Called when an LLM API call has completed."""
428 |         for callback in self.callbacks:
429 |             if hasattr(callback, 'on_api_end'):
430 |                 await callback.on_api_end(get_json(kwargs), get_json(result))
431 | 
432 |     async def _on_usage(self, usage: Dict[str, Any]) -> None:
433 |         """Called when usage information is received."""
434 |         for callback in self.callbacks:
435 |             if hasattr(callback, 'on_usage'):
436 |                 await callback.on_usage(get_json(usage))
437 | 
438 |     async def _on_screenshot(self, screenshot: Union[str, bytes], name: str = "screenshot") -> None:
439 |         """Called when a screenshot is taken."""
440 |         for callback in self.callbacks:
441 |             if hasattr(callback, 'on_screenshot'):
442 |                 await callback.on_screenshot(screenshot, name)
443 | 
444 |     # ============================================================================
445 |     # AGENT OUTPUT PROCESSING
446 |     # ============================================================================
447 |     
448 |     async def _handle_item(self, item: Any, computer: Optional[AsyncComputerHandler] = None, ignore_call_ids: Optional[List[str]] = None) -> List[Dict[str, Any]]:
449 |         """Handle each item; may cause a computer action + screenshot."""
450 |         call_id = item.get("call_id")
451 |         if ignore_call_ids and call_id and call_id in ignore_call_ids:
452 |             return []
453 |         
454 |         item_type = item.get("type", None)
455 |         
456 |         if item_type == "message":
457 |             await self._on_text(item)
458 |             # # Print messages
459 |             # if item.get("content"):
460 |             #     for content_item in item.get("content"):
461 |             #         if content_item.get("text"):
462 |             #             print(content_item.get("text"))
463 |             return []
464 |         
465 |         try:
466 |             if item_type == "computer_call":
467 |                 await self._on_computer_call_start(item)
468 |                 if not computer:
469 |                     raise ValueError("Computer handler is required for computer calls")
470 | 
471 |                 # Perform computer actions
472 |                 action = item.get("action")
473 |                 action_type = action.get("type")
474 |                 if action_type is None:
475 |                     print(f"Action type cannot be `None`: action={action}, action_type={action_type}")
476 |                     return []
477 |                 
478 |                 # Extract action arguments (all fields except 'type')
479 |                 action_args = {k: v for k, v in action.items() if k != "type"}
480 |                 
481 |                 # print(f"{action_type}({action_args})")
482 |                 
483 |                 # Execute the computer action
484 |                 computer_method = getattr(computer, action_type, None)
485 |                 if computer_method:
486 |                     assert_callable_with(computer_method, **action_args)
487 |                     await computer_method(**action_args)
488 |                 else:
489 |                     raise ToolError(f"Unknown computer action: {action_type}")
490 |                 
491 |                 # Take screenshot after action
492 |                 if self.screenshot_delay and self.screenshot_delay > 0:
493 |                     await asyncio.sleep(self.screenshot_delay)
494 |                 screenshot_base64 = await computer.screenshot()
495 |                 await self._on_screenshot(screenshot_base64, "screenshot_after")
496 |                 
497 |                 # Handle safety checks
498 |                 pending_checks = item.get("pending_safety_checks", [])
499 |                 acknowledged_checks = []
500 |                 for check in pending_checks:
501 |                     check_message = check.get("message", str(check))
502 |                     acknowledged_checks.append(check)
503 |                     # TODO: implement a callback for safety checks
504 |                     # if acknowledge_safety_check_callback(check_message, allow_always=True):
505 |                     #     acknowledged_checks.append(check)
506 |                     # else:
507 |                     #     raise ValueError(f"Safety check failed: {check_message}")
508 |                 
509 |                 # Create call output
510 |                 call_output = {
511 |                     "type": "computer_call_output",
512 |                     "call_id": item.get("call_id"),
513 |                     "acknowledged_safety_checks": acknowledged_checks,
514 |                     "output": {
515 |                         "type": "input_image",
516 |                         "image_url": f"data:image/png;base64,{screenshot_base64}",
517 |                     },
518 |                 }
519 |                 
520 |                 # # Additional URL safety checks for browser environments
521 |                 # if await computer.get_environment() == "browser":
522 |                 #     current_url = await computer.get_current_url()
523 |                 #     call_output["output"]["current_url"] = current_url
524 |                 #     # TODO: implement a callback for URL safety checks
525 |                 #     # check_blocklisted_url(current_url)
526 |                 
527 |                 result = [call_output]
528 |                 await self._on_computer_call_end(item, result)
529 |                 return result
530 |             
531 |             if item_type == "function_call":
532 |                 await self._on_function_call_start(item)
533 |                 # Perform function call
534 |                 function = self._get_tool(item.get("name"))
535 |                 if not function:
536 |                     raise ToolError(f"Function {item.get("name")} not found")
537 |             
538 |                 args = json.loads(item.get("arguments"))
539 | 
540 |                 # Validate arguments before execution
541 |                 assert_callable_with(function, **args)
542 | 
543 |                 # Execute function - use asyncio.to_thread for non-async functions
544 |                 if inspect.iscoroutinefunction(function):
545 |                     result = await function(**args)
546 |                 else:
547 |                     result = await asyncio.to_thread(function, **args)
548 |             
549 |                 # Create function call output
550 |                 call_output = {
551 |                     "type": "function_call_output",
552 |                     "call_id": item.get("call_id"),
553 |                     "output": str(result),
554 |                 }
555 |             
556 |                 result = [call_output]
557 |                 await self._on_function_call_end(item, result)
558 |                 return result
559 |         except ToolError as e:
560 |             return [make_tool_error_item(repr(e), call_id)]
561 | 
562 |         return []
563 | 
564 |     # ============================================================================
565 |     # MAIN AGENT LOOP
566 |     # ============================================================================
567 |     
568 |     async def run(
569 |         self,
570 |         messages: Messages,
571 |         stream: bool = False,
572 |         **kwargs
573 |     ) -> AsyncGenerator[Dict[str, Any], None]:
574 |         """
575 |         Run the agent with the given messages using Computer protocol handler pattern.
576 |         
577 |         Args:
578 |             messages: List of message dictionaries
579 |             stream: Whether to stream the response
580 |             **kwargs: Additional arguments
581 |             
582 |         Returns:
583 |             AsyncGenerator that yields response chunks
584 |         """
585 |         if not self.agent_config_info:
586 |             raise ValueError("Agent configuration not found")
587 |         
588 |         capabilities = self.get_capabilities()
589 |         if "step" not in capabilities:
590 |             raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support step predictions")
591 | 
592 |         await self._initialize_computers()
593 |         
594 |         # Merge kwargs
595 |         merged_kwargs = {**self.kwargs, **kwargs}
596 |         
597 |         old_items = self._process_input(messages)
598 |         new_items = []
599 | 
600 |         # Initialize run tracking
601 |         run_kwargs = {
602 |             "messages": messages,
603 |             "stream": stream,
604 |             "model": self.model,
605 |             "agent_loop": self.agent_config_info.agent_class.__name__,
606 |             **merged_kwargs
607 |         }
608 |         await self._on_run_start(run_kwargs, old_items)
609 | 
610 |         while new_items[-1].get("role") != "assistant" if new_items else True:
611 |             # Lifecycle hook: Check if we should continue based on callbacks (e.g., budget manager)
612 |             should_continue = await self._on_run_continue(run_kwargs, old_items, new_items)
613 |             if not should_continue:
614 |                 break
615 | 
616 |             # Lifecycle hook: Prepare messages for the LLM call
617 |             # Use cases:
618 |             # - PII anonymization
619 |             # - Image retention policy
620 |             combined_messages = old_items + new_items
621 |             combined_messages = replace_failed_computer_calls_with_function_calls(combined_messages)
622 |             preprocessed_messages = await self._on_llm_start(combined_messages)
623 |             
624 |             loop_kwargs = {
625 |                 "messages": preprocessed_messages,
626 |                 "model": self.model,
627 |                 "tools": self.tool_schemas,
628 |                 "stream": False,
629 |                 "computer_handler": self.computer_handler,
630 |                 "max_retries": self.max_retries,
631 |                 "use_prompt_caching": self.use_prompt_caching,
632 |                 **merged_kwargs
633 |             }
634 | 
635 |             # Run agent loop iteration
636 |             result = await self.agent_loop.predict_step(
637 |                 **loop_kwargs,
638 |                 _on_api_start=self._on_api_start,
639 |                 _on_api_end=self._on_api_end,
640 |                 _on_usage=self._on_usage,
641 |                 _on_screenshot=self._on_screenshot,
642 |             )
643 |             result = get_json(result)
644 |             
645 |             # Lifecycle hook: Postprocess messages after the LLM call
646 |             # Use cases:
647 |             # - PII deanonymization (if you want tool calls to see PII)
648 |             result["output"] = await self._on_llm_end(result.get("output", []))
649 |             await self._on_responses(loop_kwargs, result)
650 |             
651 |             # Yield agent response
652 |             yield result
653 | 
654 |             # Add agent response to new_items
655 |             new_items += result.get("output")
656 | 
657 |             # Get output call ids
658 |             output_call_ids = get_output_call_ids(result.get("output", []))
659 | 
660 |             # Handle computer actions
661 |             for item in result.get("output"):
662 |                 partial_items = await self._handle_item(item, self.computer_handler, ignore_call_ids=output_call_ids)
663 |                 new_items += partial_items
664 | 
665 |                 # Yield partial response
666 |                 yield {
667 |                     "output": partial_items,
668 |                     "usage": Usage(
669 |                         prompt_tokens=0,
670 |                         completion_tokens=0,
671 |                         total_tokens=0,
672 |                     )
673 |                 }
674 |         
675 |         await self._on_run_end(loop_kwargs, old_items, new_items)
676 |     
677 |     async def predict_click(
678 |         self,
679 |         instruction: str,
680 |         image_b64: Optional[str] = None
681 |     ) -> Optional[Tuple[int, int]]:
682 |         """
683 |         Predict click coordinates based on image and instruction.
684 |         
685 |         Args:
686 |             instruction: Instruction for where to click
687 |             image_b64: Base64 encoded image (optional, will take screenshot if not provided)
688 |             
689 |         Returns:
690 |             None or tuple with (x, y) coordinates
691 |         """
692 |         if not self.agent_config_info:
693 |             raise ValueError("Agent configuration not found")
694 |         
695 |         capabilities = self.get_capabilities()
696 |         if "click" not in capabilities:
697 |             raise ValueError(f"Agent loop {self.agent_config_info.agent_class.__name__} does not support click predictions")
698 |         if hasattr(self.agent_loop, 'predict_click'):
699 |             if not image_b64:
700 |                 if not self.computer_handler:
701 |                     raise ValueError("Computer tool or image_b64 is required for predict_click")
702 |                 image_b64 = await self.computer_handler.screenshot()
703 |             return await self.agent_loop.predict_click(
704 |                 model=self.model,
705 |                 image_b64=image_b64,
706 |                 instruction=instruction
707 |             )
708 |         return None
709 |     
710 |     def get_capabilities(self) -> List[AgentCapability]:
711 |         """
712 |         Get list of capabilities supported by the current agent config.
713 |         
714 |         Returns:
715 |             List of capability strings (e.g., ["step", "click"])
716 |         """
717 |         if not self.agent_config_info:
718 |             raise ValueError("Agent configuration not found")
719 |         
720 |         if hasattr(self.agent_loop, 'get_capabilities'):
721 |             return self.agent_loop.get_capabilities()
722 |         return ["step"]  # Default capability
```
Page 15/21FirstPrevNextLast